xml_row_finder 0.1.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cfc152e03ec51b624550b4b4bf031eae3e7205c42e1dccbdd4d45219d684860d
4
- data.tar.gz: 77be8ae4da89aded4ec73a653e9fe1e3d9815a144a29a37a73158b97174da984
3
+ metadata.gz: 1407d0f852c5a904600ef49f43c93b7a8e5cc4a459bb88d9d2e0bc4c26928887
4
+ data.tar.gz: 3867560288f6ed062b2c59e78bea3e30b7ddaf5217d3a181f864b4fbf28a202f
5
5
  SHA512:
6
- metadata.gz: e83c208f4dfeac7827b9f8ca453baba4f19ff4be9d4341bbd15f46695948afd7af42ae5ad0c71efb994b58832c46b9cf836fcb2d115ffb126a49ebfc26065772
7
- data.tar.gz: 24335ada2416257aa9caebd29b687562e92fe2d0ca2ac71f59824448c98eee018e15f95ee60352ed326440a50cb5445d3fb5bd4cbb1a0c47eed58ffc85ff12ec
6
+ metadata.gz: c19790d1e1c125b0be88831e1a1ab870662b03ad35859fc68b2431390854c0f4e507a8629198fefb1bfd03ed460e75ceb78e278f5de4be43e9dd9a1ab4c03734
7
+ data.tar.gz: d9d5321b0f639868aeca083728f7ebaa6da2e1d101d609aa05076e9e96c0c7e33e2545db0e803e5efd53808514c028b2190a425e08267e36e42cd6b6118adb3a
checksums.yaml.gz.sig CHANGED
Binary file
@@ -2,19 +2,20 @@
2
2
 
3
3
  # file: xml_row_finder.rb
4
4
 
5
- require 'rexle'
6
- require 'rexml'
7
- include REXML
5
+ require 'nokorexi'
8
6
 
9
7
 
10
8
  class XMLRowFinder
11
9
 
12
10
  attr_reader :to_a
13
11
 
14
- def initialize(s, debug: false)
12
+ def initialize(raws, debug: false)
15
13
 
16
14
  @debug = debug
17
- doc = Rexle.new(s)
15
+
16
+ doc = Nokorexi.new(raws, filter: true).to_doc
17
+
18
+ @doc = Rexle.new(doc.root.xml)
18
19
 
19
20
  a = []
20
21
 
@@ -23,51 +24,77 @@ class XMLRowFinder
23
24
  a << e.backtrack.to_xpath
24
25
  end
25
26
 
26
- a2 = a.select{ |e| a.count(e) > 1 }.map {|x| x.split('/')}.uniq
27
+ @to_a = a2 = a.map {|e| [a.count(e), e] }.uniq
28
+ xpath = a2.max_by(&:first).last
27
29
 
28
- # remove parent nodes on the same branch
29
- #
30
- a2.reject!.with_index do |x,i|
31
- next if i == a2.length-1
32
- x == a2[i+1][0..-2]
33
- end
30
+ a3 = xpath.split('/')
31
+ a4 = [xpath]
32
+ p1 = []
34
33
 
35
- # remove elements from rows which only exist once in the document
36
- #
37
- a3 = a2.map do |row|
38
- row.reject do |x|
39
- found = doc.root.xpath('//' + x)
40
- found.length < 2
41
- end
34
+ until (a3.length < 1) do
35
+ p1 << a3.pop; a4 << a3.join('/') + "[%s]" % p1.reverse.join('/')
42
36
  end
43
37
 
44
- # add parent node to the row as a reference for the xpath
38
+ # using Nokogiri since Rexle has a bug with xpath predicates
45
39
  #
46
- a4 = a3.map.with_index do |row,i|
47
- a2[i][-(row.length+1)..-1]
40
+ @doc2 = Nokogiri::XML(doc.root.xml)
41
+
42
+ a5 = a4[0..-2].map do |xpath2|
43
+ [@doc2.xpath(xpath2).length, xpath2]
48
44
  end
49
45
 
50
- # find the parent node attributes
51
- #
52
- @to_a = a4.map do |col|
53
-
54
- # currently using REXML for this XPath since there is a bug in
55
- # Rexle when attempting the following
56
- #
57
- doc2 = Document.new(s)
58
- xpath = "//%s[%s]" % [col[0], col[1..-1].join('/')]
59
- puts 'xpath: ' + xpath.inspect if @debug
60
- r = XPath.first(doc2, xpath)
61
- xpath_a = BacktrackXPath.new(r).to_xpath
62
-
63
- if col.length >= 3
64
- "%s/%s[%s]" % [xpath_a, col[1], col[2..-1].join('/')]
65
- else
66
- "%s/%s" % [xpath_a, col[1]]
67
- end
46
+ puts 'a5: ' + a5.inspect if @debug
47
+ rows_xpath = a5.reverse.detect {|num, xpath2| num > 1}.last
48
+ doc3 = Document.new @doc.root.xml
49
+ @rows = XPath.match(doc3, rows_xpath)
50
+ @xpath = rows_xpath
51
+ #@xpath = BacktrackXPath.new(@rows.first).to_xpath.gsub("[@class='']",'')
52
+
53
+ last_row = XPath.match(doc3, @xpath).last
54
+ puts '@xpath: ' + @xpath.inspect
55
+
56
+ # find the container element
57
+ xpath = @xpath[/^[^\[]+/]
58
+ axpath = xpath.split('/')
59
+
60
+ e = XPath.first(doc3, xpath)
61
+ puts 'e: ' + e.to_s
62
+
63
+ until (e.nil? or e.to_s.include?(last_row.to_s)) do
64
+ axpath.pop
65
+ e = XPath.first(doc3, axpath.join('/'))
68
66
  end
69
-
67
+
68
+ @cont_xpath = axpath.join('/')
69
+
70
70
  end
71
71
 
72
- end
72
+ # returns the container element for all rows
73
+ #
74
+ def body()
75
+ Rexle.new(@doc.element(@cont_xpath).xml)
76
+ end
77
+
78
+ # returns the xpath pointing to the container element for all rows
79
+ #
80
+ def body_xpath()
81
+ @cont_xpath
82
+ end
83
+
84
+ # returns rows
85
+ # object returned: An array of Nokogiri XML Element object
86
+ #
87
+ def rows()
88
+ @rows
89
+ end
90
+
91
+ # returns the xpath pointing to the rows
92
+ #
93
+ def to_xpath()
94
+ @xpath
95
+ end
73
96
 
97
+ alias rows_xpath to_xpath
98
+
99
+
100
+ end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: xml_row_finder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,28 +35,28 @@ cert_chain:
35
35
  a/pAsvo0jT6QTnSB7xzsx8LSFDT5tfHKR9Dcn1Y3R06fsh02JvwxaSAMgDBM2aFb
36
36
  2A7/BQ1hD7SU82VTxB1gFIHl
37
37
  -----END CERTIFICATE-----
38
- date: 2022-01-21 00:00:00.000000000 Z
38
+ date: 2022-01-28 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
- name: rexle
41
+ name: nokorexi
42
42
  requirement: !ruby/object:Gem::Requirement
43
43
  requirements:
44
- - - "~>"
45
- - !ruby/object:Gem::Version
46
- version: '1.5'
47
44
  - - ">="
48
45
  - !ruby/object:Gem::Version
49
- version: 1.5.14
46
+ version: 0.6.0
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: '0.6'
50
50
  type: :runtime
51
51
  prerelease: false
52
52
  version_requirements: !ruby/object:Gem::Requirement
53
53
  requirements:
54
- - - "~>"
55
- - !ruby/object:Gem::Version
56
- version: '1.5'
57
54
  - - ">="
58
55
  - !ruby/object:Gem::Version
59
- version: 1.5.14
56
+ version: 0.6.0
57
+ - - "~>"
58
+ - !ruby/object:Gem::Version
59
+ version: '0.6'
60
60
  description:
61
61
  email: digital.robertson@gmail.com
62
62
  executables: []
@@ -83,7 +83,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
83
83
  - !ruby/object:Gem::Version
84
84
  version: '0'
85
85
  requirements: []
86
- rubygems_version: 3.2.22
86
+ rubyforge_project:
87
+ rubygems_version: 2.7.10
87
88
  signing_key:
88
89
  specification_version: 4
89
90
  summary: Attempts to find repeating rows in XHTML and returns the associated xpath.
metadata.gz.sig CHANGED
Binary file