xml_col_finder 0.1.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/xml_col_finder.rb +135 -4
- data.tar.gz.sig +0 -0
- metadata +2 -2
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4229c7009179e903163ef10f64f0917a7f67e29d588172d184bed94f716123cf
|
4
|
+
data.tar.gz: f543fcbacf5ca517995d28fc6050671b48f18c8ae9df539aad6133486d7b74f5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 22cd4abb94f055f100d0f56993de883fa87d1bcb2e59ef9edaf0f3f46846f49aa2c73f7e0a38f4779d40314ad5a93bfe91b22738d347bf5866afc1b2ed58c105
|
7
|
+
data.tar.gz: 10c48be3a91d355aa826ebaf1e5b6329e2f3ec7acabc795e8800eb4758fe0abed45cfaee761a91190f72b4407e7b0a0a9accc19b563ca770057a98aa7dfb148b
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/xml_col_finder.rb
CHANGED
@@ -12,10 +12,10 @@ class XMLColFinder
|
|
12
12
|
def initialize(s, debug: false)
|
13
13
|
|
14
14
|
@debug = debug
|
15
|
-
doc = Rexle.new(s)
|
15
|
+
@doc = Rexle.new(s)
|
16
16
|
|
17
17
|
a = []
|
18
|
-
doc.root.each_recursive do |node|
|
18
|
+
@doc.root.each_recursive do |node|
|
19
19
|
|
20
20
|
if node.text then
|
21
21
|
a << [BacktrackXPath.new(node, ignore_id: true).to_xpath.split('/'),
|
@@ -30,8 +30,83 @@ class XMLColFinder
|
|
30
30
|
|
31
31
|
end
|
32
32
|
|
33
|
+
def to_code(nametip: true)
|
34
|
+
|
35
|
+
@nametip = nametip
|
36
|
+
@tags = {}
|
37
|
+
|
38
|
+
xpath, remaining = @to_a
|
39
|
+
|
40
|
+
eid = getid(xpath)
|
41
|
+
linex = formatline('doc', eid, xpath)
|
42
|
+
a = scan(remaining, eid)
|
43
|
+
|
44
|
+
lines = a.flatten.compact.prepend linex
|
45
|
+
lines.join("\n").lines\
|
46
|
+
.map {|line| line =~ /.text$/ ? 'puts ' + line : line }.join
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
def to_doc()
|
51
|
+
@doc
|
52
|
+
end
|
53
|
+
|
33
54
|
private
|
34
55
|
|
56
|
+
def formatline(pid, eid=nil, key=nil, tail=nil, index: nil)
|
57
|
+
|
58
|
+
if eid then
|
59
|
+
|
60
|
+
nametip = @nametip && tail.is_a?(String)
|
61
|
+
klass = nametip ? key.scan(/@class=['"]([^'"]+)/).last : nil
|
62
|
+
|
63
|
+
line = if klass then
|
64
|
+
desc = klass[0][/^[^\-]+/].gsub(/(?=[A-Z])/,' ').downcase
|
65
|
+
desc += " (e.g. %s)" % [tail.length < 50 ? tail : tail[0..46] + '...']
|
66
|
+
"\n# " + desc + "\n"
|
67
|
+
elsif nametip
|
68
|
+
"\n# e.g. %s\n" % [tail.length < 50 ? tail : tail[0..46] + '...']
|
69
|
+
else
|
70
|
+
''
|
71
|
+
end
|
72
|
+
|
73
|
+
key.gsub!("[@class='']",'') # Rexle XPath bug solution!
|
74
|
+
line += "%s = %s.element(\"%s\")" % [eid, pid, key]
|
75
|
+
if tail.is_a? String
|
76
|
+
line += '.text'
|
77
|
+
#line += "\n" if nametip
|
78
|
+
end
|
79
|
+
|
80
|
+
else
|
81
|
+
line = index ? ("%s[%d].text" % [pid, index]) : ("%s.text" % pid)
|
82
|
+
end
|
83
|
+
|
84
|
+
return line
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
def getid(rawtag)
|
89
|
+
|
90
|
+
rawtagx = rawtag.split('/').last[/\w+/]
|
91
|
+
|
92
|
+
tag = case rawtagx.to_sym
|
93
|
+
when :a
|
94
|
+
'link'
|
95
|
+
when :p
|
96
|
+
'para'
|
97
|
+
else
|
98
|
+
rawtagx
|
99
|
+
end
|
100
|
+
|
101
|
+
if @tags.include?(tag) then
|
102
|
+
@tags[tag] =~ /\d+$/ ? @tags[tag].succ! : @tags[tag] += '1'
|
103
|
+
else
|
104
|
+
@tags[tag] = tag
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
|
109
|
+
|
35
110
|
# Groups xpath by matching branches
|
36
111
|
#
|
37
112
|
def group_by_xpath(a)
|
@@ -73,8 +148,6 @@ class XMLColFinder
|
|
73
148
|
|
74
149
|
else
|
75
150
|
|
76
|
-
puts "path.join('/'): " + path.join('/').inspect
|
77
|
-
puts 'txt:' + txt.inspect
|
78
151
|
|
79
152
|
h2[stickypath.sub(/^\//,'')] ||= []
|
80
153
|
h2[stickypath.sub(/^\//,'')] << [path.join('/'), txt]
|
@@ -99,6 +172,64 @@ class XMLColFinder
|
|
99
172
|
|
100
173
|
end
|
101
174
|
|
175
|
+
def scan(a, eid='doc', pid=eid.clone)
|
176
|
+
|
177
|
+
#puts 'a: ' + a.inspect if @debug
|
178
|
+
|
179
|
+
a.map do |row|
|
180
|
+
|
181
|
+
head, tail = row
|
182
|
+
|
183
|
+
if head.is_a? Array then
|
184
|
+
|
185
|
+
hline = scan(row, eid, pid)
|
186
|
+
|
187
|
+
elsif head
|
188
|
+
|
189
|
+
if head[0] == '/' then
|
190
|
+
|
191
|
+
key = head[1..-1]
|
192
|
+
puts 'key: ' + key.inspect if @debug
|
193
|
+
|
194
|
+
eid = getid(key)
|
195
|
+
|
196
|
+
hline = if tail.is_a? Array and tail.all? {|x| x.is_a? String } then
|
197
|
+
@prev_xpath = true
|
198
|
+
"%s = %s.xpath(\"%s\")" % [eid, pid, key]
|
199
|
+
else
|
200
|
+
@prev_xpath = false
|
201
|
+
formatline(pid, eid, key, tail)
|
202
|
+
end
|
203
|
+
|
204
|
+
else
|
205
|
+
|
206
|
+
hline = formatline(pid=eid)
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
if tail.is_a? Array then
|
211
|
+
|
212
|
+
if tail.compact[0].is_a? Array then
|
213
|
+
|
214
|
+
puts 'tail: ' + tail.inspect if @debug
|
215
|
+
|
216
|
+
tline = scan(tail, eid)
|
217
|
+
|
218
|
+
elsif tail.all? {|x| x.is_a? String} and tail[0][0] != '/'
|
219
|
+
puts '_tail: ' + tail.inspect if @debug
|
220
|
+
tline = tail.map.with_index do |x,i|
|
221
|
+
formatline(pid=eid, index: i)
|
222
|
+
end
|
223
|
+
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
[hline, tline]
|
228
|
+
end
|
229
|
+
|
230
|
+
end
|
231
|
+
|
232
|
+
|
102
233
|
def truncate_xpath(records, offset=0)
|
103
234
|
|
104
235
|
records.map do |record|
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: xml_col_finder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
GSlZ9ilAfm8srTjbZ2cWQyNGGxH+zHQ3Z02c4ZEtgPv/wHjptd1VeBm0P1aemsRA
|
36
36
|
ShsxXxzmzIrRENmpBp3tyR3k
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-01-
|
38
|
+
date: 2022-01-24 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: rexle
|
metadata.gz.sig
CHANGED
Binary file
|