rexleparser 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a6291ec758a59d1737240656c502a96250c5fe6cdd230a452c220ba34c17f091
4
- data.tar.gz: c9edbdc47b4e728598407ddd23da8d5622ed699069061742497d3017477e25bc
3
+ metadata.gz: a98589dfd93cc2c5a4946f9dc5a2bc5418bd30cbfc4662330d7698bf13b1e107
4
+ data.tar.gz: 92a35568634aa47b9032aeb26cc193830adee7ba0c3d48dc2a53458119671791
5
5
  SHA512:
6
- metadata.gz: 2665fb624f0b3fee4e4304c304c093900685e6708bfc62209fa54a0f0108e7e195844285fa7a677a91d83f0ee72c306ca8f785052cd7d6c51dba3ab1ea3950b9
7
- data.tar.gz: 8c7d22a214d798f867d670c300d2b61c2235a17c437823ac7f5978dcb3c80015d3887dfc2f7f22b421d58d6b3b321eaa327ac931cb0448c06c1d1f71f70640ae
6
+ metadata.gz: 1132a312974ad5eaef5b1267bf18f35e0228138a3247a867e6e9de4f366f8c654b6f24220e68922cca90698520c07bf45cdc52d44390fedf50e8c97df3ac0666
7
+ data.tar.gz: 211dddd0cd0411e4311541dd88e8742dbb1f32a2d5cdd6a25b48d90ef25ffd4a4bab7525180c5bdfe7a0b8afb1610e4b532e2fefec4980acae2d22dc160b0602
checksums.yaml.gz.sig CHANGED
Binary file
data/lib/rexleparser.rb CHANGED
@@ -62,222 +62,183 @@ class Attributes < Hash
62
62
  end
63
63
  end
64
64
 
65
-
66
65
  class RexleParserException < Exception
67
66
  end
68
67
 
69
68
  class RexleParser
70
69
 
71
- attr_reader :stack
70
+ attr_reader :instructions, :doctype, :to_a
72
71
 
73
- def initialize(raws, debug: false)
72
+ def initialize(raw_s)
74
73
 
75
- s = raws.strip
76
- @debug = debug
77
- @a = []
78
- @stack = []
74
+ super()
75
+ s = raw_s.clone.strip
76
+ return if s.empty?
79
77
 
80
78
  raw_xml, raw_instrctns = if s.lines.first =~ /<?xml/ then
81
- s.split(/(?<=\?>)/,2).reverse
79
+ s.split(/(?=\?>\s*<\w)/,2).reverse
82
80
  else
83
81
  s
84
82
  end
85
- puts 'raw_xml: ' + raw_xml.inspect if @debug
86
83
  @instructions = raw_instrctns ? \
87
84
  raw_instrctns.scan(/<\?([\w-]+) ([^\?]+)/) : []
88
85
  @doctype = s.slice!(/<!DOCTYPE html>\n?/) if s.lines.first =~ /<\!DOCTYPE/
89
-
90
- # scancom is run twice because we 1st check for comment tags and then cdata tags
91
- @a = parse(scancom(scancom(raw_xml), type: :cdata)).flatten(1)
92
-
93
- end
86
+ @to_a = reverse(parse_node(raw_xml.strip.reverse))
94
87
 
95
- def to_a()
96
- @a
97
88
  end
98
-
89
+
99
90
  private
91
+
100
92
 
101
- def ehead(raws)
93
+ def scan_next(r, tagname)
102
94
 
103
- s = raws.lstrip
104
- puts '_s: ' + s.inspect if @debug
105
- # fetch the element head
106
- tag = s =~ /<[^>]+\/?>/
107
- s2 = s[tag+1..-1]
108
- tagb = s2 =~ />/
109
- return unless tag
95
+ j = tagname
110
96
 
111
- len = tagb+1-tag
112
-
113
- if @debug then
114
- puts 'ehead()/detail: ' + [tag, tagb, len, s[tag,len+1]].inspect
115
- end
97
+ if r[0] == '>' then
116
98
 
117
- [s[tag,len+1], s[len+1..-1]]
99
+ # end tag match
100
+ tag = r[/^>[^<]+</]
101
+
102
+ if tag[1][/[ \w"']/] and tag[-2] != '/' then
118
103
 
119
- end
104
+ # is it the end tag to match the start tag?
105
+ tag = r.slice!(/^>[^<]+</)
106
+ end_tag = tag[/^>[^>]*#{j}<$/]
120
107
 
121
- def get_attributes(raw_attributes)
122
-
123
- r1 = /([\w\-:\(\)]+\='[^']*)'/
124
- r2 = /([\w\-:\(\)]+\="[^"]*)"/
125
-
126
- r = raw_attributes.scan(/#{r1}|#{r2}/).map(&:compact)\
127
- .flatten.inject(Attributes.new) do |r, x|
128
- attr_name, raw_val = x.split(/=/,2)
129
- val = attr_name != 'class' ? raw_val[1..-1] : raw_val[1..-1].split
130
- r.merge(attr_name.to_sym => val)
131
- end
108
+ if end_tag then
109
+
110
+ j = nil
111
+ return [:end_tag, end_tag]
112
+
113
+ elsif tag[/^>[^>]*\w+<$/] then
114
+ # broken tag found
115
+ broken_tag = tag
116
+ return [:child, [nil, [], broken_tag]] if broken_tag
117
+ else
118
+
119
+ text, newtag = tag.sub('>',';tg&').split(/>/,2)
120
+
121
+ if newtag then
122
+ tag = newtag
123
+ r.prepend '>' + tag
124
+ end
125
+
126
+ return [:child, text]
127
+ end
128
+ elsif r[0,3] == '>--' then # comment tag found
129
+
130
+ r.slice!(0,3)
131
+ i = r =~ /(\-\-!<)/
132
+ s = r.slice!(0,i)
133
+ r.slice!(0,4)
132
134
 
133
- return r
134
- end
135
+ tagname, content = ['-!',s]
136
+
137
+ return [:child, [">#{tagname}<", [content], ">#{tagname}/<"]]
138
+
139
+ elsif r[0,3] == '>]]' then # CDATA tag found
135
140
 
136
- def parse(raws, a=[], cur=nil)
141
+ r.slice!(0,3)
142
+ i = r =~ /(\[ATADC\[!<)/
143
+ s = r.slice!(0,i)
144
+ r.slice!(0,9)
137
145
 
138
- s = raws #.lstrip
139
-
140
- if @debug then
141
- puts '.parse() s: ' + s.inspect[0..600]
142
- puts '.parse() a: ' + a.inspect[0..699]
143
- puts '.parse() cur: ' + cur.inspect[0..799]
144
- end
146
+ tagname, content = ['[!',s]
145
147
 
146
- # if self-closing tag
147
- if s =~ /^<[^<]+\/>/ then
148
+ return [:child, [">#{tagname}<", [content], ">#{tagname}/<"]]
149
+
150
+ elsif tag[/>\/|\/<$/] or tag[/^>.*[\w!]+\/<$/] then
151
+
152
+ return [:newnode]
153
+
154
+ else
155
+
156
+ r.sub!('>',';tg&')
157
+ i = r =~ />(?:[\-\/"'\w]|\]\])/ # collect until a tag is found or a CDATA element
158
+ text = r.slice!(0,i)
148
159
 
149
- tag = s[/^<[^<]+\/>/]
150
- puts 'parse() self-closing/tag: ' + tag.inspect if @debug
151
- tail = $'
152
-
153
- if @debug then
154
- puts 'parse() self-closing tag found'
155
- puts 'parse()/tail: ' + tail.inspect
156
- end
157
-
158
- a2 = parsetag(tag)
159
- puts '_a: ' + a.inspect if @debug
160
- cur ? a.last << a2 : a << a2
161
-
162
- parse(tail, a, cur)
160
+ return [:child, text] if text
163
161
 
164
- # is it the head?
165
- elsif (s =~ /^<[^\/>]+>/) == 0 then
162
+ end # end of tag match
166
163
 
167
- puts 'parse()/head found' if @debug
164
+ else
168
165
 
169
- tag, tail = ehead(s)
170
-
171
- if @debug then
172
- puts 'parse() tag: ' + tag.inspect
173
- puts 'parse() tail: ' + tail.inspect
174
- end
175
- # add it to the list
176
- a2 = parsetag(tag)
166
+ # it's a text value
167
+ i = r =~ />(?:[\-\/"'\w]|\]\])/ # collect until a tag is found or a CDATA element
168
+ text = r.slice!(0,i)
177
169
 
178
- puts '_cur: ' + cur.inspect if @debug
179
- if cur then
180
- cur << a2
181
- cur2 = cur.last
182
- else
183
- a << a2
184
- cur2 = a.last
185
- end
170
+ return [:child, text] if text
171
+ end
172
+ end
173
+
174
+ def parse_node(r, j=nil)
175
+
176
+ return unless r.length > 0
177
+ tag = r.slice!(/^>[^<]+</) if (r =~ /^>[^<]+</) == 0
178
+ tagname = tag[/([\w!:]+)\/?<$/,1]
186
179
 
187
- puts '_a: ' + a.inspect if @debug
180
+ # self closing tag?
181
+ if tag[/^>\/.*#{tagname}<$/m] then
182
+ return [">/#{tagname}<", [], "#{tag.sub(/>\//,'>')}"]
183
+ end
188
184
 
189
- # add it to the stack
190
- @stack.push cur2
185
+ start_tag, children, end_tag = tag, [], nil
191
186
 
192
- parse(tail, a, cur2)
193
-
194
- elsif (s =~ /^[^<]/) == 0
187
+ unless start_tag[1..-3][/\w+$/] then
188
+ raise RexleParserException, 'invalid closing tag found ' + \
189
+ start_tag.reverse + '; context: ' + r[0..120].reverse.inspect
190
+ end
195
191
 
196
- puts 'parse() we have text!' if @debug
197
- text = raws[/[^<]+/m] #
198
- remaining = $'
192
+ until end_tag do
199
193
 
200
- if @debug then
201
- puts 'parse() text: ' + text.inspect
202
- puts 'cur tag: ' + cur[0].inspect
203
- end
194
+ key, res = scan_next r, tagname
204
195
 
205
- cur << if cur[0][0] == '!' then
206
- text.gsub('&lt;','<').gsub('&gt;','>').gsub('&amp;','&')
196
+ case key
197
+ when :end_tag
198
+ end_tag = res
199
+ r2 = [start_tag, children, end_tag]
200
+ end_tag = nil
201
+
202
+ return r2
203
+ when :child
204
+ children << res
205
+ when :newnode
206
+ children << parse_node(r, tagname)
207
207
  else
208
- text.gsub(/>/,'&gt;').gsub(/</, '&lt;')
208
+ break
209
209
  end
210
-
211
- puts 'remaining: ' + remaining.inspect if @debug
212
- parse(remaining, a, cur) if remaining.length > 0
213
-
214
-
215
- # is it a closing tag?
216
- elsif s =~ /^\s?<\/\w+>/m
217
-
218
- tail = s[/^\s*<\/\w+>(.*)/m,1]
219
-
220
- if @debug then
221
- puts 'parse()/closing tag ' + s[/^\s*<\/\w+>/].inspect
222
- puts '>a: ' + a.inspect
223
- end
224
-
225
- @stack.pop
226
- #a << []
227
- parse(tail, a, @stack.last)
228
-
229
- elsif s.empty? and @stack.length > 0
230
-
231
- puts 'parse() no end tag!' if @debug
232
-
233
210
  end
234
211
 
235
- return a
236
-
212
+ [start_tag, children, end_tag]
237
213
  end
238
214
 
239
- # we parse the tag because it contains more than just the name it often contains attributes
240
- #
241
- def parsetag(s)
215
+ def get_attributes(raw_attributes)
242
216
 
243
- puts 'parsetag:' + s.inspect if @debug
244
- rawtagname, rawattr = s[1..-2].sub(/\/$/,'').match(/^(\w+) *(.*)/)\
245
- .values_at(1,2)
246
-
247
- tagname = case rawtagname.to_sym
248
- when :_comment
249
- '!-'
250
- when :_cdata
251
- '!['
252
- else
253
- rawtagname
217
+ r1 = /([\w\-:\(\)]+\='[^']*)'/
218
+ r2 = /([\w\-:\(\)]+\="[^"]*)"/
219
+
220
+ r = raw_attributes.scan(/#{r1}|#{r2}/).map(&:compact)\
221
+ .flatten.inject(Attributes.new) do |r, x|
222
+ attr_name, raw_val = x.split(/=/,2)
223
+ val = attr_name != 'class' ? raw_val[1..-1] : raw_val[1..-1].split
224
+ r.merge(attr_name.to_sym => val)
254
225
  end
255
226
 
256
- [tagname, get_attributes(rawattr)]
227
+ return r
257
228
  end
258
229
 
259
- def scancom(s, type=:comment)
260
-
261
- tag1 = ['<!--', '-->', 'comment', '<!--']
262
- tag2 = ['<![CDATA[', '\]\]>', 'cdata', '\<!\[CDATA\[']
263
- tag = type == :comment ? tag1 : tag2
264
-
265
- #puts 'tag: ' + tag.inspect
266
- istart = s =~ /#{tag[3]}/
267
- return s unless istart
268
-
269
- iend = s =~ /#{tag[1]}/
270
- comment ="<_%s>%s</_%s>" % [tag[2], s[istart+tag[0].length.. iend-1].gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;'), tag[2]]
271
-
272
- if @debug then
273
- puts 'comment: ' + comment.inspect
274
- # construct the new string
275
- puts 'istart: ' + istart.inspect
276
- end
277
-
278
- s3 = s[0,istart].to_s + comment + s[iend+3..-1]
279
- scancom(s3, type)
230
+ def reverse(raw_obj)
231
+
232
+ return unless raw_obj
233
+ obj = raw_obj.clone
234
+ return obj.reverse! if obj.is_a? String
280
235
 
281
- end
236
+ tag = obj.pop.reverse
237
+
238
+ children = obj[-1]
282
239
 
240
+ r = children.reverse.map {|x| reverse(x)}
241
+
242
+ return [tag[/[!\-\w:\[]+/], get_attributes(tag), *r]
243
+ end
283
244
  end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rexleparser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -64,7 +64,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
64
64
  - !ruby/object:Gem::Version
65
65
  version: '0'
66
66
  requirements: []
67
- rubygems_version: 3.4.6
67
+ rubygems_version: 3.4.4
68
68
  signing_key:
69
69
  specification_version: 4
70
70
  summary: Rexleparser is an XML parser used by the Rexle gem
metadata.gz.sig CHANGED
Binary file