rexleparser 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a6291ec758a59d1737240656c502a96250c5fe6cdd230a452c220ba34c17f091
4
- data.tar.gz: c9edbdc47b4e728598407ddd23da8d5622ed699069061742497d3017477e25bc
3
+ metadata.gz: a98589dfd93cc2c5a4946f9dc5a2bc5418bd30cbfc4662330d7698bf13b1e107
4
+ data.tar.gz: 92a35568634aa47b9032aeb26cc193830adee7ba0c3d48dc2a53458119671791
5
5
  SHA512:
6
- metadata.gz: 2665fb624f0b3fee4e4304c304c093900685e6708bfc62209fa54a0f0108e7e195844285fa7a677a91d83f0ee72c306ca8f785052cd7d6c51dba3ab1ea3950b9
7
- data.tar.gz: 8c7d22a214d798f867d670c300d2b61c2235a17c437823ac7f5978dcb3c80015d3887dfc2f7f22b421d58d6b3b321eaa327ac931cb0448c06c1d1f71f70640ae
6
+ metadata.gz: 1132a312974ad5eaef5b1267bf18f35e0228138a3247a867e6e9de4f366f8c654b6f24220e68922cca90698520c07bf45cdc52d44390fedf50e8c97df3ac0666
7
+ data.tar.gz: 211dddd0cd0411e4311541dd88e8742dbb1f32a2d5cdd6a25b48d90ef25ffd4a4bab7525180c5bdfe7a0b8afb1610e4b532e2fefec4980acae2d22dc160b0602
checksums.yaml.gz.sig CHANGED
Binary file
data/lib/rexleparser.rb CHANGED
@@ -62,222 +62,183 @@ class Attributes < Hash
62
62
  end
63
63
  end
64
64
 
65
-
66
65
  class RexleParserException < Exception
67
66
  end
68
67
 
69
68
  class RexleParser
70
69
 
71
- attr_reader :stack
70
+ attr_reader :instructions, :doctype, :to_a
72
71
 
73
- def initialize(raws, debug: false)
72
+ def initialize(raw_s)
74
73
 
75
- s = raws.strip
76
- @debug = debug
77
- @a = []
78
- @stack = []
74
+ super()
75
+ s = raw_s.clone.strip
76
+ return if s.empty?
79
77
 
80
78
  raw_xml, raw_instrctns = if s.lines.first =~ /<?xml/ then
81
- s.split(/(?<=\?>)/,2).reverse
79
+ s.split(/(?=\?>\s*<\w)/,2).reverse
82
80
  else
83
81
  s
84
82
  end
85
- puts 'raw_xml: ' + raw_xml.inspect if @debug
86
83
  @instructions = raw_instrctns ? \
87
84
  raw_instrctns.scan(/<\?([\w-]+) ([^\?]+)/) : []
88
85
  @doctype = s.slice!(/<!DOCTYPE html>\n?/) if s.lines.first =~ /<\!DOCTYPE/
89
-
90
- # scancom is run twice because we 1st check for comment tags and then cdata tags
91
- @a = parse(scancom(scancom(raw_xml), type: :cdata)).flatten(1)
92
-
93
- end
86
+ @to_a = reverse(parse_node(raw_xml.strip.reverse))
94
87
 
95
- def to_a()
96
- @a
97
88
  end
98
-
89
+
99
90
  private
91
+
100
92
 
101
- def ehead(raws)
93
+ def scan_next(r, tagname)
102
94
 
103
- s = raws.lstrip
104
- puts '_s: ' + s.inspect if @debug
105
- # fetch the element head
106
- tag = s =~ /<[^>]+\/?>/
107
- s2 = s[tag+1..-1]
108
- tagb = s2 =~ />/
109
- return unless tag
95
+ j = tagname
110
96
 
111
- len = tagb+1-tag
112
-
113
- if @debug then
114
- puts 'ehead()/detail: ' + [tag, tagb, len, s[tag,len+1]].inspect
115
- end
97
+ if r[0] == '>' then
116
98
 
117
- [s[tag,len+1], s[len+1..-1]]
99
+ # end tag match
100
+ tag = r[/^>[^<]+</]
101
+
102
+ if tag[1][/[ \w"']/] and tag[-2] != '/' then
118
103
 
119
- end
104
+ # is it the end tag to match the start tag?
105
+ tag = r.slice!(/^>[^<]+</)
106
+ end_tag = tag[/^>[^>]*#{j}<$/]
120
107
 
121
- def get_attributes(raw_attributes)
122
-
123
- r1 = /([\w\-:\(\)]+\='[^']*)'/
124
- r2 = /([\w\-:\(\)]+\="[^"]*)"/
125
-
126
- r = raw_attributes.scan(/#{r1}|#{r2}/).map(&:compact)\
127
- .flatten.inject(Attributes.new) do |r, x|
128
- attr_name, raw_val = x.split(/=/,2)
129
- val = attr_name != 'class' ? raw_val[1..-1] : raw_val[1..-1].split
130
- r.merge(attr_name.to_sym => val)
131
- end
108
+ if end_tag then
109
+
110
+ j = nil
111
+ return [:end_tag, end_tag]
112
+
113
+ elsif tag[/^>[^>]*\w+<$/] then
114
+ # broken tag found
115
+ broken_tag = tag
116
+ return [:child, [nil, [], broken_tag]] if broken_tag
117
+ else
118
+
119
+ text, newtag = tag.sub('>',';tg&').split(/>/,2)
120
+
121
+ if newtag then
122
+ tag = newtag
123
+ r.prepend '>' + tag
124
+ end
125
+
126
+ return [:child, text]
127
+ end
128
+ elsif r[0,3] == '>--' then # comment tag found
129
+
130
+ r.slice!(0,3)
131
+ i = r =~ /(\-\-!<)/
132
+ s = r.slice!(0,i)
133
+ r.slice!(0,4)
132
134
 
133
- return r
134
- end
135
+ tagname, content = ['-!',s]
136
+
137
+ return [:child, [">#{tagname}<", [content], ">#{tagname}/<"]]
138
+
139
+ elsif r[0,3] == '>]]' then # CDATA tag found
135
140
 
136
- def parse(raws, a=[], cur=nil)
141
+ r.slice!(0,3)
142
+ i = r =~ /(\[ATADC\[!<)/
143
+ s = r.slice!(0,i)
144
+ r.slice!(0,9)
137
145
 
138
- s = raws #.lstrip
139
-
140
- if @debug then
141
- puts '.parse() s: ' + s.inspect[0..600]
142
- puts '.parse() a: ' + a.inspect[0..699]
143
- puts '.parse() cur: ' + cur.inspect[0..799]
144
- end
146
+ tagname, content = ['[!',s]
145
147
 
146
- # if self-closing tag
147
- if s =~ /^<[^<]+\/>/ then
148
+ return [:child, [">#{tagname}<", [content], ">#{tagname}/<"]]
149
+
150
+ elsif tag[/>\/|\/<$/] or tag[/^>.*[\w!]+\/<$/] then
151
+
152
+ return [:newnode]
153
+
154
+ else
155
+
156
+ r.sub!('>',';tg&')
157
+ i = r =~ />(?:[\-\/"'\w]|\]\])/ # collect until a tag is found or a CDATA element
158
+ text = r.slice!(0,i)
148
159
 
149
- tag = s[/^<[^<]+\/>/]
150
- puts 'parse() self-closing/tag: ' + tag.inspect if @debug
151
- tail = $'
152
-
153
- if @debug then
154
- puts 'parse() self-closing tag found'
155
- puts 'parse()/tail: ' + tail.inspect
156
- end
157
-
158
- a2 = parsetag(tag)
159
- puts '_a: ' + a.inspect if @debug
160
- cur ? a.last << a2 : a << a2
161
-
162
- parse(tail, a, cur)
160
+ return [:child, text] if text
163
161
 
164
- # is it the head?
165
- elsif (s =~ /^<[^\/>]+>/) == 0 then
162
+ end # end of tag match
166
163
 
167
- puts 'parse()/head found' if @debug
164
+ else
168
165
 
169
- tag, tail = ehead(s)
170
-
171
- if @debug then
172
- puts 'parse() tag: ' + tag.inspect
173
- puts 'parse() tail: ' + tail.inspect
174
- end
175
- # add it to the list
176
- a2 = parsetag(tag)
166
+ # it's a text value
167
+ i = r =~ />(?:[\-\/"'\w]|\]\])/ # collect until a tag is found or a CDATA element
168
+ text = r.slice!(0,i)
177
169
 
178
- puts '_cur: ' + cur.inspect if @debug
179
- if cur then
180
- cur << a2
181
- cur2 = cur.last
182
- else
183
- a << a2
184
- cur2 = a.last
185
- end
170
+ return [:child, text] if text
171
+ end
172
+ end
173
+
174
+ def parse_node(r, j=nil)
175
+
176
+ return unless r.length > 0
177
+ tag = r.slice!(/^>[^<]+</) if (r =~ /^>[^<]+</) == 0
178
+ tagname = tag[/([\w!:]+)\/?<$/,1]
186
179
 
187
- puts '_a: ' + a.inspect if @debug
180
+ # self closing tag?
181
+ if tag[/^>\/.*#{tagname}<$/m] then
182
+ return [">/#{tagname}<", [], "#{tag.sub(/>\//,'>')}"]
183
+ end
188
184
 
189
- # add it to the stack
190
- @stack.push cur2
185
+ start_tag, children, end_tag = tag, [], nil
191
186
 
192
- parse(tail, a, cur2)
193
-
194
- elsif (s =~ /^[^<]/) == 0
187
+ unless start_tag[1..-3][/\w+$/] then
188
+ raise RexleParserException, 'invalid closing tag found ' + \
189
+ start_tag.reverse + '; context: ' + r[0..120].reverse.inspect
190
+ end
195
191
 
196
- puts 'parse() we have text!' if @debug
197
- text = raws[/[^<]+/m] #
198
- remaining = $'
192
+ until end_tag do
199
193
 
200
- if @debug then
201
- puts 'parse() text: ' + text.inspect
202
- puts 'cur tag: ' + cur[0].inspect
203
- end
194
+ key, res = scan_next r, tagname
204
195
 
205
- cur << if cur[0][0] == '!' then
206
- text.gsub('&lt;','<').gsub('&gt;','>').gsub('&amp;','&')
196
+ case key
197
+ when :end_tag
198
+ end_tag = res
199
+ r2 = [start_tag, children, end_tag]
200
+ end_tag = nil
201
+
202
+ return r2
203
+ when :child
204
+ children << res
205
+ when :newnode
206
+ children << parse_node(r, tagname)
207
207
  else
208
- text.gsub(/>/,'&gt;').gsub(/</, '&lt;')
208
+ break
209
209
  end
210
-
211
- puts 'remaining: ' + remaining.inspect if @debug
212
- parse(remaining, a, cur) if remaining.length > 0
213
-
214
-
215
- # is it a closing tag?
216
- elsif s =~ /^\s?<\/\w+>/m
217
-
218
- tail = s[/^\s*<\/\w+>(.*)/m,1]
219
-
220
- if @debug then
221
- puts 'parse()/closing tag ' + s[/^\s*<\/\w+>/].inspect
222
- puts '>a: ' + a.inspect
223
- end
224
-
225
- @stack.pop
226
- #a << []
227
- parse(tail, a, @stack.last)
228
-
229
- elsif s.empty? and @stack.length > 0
230
-
231
- puts 'parse() no end tag!' if @debug
232
-
233
210
  end
234
211
 
235
- return a
236
-
212
+ [start_tag, children, end_tag]
237
213
  end
238
214
 
239
- # we parse the tag because it contains more than just the name it often contains attributes
240
- #
241
- def parsetag(s)
215
+ def get_attributes(raw_attributes)
242
216
 
243
- puts 'parsetag:' + s.inspect if @debug
244
- rawtagname, rawattr = s[1..-2].sub(/\/$/,'').match(/^(\w+) *(.*)/)\
245
- .values_at(1,2)
246
-
247
- tagname = case rawtagname.to_sym
248
- when :_comment
249
- '!-'
250
- when :_cdata
251
- '!['
252
- else
253
- rawtagname
217
+ r1 = /([\w\-:\(\)]+\='[^']*)'/
218
+ r2 = /([\w\-:\(\)]+\="[^"]*)"/
219
+
220
+ r = raw_attributes.scan(/#{r1}|#{r2}/).map(&:compact)\
221
+ .flatten.inject(Attributes.new) do |r, x|
222
+ attr_name, raw_val = x.split(/=/,2)
223
+ val = attr_name != 'class' ? raw_val[1..-1] : raw_val[1..-1].split
224
+ r.merge(attr_name.to_sym => val)
254
225
  end
255
226
 
256
- [tagname, get_attributes(rawattr)]
227
+ return r
257
228
  end
258
229
 
259
- def scancom(s, type=:comment)
260
-
261
- tag1 = ['<!--', '-->', 'comment', '<!--']
262
- tag2 = ['<![CDATA[', '\]\]>', 'cdata', '\<!\[CDATA\[']
263
- tag = type == :comment ? tag1 : tag2
264
-
265
- #puts 'tag: ' + tag.inspect
266
- istart = s =~ /#{tag[3]}/
267
- return s unless istart
268
-
269
- iend = s =~ /#{tag[1]}/
270
- comment ="<_%s>%s</_%s>" % [tag[2], s[istart+tag[0].length.. iend-1].gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;'), tag[2]]
271
-
272
- if @debug then
273
- puts 'comment: ' + comment.inspect
274
- # construct the new string
275
- puts 'istart: ' + istart.inspect
276
- end
277
-
278
- s3 = s[0,istart].to_s + comment + s[iend+3..-1]
279
- scancom(s3, type)
230
+ def reverse(raw_obj)
231
+
232
+ return unless raw_obj
233
+ obj = raw_obj.clone
234
+ return obj.reverse! if obj.is_a? String
280
235
 
281
- end
236
+ tag = obj.pop.reverse
237
+
238
+ children = obj[-1]
282
239
 
240
+ r = children.reverse.map {|x| reverse(x)}
241
+
242
+ return [tag[/[!\-\w:\[]+/], get_attributes(tag), *r]
243
+ end
283
244
  end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rexleparser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -64,7 +64,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
64
64
  - !ruby/object:Gem::Version
65
65
  version: '0'
66
66
  requirements: []
67
- rubygems_version: 3.4.6
67
+ rubygems_version: 3.4.4
68
68
  signing_key:
69
69
  specification_version: 4
70
70
  summary: Rexleparser is an XML parser used by the Rexle gem
metadata.gz.sig CHANGED
Binary file