rexml 3.2.6 → 3.3.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -47,6 +47,10 @@ module REXML
47
47
  @listeners << listener
48
48
  end
49
49
 
50
+ def entity_expansion_count
51
+ @parser.entity_expansion_count
52
+ end
53
+
50
54
  def each
51
55
  while has_next?
52
56
  yield self.pull
@@ -22,6 +22,10 @@ module REXML
22
22
  @parser.source
23
23
  end
24
24
 
25
+ def entity_expansion_count
26
+ @parser.entity_expansion_count
27
+ end
28
+
25
29
  def add_listener( listener )
26
30
  @parser.add_listener( listener )
27
31
  end
@@ -157,25 +161,8 @@ module REXML
157
161
  end
158
162
  end
159
163
  when :text
160
- #normalized = @parser.normalize( event[1] )
161
- #handle( :characters, normalized )
162
- copy = event[1].clone
163
-
164
- esub = proc { |match|
165
- if @entities.has_key?($1)
166
- @entities[$1].gsub(Text::REFERENCE, &esub)
167
- else
168
- match
169
- end
170
- }
171
-
172
- copy.gsub!( Text::REFERENCE, &esub )
173
- copy.gsub!( Text::NUMERICENTITY ) {|m|
174
- m=$1
175
- m = "0#{m}" if m[0] == ?x
176
- [Integer(m)].pack('U*')
177
- }
178
- handle( :characters, copy )
164
+ unnormalized = @parser.unnormalize( event[1], @entities )
165
+ handle( :characters, unnormalized )
179
166
  when :entitydecl
180
167
  handle_entitydecl( event )
181
168
  when :processing_instruction, :comment, :attlistdecl,
@@ -7,37 +7,34 @@ module REXML
7
7
  def initialize source, listener
8
8
  @listener = listener
9
9
  @parser = BaseParser.new( source )
10
- @tag_stack = []
10
+ @entities = {}
11
11
  end
12
12
 
13
13
  def add_listener( listener )
14
14
  @parser.add_listener( listener )
15
15
  end
16
16
 
17
+ def entity_expansion_count
18
+ @parser.entity_expansion_count
19
+ end
20
+
17
21
  def parse
18
22
  # entity string
19
23
  while true
20
24
  event = @parser.pull
21
25
  case event[0]
22
26
  when :end_document
23
- unless @tag_stack.empty?
24
- tag_path = "/" + @tag_stack.join("/")
25
- raise ParseException.new("Missing end tag for '#{tag_path}'",
26
- @parser.source)
27
- end
28
27
  return
29
28
  when :start_element
30
- @tag_stack << event[1]
31
29
  attrs = event[2].each do |n, v|
32
30
  event[2][n] = @parser.unnormalize( v )
33
31
  end
34
32
  @listener.tag_start( event[1], attrs )
35
33
  when :end_element
36
34
  @listener.tag_end( event[1] )
37
- @tag_stack.pop
38
35
  when :text
39
- normalized = @parser.unnormalize( event[1] )
40
- @listener.text( normalized )
36
+ unnormalized = @parser.unnormalize( event[1], @entities )
37
+ @listener.text( unnormalized )
41
38
  when :processing_instruction
42
39
  @listener.instruction( *event[1,2] )
43
40
  when :start_doctype
@@ -48,6 +45,7 @@ module REXML
48
45
  when :comment, :attlistdecl, :cdata, :xmldecl, :elementdecl
49
46
  @listener.send( event[0].to_s, *event[1..-1] )
50
47
  when :entitydecl, :notationdecl
48
+ @entities[ event[1] ] = event[2] if event.size == 3
51
49
  @listener.send( event[0].to_s, event[1..-1] )
52
50
  when :externalentity
53
51
  entity_reference = event[1]
@@ -15,8 +15,6 @@ module REXML
15
15
  end
16
16
 
17
17
  def parse
18
- tag_stack = []
19
- in_doctype = false
20
18
  entities = nil
21
19
  begin
22
20
  while true
@@ -24,32 +22,24 @@ module REXML
24
22
  #STDERR.puts "TREEPARSER GOT #{event.inspect}"
25
23
  case event[0]
26
24
  when :end_document
27
- unless tag_stack.empty?
28
- raise ParseException.new("No close tag for #{@build_context.xpath}",
29
- @parser.source, @parser)
30
- end
31
25
  return
32
26
  when :start_element
33
- tag_stack.push(event[1])
34
27
  el = @build_context = @build_context.add_element( event[1] )
35
28
  event[2].each do |key, value|
36
29
  el.attributes[key]=Attribute.new(key,value,self)
37
30
  end
38
31
  when :end_element
39
- tag_stack.pop
40
32
  @build_context = @build_context.parent
41
33
  when :text
42
- if not in_doctype
43
- if @build_context[-1].instance_of? Text
44
- @build_context[-1] << event[1]
45
- else
46
- @build_context.add(
47
- Text.new(event[1], @build_context.whitespace, nil, true)
48
- ) unless (
49
- @build_context.ignore_whitespace_nodes and
50
- event[1].strip.size==0
51
- )
52
- end
34
+ if @build_context[-1].instance_of? Text
35
+ @build_context[-1] << event[1]
36
+ else
37
+ @build_context.add(
38
+ Text.new(event[1], @build_context.whitespace, nil, true)
39
+ ) unless (
40
+ @build_context.ignore_whitespace_nodes and
41
+ event[1].strip.size==0
42
+ )
53
43
  end
54
44
  when :comment
55
45
  c = Comment.new( event[1] )
@@ -60,14 +50,12 @@ module REXML
60
50
  when :processing_instruction
61
51
  @build_context.add( Instruction.new( event[1], event[2] ) )
62
52
  when :end_doctype
63
- in_doctype = false
64
53
  entities.each { |k,v| entities[k] = @build_context.entities[k].value }
65
54
  @build_context = @build_context.parent
66
55
  when :start_doctype
67
56
  doctype = DocType.new( event[1..-1], @build_context )
68
57
  @build_context = doctype
69
58
  entities = {}
70
- in_doctype = true
71
59
  when :attlistdecl
72
60
  n = AttlistDecl.new( event[1..-1] )
73
61
  @build_context.add( n )
data/lib/rexml/rexml.rb CHANGED
@@ -31,7 +31,7 @@
31
31
  module REXML
32
32
  COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
33
33
  DATE = "2008/019"
34
- VERSION = "3.2.6"
34
+ VERSION = "3.3.6"
35
35
  REVISION = ""
36
36
 
37
37
  Copyright = COPYRIGHT
data/lib/rexml/source.rb CHANGED
@@ -1,8 +1,28 @@
1
1
  # coding: US-ASCII
2
2
  # frozen_string_literal: false
3
+
4
+ require "strscan"
5
+
3
6
  require_relative 'encoding'
4
7
 
5
8
  module REXML
9
+ if StringScanner::Version < "1.0.0"
10
+ module StringScannerCheckScanString
11
+ refine StringScanner do
12
+ def check(pattern)
13
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
14
+ super(pattern)
15
+ end
16
+
17
+ def scan(pattern)
18
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
19
+ super(pattern)
20
+ end
21
+ end
22
+ end
23
+ using StringScannerCheckScanString
24
+ end
25
+
6
26
  # Generates Source-s. USE THIS CLASS.
7
27
  class SourceFactory
8
28
  # Generates a Source object
@@ -30,18 +50,27 @@ module REXML
30
50
  # objects and provides consumption of text
31
51
  class Source
32
52
  include Encoding
33
- # The current buffer (what we're going to read next)
34
- attr_reader :buffer
35
53
  # The line number of the last consumed text
36
54
  attr_reader :line
37
55
  attr_reader :encoding
38
56
 
57
+ module Private
58
+ SCANNER_RESET_SIZE = 100000
59
+ PRE_DEFINED_TERM_PATTERNS = {}
60
+ pre_defined_terms = ["'", '"', "<"]
61
+ pre_defined_terms.each do |term|
62
+ PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/
63
+ end
64
+ end
65
+ private_constant :Private
66
+
39
67
  # Constructor
40
68
  # @param arg must be a String, and should be a valid XML document
41
69
  # @param encoding if non-null, sets the encoding of the source to this
42
70
  # value, overriding all encoding detection
43
71
  def initialize(arg, encoding=nil)
44
- @orig = @buffer = arg
72
+ @orig = arg
73
+ @scanner = StringScanner.new(@orig)
45
74
  if encoding
46
75
  self.encoding = encoding
47
76
  else
@@ -50,6 +79,20 @@ module REXML
50
79
  @line = 0
51
80
  end
52
81
 
82
+ # The current buffer (what we're going to read next)
83
+ def buffer
84
+ @scanner.rest
85
+ end
86
+
87
+ def drop_parsed_content
88
+ if @scanner.pos > Private::SCANNER_RESET_SIZE
89
+ @scanner.string = @scanner.rest
90
+ end
91
+ end
92
+
93
+ def buffer_encoding=(encoding)
94
+ @scanner.string.force_encoding(encoding)
95
+ end
53
96
 
54
97
  # Inherited from Encoding
55
98
  # Overridden to support optimized en/decoding
@@ -58,98 +101,78 @@ module REXML
58
101
  encoding_updated
59
102
  end
60
103
 
61
- # Scans the source for a given pattern. Note, that this is not your
62
- # usual scan() method. For one thing, the pattern argument has some
63
- # requirements; for another, the source can be consumed. You can easily
64
- # confuse this method. Originally, the patterns were easier
65
- # to construct and this method more robust, because this method
66
- # generated search regexps on the fly; however, this was
67
- # computationally expensive and slowed down the entire REXML package
68
- # considerably, since this is by far the most commonly called method.
69
- # @param pattern must be a Regexp, and must be in the form of
70
- # /^\s*(#{your pattern, with no groups})(.*)/. The first group
71
- # will be returned; the second group is used if the consume flag is
72
- # set.
73
- # @param consume if true, the pattern returned will be consumed, leaving
74
- # everything after it in the Source.
75
- # @return the pattern, if found, or nil if the Source is empty or the
76
- # pattern is not found.
77
- def scan(pattern, cons=false)
78
- return nil if @buffer.nil?
79
- rv = @buffer.scan(pattern)
80
- @buffer = $' if cons and rv.size>0
81
- rv
104
+ def read(term = nil)
82
105
  end
83
106
 
84
- def read
107
+ def read_until(term)
108
+ pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
109
+ data = @scanner.scan_until(pattern)
110
+ unless data
111
+ data = @scanner.rest
112
+ @scanner.pos = @scanner.string.bytesize
113
+ end
114
+ data
85
115
  end
86
116
 
87
- def consume( pattern )
88
- @buffer = $' if pattern.match( @buffer )
117
+ def ensure_buffer
89
118
  end
90
119
 
91
- def match_to( char, pattern )
92
- return pattern.match(@buffer)
120
+ def match(pattern, cons=false)
121
+ if cons
122
+ @scanner.scan(pattern).nil? ? nil : @scanner
123
+ else
124
+ @scanner.check(pattern).nil? ? nil : @scanner
125
+ end
93
126
  end
94
127
 
95
- def match_to_consume( char, pattern )
96
- md = pattern.match(@buffer)
97
- @buffer = $'
98
- return md
128
+ def position
129
+ @scanner.pos
99
130
  end
100
131
 
101
- def match(pattern, cons=false)
102
- md = pattern.match(@buffer)
103
- @buffer = $' if cons and md
104
- return md
132
+ def position=(pos)
133
+ @scanner.pos = pos
105
134
  end
106
135
 
107
136
  # @return true if the Source is exhausted
108
137
  def empty?
109
- @buffer == ""
110
- end
111
-
112
- def position
113
- @orig.index( @buffer )
138
+ @scanner.eos?
114
139
  end
115
140
 
116
141
  # @return the current line in the source
117
142
  def current_line
118
143
  lines = @orig.split
119
- res = lines.grep @buffer[0..30]
144
+ res = lines.grep @scanner.rest[0..30]
120
145
  res = res[-1] if res.kind_of? Array
121
146
  lines.index( res ) if res
122
147
  end
123
148
 
124
149
  private
150
+
125
151
  def detect_encoding
126
- buffer_encoding = @buffer.encoding
152
+ scanner_encoding = @scanner.rest.encoding
127
153
  detected_encoding = "UTF-8"
128
154
  begin
129
- @buffer.force_encoding("ASCII-8BIT")
130
- if @buffer[0, 2] == "\xfe\xff"
131
- @buffer[0, 2] = ""
155
+ @scanner.string.force_encoding("ASCII-8BIT")
156
+ if @scanner.scan(/\xfe\xff/n)
132
157
  detected_encoding = "UTF-16BE"
133
- elsif @buffer[0, 2] == "\xff\xfe"
134
- @buffer[0, 2] = ""
158
+ elsif @scanner.scan(/\xff\xfe/n)
135
159
  detected_encoding = "UTF-16LE"
136
- elsif @buffer[0, 3] == "\xef\xbb\xbf"
137
- @buffer[0, 3] = ""
160
+ elsif @scanner.scan(/\xef\xbb\xbf/n)
138
161
  detected_encoding = "UTF-8"
139
162
  end
140
163
  ensure
141
- @buffer.force_encoding(buffer_encoding)
164
+ @scanner.string.force_encoding(scanner_encoding)
142
165
  end
143
166
  self.encoding = detected_encoding
144
167
  end
145
168
 
146
169
  def encoding_updated
147
170
  if @encoding != 'UTF-8'
148
- @buffer = decode(@buffer)
171
+ @scanner.string = decode(@scanner.rest)
149
172
  @to_utf = true
150
173
  else
151
174
  @to_utf = false
152
- @buffer.force_encoding ::Encoding::UTF_8
175
+ @scanner.string.force_encoding(::Encoding::UTF_8)
153
176
  end
154
177
  end
155
178
  end
@@ -172,7 +195,7 @@ module REXML
172
195
  end
173
196
 
174
197
  if !@to_utf and
175
- @buffer.respond_to?(:force_encoding) and
198
+ @orig.respond_to?(:force_encoding) and
176
199
  @source.respond_to?(:external_encoding) and
177
200
  @source.external_encoding != ::Encoding::UTF_8
178
201
  @force_utf8 = true
@@ -181,65 +204,72 @@ module REXML
181
204
  end
182
205
  end
183
206
 
184
- def scan(pattern, cons=false)
185
- rv = super
186
- # You'll notice that this next section is very similar to the same
187
- # section in match(), but just a liiittle different. This is
188
- # because it is a touch faster to do it this way with scan()
189
- # than the way match() does it; enough faster to warrant duplicating
190
- # some code
191
- if rv.size == 0
192
- until @buffer =~ pattern or @source.nil?
193
- begin
194
- @buffer << readline
195
- rescue Iconv::IllegalSequence
196
- raise
197
- rescue
198
- @source = nil
207
+ def read(term = nil, min_bytes = 1)
208
+ term = encode(term) if term
209
+ begin
210
+ str = readline(term)
211
+ @scanner << str
212
+ read_bytes = str.bytesize
213
+ begin
214
+ while read_bytes < min_bytes
215
+ str = readline(term)
216
+ @scanner << str
217
+ read_bytes += str.bytesize
199
218
  end
219
+ rescue IOError
200
220
  end
201
- rv = super
221
+ true
222
+ rescue Exception, NameError
223
+ @source = nil
224
+ false
202
225
  end
203
- rv.taint if RUBY_VERSION < '2.7'
204
- rv
205
226
  end
206
227
 
207
- def read
208
- begin
209
- @buffer << readline
210
- rescue Exception, NameError
211
- @source = nil
228
+ def read_until(term)
229
+ pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
230
+ term = encode(term)
231
+ until str = @scanner.scan_until(pattern)
232
+ break if @source.nil?
233
+ break if @source.eof?
234
+ @scanner << readline(term)
235
+ end
236
+ if str
237
+ read if @scanner.eos? and !@source.eof?
238
+ str
239
+ else
240
+ rest = @scanner.rest
241
+ @scanner.pos = @scanner.string.bytesize
242
+ rest
212
243
  end
213
244
  end
214
245
 
215
- def consume( pattern )
216
- match( pattern, true )
246
+ def ensure_buffer
247
+ read if @scanner.eos? && @source
217
248
  end
218
249
 
219
250
  def match( pattern, cons=false )
220
- rv = pattern.match(@buffer)
221
- @buffer = $' if cons and rv
222
- while !rv and @source
223
- begin
224
- @buffer << readline
225
- rv = pattern.match(@buffer)
226
- @buffer = $' if cons and rv
227
- rescue
228
- @source = nil
251
+ # To avoid performance issue, we need to increase bytes to read per scan
252
+ min_bytes = 1
253
+ while true
254
+ if cons
255
+ md = @scanner.scan(pattern)
256
+ else
257
+ md = @scanner.check(pattern)
229
258
  end
259
+ break if md
260
+ return nil if pattern.is_a?(String)
261
+ return nil if @source.nil?
262
+ return nil unless read(nil, min_bytes)
263
+ min_bytes *= 2
230
264
  end
231
- rv.taint if RUBY_VERSION < '2.7'
232
- rv
265
+
266
+ md.nil? ? nil : @scanner
233
267
  end
234
268
 
235
269
  def empty?
236
270
  super and ( @source.nil? || @source.eof? )
237
271
  end
238
272
 
239
- def position
240
- @er_source.pos rescue 0
241
- end
242
-
243
273
  # @return the current line in the source
244
274
  def current_line
245
275
  begin
@@ -263,8 +293,8 @@ module REXML
263
293
  end
264
294
 
265
295
  private
266
- def readline
267
- str = @source.readline(@line_break)
296
+ def readline(term = nil)
297
+ str = @source.readline(term || @line_break)
268
298
  if @pending_buffer
269
299
  if str.nil?
270
300
  str = @pending_buffer
@@ -290,7 +320,7 @@ module REXML
290
320
  @source.set_encoding(@encoding, @encoding)
291
321
  end
292
322
  @line_break = encode(">")
293
- @pending_buffer, @buffer = @buffer, ""
323
+ @pending_buffer, @scanner.string = @scanner.rest, ""
294
324
  @pending_buffer.force_encoding(@encoding)
295
325
  super
296
326
  end
data/lib/rexml/text.rb CHANGED
@@ -151,25 +151,45 @@ module REXML
151
151
  end
152
152
  end
153
153
 
154
- # context sensitive
155
- string.scan(pattern) do
156
- if $1[-1] != ?;
157
- raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
158
- elsif $1[0] == ?&
159
- if $5 and $5[0] == ?#
160
- case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
161
- when *VALID_CHAR
154
+ pos = 0
155
+ while (index = string.index(/<|&/, pos))
156
+ if string[index] == "<"
157
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
158
+ end
159
+
160
+ unless (end_index = string.index(/[^\s];/, index + 1))
161
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
162
+ end
163
+
164
+ value = string[(index + 1)..end_index]
165
+ if /\s/.match?(value)
166
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
167
+ end
168
+
169
+ if value[0] == "#"
170
+ character_reference = value[1..-1]
171
+
172
+ unless (/\A(\d+|x[0-9a-fA-F]+)\z/.match?(character_reference))
173
+ if character_reference[0] == "x" || character_reference[-1] == "x"
174
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
162
175
  else
163
- raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
176
+ raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
164
177
  end
165
- # FIXME: below can't work but this needs API change.
166
- # elsif @parent and $3 and !SUBSTITUTES.include?($1)
167
- # if !doctype or !doctype.entities.has_key?($3)
168
- # raise "Undeclared entity '#{$1}' in raw string \"#{string}\""
169
- # end
170
178
  end
179
+
180
+ case (character_reference[0] == "x" ? character_reference[1..-1].to_i(16) : character_reference[0..-1].to_i)
181
+ when *VALID_CHAR
182
+ else
183
+ raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
184
+ end
185
+ elsif !(/\A#{Entity::NAME}\z/um.match?(value))
186
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
171
187
  end
188
+
189
+ pos = end_index + 1
172
190
  end
191
+
192
+ string
173
193
  end
174
194
 
175
195
  def node_type
@@ -590,6 +590,7 @@ module REXML
590
590
 
591
591
  def evaluate_predicate(expression, nodesets)
592
592
  enter(:predicate, expression, nodesets) if @debug
593
+ new_nodeset_count = 0
593
594
  new_nodesets = nodesets.collect do |nodeset|
594
595
  new_nodeset = []
595
596
  subcontext = { :size => nodeset.size }
@@ -606,17 +607,20 @@ module REXML
606
607
  result = result[0] if result.kind_of? Array and result.length == 1
607
608
  if result.kind_of? Numeric
608
609
  if result == node.position
609
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
610
+ new_nodeset_count += 1
611
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
610
612
  end
611
613
  elsif result.instance_of? Array
612
614
  if result.size > 0 and result.inject(false) {|k,s| s or k}
613
615
  if result.size > 0
614
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
616
+ new_nodeset_count += 1
617
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
615
618
  end
616
619
  end
617
620
  else
618
621
  if result
619
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
622
+ new_nodeset_count += 1
623
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
620
624
  end
621
625
  end
622
626
  end