rexml 3.2.5 → 3.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/rexml/source.rb CHANGED
@@ -1,8 +1,39 @@
1
1
  # coding: US-ASCII
2
2
  # frozen_string_literal: false
3
+
4
+ require "stringio"
5
+ require "strscan"
6
+
3
7
  require_relative 'encoding'
4
8
 
5
9
  module REXML
10
+ if StringScanner::Version < "1.0.0"
11
+ module StringScannerCheckScanString
12
+ refine StringScanner do
13
+ def check(pattern)
14
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
15
+ super(pattern)
16
+ end
17
+
18
+ def scan(pattern)
19
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
20
+ super(pattern)
21
+ end
22
+
23
+ def match?(pattern)
24
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
25
+ super(pattern)
26
+ end
27
+
28
+ def skip(pattern)
29
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
30
+ super(pattern)
31
+ end
32
+ end
33
+ end
34
+ using StringScannerCheckScanString
35
+ end
36
+
6
37
  # Generates Source-s. USE THIS CLASS.
7
38
  class SourceFactory
8
39
  # Generates a Source object
@@ -15,7 +46,6 @@ module REXML
15
46
  arg.respond_to? :eof?
16
47
  IOSource.new(arg)
17
48
  elsif arg.respond_to? :to_str
18
- require 'stringio'
19
49
  IOSource.new(StringIO.new(arg))
20
50
  elsif arg.kind_of? Source
21
51
  arg
@@ -30,26 +60,56 @@ module REXML
30
60
  # objects and provides consumption of text
31
61
  class Source
32
62
  include Encoding
33
- # The current buffer (what we're going to read next)
34
- attr_reader :buffer
35
63
  # The line number of the last consumed text
36
64
  attr_reader :line
37
65
  attr_reader :encoding
38
66
 
67
+ module Private
68
+ SCANNER_RESET_SIZE = 100000
69
+ PRE_DEFINED_TERM_PATTERNS = {}
70
+ pre_defined_terms = ["'", '"', "<"]
71
+ if StringScanner::Version < "3.1.1"
72
+ pre_defined_terms.each do |term|
73
+ PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/
74
+ end
75
+ else
76
+ pre_defined_terms.each do |term|
77
+ PRE_DEFINED_TERM_PATTERNS[term] = term
78
+ end
79
+ end
80
+ end
81
+ private_constant :Private
82
+
39
83
  # Constructor
40
84
  # @param arg must be a String, and should be a valid XML document
41
85
  # @param encoding if non-null, sets the encoding of the source to this
42
86
  # value, overriding all encoding detection
43
87
  def initialize(arg, encoding=nil)
44
- @orig = @buffer = arg
88
+ @orig = arg
89
+ @scanner = StringScanner.new(@orig)
45
90
  if encoding
46
91
  self.encoding = encoding
47
92
  else
48
93
  detect_encoding
49
94
  end
50
95
  @line = 0
96
+ @encoded_terms = {}
97
+ end
98
+
99
+ # The current buffer (what we're going to read next)
100
+ def buffer
101
+ @scanner.rest
51
102
  end
52
103
 
104
+ def drop_parsed_content
105
+ if @scanner.pos > Private::SCANNER_RESET_SIZE
106
+ @scanner.string = @scanner.rest
107
+ end
108
+ end
109
+
110
+ def buffer_encoding=(encoding)
111
+ @scanner.string.force_encoding(encoding)
112
+ end
53
113
 
54
114
  # Inherited from Encoding
55
115
  # Overridden to support optimized en/decoding
@@ -58,98 +118,94 @@ module REXML
58
118
  encoding_updated
59
119
  end
60
120
 
61
- # Scans the source for a given pattern. Note, that this is not your
62
- # usual scan() method. For one thing, the pattern argument has some
63
- # requirements; for another, the source can be consumed. You can easily
64
- # confuse this method. Originally, the patterns were easier
65
- # to construct and this method more robust, because this method
66
- # generated search regexps on the fly; however, this was
67
- # computationally expensive and slowed down the entire REXML package
68
- # considerably, since this is by far the most commonly called method.
69
- # @param pattern must be a Regexp, and must be in the form of
70
- # /^\s*(#{your pattern, with no groups})(.*)/. The first group
71
- # will be returned; the second group is used if the consume flag is
72
- # set.
73
- # @param consume if true, the pattern returned will be consumed, leaving
74
- # everything after it in the Source.
75
- # @return the pattern, if found, or nil if the Source is empty or the
76
- # pattern is not found.
77
- def scan(pattern, cons=false)
78
- return nil if @buffer.nil?
79
- rv = @buffer.scan(pattern)
80
- @buffer = $' if cons and rv.size>0
81
- rv
121
+ def read(term = nil)
82
122
  end
83
123
 
84
- def read
124
+ def read_until(term)
125
+ pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
126
+ data = @scanner.scan_until(pattern)
127
+ unless data
128
+ data = @scanner.rest
129
+ @scanner.pos = @scanner.string.bytesize
130
+ end
131
+ data
85
132
  end
86
133
 
87
- def consume( pattern )
88
- @buffer = $' if pattern.match( @buffer )
134
+ def ensure_buffer
89
135
  end
90
136
 
91
- def match_to( char, pattern )
92
- return pattern.match(@buffer)
137
+ def match(pattern, cons=false)
138
+ if cons
139
+ @scanner.scan(pattern).nil? ? nil : @scanner
140
+ else
141
+ @scanner.check(pattern).nil? ? nil : @scanner
142
+ end
93
143
  end
94
144
 
95
- def match_to_consume( char, pattern )
96
- md = pattern.match(@buffer)
97
- @buffer = $'
98
- return md
145
+ def match?(pattern, cons=false)
146
+ if cons
147
+ !@scanner.skip(pattern).nil?
148
+ else
149
+ !@scanner.match?(pattern).nil?
150
+ end
99
151
  end
100
152
 
101
- def match(pattern, cons=false)
102
- md = pattern.match(@buffer)
103
- @buffer = $' if cons and md
104
- return md
153
+ def position
154
+ @scanner.pos
105
155
  end
106
156
 
107
- # @return true if the Source is exhausted
108
- def empty?
109
- @buffer == ""
157
+ def position=(pos)
158
+ @scanner.pos = pos
110
159
  end
111
160
 
112
- def position
113
- @orig.index( @buffer )
161
+ def peek_byte
162
+ @scanner.peek_byte
163
+ end
164
+
165
+ def scan_byte
166
+ @scanner.scan_byte
167
+ end
168
+
169
+ # @return true if the Source is exhausted
170
+ def empty?
171
+ @scanner.eos?
114
172
  end
115
173
 
116
174
  # @return the current line in the source
117
175
  def current_line
118
176
  lines = @orig.split
119
- res = lines.grep @buffer[0..30]
177
+ res = lines.grep @scanner.rest[0..30]
120
178
  res = res[-1] if res.kind_of? Array
121
179
  lines.index( res ) if res
122
180
  end
123
181
 
124
182
  private
183
+
125
184
  def detect_encoding
126
- buffer_encoding = @buffer.encoding
185
+ scanner_encoding = @scanner.rest.encoding
127
186
  detected_encoding = "UTF-8"
128
187
  begin
129
- @buffer.force_encoding("ASCII-8BIT")
130
- if @buffer[0, 2] == "\xfe\xff"
131
- @buffer[0, 2] = ""
188
+ @scanner.string.force_encoding("ASCII-8BIT")
189
+ if @scanner.scan(/\xfe\xff/n)
132
190
  detected_encoding = "UTF-16BE"
133
- elsif @buffer[0, 2] == "\xff\xfe"
134
- @buffer[0, 2] = ""
191
+ elsif @scanner.scan(/\xff\xfe/n)
135
192
  detected_encoding = "UTF-16LE"
136
- elsif @buffer[0, 3] == "\xef\xbb\xbf"
137
- @buffer[0, 3] = ""
193
+ elsif @scanner.scan(/\xef\xbb\xbf/n)
138
194
  detected_encoding = "UTF-8"
139
195
  end
140
196
  ensure
141
- @buffer.force_encoding(buffer_encoding)
197
+ @scanner.string.force_encoding(scanner_encoding)
142
198
  end
143
199
  self.encoding = detected_encoding
144
200
  end
145
201
 
146
202
  def encoding_updated
147
203
  if @encoding != 'UTF-8'
148
- @buffer = decode(@buffer)
204
+ @scanner.string = decode(@scanner.rest)
149
205
  @to_utf = true
150
206
  else
151
207
  @to_utf = false
152
- @buffer.force_encoding ::Encoding::UTF_8
208
+ @scanner.string.force_encoding(::Encoding::UTF_8)
153
209
  end
154
210
  end
155
211
  end
@@ -172,7 +228,7 @@ module REXML
172
228
  end
173
229
 
174
230
  if !@to_utf and
175
- @buffer.respond_to?(:force_encoding) and
231
+ @orig.respond_to?(:force_encoding) and
176
232
  @source.respond_to?(:external_encoding) and
177
233
  @source.external_encoding != ::Encoding::UTF_8
178
234
  @force_utf8 = true
@@ -181,63 +237,87 @@ module REXML
181
237
  end
182
238
  end
183
239
 
184
- def scan(pattern, cons=false)
185
- rv = super
186
- # You'll notice that this next section is very similar to the same
187
- # section in match(), but just a liiittle different. This is
188
- # because it is a touch faster to do it this way with scan()
189
- # than the way match() does it; enough faster to warrant duplicating
190
- # some code
191
- if rv.size == 0
192
- until @buffer =~ pattern or @source.nil?
193
- begin
194
- @buffer << readline
195
- rescue Iconv::IllegalSequence
196
- raise
197
- rescue
198
- @source = nil
240
+ def read(term = nil, min_bytes = 1)
241
+ term = encode(term) if term
242
+ begin
243
+ str = readline(term)
244
+ @scanner << str
245
+ read_bytes = str.bytesize
246
+ begin
247
+ while read_bytes < min_bytes
248
+ str = readline(term)
249
+ @scanner << str
250
+ read_bytes += str.bytesize
199
251
  end
252
+ rescue IOError
200
253
  end
201
- rv = super
254
+ true
255
+ rescue Exception, NameError
256
+ @source = nil
257
+ false
202
258
  end
203
- rv.taint if RUBY_VERSION < '2.7'
204
- rv
205
259
  end
206
260
 
207
- def read
208
- begin
209
- @buffer << readline
210
- rescue Exception, NameError
211
- @source = nil
261
+ def read_until(term)
262
+ pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
263
+ term = @encoded_terms[term] ||= encode(term)
264
+ until str = @scanner.scan_until(pattern)
265
+ break if @source.nil?
266
+ break if @source.eof?
267
+ @scanner << readline(term)
268
+ end
269
+ if str
270
+ read if @scanner.eos? and !@source.eof?
271
+ str
272
+ else
273
+ rest = @scanner.rest
274
+ @scanner.pos = @scanner.string.bytesize
275
+ rest
212
276
  end
213
277
  end
214
278
 
215
- def consume( pattern )
216
- match( pattern, true )
279
+ def ensure_buffer
280
+ read if @scanner.eos? && @source
217
281
  end
218
282
 
219
283
  def match( pattern, cons=false )
220
- rv = pattern.match(@buffer)
221
- @buffer = $' if cons and rv
222
- while !rv and @source
223
- begin
224
- @buffer << readline
225
- rv = pattern.match(@buffer)
226
- @buffer = $' if cons and rv
227
- rescue
228
- @source = nil
284
+ # To avoid performance issue, we need to increase bytes to read per scan
285
+ min_bytes = 1
286
+ while true
287
+ if cons
288
+ md = @scanner.scan(pattern)
289
+ else
290
+ md = @scanner.check(pattern)
229
291
  end
292
+ break if md
293
+ return nil if pattern.is_a?(String)
294
+ return nil if @source.nil?
295
+ return nil unless read(nil, min_bytes)
296
+ min_bytes *= 2
230
297
  end
231
- rv.taint if RUBY_VERSION < '2.7'
232
- rv
298
+
299
+ md.nil? ? nil : @scanner
233
300
  end
234
301
 
235
- def empty?
236
- super and ( @source.nil? || @source.eof? )
302
+ def match?( pattern, cons=false )
303
+ # To avoid performance issue, we need to increase bytes to read per scan
304
+ min_bytes = 1
305
+ while true
306
+ if cons
307
+ n_matched_bytes = @scanner.skip(pattern)
308
+ else
309
+ n_matched_bytes = @scanner.match?(pattern)
310
+ end
311
+ return true if n_matched_bytes
312
+ return false if pattern.is_a?(String)
313
+ return false if @source.nil?
314
+ return false unless read(nil, min_bytes)
315
+ min_bytes *= 2
316
+ end
237
317
  end
238
318
 
239
- def position
240
- @er_source.pos rescue 0
319
+ def empty?
320
+ super and ( @source.nil? || @source.eof? )
241
321
  end
242
322
 
243
323
  # @return the current line in the source
@@ -255,7 +335,7 @@ module REXML
255
335
  rescue
256
336
  end
257
337
  @er_source.seek(pos)
258
- rescue IOError
338
+ rescue IOError, SystemCallError
259
339
  pos = -1
260
340
  line = -1
261
341
  end
@@ -263,15 +343,20 @@ module REXML
263
343
  end
264
344
 
265
345
  private
266
- def readline
267
- str = @source.readline(@line_break)
346
+ def readline(term = nil)
268
347
  if @pending_buffer
348
+ begin
349
+ str = @source.readline(term || @line_break)
350
+ rescue IOError
351
+ end
269
352
  if str.nil?
270
353
  str = @pending_buffer
271
354
  else
272
355
  str = @pending_buffer + str
273
356
  end
274
357
  @pending_buffer = nil
358
+ else
359
+ str = @source.readline(term || @line_break)
275
360
  end
276
361
  return nil if str.nil?
277
362
 
@@ -290,7 +375,7 @@ module REXML
290
375
  @source.set_encoding(@encoding, @encoding)
291
376
  end
292
377
  @line_break = encode(">")
293
- @pending_buffer, @buffer = @buffer, ""
378
+ @pending_buffer, @scanner.string = @scanner.rest, ""
294
379
  @pending_buffer.force_encoding(@encoding)
295
380
  super
296
381
  end
data/lib/rexml/text.rb CHANGED
@@ -1,4 +1,4 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative 'security'
3
3
  require_relative 'entity'
4
4
  require_relative 'doctype'
@@ -29,31 +29,16 @@ module REXML
29
29
  (0x10000..0x10FFFF)
30
30
  ]
31
31
 
32
- if String.method_defined? :encode
33
- VALID_XML_CHARS = Regexp.new('^['+
34
- VALID_CHAR.map { |item|
35
- case item
36
- when Integer
37
- [item].pack('U').force_encoding('utf-8')
38
- when Range
39
- [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')
40
- end
41
- }.join +
42
- ']*$')
43
- else
44
- VALID_XML_CHARS = /^(
45
- [\x09\x0A\x0D\x20-\x7E] # ASCII
46
- | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
47
- | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
48
- | [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte
49
- | \xEF[\x80-\xBE]{2} #
50
- | \xEF\xBF[\x80-\xBD] # excluding U+fffe and U+ffff
51
- | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
52
- | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
53
- | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
54
- | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
55
- )*$/nx;
56
- end
32
+ VALID_XML_CHARS = Regexp.new('^['+
33
+ VALID_CHAR.map { |item|
34
+ case item
35
+ when Integer
36
+ [item].pack('U').force_encoding('utf-8')
37
+ when Range
38
+ [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')
39
+ end
40
+ }.join +
41
+ ']*$')
57
42
 
58
43
  # Constructor
59
44
  # +arg+ if a String, the content is set to the String. If a Text,
@@ -131,45 +116,55 @@ module REXML
131
116
  def Text.check string, pattern, doctype
132
117
 
133
118
  # illegal anywhere
134
- if string !~ VALID_XML_CHARS
135
- if String.method_defined? :encode
136
- string.chars.each do |c|
137
- case c.ord
138
- when *VALID_CHAR
139
- else
140
- raise "Illegal character #{c.inspect} in raw string #{string.inspect}"
141
- end
142
- end
143
- else
144
- string.scan(/[\x00-\x7F]|[\x80-\xBF][\xC0-\xF0]*|[\xC0-\xF0]/n) do |c|
145
- case c.unpack('U')
146
- when *VALID_CHAR
147
- else
148
- raise "Illegal character #{c.inspect} in raw string #{string.inspect}"
149
- end
119
+ if !string.match?(VALID_XML_CHARS)
120
+ string.chars.each do |c|
121
+ case c.ord
122
+ when *VALID_CHAR
123
+ else
124
+ raise "Illegal character #{c.inspect} in raw string #{string.inspect}"
150
125
  end
151
126
  end
152
127
  end
153
128
 
154
- # context sensitive
155
- string.scan(pattern) do
156
- if $1[-1] != ?;
157
- raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
158
- elsif $1[0] == ?&
159
- if $5 and $5[0] == ?#
160
- case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
161
- when *VALID_CHAR
129
+ pos = 0
130
+ while (index = string.index(/<|&/, pos))
131
+ if string[index] == "<"
132
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
133
+ end
134
+
135
+ unless (end_index = string.index(/[^\s];/, index + 1))
136
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
137
+ end
138
+
139
+ value = string[(index + 1)..end_index]
140
+ if /\s/.match?(value)
141
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
142
+ end
143
+
144
+ if value[0] == "#"
145
+ character_reference = value[1..-1]
146
+
147
+ unless (/\A(\d+|x[0-9a-fA-F]+)\z/.match?(character_reference))
148
+ if character_reference[0] == "x" || character_reference[-1] == "x"
149
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
162
150
  else
163
- raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
151
+ raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
164
152
  end
165
- # FIXME: below can't work but this needs API change.
166
- # elsif @parent and $3 and !SUBSTITUTES.include?($1)
167
- # if !doctype or !doctype.entities.has_key?($3)
168
- # raise "Undeclared entity '#{$1}' in raw string \"#{string}\""
169
- # end
170
153
  end
154
+
155
+ case (character_reference[0] == "x" ? character_reference[1..-1].to_i(16) : character_reference[0..-1].to_i)
156
+ when *VALID_CHAR
157
+ else
158
+ raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
159
+ end
160
+ elsif !(/\A#{Entity::NAME}\z/um.match?(value))
161
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
171
162
  end
163
+
164
+ pos = end_index + 1
172
165
  end
166
+
167
+ string
173
168
  end
174
169
 
175
170
  def node_type
@@ -248,7 +243,8 @@ module REXML
248
243
  # u = Text.new( "sean russell", false, nil, true )
249
244
  # u.value #-> "sean russell"
250
245
  def value
251
- @unnormalized ||= Text::unnormalize( @string, doctype )
246
+ @unnormalized ||= Text::unnormalize(@string, doctype,
247
+ entity_expansion_text_limit: document&.entity_expansion_text_limit)
252
248
  end
253
249
 
254
250
  # Sets the contents of this text node. This expects the text to be
@@ -371,7 +367,7 @@ module REXML
371
367
  copy = input.to_s
372
368
  # Doing it like this rather than in a loop improves the speed
373
369
  #copy = copy.gsub( EREFERENCE, '&amp;' )
374
- copy = copy.gsub( "&", "&amp;" )
370
+ copy = copy.gsub( "&", "&amp;" ) if copy.include?("&")
375
371
  if doctype
376
372
  # Replace all ampersands that aren't part of an entity
377
373
  doctype.entities.each_value do |entity|
@@ -382,18 +378,21 @@ module REXML
382
378
  else
383
379
  # Replace all ampersands that aren't part of an entity
384
380
  DocType::DEFAULT_ENTITIES.each_value do |entity|
385
- copy = copy.gsub(entity.value, "&#{entity.name};" )
381
+ if copy.include?(entity.value)
382
+ copy = copy.gsub(entity.value, "&#{entity.name};" )
383
+ end
386
384
  end
387
385
  end
388
386
  copy
389
387
  end
390
388
 
391
389
  # Unescapes all possible entities
392
- def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil )
390
+ def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expansion_text_limit: nil )
391
+ entity_expansion_text_limit ||= Security.entity_expansion_text_limit
393
392
  sum = 0
394
393
  string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) {
395
394
  s = Text.expand($&, doctype, filter)
396
- if sum + s.bytesize > Security.entity_expansion_text_limit
395
+ if sum + s.bytesize > entity_expansion_text_limit
397
396
  raise "entity expansion has grown too large"
398
397
  else
399
398
  sum += s.bytesize
@@ -590,6 +590,7 @@ module REXML
590
590
 
591
591
  def evaluate_predicate(expression, nodesets)
592
592
  enter(:predicate, expression, nodesets) if @debug
593
+ new_nodeset_count = 0
593
594
  new_nodesets = nodesets.collect do |nodeset|
594
595
  new_nodeset = []
595
596
  subcontext = { :size => nodeset.size }
@@ -606,17 +607,20 @@ module REXML
606
607
  result = result[0] if result.kind_of? Array and result.length == 1
607
608
  if result.kind_of? Numeric
608
609
  if result == node.position
609
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
610
+ new_nodeset_count += 1
611
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
610
612
  end
611
613
  elsif result.instance_of? Array
612
614
  if result.size > 0 and result.inject(false) {|k,s| s or k}
613
615
  if result.size > 0
614
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
616
+ new_nodeset_count += 1
617
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
615
618
  end
616
619
  end
617
620
  else
618
621
  if result
619
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
622
+ new_nodeset_count += 1
623
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
620
624
  end
621
625
  end
622
626
  end