rexml 3.2.6 → 3.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -47,6 +47,18 @@ module REXML
47
47
  @listeners << listener
48
48
  end
49
49
 
50
+ def entity_expansion_count
51
+ @parser.entity_expansion_count
52
+ end
53
+
54
+ def entity_expansion_limit=( limit )
55
+ @parser.entity_expansion_limit = limit
56
+ end
57
+
58
+ def entity_expansion_text_limit=( limit )
59
+ @parser.entity_expansion_text_limit = limit
60
+ end
61
+
50
62
  def each
51
63
  while has_next?
52
64
  yield self.pull
@@ -22,6 +22,18 @@ module REXML
22
22
  @parser.source
23
23
  end
24
24
 
25
+ def entity_expansion_count
26
+ @parser.entity_expansion_count
27
+ end
28
+
29
+ def entity_expansion_limit=( limit )
30
+ @parser.entity_expansion_limit = limit
31
+ end
32
+
33
+ def entity_expansion_text_limit=( limit )
34
+ @parser.entity_expansion_text_limit = limit
35
+ end
36
+
25
37
  def add_listener( listener )
26
38
  @parser.add_listener( listener )
27
39
  end
@@ -157,25 +169,8 @@ module REXML
157
169
  end
158
170
  end
159
171
  when :text
160
- #normalized = @parser.normalize( event[1] )
161
- #handle( :characters, normalized )
162
- copy = event[1].clone
163
-
164
- esub = proc { |match|
165
- if @entities.has_key?($1)
166
- @entities[$1].gsub(Text::REFERENCE, &esub)
167
- else
168
- match
169
- end
170
- }
171
-
172
- copy.gsub!( Text::REFERENCE, &esub )
173
- copy.gsub!( Text::NUMERICENTITY ) {|m|
174
- m=$1
175
- m = "0#{m}" if m[0] == ?x
176
- [Integer(m)].pack('U*')
177
- }
178
- handle( :characters, copy )
172
+ unnormalized = @parser.unnormalize( event[1], @entities )
173
+ handle( :characters, unnormalized )
179
174
  when :entitydecl
180
175
  handle_entitydecl( event )
181
176
  when :processing_instruction, :comment, :attlistdecl,
@@ -264,6 +259,8 @@ module REXML
264
259
  end
265
260
 
266
261
  def get_namespace( prefix )
262
+ return nil if @namespace_stack.empty?
263
+
267
264
  uris = (@namespace_stack.find_all { |ns| not ns[prefix].nil? }) ||
268
265
  (@namespace_stack.find { |ns| not ns[nil].nil? })
269
266
  uris[-1][prefix] unless uris.nil? or 0 == uris.size
@@ -7,37 +7,42 @@ module REXML
7
7
  def initialize source, listener
8
8
  @listener = listener
9
9
  @parser = BaseParser.new( source )
10
- @tag_stack = []
10
+ @entities = {}
11
11
  end
12
12
 
13
13
  def add_listener( listener )
14
14
  @parser.add_listener( listener )
15
15
  end
16
16
 
17
+ def entity_expansion_count
18
+ @parser.entity_expansion_count
19
+ end
20
+
21
+ def entity_expansion_limit=( limit )
22
+ @parser.entity_expansion_limit = limit
23
+ end
24
+
25
+ def entity_expansion_text_limit=( limit )
26
+ @parser.entity_expansion_text_limit = limit
27
+ end
28
+
17
29
  def parse
18
30
  # entity string
19
31
  while true
20
32
  event = @parser.pull
21
33
  case event[0]
22
34
  when :end_document
23
- unless @tag_stack.empty?
24
- tag_path = "/" + @tag_stack.join("/")
25
- raise ParseException.new("Missing end tag for '#{tag_path}'",
26
- @parser.source)
27
- end
28
35
  return
29
36
  when :start_element
30
- @tag_stack << event[1]
31
37
  attrs = event[2].each do |n, v|
32
38
  event[2][n] = @parser.unnormalize( v )
33
39
  end
34
40
  @listener.tag_start( event[1], attrs )
35
41
  when :end_element
36
42
  @listener.tag_end( event[1] )
37
- @tag_stack.pop
38
43
  when :text
39
- normalized = @parser.unnormalize( event[1] )
40
- @listener.text( normalized )
44
+ unnormalized = @parser.unnormalize( event[1], @entities )
45
+ @listener.text( unnormalized )
41
46
  when :processing_instruction
42
47
  @listener.instruction( *event[1,2] )
43
48
  when :start_doctype
@@ -48,6 +53,7 @@ module REXML
48
53
  when :comment, :attlistdecl, :cdata, :xmldecl, :elementdecl
49
54
  @listener.send( event[0].to_s, *event[1..-1] )
50
55
  when :entitydecl, :notationdecl
56
+ @entities[ event[1] ] = event[2] if event.size == 3
51
57
  @listener.send( event[0].to_s, event[1..-1] )
52
58
  when :externalentity
53
59
  entity_reference = event[1]
@@ -15,8 +15,6 @@ module REXML
15
15
  end
16
16
 
17
17
  def parse
18
- tag_stack = []
19
- in_doctype = false
20
18
  entities = nil
21
19
  begin
22
20
  while true
@@ -24,32 +22,24 @@ module REXML
24
22
  #STDERR.puts "TREEPARSER GOT #{event.inspect}"
25
23
  case event[0]
26
24
  when :end_document
27
- unless tag_stack.empty?
28
- raise ParseException.new("No close tag for #{@build_context.xpath}",
29
- @parser.source, @parser)
30
- end
31
25
  return
32
26
  when :start_element
33
- tag_stack.push(event[1])
34
27
  el = @build_context = @build_context.add_element( event[1] )
35
28
  event[2].each do |key, value|
36
29
  el.attributes[key]=Attribute.new(key,value,self)
37
30
  end
38
31
  when :end_element
39
- tag_stack.pop
40
32
  @build_context = @build_context.parent
41
33
  when :text
42
- if not in_doctype
43
- if @build_context[-1].instance_of? Text
44
- @build_context[-1] << event[1]
45
- else
46
- @build_context.add(
47
- Text.new(event[1], @build_context.whitespace, nil, true)
48
- ) unless (
49
- @build_context.ignore_whitespace_nodes and
50
- event[1].strip.size==0
51
- )
52
- end
34
+ if @build_context[-1].instance_of? Text
35
+ @build_context[-1] << event[1]
36
+ else
37
+ @build_context.add(
38
+ Text.new(event[1], @build_context.whitespace, nil, true)
39
+ ) unless (
40
+ @build_context.ignore_whitespace_nodes and
41
+ event[1].strip.size==0
42
+ )
53
43
  end
54
44
  when :comment
55
45
  c = Comment.new( event[1] )
@@ -60,14 +50,12 @@ module REXML
60
50
  when :processing_instruction
61
51
  @build_context.add( Instruction.new( event[1], event[2] ) )
62
52
  when :end_doctype
63
- in_doctype = false
64
53
  entities.each { |k,v| entities[k] = @build_context.entities[k].value }
65
54
  @build_context = @build_context.parent
66
55
  when :start_doctype
67
56
  doctype = DocType.new( event[1..-1], @build_context )
68
57
  @build_context = doctype
69
58
  entities = {}
70
- in_doctype = true
71
59
  when :attlistdecl
72
60
  n = AttlistDecl.new( event[1..-1] )
73
61
  @build_context.add( n )
data/lib/rexml/rexml.rb CHANGED
@@ -31,7 +31,7 @@
31
31
  module REXML
32
32
  COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
33
33
  DATE = "2008/019"
34
- VERSION = "3.2.6"
34
+ VERSION = "3.3.9"
35
35
  REVISION = ""
36
36
 
37
37
  Copyright = COPYRIGHT
data/lib/rexml/source.rb CHANGED
@@ -1,8 +1,28 @@
1
1
  # coding: US-ASCII
2
2
  # frozen_string_literal: false
3
+
4
+ require "strscan"
5
+
3
6
  require_relative 'encoding'
4
7
 
5
8
  module REXML
9
+ if StringScanner::Version < "1.0.0"
10
+ module StringScannerCheckScanString
11
+ refine StringScanner do
12
+ def check(pattern)
13
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
14
+ super(pattern)
15
+ end
16
+
17
+ def scan(pattern)
18
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
19
+ super(pattern)
20
+ end
21
+ end
22
+ end
23
+ using StringScannerCheckScanString
24
+ end
25
+
6
26
  # Generates Source-s. USE THIS CLASS.
7
27
  class SourceFactory
8
28
  # Generates a Source object
@@ -30,26 +50,50 @@ module REXML
30
50
  # objects and provides consumption of text
31
51
  class Source
32
52
  include Encoding
33
- # The current buffer (what we're going to read next)
34
- attr_reader :buffer
35
53
  # The line number of the last consumed text
36
54
  attr_reader :line
37
55
  attr_reader :encoding
38
56
 
57
+ module Private
58
+ SCANNER_RESET_SIZE = 100000
59
+ PRE_DEFINED_TERM_PATTERNS = {}
60
+ pre_defined_terms = ["'", '"', "<"]
61
+ pre_defined_terms.each do |term|
62
+ PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/
63
+ end
64
+ end
65
+ private_constant :Private
66
+
39
67
  # Constructor
40
68
  # @param arg must be a String, and should be a valid XML document
41
69
  # @param encoding if non-null, sets the encoding of the source to this
42
70
  # value, overriding all encoding detection
43
71
  def initialize(arg, encoding=nil)
44
- @orig = @buffer = arg
72
+ @orig = arg
73
+ @scanner = StringScanner.new(@orig)
45
74
  if encoding
46
75
  self.encoding = encoding
47
76
  else
48
77
  detect_encoding
49
78
  end
50
79
  @line = 0
80
+ @term_encord = {}
51
81
  end
52
82
 
83
+ # The current buffer (what we're going to read next)
84
+ def buffer
85
+ @scanner.rest
86
+ end
87
+
88
+ def drop_parsed_content
89
+ if @scanner.pos > Private::SCANNER_RESET_SIZE
90
+ @scanner.string = @scanner.rest
91
+ end
92
+ end
93
+
94
+ def buffer_encoding=(encoding)
95
+ @scanner.string.force_encoding(encoding)
96
+ end
53
97
 
54
98
  # Inherited from Encoding
55
99
  # Overridden to support optimized en/decoding
@@ -58,98 +102,78 @@ module REXML
58
102
  encoding_updated
59
103
  end
60
104
 
61
- # Scans the source for a given pattern. Note, that this is not your
62
- # usual scan() method. For one thing, the pattern argument has some
63
- # requirements; for another, the source can be consumed. You can easily
64
- # confuse this method. Originally, the patterns were easier
65
- # to construct and this method more robust, because this method
66
- # generated search regexps on the fly; however, this was
67
- # computationally expensive and slowed down the entire REXML package
68
- # considerably, since this is by far the most commonly called method.
69
- # @param pattern must be a Regexp, and must be in the form of
70
- # /^\s*(#{your pattern, with no groups})(.*)/. The first group
71
- # will be returned; the second group is used if the consume flag is
72
- # set.
73
- # @param consume if true, the pattern returned will be consumed, leaving
74
- # everything after it in the Source.
75
- # @return the pattern, if found, or nil if the Source is empty or the
76
- # pattern is not found.
77
- def scan(pattern, cons=false)
78
- return nil if @buffer.nil?
79
- rv = @buffer.scan(pattern)
80
- @buffer = $' if cons and rv.size>0
81
- rv
105
+ def read(term = nil)
82
106
  end
83
107
 
84
- def read
108
+ def read_until(term)
109
+ pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
110
+ data = @scanner.scan_until(pattern)
111
+ unless data
112
+ data = @scanner.rest
113
+ @scanner.pos = @scanner.string.bytesize
114
+ end
115
+ data
85
116
  end
86
117
 
87
- def consume( pattern )
88
- @buffer = $' if pattern.match( @buffer )
118
+ def ensure_buffer
89
119
  end
90
120
 
91
- def match_to( char, pattern )
92
- return pattern.match(@buffer)
121
+ def match(pattern, cons=false)
122
+ if cons
123
+ @scanner.scan(pattern).nil? ? nil : @scanner
124
+ else
125
+ @scanner.check(pattern).nil? ? nil : @scanner
126
+ end
93
127
  end
94
128
 
95
- def match_to_consume( char, pattern )
96
- md = pattern.match(@buffer)
97
- @buffer = $'
98
- return md
129
+ def position
130
+ @scanner.pos
99
131
  end
100
132
 
101
- def match(pattern, cons=false)
102
- md = pattern.match(@buffer)
103
- @buffer = $' if cons and md
104
- return md
133
+ def position=(pos)
134
+ @scanner.pos = pos
105
135
  end
106
136
 
107
137
  # @return true if the Source is exhausted
108
138
  def empty?
109
- @buffer == ""
110
- end
111
-
112
- def position
113
- @orig.index( @buffer )
139
+ @scanner.eos?
114
140
  end
115
141
 
116
142
  # @return the current line in the source
117
143
  def current_line
118
144
  lines = @orig.split
119
- res = lines.grep @buffer[0..30]
145
+ res = lines.grep @scanner.rest[0..30]
120
146
  res = res[-1] if res.kind_of? Array
121
147
  lines.index( res ) if res
122
148
  end
123
149
 
124
150
  private
151
+
125
152
  def detect_encoding
126
- buffer_encoding = @buffer.encoding
153
+ scanner_encoding = @scanner.rest.encoding
127
154
  detected_encoding = "UTF-8"
128
155
  begin
129
- @buffer.force_encoding("ASCII-8BIT")
130
- if @buffer[0, 2] == "\xfe\xff"
131
- @buffer[0, 2] = ""
156
+ @scanner.string.force_encoding("ASCII-8BIT")
157
+ if @scanner.scan(/\xfe\xff/n)
132
158
  detected_encoding = "UTF-16BE"
133
- elsif @buffer[0, 2] == "\xff\xfe"
134
- @buffer[0, 2] = ""
159
+ elsif @scanner.scan(/\xff\xfe/n)
135
160
  detected_encoding = "UTF-16LE"
136
- elsif @buffer[0, 3] == "\xef\xbb\xbf"
137
- @buffer[0, 3] = ""
161
+ elsif @scanner.scan(/\xef\xbb\xbf/n)
138
162
  detected_encoding = "UTF-8"
139
163
  end
140
164
  ensure
141
- @buffer.force_encoding(buffer_encoding)
165
+ @scanner.string.force_encoding(scanner_encoding)
142
166
  end
143
167
  self.encoding = detected_encoding
144
168
  end
145
169
 
146
170
  def encoding_updated
147
171
  if @encoding != 'UTF-8'
148
- @buffer = decode(@buffer)
172
+ @scanner.string = decode(@scanner.rest)
149
173
  @to_utf = true
150
174
  else
151
175
  @to_utf = false
152
- @buffer.force_encoding ::Encoding::UTF_8
176
+ @scanner.string.force_encoding(::Encoding::UTF_8)
153
177
  end
154
178
  end
155
179
  end
@@ -172,7 +196,7 @@ module REXML
172
196
  end
173
197
 
174
198
  if !@to_utf and
175
- @buffer.respond_to?(:force_encoding) and
199
+ @orig.respond_to?(:force_encoding) and
176
200
  @source.respond_to?(:external_encoding) and
177
201
  @source.external_encoding != ::Encoding::UTF_8
178
202
  @force_utf8 = true
@@ -181,65 +205,72 @@ module REXML
181
205
  end
182
206
  end
183
207
 
184
- def scan(pattern, cons=false)
185
- rv = super
186
- # You'll notice that this next section is very similar to the same
187
- # section in match(), but just a liiittle different. This is
188
- # because it is a touch faster to do it this way with scan()
189
- # than the way match() does it; enough faster to warrant duplicating
190
- # some code
191
- if rv.size == 0
192
- until @buffer =~ pattern or @source.nil?
193
- begin
194
- @buffer << readline
195
- rescue Iconv::IllegalSequence
196
- raise
197
- rescue
198
- @source = nil
208
+ def read(term = nil, min_bytes = 1)
209
+ term = encode(term) if term
210
+ begin
211
+ str = readline(term)
212
+ @scanner << str
213
+ read_bytes = str.bytesize
214
+ begin
215
+ while read_bytes < min_bytes
216
+ str = readline(term)
217
+ @scanner << str
218
+ read_bytes += str.bytesize
199
219
  end
220
+ rescue IOError
200
221
  end
201
- rv = super
222
+ true
223
+ rescue Exception, NameError
224
+ @source = nil
225
+ false
202
226
  end
203
- rv.taint if RUBY_VERSION < '2.7'
204
- rv
205
227
  end
206
228
 
207
- def read
208
- begin
209
- @buffer << readline
210
- rescue Exception, NameError
211
- @source = nil
229
+ def read_until(term)
230
+ pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
231
+ term = @term_encord[term] ||= encode(term)
232
+ until str = @scanner.scan_until(pattern)
233
+ break if @source.nil?
234
+ break if @source.eof?
235
+ @scanner << readline(term)
236
+ end
237
+ if str
238
+ read if @scanner.eos? and !@source.eof?
239
+ str
240
+ else
241
+ rest = @scanner.rest
242
+ @scanner.pos = @scanner.string.bytesize
243
+ rest
212
244
  end
213
245
  end
214
246
 
215
- def consume( pattern )
216
- match( pattern, true )
247
+ def ensure_buffer
248
+ read if @scanner.eos? && @source
217
249
  end
218
250
 
219
251
  def match( pattern, cons=false )
220
- rv = pattern.match(@buffer)
221
- @buffer = $' if cons and rv
222
- while !rv and @source
223
- begin
224
- @buffer << readline
225
- rv = pattern.match(@buffer)
226
- @buffer = $' if cons and rv
227
- rescue
228
- @source = nil
252
+ # To avoid performance issue, we need to increase bytes to read per scan
253
+ min_bytes = 1
254
+ while true
255
+ if cons
256
+ md = @scanner.scan(pattern)
257
+ else
258
+ md = @scanner.check(pattern)
229
259
  end
260
+ break if md
261
+ return nil if pattern.is_a?(String)
262
+ return nil if @source.nil?
263
+ return nil unless read(nil, min_bytes)
264
+ min_bytes *= 2
230
265
  end
231
- rv.taint if RUBY_VERSION < '2.7'
232
- rv
266
+
267
+ md.nil? ? nil : @scanner
233
268
  end
234
269
 
235
270
  def empty?
236
271
  super and ( @source.nil? || @source.eof? )
237
272
  end
238
273
 
239
- def position
240
- @er_source.pos rescue 0
241
- end
242
-
243
274
  # @return the current line in the source
244
275
  def current_line
245
276
  begin
@@ -263,15 +294,20 @@ module REXML
263
294
  end
264
295
 
265
296
  private
266
- def readline
267
- str = @source.readline(@line_break)
297
+ def readline(term = nil)
268
298
  if @pending_buffer
299
+ begin
300
+ str = @source.readline(term || @line_break)
301
+ rescue IOError
302
+ end
269
303
  if str.nil?
270
304
  str = @pending_buffer
271
305
  else
272
306
  str = @pending_buffer + str
273
307
  end
274
308
  @pending_buffer = nil
309
+ else
310
+ str = @source.readline(term || @line_break)
275
311
  end
276
312
  return nil if str.nil?
277
313
 
@@ -290,7 +326,7 @@ module REXML
290
326
  @source.set_encoding(@encoding, @encoding)
291
327
  end
292
328
  @line_break = encode(">")
293
- @pending_buffer, @buffer = @buffer, ""
329
+ @pending_buffer, @scanner.string = @scanner.rest, ""
294
330
  @pending_buffer.force_encoding(@encoding)
295
331
  super
296
332
  end
data/lib/rexml/text.rb CHANGED
@@ -151,25 +151,45 @@ module REXML
151
151
  end
152
152
  end
153
153
 
154
- # context sensitive
155
- string.scan(pattern) do
156
- if $1[-1] != ?;
157
- raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
158
- elsif $1[0] == ?&
159
- if $5 and $5[0] == ?#
160
- case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
161
- when *VALID_CHAR
154
+ pos = 0
155
+ while (index = string.index(/<|&/, pos))
156
+ if string[index] == "<"
157
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
158
+ end
159
+
160
+ unless (end_index = string.index(/[^\s];/, index + 1))
161
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
162
+ end
163
+
164
+ value = string[(index + 1)..end_index]
165
+ if /\s/.match?(value)
166
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
167
+ end
168
+
169
+ if value[0] == "#"
170
+ character_reference = value[1..-1]
171
+
172
+ unless (/\A(\d+|x[0-9a-fA-F]+)\z/.match?(character_reference))
173
+ if character_reference[0] == "x" || character_reference[-1] == "x"
174
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
162
175
  else
163
- raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
176
+ raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
164
177
  end
165
- # FIXME: below can't work but this needs API change.
166
- # elsif @parent and $3 and !SUBSTITUTES.include?($1)
167
- # if !doctype or !doctype.entities.has_key?($3)
168
- # raise "Undeclared entity '#{$1}' in raw string \"#{string}\""
169
- # end
170
178
  end
179
+
180
+ case (character_reference[0] == "x" ? character_reference[1..-1].to_i(16) : character_reference[0..-1].to_i)
181
+ when *VALID_CHAR
182
+ else
183
+ raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
184
+ end
185
+ elsif !(/\A#{Entity::NAME}\z/um.match?(value))
186
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
171
187
  end
188
+
189
+ pos = end_index + 1
172
190
  end
191
+
192
+ string
173
193
  end
174
194
 
175
195
  def node_type
@@ -248,7 +268,8 @@ module REXML
248
268
  # u = Text.new( "sean russell", false, nil, true )
249
269
  # u.value #-> "sean russell"
250
270
  def value
251
- @unnormalized ||= Text::unnormalize( @string, doctype )
271
+ @unnormalized ||= Text::unnormalize(@string, doctype,
272
+ entity_expansion_text_limit: document&.entity_expansion_text_limit)
252
273
  end
253
274
 
254
275
  # Sets the contents of this text node. This expects the text to be
@@ -391,11 +412,12 @@ module REXML
391
412
  end
392
413
 
393
414
  # Unescapes all possible entities
394
- def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil )
415
+ def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expansion_text_limit: nil )
416
+ entity_expansion_text_limit ||= Security.entity_expansion_text_limit
395
417
  sum = 0
396
418
  string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) {
397
419
  s = Text.expand($&, doctype, filter)
398
- if sum + s.bytesize > Security.entity_expansion_text_limit
420
+ if sum + s.bytesize > entity_expansion_text_limit
399
421
  raise "entity expansion has grown too large"
400
422
  else
401
423
  sum += s.bytesize