rexml 3.2.6 → 3.3.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -47,6 +47,18 @@ module REXML
47
47
  @listeners << listener
48
48
  end
49
49
 
50
+ def entity_expansion_count
51
+ @parser.entity_expansion_count
52
+ end
53
+
54
+ def entity_expansion_limit=( limit )
55
+ @parser.entity_expansion_limit = limit
56
+ end
57
+
58
+ def entity_expansion_text_limit=( limit )
59
+ @parser.entity_expansion_text_limit = limit
60
+ end
61
+
50
62
  def each
51
63
  while has_next?
52
64
  yield self.pull
@@ -22,6 +22,18 @@ module REXML
22
22
  @parser.source
23
23
  end
24
24
 
25
+ def entity_expansion_count
26
+ @parser.entity_expansion_count
27
+ end
28
+
29
+ def entity_expansion_limit=( limit )
30
+ @parser.entity_expansion_limit = limit
31
+ end
32
+
33
+ def entity_expansion_text_limit=( limit )
34
+ @parser.entity_expansion_text_limit = limit
35
+ end
36
+
25
37
  def add_listener( listener )
26
38
  @parser.add_listener( listener )
27
39
  end
@@ -157,25 +169,8 @@ module REXML
157
169
  end
158
170
  end
159
171
  when :text
160
- #normalized = @parser.normalize( event[1] )
161
- #handle( :characters, normalized )
162
- copy = event[1].clone
163
-
164
- esub = proc { |match|
165
- if @entities.has_key?($1)
166
- @entities[$1].gsub(Text::REFERENCE, &esub)
167
- else
168
- match
169
- end
170
- }
171
-
172
- copy.gsub!( Text::REFERENCE, &esub )
173
- copy.gsub!( Text::NUMERICENTITY ) {|m|
174
- m=$1
175
- m = "0#{m}" if m[0] == ?x
176
- [Integer(m)].pack('U*')
177
- }
178
- handle( :characters, copy )
172
+ unnormalized = @parser.unnormalize( event[1], @entities )
173
+ handle( :characters, unnormalized )
179
174
  when :entitydecl
180
175
  handle_entitydecl( event )
181
176
  when :processing_instruction, :comment, :attlistdecl,
@@ -264,6 +259,8 @@ module REXML
264
259
  end
265
260
 
266
261
  def get_namespace( prefix )
262
+ return nil if @namespace_stack.empty?
263
+
267
264
  uris = (@namespace_stack.find_all { |ns| not ns[prefix].nil? }) ||
268
265
  (@namespace_stack.find { |ns| not ns[nil].nil? })
269
266
  uris[-1][prefix] unless uris.nil? or 0 == uris.size
@@ -7,37 +7,42 @@ module REXML
7
7
  def initialize source, listener
8
8
  @listener = listener
9
9
  @parser = BaseParser.new( source )
10
- @tag_stack = []
10
+ @entities = {}
11
11
  end
12
12
 
13
13
  def add_listener( listener )
14
14
  @parser.add_listener( listener )
15
15
  end
16
16
 
17
+ def entity_expansion_count
18
+ @parser.entity_expansion_count
19
+ end
20
+
21
+ def entity_expansion_limit=( limit )
22
+ @parser.entity_expansion_limit = limit
23
+ end
24
+
25
+ def entity_expansion_text_limit=( limit )
26
+ @parser.entity_expansion_text_limit = limit
27
+ end
28
+
17
29
  def parse
18
30
  # entity string
19
31
  while true
20
32
  event = @parser.pull
21
33
  case event[0]
22
34
  when :end_document
23
- unless @tag_stack.empty?
24
- tag_path = "/" + @tag_stack.join("/")
25
- raise ParseException.new("Missing end tag for '#{tag_path}'",
26
- @parser.source)
27
- end
28
35
  return
29
36
  when :start_element
30
- @tag_stack << event[1]
31
37
  attrs = event[2].each do |n, v|
32
38
  event[2][n] = @parser.unnormalize( v )
33
39
  end
34
40
  @listener.tag_start( event[1], attrs )
35
41
  when :end_element
36
42
  @listener.tag_end( event[1] )
37
- @tag_stack.pop
38
43
  when :text
39
- normalized = @parser.unnormalize( event[1] )
40
- @listener.text( normalized )
44
+ unnormalized = @parser.unnormalize( event[1], @entities )
45
+ @listener.text( unnormalized )
41
46
  when :processing_instruction
42
47
  @listener.instruction( *event[1,2] )
43
48
  when :start_doctype
@@ -48,6 +53,7 @@ module REXML
48
53
  when :comment, :attlistdecl, :cdata, :xmldecl, :elementdecl
49
54
  @listener.send( event[0].to_s, *event[1..-1] )
50
55
  when :entitydecl, :notationdecl
56
+ @entities[ event[1] ] = event[2] if event.size == 3
51
57
  @listener.send( event[0].to_s, event[1..-1] )
52
58
  when :externalentity
53
59
  entity_reference = event[1]
@@ -15,8 +15,6 @@ module REXML
15
15
  end
16
16
 
17
17
  def parse
18
- tag_stack = []
19
- in_doctype = false
20
18
  entities = nil
21
19
  begin
22
20
  while true
@@ -24,32 +22,24 @@ module REXML
24
22
  #STDERR.puts "TREEPARSER GOT #{event.inspect}"
25
23
  case event[0]
26
24
  when :end_document
27
- unless tag_stack.empty?
28
- raise ParseException.new("No close tag for #{@build_context.xpath}",
29
- @parser.source, @parser)
30
- end
31
25
  return
32
26
  when :start_element
33
- tag_stack.push(event[1])
34
27
  el = @build_context = @build_context.add_element( event[1] )
35
28
  event[2].each do |key, value|
36
29
  el.attributes[key]=Attribute.new(key,value,self)
37
30
  end
38
31
  when :end_element
39
- tag_stack.pop
40
32
  @build_context = @build_context.parent
41
33
  when :text
42
- if not in_doctype
43
- if @build_context[-1].instance_of? Text
44
- @build_context[-1] << event[1]
45
- else
46
- @build_context.add(
47
- Text.new(event[1], @build_context.whitespace, nil, true)
48
- ) unless (
49
- @build_context.ignore_whitespace_nodes and
50
- event[1].strip.size==0
51
- )
52
- end
34
+ if @build_context[-1].instance_of? Text
35
+ @build_context[-1] << event[1]
36
+ else
37
+ @build_context.add(
38
+ Text.new(event[1], @build_context.whitespace, nil, true)
39
+ ) unless (
40
+ @build_context.ignore_whitespace_nodes and
41
+ event[1].strip.size==0
42
+ )
53
43
  end
54
44
  when :comment
55
45
  c = Comment.new( event[1] )
@@ -60,14 +50,12 @@ module REXML
60
50
  when :processing_instruction
61
51
  @build_context.add( Instruction.new( event[1], event[2] ) )
62
52
  when :end_doctype
63
- in_doctype = false
64
53
  entities.each { |k,v| entities[k] = @build_context.entities[k].value }
65
54
  @build_context = @build_context.parent
66
55
  when :start_doctype
67
56
  doctype = DocType.new( event[1..-1], @build_context )
68
57
  @build_context = doctype
69
58
  entities = {}
70
- in_doctype = true
71
59
  when :attlistdecl
72
60
  n = AttlistDecl.new( event[1..-1] )
73
61
  @build_context.add( n )
data/lib/rexml/rexml.rb CHANGED
@@ -31,7 +31,7 @@
31
31
  module REXML
32
32
  COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
33
33
  DATE = "2008/019"
34
- VERSION = "3.2.6"
34
+ VERSION = "3.3.9"
35
35
  REVISION = ""
36
36
 
37
37
  Copyright = COPYRIGHT
data/lib/rexml/source.rb CHANGED
@@ -1,8 +1,28 @@
1
1
  # coding: US-ASCII
2
2
  # frozen_string_literal: false
3
+
4
+ require "strscan"
5
+
3
6
  require_relative 'encoding'
4
7
 
5
8
  module REXML
9
+ if StringScanner::Version < "1.0.0"
10
+ module StringScannerCheckScanString
11
+ refine StringScanner do
12
+ def check(pattern)
13
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
14
+ super(pattern)
15
+ end
16
+
17
+ def scan(pattern)
18
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
19
+ super(pattern)
20
+ end
21
+ end
22
+ end
23
+ using StringScannerCheckScanString
24
+ end
25
+
6
26
  # Generates Source-s. USE THIS CLASS.
7
27
  class SourceFactory
8
28
  # Generates a Source object
@@ -30,26 +50,50 @@ module REXML
30
50
  # objects and provides consumption of text
31
51
  class Source
32
52
  include Encoding
33
- # The current buffer (what we're going to read next)
34
- attr_reader :buffer
35
53
  # The line number of the last consumed text
36
54
  attr_reader :line
37
55
  attr_reader :encoding
38
56
 
57
+ module Private
58
+ SCANNER_RESET_SIZE = 100000
59
+ PRE_DEFINED_TERM_PATTERNS = {}
60
+ pre_defined_terms = ["'", '"', "<"]
61
+ pre_defined_terms.each do |term|
62
+ PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/
63
+ end
64
+ end
65
+ private_constant :Private
66
+
39
67
  # Constructor
40
68
  # @param arg must be a String, and should be a valid XML document
41
69
  # @param encoding if non-null, sets the encoding of the source to this
42
70
  # value, overriding all encoding detection
43
71
  def initialize(arg, encoding=nil)
44
- @orig = @buffer = arg
72
+ @orig = arg
73
+ @scanner = StringScanner.new(@orig)
45
74
  if encoding
46
75
  self.encoding = encoding
47
76
  else
48
77
  detect_encoding
49
78
  end
50
79
  @line = 0
80
+ @term_encord = {}
51
81
  end
52
82
 
83
+ # The current buffer (what we're going to read next)
84
+ def buffer
85
+ @scanner.rest
86
+ end
87
+
88
+ def drop_parsed_content
89
+ if @scanner.pos > Private::SCANNER_RESET_SIZE
90
+ @scanner.string = @scanner.rest
91
+ end
92
+ end
93
+
94
+ def buffer_encoding=(encoding)
95
+ @scanner.string.force_encoding(encoding)
96
+ end
53
97
 
54
98
  # Inherited from Encoding
55
99
  # Overridden to support optimized en/decoding
@@ -58,98 +102,78 @@ module REXML
58
102
  encoding_updated
59
103
  end
60
104
 
61
- # Scans the source for a given pattern. Note, that this is not your
62
- # usual scan() method. For one thing, the pattern argument has some
63
- # requirements; for another, the source can be consumed. You can easily
64
- # confuse this method. Originally, the patterns were easier
65
- # to construct and this method more robust, because this method
66
- # generated search regexps on the fly; however, this was
67
- # computationally expensive and slowed down the entire REXML package
68
- # considerably, since this is by far the most commonly called method.
69
- # @param pattern must be a Regexp, and must be in the form of
70
- # /^\s*(#{your pattern, with no groups})(.*)/. The first group
71
- # will be returned; the second group is used if the consume flag is
72
- # set.
73
- # @param consume if true, the pattern returned will be consumed, leaving
74
- # everything after it in the Source.
75
- # @return the pattern, if found, or nil if the Source is empty or the
76
- # pattern is not found.
77
- def scan(pattern, cons=false)
78
- return nil if @buffer.nil?
79
- rv = @buffer.scan(pattern)
80
- @buffer = $' if cons and rv.size>0
81
- rv
105
+ def read(term = nil)
82
106
  end
83
107
 
84
- def read
108
+ def read_until(term)
109
+ pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
110
+ data = @scanner.scan_until(pattern)
111
+ unless data
112
+ data = @scanner.rest
113
+ @scanner.pos = @scanner.string.bytesize
114
+ end
115
+ data
85
116
  end
86
117
 
87
- def consume( pattern )
88
- @buffer = $' if pattern.match( @buffer )
118
+ def ensure_buffer
89
119
  end
90
120
 
91
- def match_to( char, pattern )
92
- return pattern.match(@buffer)
121
+ def match(pattern, cons=false)
122
+ if cons
123
+ @scanner.scan(pattern).nil? ? nil : @scanner
124
+ else
125
+ @scanner.check(pattern).nil? ? nil : @scanner
126
+ end
93
127
  end
94
128
 
95
- def match_to_consume( char, pattern )
96
- md = pattern.match(@buffer)
97
- @buffer = $'
98
- return md
129
+ def position
130
+ @scanner.pos
99
131
  end
100
132
 
101
- def match(pattern, cons=false)
102
- md = pattern.match(@buffer)
103
- @buffer = $' if cons and md
104
- return md
133
+ def position=(pos)
134
+ @scanner.pos = pos
105
135
  end
106
136
 
107
137
  # @return true if the Source is exhausted
108
138
  def empty?
109
- @buffer == ""
110
- end
111
-
112
- def position
113
- @orig.index( @buffer )
139
+ @scanner.eos?
114
140
  end
115
141
 
116
142
  # @return the current line in the source
117
143
  def current_line
118
144
  lines = @orig.split
119
- res = lines.grep @buffer[0..30]
145
+ res = lines.grep @scanner.rest[0..30]
120
146
  res = res[-1] if res.kind_of? Array
121
147
  lines.index( res ) if res
122
148
  end
123
149
 
124
150
  private
151
+
125
152
  def detect_encoding
126
- buffer_encoding = @buffer.encoding
153
+ scanner_encoding = @scanner.rest.encoding
127
154
  detected_encoding = "UTF-8"
128
155
  begin
129
- @buffer.force_encoding("ASCII-8BIT")
130
- if @buffer[0, 2] == "\xfe\xff"
131
- @buffer[0, 2] = ""
156
+ @scanner.string.force_encoding("ASCII-8BIT")
157
+ if @scanner.scan(/\xfe\xff/n)
132
158
  detected_encoding = "UTF-16BE"
133
- elsif @buffer[0, 2] == "\xff\xfe"
134
- @buffer[0, 2] = ""
159
+ elsif @scanner.scan(/\xff\xfe/n)
135
160
  detected_encoding = "UTF-16LE"
136
- elsif @buffer[0, 3] == "\xef\xbb\xbf"
137
- @buffer[0, 3] = ""
161
+ elsif @scanner.scan(/\xef\xbb\xbf/n)
138
162
  detected_encoding = "UTF-8"
139
163
  end
140
164
  ensure
141
- @buffer.force_encoding(buffer_encoding)
165
+ @scanner.string.force_encoding(scanner_encoding)
142
166
  end
143
167
  self.encoding = detected_encoding
144
168
  end
145
169
 
146
170
  def encoding_updated
147
171
  if @encoding != 'UTF-8'
148
- @buffer = decode(@buffer)
172
+ @scanner.string = decode(@scanner.rest)
149
173
  @to_utf = true
150
174
  else
151
175
  @to_utf = false
152
- @buffer.force_encoding ::Encoding::UTF_8
176
+ @scanner.string.force_encoding(::Encoding::UTF_8)
153
177
  end
154
178
  end
155
179
  end
@@ -172,7 +196,7 @@ module REXML
172
196
  end
173
197
 
174
198
  if !@to_utf and
175
- @buffer.respond_to?(:force_encoding) and
199
+ @orig.respond_to?(:force_encoding) and
176
200
  @source.respond_to?(:external_encoding) and
177
201
  @source.external_encoding != ::Encoding::UTF_8
178
202
  @force_utf8 = true
@@ -181,65 +205,72 @@ module REXML
181
205
  end
182
206
  end
183
207
 
184
- def scan(pattern, cons=false)
185
- rv = super
186
- # You'll notice that this next section is very similar to the same
187
- # section in match(), but just a liiittle different. This is
188
- # because it is a touch faster to do it this way with scan()
189
- # than the way match() does it; enough faster to warrant duplicating
190
- # some code
191
- if rv.size == 0
192
- until @buffer =~ pattern or @source.nil?
193
- begin
194
- @buffer << readline
195
- rescue Iconv::IllegalSequence
196
- raise
197
- rescue
198
- @source = nil
208
+ def read(term = nil, min_bytes = 1)
209
+ term = encode(term) if term
210
+ begin
211
+ str = readline(term)
212
+ @scanner << str
213
+ read_bytes = str.bytesize
214
+ begin
215
+ while read_bytes < min_bytes
216
+ str = readline(term)
217
+ @scanner << str
218
+ read_bytes += str.bytesize
199
219
  end
220
+ rescue IOError
200
221
  end
201
- rv = super
222
+ true
223
+ rescue Exception, NameError
224
+ @source = nil
225
+ false
202
226
  end
203
- rv.taint if RUBY_VERSION < '2.7'
204
- rv
205
227
  end
206
228
 
207
- def read
208
- begin
209
- @buffer << readline
210
- rescue Exception, NameError
211
- @source = nil
229
+ def read_until(term)
230
+ pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
231
+ term = @term_encord[term] ||= encode(term)
232
+ until str = @scanner.scan_until(pattern)
233
+ break if @source.nil?
234
+ break if @source.eof?
235
+ @scanner << readline(term)
236
+ end
237
+ if str
238
+ read if @scanner.eos? and !@source.eof?
239
+ str
240
+ else
241
+ rest = @scanner.rest
242
+ @scanner.pos = @scanner.string.bytesize
243
+ rest
212
244
  end
213
245
  end
214
246
 
215
- def consume( pattern )
216
- match( pattern, true )
247
+ def ensure_buffer
248
+ read if @scanner.eos? && @source
217
249
  end
218
250
 
219
251
  def match( pattern, cons=false )
220
- rv = pattern.match(@buffer)
221
- @buffer = $' if cons and rv
222
- while !rv and @source
223
- begin
224
- @buffer << readline
225
- rv = pattern.match(@buffer)
226
- @buffer = $' if cons and rv
227
- rescue
228
- @source = nil
252
+ # To avoid performance issue, we need to increase bytes to read per scan
253
+ min_bytes = 1
254
+ while true
255
+ if cons
256
+ md = @scanner.scan(pattern)
257
+ else
258
+ md = @scanner.check(pattern)
229
259
  end
260
+ break if md
261
+ return nil if pattern.is_a?(String)
262
+ return nil if @source.nil?
263
+ return nil unless read(nil, min_bytes)
264
+ min_bytes *= 2
230
265
  end
231
- rv.taint if RUBY_VERSION < '2.7'
232
- rv
266
+
267
+ md.nil? ? nil : @scanner
233
268
  end
234
269
 
235
270
  def empty?
236
271
  super and ( @source.nil? || @source.eof? )
237
272
  end
238
273
 
239
- def position
240
- @er_source.pos rescue 0
241
- end
242
-
243
274
  # @return the current line in the source
244
275
  def current_line
245
276
  begin
@@ -263,15 +294,20 @@ module REXML
263
294
  end
264
295
 
265
296
  private
266
- def readline
267
- str = @source.readline(@line_break)
297
+ def readline(term = nil)
268
298
  if @pending_buffer
299
+ begin
300
+ str = @source.readline(term || @line_break)
301
+ rescue IOError
302
+ end
269
303
  if str.nil?
270
304
  str = @pending_buffer
271
305
  else
272
306
  str = @pending_buffer + str
273
307
  end
274
308
  @pending_buffer = nil
309
+ else
310
+ str = @source.readline(term || @line_break)
275
311
  end
276
312
  return nil if str.nil?
277
313
 
@@ -290,7 +326,7 @@ module REXML
290
326
  @source.set_encoding(@encoding, @encoding)
291
327
  end
292
328
  @line_break = encode(">")
293
- @pending_buffer, @buffer = @buffer, ""
329
+ @pending_buffer, @scanner.string = @scanner.rest, ""
294
330
  @pending_buffer.force_encoding(@encoding)
295
331
  super
296
332
  end
data/lib/rexml/text.rb CHANGED
@@ -151,25 +151,45 @@ module REXML
151
151
  end
152
152
  end
153
153
 
154
- # context sensitive
155
- string.scan(pattern) do
156
- if $1[-1] != ?;
157
- raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
158
- elsif $1[0] == ?&
159
- if $5 and $5[0] == ?#
160
- case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
161
- when *VALID_CHAR
154
+ pos = 0
155
+ while (index = string.index(/<|&/, pos))
156
+ if string[index] == "<"
157
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
158
+ end
159
+
160
+ unless (end_index = string.index(/[^\s];/, index + 1))
161
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
162
+ end
163
+
164
+ value = string[(index + 1)..end_index]
165
+ if /\s/.match?(value)
166
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
167
+ end
168
+
169
+ if value[0] == "#"
170
+ character_reference = value[1..-1]
171
+
172
+ unless (/\A(\d+|x[0-9a-fA-F]+)\z/.match?(character_reference))
173
+ if character_reference[0] == "x" || character_reference[-1] == "x"
174
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
162
175
  else
163
- raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
176
+ raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
164
177
  end
165
- # FIXME: below can't work but this needs API change.
166
- # elsif @parent and $3 and !SUBSTITUTES.include?($1)
167
- # if !doctype or !doctype.entities.has_key?($3)
168
- # raise "Undeclared entity '#{$1}' in raw string \"#{string}\""
169
- # end
170
178
  end
179
+
180
+ case (character_reference[0] == "x" ? character_reference[1..-1].to_i(16) : character_reference[0..-1].to_i)
181
+ when *VALID_CHAR
182
+ else
183
+ raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
184
+ end
185
+ elsif !(/\A#{Entity::NAME}\z/um.match?(value))
186
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
171
187
  end
188
+
189
+ pos = end_index + 1
172
190
  end
191
+
192
+ string
173
193
  end
174
194
 
175
195
  def node_type
@@ -248,7 +268,8 @@ module REXML
248
268
  # u = Text.new( "sean russell", false, nil, true )
249
269
  # u.value #-> "sean russell"
250
270
  def value
251
- @unnormalized ||= Text::unnormalize( @string, doctype )
271
+ @unnormalized ||= Text::unnormalize(@string, doctype,
272
+ entity_expansion_text_limit: document&.entity_expansion_text_limit)
252
273
  end
253
274
 
254
275
  # Sets the contents of this text node. This expects the text to be
@@ -391,11 +412,12 @@ module REXML
391
412
  end
392
413
 
393
414
  # Unescapes all possible entities
394
- def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil )
415
+ def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expansion_text_limit: nil )
416
+ entity_expansion_text_limit ||= Security.entity_expansion_text_limit
395
417
  sum = 0
396
418
  string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) {
397
419
  s = Text.expand($&, doctype, filter)
398
- if sum + s.bytesize > Security.entity_expansion_text_limit
420
+ if sum + s.bytesize > entity_expansion_text_limit
399
421
  raise "entity expansion has grown too large"
400
422
  else
401
423
  sum += s.bytesize