rexml 3.2.6 → 3.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -47,6 +47,18 @@ module REXML
47
47
  @listeners << listener
48
48
  end
49
49
 
50
+ def entity_expansion_count
51
+ @parser.entity_expansion_count
52
+ end
53
+
54
+ def entity_expansion_limit=( limit )
55
+ @parser.entity_expansion_limit = limit
56
+ end
57
+
58
+ def entity_expansion_text_limit=( limit )
59
+ @parser.entity_expansion_text_limit = limit
60
+ end
61
+
50
62
  def each
51
63
  while has_next?
52
64
  yield self.pull
@@ -81,6 +93,10 @@ module REXML
81
93
  def unshift token
82
94
  @my_stack.unshift token
83
95
  end
96
+
97
+ def reset
98
+ @parser.reset
99
+ end
84
100
  end
85
101
 
86
102
  # A parsing event. The contents of the event are accessed as an +Array?,
@@ -22,6 +22,18 @@ module REXML
22
22
  @parser.source
23
23
  end
24
24
 
25
+ def entity_expansion_count
26
+ @parser.entity_expansion_count
27
+ end
28
+
29
+ def entity_expansion_limit=( limit )
30
+ @parser.entity_expansion_limit = limit
31
+ end
32
+
33
+ def entity_expansion_text_limit=( limit )
34
+ @parser.entity_expansion_text_limit = limit
35
+ end
36
+
25
37
  def add_listener( listener )
26
38
  @parser.add_listener( listener )
27
39
  end
@@ -157,25 +169,8 @@ module REXML
157
169
  end
158
170
  end
159
171
  when :text
160
- #normalized = @parser.normalize( event[1] )
161
- #handle( :characters, normalized )
162
- copy = event[1].clone
163
-
164
- esub = proc { |match|
165
- if @entities.has_key?($1)
166
- @entities[$1].gsub(Text::REFERENCE, &esub)
167
- else
168
- match
169
- end
170
- }
171
-
172
- copy.gsub!( Text::REFERENCE, &esub )
173
- copy.gsub!( Text::NUMERICENTITY ) {|m|
174
- m=$1
175
- m = "0#{m}" if m[0] == ?x
176
- [Integer(m)].pack('U*')
177
- }
178
- handle( :characters, copy )
172
+ unnormalized = @parser.unnormalize( event[1], @entities )
173
+ handle( :characters, unnormalized )
179
174
  when :entitydecl
180
175
  handle_entitydecl( event )
181
176
  when :processing_instruction, :comment, :attlistdecl,
@@ -264,6 +259,8 @@ module REXML
264
259
  end
265
260
 
266
261
  def get_namespace( prefix )
262
+ return nil if @namespace_stack.empty?
263
+
267
264
  uris = (@namespace_stack.find_all { |ns| not ns[prefix].nil? }) ||
268
265
  (@namespace_stack.find { |ns| not ns[nil].nil? })
269
266
  uris[-1][prefix] unless uris.nil? or 0 == uris.size
@@ -7,37 +7,42 @@ module REXML
7
7
  def initialize source, listener
8
8
  @listener = listener
9
9
  @parser = BaseParser.new( source )
10
- @tag_stack = []
10
+ @entities = {}
11
11
  end
12
12
 
13
13
  def add_listener( listener )
14
14
  @parser.add_listener( listener )
15
15
  end
16
16
 
17
+ def entity_expansion_count
18
+ @parser.entity_expansion_count
19
+ end
20
+
21
+ def entity_expansion_limit=( limit )
22
+ @parser.entity_expansion_limit = limit
23
+ end
24
+
25
+ def entity_expansion_text_limit=( limit )
26
+ @parser.entity_expansion_text_limit = limit
27
+ end
28
+
17
29
  def parse
18
30
  # entity string
19
31
  while true
20
32
  event = @parser.pull
21
33
  case event[0]
22
34
  when :end_document
23
- unless @tag_stack.empty?
24
- tag_path = "/" + @tag_stack.join("/")
25
- raise ParseException.new("Missing end tag for '#{tag_path}'",
26
- @parser.source)
27
- end
28
35
  return
29
36
  when :start_element
30
- @tag_stack << event[1]
31
37
  attrs = event[2].each do |n, v|
32
38
  event[2][n] = @parser.unnormalize( v )
33
39
  end
34
40
  @listener.tag_start( event[1], attrs )
35
41
  when :end_element
36
42
  @listener.tag_end( event[1] )
37
- @tag_stack.pop
38
43
  when :text
39
- normalized = @parser.unnormalize( event[1] )
40
- @listener.text( normalized )
44
+ unnormalized = @parser.unnormalize( event[1], @entities )
45
+ @listener.text( unnormalized )
41
46
  when :processing_instruction
42
47
  @listener.instruction( *event[1,2] )
43
48
  when :start_doctype
@@ -48,6 +53,7 @@ module REXML
48
53
  when :comment, :attlistdecl, :cdata, :xmldecl, :elementdecl
49
54
  @listener.send( event[0].to_s, *event[1..-1] )
50
55
  when :entitydecl, :notationdecl
56
+ @entities[ event[1] ] = event[2] if event.size == 3
51
57
  @listener.send( event[0].to_s, event[1..-1] )
52
58
  when :externalentity
53
59
  entity_reference = event[1]
@@ -15,8 +15,6 @@ module REXML
15
15
  end
16
16
 
17
17
  def parse
18
- tag_stack = []
19
- in_doctype = false
20
18
  entities = nil
21
19
  begin
22
20
  while true
@@ -24,32 +22,24 @@ module REXML
24
22
  #STDERR.puts "TREEPARSER GOT #{event.inspect}"
25
23
  case event[0]
26
24
  when :end_document
27
- unless tag_stack.empty?
28
- raise ParseException.new("No close tag for #{@build_context.xpath}",
29
- @parser.source, @parser)
30
- end
31
25
  return
32
26
  when :start_element
33
- tag_stack.push(event[1])
34
27
  el = @build_context = @build_context.add_element( event[1] )
35
28
  event[2].each do |key, value|
36
29
  el.attributes[key]=Attribute.new(key,value,self)
37
30
  end
38
31
  when :end_element
39
- tag_stack.pop
40
32
  @build_context = @build_context.parent
41
33
  when :text
42
- if not in_doctype
43
- if @build_context[-1].instance_of? Text
44
- @build_context[-1] << event[1]
45
- else
46
- @build_context.add(
47
- Text.new(event[1], @build_context.whitespace, nil, true)
48
- ) unless (
49
- @build_context.ignore_whitespace_nodes and
50
- event[1].strip.size==0
51
- )
52
- end
34
+ if @build_context[-1].instance_of? Text
35
+ @build_context[-1] << event[1]
36
+ else
37
+ @build_context.add(
38
+ Text.new(event[1], @build_context.whitespace, nil, true)
39
+ ) unless (
40
+ @build_context.ignore_whitespace_nodes and
41
+ event[1].strip.size==0
42
+ )
53
43
  end
54
44
  when :comment
55
45
  c = Comment.new( event[1] )
@@ -60,14 +50,12 @@ module REXML
60
50
  when :processing_instruction
61
51
  @build_context.add( Instruction.new( event[1], event[2] ) )
62
52
  when :end_doctype
63
- in_doctype = false
64
53
  entities.each { |k,v| entities[k] = @build_context.entities[k].value }
65
54
  @build_context = @build_context.parent
66
55
  when :start_doctype
67
56
  doctype = DocType.new( event[1..-1], @build_context )
68
57
  @build_context = doctype
69
58
  entities = {}
70
- in_doctype = true
71
59
  when :attlistdecl
72
60
  n = AttlistDecl.new( event[1..-1] )
73
61
  @build_context.add( n )
data/lib/rexml/rexml.rb CHANGED
@@ -31,7 +31,7 @@
31
31
  module REXML
32
32
  COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
33
33
  DATE = "2008/019"
34
- VERSION = "3.2.6"
34
+ VERSION = "3.4.0"
35
35
  REVISION = ""
36
36
 
37
37
  Copyright = COPYRIGHT
data/lib/rexml/source.rb CHANGED
@@ -1,8 +1,39 @@
1
1
  # coding: US-ASCII
2
2
  # frozen_string_literal: false
3
+
4
+ require "stringio"
5
+ require "strscan"
6
+
3
7
  require_relative 'encoding'
4
8
 
5
9
  module REXML
10
+ if StringScanner::Version < "1.0.0"
11
+ module StringScannerCheckScanString
12
+ refine StringScanner do
13
+ def check(pattern)
14
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
15
+ super(pattern)
16
+ end
17
+
18
+ def scan(pattern)
19
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
20
+ super(pattern)
21
+ end
22
+
23
+ def match?(pattern)
24
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
25
+ super(pattern)
26
+ end
27
+
28
+ def skip(pattern)
29
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
30
+ super(pattern)
31
+ end
32
+ end
33
+ end
34
+ using StringScannerCheckScanString
35
+ end
36
+
6
37
  # Generates Source-s. USE THIS CLASS.
7
38
  class SourceFactory
8
39
  # Generates a Source object
@@ -15,7 +46,6 @@ module REXML
15
46
  arg.respond_to? :eof?
16
47
  IOSource.new(arg)
17
48
  elsif arg.respond_to? :to_str
18
- require 'stringio'
19
49
  IOSource.new(StringIO.new(arg))
20
50
  elsif arg.kind_of? Source
21
51
  arg
@@ -30,26 +60,50 @@ module REXML
30
60
  # objects and provides consumption of text
31
61
  class Source
32
62
  include Encoding
33
- # The current buffer (what we're going to read next)
34
- attr_reader :buffer
35
63
  # The line number of the last consumed text
36
64
  attr_reader :line
37
65
  attr_reader :encoding
38
66
 
67
+ module Private
68
+ SCANNER_RESET_SIZE = 100000
69
+ PRE_DEFINED_TERM_PATTERNS = {}
70
+ pre_defined_terms = ["'", '"', "<"]
71
+ pre_defined_terms.each do |term|
72
+ PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/
73
+ end
74
+ end
75
+ private_constant :Private
76
+
39
77
  # Constructor
40
78
  # @param arg must be a String, and should be a valid XML document
41
79
  # @param encoding if non-null, sets the encoding of the source to this
42
80
  # value, overriding all encoding detection
43
81
  def initialize(arg, encoding=nil)
44
- @orig = @buffer = arg
82
+ @orig = arg
83
+ @scanner = StringScanner.new(@orig)
45
84
  if encoding
46
85
  self.encoding = encoding
47
86
  else
48
87
  detect_encoding
49
88
  end
50
89
  @line = 0
90
+ @encoded_terms = {}
91
+ end
92
+
93
+ # The current buffer (what we're going to read next)
94
+ def buffer
95
+ @scanner.rest
51
96
  end
52
97
 
98
+ def drop_parsed_content
99
+ if @scanner.pos > Private::SCANNER_RESET_SIZE
100
+ @scanner.string = @scanner.rest
101
+ end
102
+ end
103
+
104
+ def buffer_encoding=(encoding)
105
+ @scanner.string.force_encoding(encoding)
106
+ end
53
107
 
54
108
  # Inherited from Encoding
55
109
  # Overridden to support optimized en/decoding
@@ -58,98 +112,86 @@ module REXML
58
112
  encoding_updated
59
113
  end
60
114
 
61
- # Scans the source for a given pattern. Note, that this is not your
62
- # usual scan() method. For one thing, the pattern argument has some
63
- # requirements; for another, the source can be consumed. You can easily
64
- # confuse this method. Originally, the patterns were easier
65
- # to construct and this method more robust, because this method
66
- # generated search regexps on the fly; however, this was
67
- # computationally expensive and slowed down the entire REXML package
68
- # considerably, since this is by far the most commonly called method.
69
- # @param pattern must be a Regexp, and must be in the form of
70
- # /^\s*(#{your pattern, with no groups})(.*)/. The first group
71
- # will be returned; the second group is used if the consume flag is
72
- # set.
73
- # @param consume if true, the pattern returned will be consumed, leaving
74
- # everything after it in the Source.
75
- # @return the pattern, if found, or nil if the Source is empty or the
76
- # pattern is not found.
77
- def scan(pattern, cons=false)
78
- return nil if @buffer.nil?
79
- rv = @buffer.scan(pattern)
80
- @buffer = $' if cons and rv.size>0
81
- rv
115
+ def read(term = nil)
82
116
  end
83
117
 
84
- def read
118
+ def read_until(term)
119
+ pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
120
+ data = @scanner.scan_until(pattern)
121
+ unless data
122
+ data = @scanner.rest
123
+ @scanner.pos = @scanner.string.bytesize
124
+ end
125
+ data
85
126
  end
86
127
 
87
- def consume( pattern )
88
- @buffer = $' if pattern.match( @buffer )
128
+ def ensure_buffer
89
129
  end
90
130
 
91
- def match_to( char, pattern )
92
- return pattern.match(@buffer)
131
+ def match(pattern, cons=false)
132
+ if cons
133
+ @scanner.scan(pattern).nil? ? nil : @scanner
134
+ else
135
+ @scanner.check(pattern).nil? ? nil : @scanner
136
+ end
93
137
  end
94
138
 
95
- def match_to_consume( char, pattern )
96
- md = pattern.match(@buffer)
97
- @buffer = $'
98
- return md
139
+ def match?(pattern, cons=false)
140
+ if cons
141
+ !@scanner.skip(pattern).nil?
142
+ else
143
+ !@scanner.match?(pattern).nil?
144
+ end
99
145
  end
100
146
 
101
- def match(pattern, cons=false)
102
- md = pattern.match(@buffer)
103
- @buffer = $' if cons and md
104
- return md
147
+ def position
148
+ @scanner.pos
105
149
  end
106
150
 
107
- # @return true if the Source is exhausted
108
- def empty?
109
- @buffer == ""
151
+ def position=(pos)
152
+ @scanner.pos = pos
110
153
  end
111
154
 
112
- def position
113
- @orig.index( @buffer )
155
+ # @return true if the Source is exhausted
156
+ def empty?
157
+ @scanner.eos?
114
158
  end
115
159
 
116
160
  # @return the current line in the source
117
161
  def current_line
118
162
  lines = @orig.split
119
- res = lines.grep @buffer[0..30]
163
+ res = lines.grep @scanner.rest[0..30]
120
164
  res = res[-1] if res.kind_of? Array
121
165
  lines.index( res ) if res
122
166
  end
123
167
 
124
168
  private
169
+
125
170
  def detect_encoding
126
- buffer_encoding = @buffer.encoding
171
+ scanner_encoding = @scanner.rest.encoding
127
172
  detected_encoding = "UTF-8"
128
173
  begin
129
- @buffer.force_encoding("ASCII-8BIT")
130
- if @buffer[0, 2] == "\xfe\xff"
131
- @buffer[0, 2] = ""
174
+ @scanner.string.force_encoding("ASCII-8BIT")
175
+ if @scanner.scan(/\xfe\xff/n)
132
176
  detected_encoding = "UTF-16BE"
133
- elsif @buffer[0, 2] == "\xff\xfe"
134
- @buffer[0, 2] = ""
177
+ elsif @scanner.scan(/\xff\xfe/n)
135
178
  detected_encoding = "UTF-16LE"
136
- elsif @buffer[0, 3] == "\xef\xbb\xbf"
137
- @buffer[0, 3] = ""
179
+ elsif @scanner.scan(/\xef\xbb\xbf/n)
138
180
  detected_encoding = "UTF-8"
139
181
  end
140
182
  ensure
141
- @buffer.force_encoding(buffer_encoding)
183
+ @scanner.string.force_encoding(scanner_encoding)
142
184
  end
143
185
  self.encoding = detected_encoding
144
186
  end
145
187
 
146
188
  def encoding_updated
147
189
  if @encoding != 'UTF-8'
148
- @buffer = decode(@buffer)
190
+ @scanner.string = decode(@scanner.rest)
149
191
  @to_utf = true
150
192
  else
151
193
  @to_utf = false
152
- @buffer.force_encoding ::Encoding::UTF_8
194
+ @scanner.string.force_encoding(::Encoding::UTF_8)
153
195
  end
154
196
  end
155
197
  end
@@ -172,7 +214,7 @@ module REXML
172
214
  end
173
215
 
174
216
  if !@to_utf and
175
- @buffer.respond_to?(:force_encoding) and
217
+ @orig.respond_to?(:force_encoding) and
176
218
  @source.respond_to?(:external_encoding) and
177
219
  @source.external_encoding != ::Encoding::UTF_8
178
220
  @force_utf8 = true
@@ -181,63 +223,87 @@ module REXML
181
223
  end
182
224
  end
183
225
 
184
- def scan(pattern, cons=false)
185
- rv = super
186
- # You'll notice that this next section is very similar to the same
187
- # section in match(), but just a liiittle different. This is
188
- # because it is a touch faster to do it this way with scan()
189
- # than the way match() does it; enough faster to warrant duplicating
190
- # some code
191
- if rv.size == 0
192
- until @buffer =~ pattern or @source.nil?
193
- begin
194
- @buffer << readline
195
- rescue Iconv::IllegalSequence
196
- raise
197
- rescue
198
- @source = nil
226
+ def read(term = nil, min_bytes = 1)
227
+ term = encode(term) if term
228
+ begin
229
+ str = readline(term)
230
+ @scanner << str
231
+ read_bytes = str.bytesize
232
+ begin
233
+ while read_bytes < min_bytes
234
+ str = readline(term)
235
+ @scanner << str
236
+ read_bytes += str.bytesize
199
237
  end
238
+ rescue IOError
200
239
  end
201
- rv = super
240
+ true
241
+ rescue Exception, NameError
242
+ @source = nil
243
+ false
202
244
  end
203
- rv.taint if RUBY_VERSION < '2.7'
204
- rv
205
245
  end
206
246
 
207
- def read
208
- begin
209
- @buffer << readline
210
- rescue Exception, NameError
211
- @source = nil
247
+ def read_until(term)
248
+ pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
249
+ term = @encoded_terms[term] ||= encode(term)
250
+ until str = @scanner.scan_until(pattern)
251
+ break if @source.nil?
252
+ break if @source.eof?
253
+ @scanner << readline(term)
254
+ end
255
+ if str
256
+ read if @scanner.eos? and !@source.eof?
257
+ str
258
+ else
259
+ rest = @scanner.rest
260
+ @scanner.pos = @scanner.string.bytesize
261
+ rest
212
262
  end
213
263
  end
214
264
 
215
- def consume( pattern )
216
- match( pattern, true )
265
+ def ensure_buffer
266
+ read if @scanner.eos? && @source
217
267
  end
218
268
 
219
269
  def match( pattern, cons=false )
220
- rv = pattern.match(@buffer)
221
- @buffer = $' if cons and rv
222
- while !rv and @source
223
- begin
224
- @buffer << readline
225
- rv = pattern.match(@buffer)
226
- @buffer = $' if cons and rv
227
- rescue
228
- @source = nil
270
+ # To avoid performance issue, we need to increase bytes to read per scan
271
+ min_bytes = 1
272
+ while true
273
+ if cons
274
+ md = @scanner.scan(pattern)
275
+ else
276
+ md = @scanner.check(pattern)
229
277
  end
278
+ break if md
279
+ return nil if pattern.is_a?(String)
280
+ return nil if @source.nil?
281
+ return nil unless read(nil, min_bytes)
282
+ min_bytes *= 2
230
283
  end
231
- rv.taint if RUBY_VERSION < '2.7'
232
- rv
284
+
285
+ md.nil? ? nil : @scanner
233
286
  end
234
287
 
235
- def empty?
236
- super and ( @source.nil? || @source.eof? )
288
+ def match?( pattern, cons=false )
289
+ # To avoid performance issue, we need to increase bytes to read per scan
290
+ min_bytes = 1
291
+ while true
292
+ if cons
293
+ n_matched_bytes = @scanner.skip(pattern)
294
+ else
295
+ n_matched_bytes = @scanner.match?(pattern)
296
+ end
297
+ return true if n_matched_bytes
298
+ return false if pattern.is_a?(String)
299
+ return false if @source.nil?
300
+ return false unless read(nil, min_bytes)
301
+ min_bytes *= 2
302
+ end
237
303
  end
238
304
 
239
- def position
240
- @er_source.pos rescue 0
305
+ def empty?
306
+ super and ( @source.nil? || @source.eof? )
241
307
  end
242
308
 
243
309
  # @return the current line in the source
@@ -255,7 +321,7 @@ module REXML
255
321
  rescue
256
322
  end
257
323
  @er_source.seek(pos)
258
- rescue IOError
324
+ rescue IOError, SystemCallError
259
325
  pos = -1
260
326
  line = -1
261
327
  end
@@ -263,15 +329,20 @@ module REXML
263
329
  end
264
330
 
265
331
  private
266
- def readline
267
- str = @source.readline(@line_break)
332
+ def readline(term = nil)
268
333
  if @pending_buffer
334
+ begin
335
+ str = @source.readline(term || @line_break)
336
+ rescue IOError
337
+ end
269
338
  if str.nil?
270
339
  str = @pending_buffer
271
340
  else
272
341
  str = @pending_buffer + str
273
342
  end
274
343
  @pending_buffer = nil
344
+ else
345
+ str = @source.readline(term || @line_break)
275
346
  end
276
347
  return nil if str.nil?
277
348
 
@@ -290,7 +361,7 @@ module REXML
290
361
  @source.set_encoding(@encoding, @encoding)
291
362
  end
292
363
  @line_break = encode(">")
293
- @pending_buffer, @buffer = @buffer, ""
364
+ @pending_buffer, @scanner.string = @scanner.rest, ""
294
365
  @pending_buffer.force_encoding(@encoding)
295
366
  super
296
367
  end