rexml 3.2.6 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -47,6 +47,18 @@ module REXML
47
47
  @listeners << listener
48
48
  end
49
49
 
50
+ def entity_expansion_count
51
+ @parser.entity_expansion_count
52
+ end
53
+
54
+ def entity_expansion_limit=( limit )
55
+ @parser.entity_expansion_limit = limit
56
+ end
57
+
58
+ def entity_expansion_text_limit=( limit )
59
+ @parser.entity_expansion_text_limit = limit
60
+ end
61
+
50
62
  def each
51
63
  while has_next?
52
64
  yield self.pull
@@ -81,6 +93,10 @@ module REXML
81
93
  def unshift token
82
94
  @my_stack.unshift token
83
95
  end
96
+
97
+ def reset
98
+ @parser.reset
99
+ end
84
100
  end
85
101
 
86
102
  # A parsing event. The contents of the event are accessed as an +Array?,
@@ -22,6 +22,18 @@ module REXML
22
22
  @parser.source
23
23
  end
24
24
 
25
+ def entity_expansion_count
26
+ @parser.entity_expansion_count
27
+ end
28
+
29
+ def entity_expansion_limit=( limit )
30
+ @parser.entity_expansion_limit = limit
31
+ end
32
+
33
+ def entity_expansion_text_limit=( limit )
34
+ @parser.entity_expansion_text_limit = limit
35
+ end
36
+
25
37
  def add_listener( listener )
26
38
  @parser.add_listener( listener )
27
39
  end
@@ -157,25 +169,8 @@ module REXML
157
169
  end
158
170
  end
159
171
  when :text
160
- #normalized = @parser.normalize( event[1] )
161
- #handle( :characters, normalized )
162
- copy = event[1].clone
163
-
164
- esub = proc { |match|
165
- if @entities.has_key?($1)
166
- @entities[$1].gsub(Text::REFERENCE, &esub)
167
- else
168
- match
169
- end
170
- }
171
-
172
- copy.gsub!( Text::REFERENCE, &esub )
173
- copy.gsub!( Text::NUMERICENTITY ) {|m|
174
- m=$1
175
- m = "0#{m}" if m[0] == ?x
176
- [Integer(m)].pack('U*')
177
- }
178
- handle( :characters, copy )
172
+ unnormalized = @parser.unnormalize( event[1], @entities )
173
+ handle( :characters, unnormalized )
179
174
  when :entitydecl
180
175
  handle_entitydecl( event )
181
176
  when :processing_instruction, :comment, :attlistdecl,
@@ -264,6 +259,8 @@ module REXML
264
259
  end
265
260
 
266
261
  def get_namespace( prefix )
262
+ return nil if @namespace_stack.empty?
263
+
267
264
  uris = (@namespace_stack.find_all { |ns| not ns[prefix].nil? }) ||
268
265
  (@namespace_stack.find { |ns| not ns[nil].nil? })
269
266
  uris[-1][prefix] unless uris.nil? or 0 == uris.size
@@ -7,37 +7,42 @@ module REXML
7
7
  def initialize source, listener
8
8
  @listener = listener
9
9
  @parser = BaseParser.new( source )
10
- @tag_stack = []
10
+ @entities = {}
11
11
  end
12
12
 
13
13
  def add_listener( listener )
14
14
  @parser.add_listener( listener )
15
15
  end
16
16
 
17
+ def entity_expansion_count
18
+ @parser.entity_expansion_count
19
+ end
20
+
21
+ def entity_expansion_limit=( limit )
22
+ @parser.entity_expansion_limit = limit
23
+ end
24
+
25
+ def entity_expansion_text_limit=( limit )
26
+ @parser.entity_expansion_text_limit = limit
27
+ end
28
+
17
29
  def parse
18
30
  # entity string
19
31
  while true
20
32
  event = @parser.pull
21
33
  case event[0]
22
34
  when :end_document
23
- unless @tag_stack.empty?
24
- tag_path = "/" + @tag_stack.join("/")
25
- raise ParseException.new("Missing end tag for '#{tag_path}'",
26
- @parser.source)
27
- end
28
35
  return
29
36
  when :start_element
30
- @tag_stack << event[1]
31
37
  attrs = event[2].each do |n, v|
32
38
  event[2][n] = @parser.unnormalize( v )
33
39
  end
34
40
  @listener.tag_start( event[1], attrs )
35
41
  when :end_element
36
42
  @listener.tag_end( event[1] )
37
- @tag_stack.pop
38
43
  when :text
39
- normalized = @parser.unnormalize( event[1] )
40
- @listener.text( normalized )
44
+ unnormalized = @parser.unnormalize( event[1], @entities )
45
+ @listener.text( unnormalized )
41
46
  when :processing_instruction
42
47
  @listener.instruction( *event[1,2] )
43
48
  when :start_doctype
@@ -48,6 +53,7 @@ module REXML
48
53
  when :comment, :attlistdecl, :cdata, :xmldecl, :elementdecl
49
54
  @listener.send( event[0].to_s, *event[1..-1] )
50
55
  when :entitydecl, :notationdecl
56
+ @entities[ event[1] ] = event[2] if event.size == 3
51
57
  @listener.send( event[0].to_s, event[1..-1] )
52
58
  when :externalentity
53
59
  entity_reference = event[1]
@@ -15,8 +15,6 @@ module REXML
15
15
  end
16
16
 
17
17
  def parse
18
- tag_stack = []
19
- in_doctype = false
20
18
  entities = nil
21
19
  begin
22
20
  while true
@@ -24,32 +22,24 @@ module REXML
24
22
  #STDERR.puts "TREEPARSER GOT #{event.inspect}"
25
23
  case event[0]
26
24
  when :end_document
27
- unless tag_stack.empty?
28
- raise ParseException.new("No close tag for #{@build_context.xpath}",
29
- @parser.source, @parser)
30
- end
31
25
  return
32
26
  when :start_element
33
- tag_stack.push(event[1])
34
27
  el = @build_context = @build_context.add_element( event[1] )
35
28
  event[2].each do |key, value|
36
29
  el.attributes[key]=Attribute.new(key,value,self)
37
30
  end
38
31
  when :end_element
39
- tag_stack.pop
40
32
  @build_context = @build_context.parent
41
33
  when :text
42
- if not in_doctype
43
- if @build_context[-1].instance_of? Text
44
- @build_context[-1] << event[1]
45
- else
46
- @build_context.add(
47
- Text.new(event[1], @build_context.whitespace, nil, true)
48
- ) unless (
49
- @build_context.ignore_whitespace_nodes and
50
- event[1].strip.size==0
51
- )
52
- end
34
+ if @build_context[-1].instance_of? Text
35
+ @build_context[-1] << event[1]
36
+ else
37
+ @build_context.add(
38
+ Text.new(event[1], @build_context.whitespace, nil, true)
39
+ ) unless (
40
+ @build_context.ignore_whitespace_nodes and
41
+ event[1].strip.size==0
42
+ )
53
43
  end
54
44
  when :comment
55
45
  c = Comment.new( event[1] )
@@ -60,14 +50,12 @@ module REXML
60
50
  when :processing_instruction
61
51
  @build_context.add( Instruction.new( event[1], event[2] ) )
62
52
  when :end_doctype
63
- in_doctype = false
64
53
  entities.each { |k,v| entities[k] = @build_context.entities[k].value }
65
54
  @build_context = @build_context.parent
66
55
  when :start_doctype
67
56
  doctype = DocType.new( event[1..-1], @build_context )
68
57
  @build_context = doctype
69
58
  entities = {}
70
- in_doctype = true
71
59
  when :attlistdecl
72
60
  n = AttlistDecl.new( event[1..-1] )
73
61
  @build_context.add( n )
data/lib/rexml/rexml.rb CHANGED
@@ -31,7 +31,7 @@
31
31
  module REXML
32
32
  COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
33
33
  DATE = "2008/019"
34
- VERSION = "3.2.6"
34
+ VERSION = "3.4.0"
35
35
  REVISION = ""
36
36
 
37
37
  Copyright = COPYRIGHT
data/lib/rexml/source.rb CHANGED
@@ -1,8 +1,39 @@
1
1
  # coding: US-ASCII
2
2
  # frozen_string_literal: false
3
+
4
+ require "stringio"
5
+ require "strscan"
6
+
3
7
  require_relative 'encoding'
4
8
 
5
9
  module REXML
10
+ if StringScanner::Version < "1.0.0"
11
+ module StringScannerCheckScanString
12
+ refine StringScanner do
13
+ def check(pattern)
14
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
15
+ super(pattern)
16
+ end
17
+
18
+ def scan(pattern)
19
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
20
+ super(pattern)
21
+ end
22
+
23
+ def match?(pattern)
24
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
25
+ super(pattern)
26
+ end
27
+
28
+ def skip(pattern)
29
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
30
+ super(pattern)
31
+ end
32
+ end
33
+ end
34
+ using StringScannerCheckScanString
35
+ end
36
+
6
37
  # Generates Source-s. USE THIS CLASS.
7
38
  class SourceFactory
8
39
  # Generates a Source object
@@ -15,7 +46,6 @@ module REXML
15
46
  arg.respond_to? :eof?
16
47
  IOSource.new(arg)
17
48
  elsif arg.respond_to? :to_str
18
- require 'stringio'
19
49
  IOSource.new(StringIO.new(arg))
20
50
  elsif arg.kind_of? Source
21
51
  arg
@@ -30,26 +60,50 @@ module REXML
30
60
  # objects and provides consumption of text
31
61
  class Source
32
62
  include Encoding
33
- # The current buffer (what we're going to read next)
34
- attr_reader :buffer
35
63
  # The line number of the last consumed text
36
64
  attr_reader :line
37
65
  attr_reader :encoding
38
66
 
67
+ module Private
68
+ SCANNER_RESET_SIZE = 100000
69
+ PRE_DEFINED_TERM_PATTERNS = {}
70
+ pre_defined_terms = ["'", '"', "<"]
71
+ pre_defined_terms.each do |term|
72
+ PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/
73
+ end
74
+ end
75
+ private_constant :Private
76
+
39
77
  # Constructor
40
78
  # @param arg must be a String, and should be a valid XML document
41
79
  # @param encoding if non-null, sets the encoding of the source to this
42
80
  # value, overriding all encoding detection
43
81
  def initialize(arg, encoding=nil)
44
- @orig = @buffer = arg
82
+ @orig = arg
83
+ @scanner = StringScanner.new(@orig)
45
84
  if encoding
46
85
  self.encoding = encoding
47
86
  else
48
87
  detect_encoding
49
88
  end
50
89
  @line = 0
90
+ @encoded_terms = {}
91
+ end
92
+
93
+ # The current buffer (what we're going to read next)
94
+ def buffer
95
+ @scanner.rest
51
96
  end
52
97
 
98
+ def drop_parsed_content
99
+ if @scanner.pos > Private::SCANNER_RESET_SIZE
100
+ @scanner.string = @scanner.rest
101
+ end
102
+ end
103
+
104
+ def buffer_encoding=(encoding)
105
+ @scanner.string.force_encoding(encoding)
106
+ end
53
107
 
54
108
  # Inherited from Encoding
55
109
  # Overridden to support optimized en/decoding
@@ -58,98 +112,86 @@ module REXML
58
112
  encoding_updated
59
113
  end
60
114
 
61
- # Scans the source for a given pattern. Note, that this is not your
62
- # usual scan() method. For one thing, the pattern argument has some
63
- # requirements; for another, the source can be consumed. You can easily
64
- # confuse this method. Originally, the patterns were easier
65
- # to construct and this method more robust, because this method
66
- # generated search regexps on the fly; however, this was
67
- # computationally expensive and slowed down the entire REXML package
68
- # considerably, since this is by far the most commonly called method.
69
- # @param pattern must be a Regexp, and must be in the form of
70
- # /^\s*(#{your pattern, with no groups})(.*)/. The first group
71
- # will be returned; the second group is used if the consume flag is
72
- # set.
73
- # @param consume if true, the pattern returned will be consumed, leaving
74
- # everything after it in the Source.
75
- # @return the pattern, if found, or nil if the Source is empty or the
76
- # pattern is not found.
77
- def scan(pattern, cons=false)
78
- return nil if @buffer.nil?
79
- rv = @buffer.scan(pattern)
80
- @buffer = $' if cons and rv.size>0
81
- rv
115
+ def read(term = nil)
82
116
  end
83
117
 
84
- def read
118
+ def read_until(term)
119
+ pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
120
+ data = @scanner.scan_until(pattern)
121
+ unless data
122
+ data = @scanner.rest
123
+ @scanner.pos = @scanner.string.bytesize
124
+ end
125
+ data
85
126
  end
86
127
 
87
- def consume( pattern )
88
- @buffer = $' if pattern.match( @buffer )
128
+ def ensure_buffer
89
129
  end
90
130
 
91
- def match_to( char, pattern )
92
- return pattern.match(@buffer)
131
+ def match(pattern, cons=false)
132
+ if cons
133
+ @scanner.scan(pattern).nil? ? nil : @scanner
134
+ else
135
+ @scanner.check(pattern).nil? ? nil : @scanner
136
+ end
93
137
  end
94
138
 
95
- def match_to_consume( char, pattern )
96
- md = pattern.match(@buffer)
97
- @buffer = $'
98
- return md
139
+ def match?(pattern, cons=false)
140
+ if cons
141
+ !@scanner.skip(pattern).nil?
142
+ else
143
+ !@scanner.match?(pattern).nil?
144
+ end
99
145
  end
100
146
 
101
- def match(pattern, cons=false)
102
- md = pattern.match(@buffer)
103
- @buffer = $' if cons and md
104
- return md
147
+ def position
148
+ @scanner.pos
105
149
  end
106
150
 
107
- # @return true if the Source is exhausted
108
- def empty?
109
- @buffer == ""
151
+ def position=(pos)
152
+ @scanner.pos = pos
110
153
  end
111
154
 
112
- def position
113
- @orig.index( @buffer )
155
+ # @return true if the Source is exhausted
156
+ def empty?
157
+ @scanner.eos?
114
158
  end
115
159
 
116
160
  # @return the current line in the source
117
161
  def current_line
118
162
  lines = @orig.split
119
- res = lines.grep @buffer[0..30]
163
+ res = lines.grep @scanner.rest[0..30]
120
164
  res = res[-1] if res.kind_of? Array
121
165
  lines.index( res ) if res
122
166
  end
123
167
 
124
168
  private
169
+
125
170
  def detect_encoding
126
- buffer_encoding = @buffer.encoding
171
+ scanner_encoding = @scanner.rest.encoding
127
172
  detected_encoding = "UTF-8"
128
173
  begin
129
- @buffer.force_encoding("ASCII-8BIT")
130
- if @buffer[0, 2] == "\xfe\xff"
131
- @buffer[0, 2] = ""
174
+ @scanner.string.force_encoding("ASCII-8BIT")
175
+ if @scanner.scan(/\xfe\xff/n)
132
176
  detected_encoding = "UTF-16BE"
133
- elsif @buffer[0, 2] == "\xff\xfe"
134
- @buffer[0, 2] = ""
177
+ elsif @scanner.scan(/\xff\xfe/n)
135
178
  detected_encoding = "UTF-16LE"
136
- elsif @buffer[0, 3] == "\xef\xbb\xbf"
137
- @buffer[0, 3] = ""
179
+ elsif @scanner.scan(/\xef\xbb\xbf/n)
138
180
  detected_encoding = "UTF-8"
139
181
  end
140
182
  ensure
141
- @buffer.force_encoding(buffer_encoding)
183
+ @scanner.string.force_encoding(scanner_encoding)
142
184
  end
143
185
  self.encoding = detected_encoding
144
186
  end
145
187
 
146
188
  def encoding_updated
147
189
  if @encoding != 'UTF-8'
148
- @buffer = decode(@buffer)
190
+ @scanner.string = decode(@scanner.rest)
149
191
  @to_utf = true
150
192
  else
151
193
  @to_utf = false
152
- @buffer.force_encoding ::Encoding::UTF_8
194
+ @scanner.string.force_encoding(::Encoding::UTF_8)
153
195
  end
154
196
  end
155
197
  end
@@ -172,7 +214,7 @@ module REXML
172
214
  end
173
215
 
174
216
  if !@to_utf and
175
- @buffer.respond_to?(:force_encoding) and
217
+ @orig.respond_to?(:force_encoding) and
176
218
  @source.respond_to?(:external_encoding) and
177
219
  @source.external_encoding != ::Encoding::UTF_8
178
220
  @force_utf8 = true
@@ -181,63 +223,87 @@ module REXML
181
223
  end
182
224
  end
183
225
 
184
- def scan(pattern, cons=false)
185
- rv = super
186
- # You'll notice that this next section is very similar to the same
187
- # section in match(), but just a liiittle different. This is
188
- # because it is a touch faster to do it this way with scan()
189
- # than the way match() does it; enough faster to warrant duplicating
190
- # some code
191
- if rv.size == 0
192
- until @buffer =~ pattern or @source.nil?
193
- begin
194
- @buffer << readline
195
- rescue Iconv::IllegalSequence
196
- raise
197
- rescue
198
- @source = nil
226
+ def read(term = nil, min_bytes = 1)
227
+ term = encode(term) if term
228
+ begin
229
+ str = readline(term)
230
+ @scanner << str
231
+ read_bytes = str.bytesize
232
+ begin
233
+ while read_bytes < min_bytes
234
+ str = readline(term)
235
+ @scanner << str
236
+ read_bytes += str.bytesize
199
237
  end
238
+ rescue IOError
200
239
  end
201
- rv = super
240
+ true
241
+ rescue Exception, NameError
242
+ @source = nil
243
+ false
202
244
  end
203
- rv.taint if RUBY_VERSION < '2.7'
204
- rv
205
245
  end
206
246
 
207
- def read
208
- begin
209
- @buffer << readline
210
- rescue Exception, NameError
211
- @source = nil
247
+ def read_until(term)
248
+ pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
249
+ term = @encoded_terms[term] ||= encode(term)
250
+ until str = @scanner.scan_until(pattern)
251
+ break if @source.nil?
252
+ break if @source.eof?
253
+ @scanner << readline(term)
254
+ end
255
+ if str
256
+ read if @scanner.eos? and !@source.eof?
257
+ str
258
+ else
259
+ rest = @scanner.rest
260
+ @scanner.pos = @scanner.string.bytesize
261
+ rest
212
262
  end
213
263
  end
214
264
 
215
- def consume( pattern )
216
- match( pattern, true )
265
+ def ensure_buffer
266
+ read if @scanner.eos? && @source
217
267
  end
218
268
 
219
269
  def match( pattern, cons=false )
220
- rv = pattern.match(@buffer)
221
- @buffer = $' if cons and rv
222
- while !rv and @source
223
- begin
224
- @buffer << readline
225
- rv = pattern.match(@buffer)
226
- @buffer = $' if cons and rv
227
- rescue
228
- @source = nil
270
+ # To avoid performance issue, we need to increase bytes to read per scan
271
+ min_bytes = 1
272
+ while true
273
+ if cons
274
+ md = @scanner.scan(pattern)
275
+ else
276
+ md = @scanner.check(pattern)
229
277
  end
278
+ break if md
279
+ return nil if pattern.is_a?(String)
280
+ return nil if @source.nil?
281
+ return nil unless read(nil, min_bytes)
282
+ min_bytes *= 2
230
283
  end
231
- rv.taint if RUBY_VERSION < '2.7'
232
- rv
284
+
285
+ md.nil? ? nil : @scanner
233
286
  end
234
287
 
235
- def empty?
236
- super and ( @source.nil? || @source.eof? )
288
+ def match?( pattern, cons=false )
289
+ # To avoid performance issue, we need to increase bytes to read per scan
290
+ min_bytes = 1
291
+ while true
292
+ if cons
293
+ n_matched_bytes = @scanner.skip(pattern)
294
+ else
295
+ n_matched_bytes = @scanner.match?(pattern)
296
+ end
297
+ return true if n_matched_bytes
298
+ return false if pattern.is_a?(String)
299
+ return false if @source.nil?
300
+ return false unless read(nil, min_bytes)
301
+ min_bytes *= 2
302
+ end
237
303
  end
238
304
 
239
- def position
240
- @er_source.pos rescue 0
305
+ def empty?
306
+ super and ( @source.nil? || @source.eof? )
241
307
  end
242
308
 
243
309
  # @return the current line in the source
@@ -255,7 +321,7 @@ module REXML
255
321
  rescue
256
322
  end
257
323
  @er_source.seek(pos)
258
- rescue IOError
324
+ rescue IOError, SystemCallError
259
325
  pos = -1
260
326
  line = -1
261
327
  end
@@ -263,15 +329,20 @@ module REXML
263
329
  end
264
330
 
265
331
  private
266
- def readline
267
- str = @source.readline(@line_break)
332
+ def readline(term = nil)
268
333
  if @pending_buffer
334
+ begin
335
+ str = @source.readline(term || @line_break)
336
+ rescue IOError
337
+ end
269
338
  if str.nil?
270
339
  str = @pending_buffer
271
340
  else
272
341
  str = @pending_buffer + str
273
342
  end
274
343
  @pending_buffer = nil
344
+ else
345
+ str = @source.readline(term || @line_break)
275
346
  end
276
347
  return nil if str.nil?
277
348
 
@@ -290,7 +361,7 @@ module REXML
290
361
  @source.set_encoding(@encoding, @encoding)
291
362
  end
292
363
  @line_break = encode(">")
293
- @pending_buffer, @buffer = @buffer, ""
364
+ @pending_buffer, @scanner.string = @scanner.rest, ""
294
365
  @pending_buffer.force_encoding(@encoding)
295
366
  super
296
367
  end