rexml 3.2.6 → 3.2.9

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of rexml might be problematic. Click here for more details.

checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2583ae302aa5e698f0887a689c416e5debe0533ac472a9f96fce6a8912040fd8
4
- data.tar.gz: b0ffa6301fd899969a78e060ccaeafebfc2169e3c63ff499ebc6170468866475
3
+ metadata.gz: ed57404d6f519cb196d671fd629380f5b08a50cf649ae99a71432edceaf15014
4
+ data.tar.gz: 9097f235049d98aa743998da10724fd6ff5f6cd0cb4f230c59290af4a75e2134
5
5
  SHA512:
6
- metadata.gz: f63fb0b84ef51e790cc6310244f2106d8c47ec9a00687c58c743afda82b60be9986d503c6f56f947db06f6758707facccd03405c4d1009376e856080aa26d0e4
7
- data.tar.gz: db62bea7391837a7ab4cfc5cb5a412ed4deb8d232653ca66d93a323a5a76383eed520cd4ced5b20204f29b04e84678791cd6f807195868f5d4a5e519a73d2aaf
6
+ metadata.gz: 0d7a0be04c12fcd88c64dd2962d4db49f57ed02c08e3f4628e8e546aea135c672839a19a950423afa72d0cf25af0554558f2261b6b5f2b23637f4ca47d43bd73
7
+ data.tar.gz: 28479b3b11de58e84f57dc88057f185eb2365998f7fbc9f0df78d5899bb9b241c6bb93d26a57ca1b54d8a058b743daa78a8df52c4ddbfbb5f7c94aeb97724808
data/NEWS.md CHANGED
@@ -1,5 +1,97 @@
1
1
  # News
2
2
 
3
+ ## 3.2.9 - 2024-06-19 {#version-3-2-9}
4
+
5
+ ### Improvements
6
+
7
+ * Added support for old strscan.
8
+ * GH-132
9
+ * Reported by Adam
10
+
11
+ * Improved attribute value parse performance.
12
+ * GH-135
13
+ * Patch by NAITOH Jun.
14
+
15
+ * Improved `REXML::Node#each_recursive` performance.
16
+ * GH-134
17
+ * GH-139
18
+ * Patch by Hiroya Fujinami.
19
+
20
+ * Improved text parse performance.
21
+ * Reported by mprogrammer.
22
+
23
+ ### Thanks
24
+
25
+ * Adam
26
+ * NAITOH Jun
27
+ * Hiroya Fujinami
28
+ * mprogrammer
29
+
30
+ ## 3.2.8 - 2024-05-16 {#version-3-2-8}
31
+
32
+ ### Fixes
33
+
34
+ * Suppressed a warning
35
+
36
+ ## 3.2.7 - 2024-05-16 {#version-3-2-7}
37
+
38
+ ### Improvements
39
+
40
+ * Improve parse performance by using `StringScanner`.
41
+
42
+ * GH-106
43
+ * GH-107
44
+ * GH-108
45
+ * GH-109
46
+ * GH-112
47
+ * GH-113
48
+ * GH-114
49
+ * GH-115
50
+ * GH-116
51
+ * GH-117
52
+ * GH-118
53
+ * GH-119
54
+ * GH-121
55
+
56
+ * Patch by NAITOH Jun.
57
+
58
+ * Improved parse performance when an attribute has many `<`s.
59
+
60
+ * GH-126
61
+
62
+ ### Fixes
63
+
64
+ * XPath: Fixed a bug of `normalize_space(array)`.
65
+
66
+ * GH-110
67
+ * GH-111
68
+
69
+ * Patch by flatisland.
70
+
71
+ * XPath: Fixed a bug that wrong position is used with nested path.
72
+
73
+ * GH-110
74
+ * GH-122
75
+
76
+ * Reported by jcavalieri.
77
+ * Patch by NAITOH Jun.
78
+
79
+ * Fixed a bug that an exception message can't be generated for
80
+ invalid encoding XML.
81
+
82
+ * GH-29
83
+ * GH-123
84
+
85
+ * Reported by DuKewu.
86
+ * Patch by NAITOH Jun.
87
+
88
+ ### Thanks
89
+
90
+ * NAITOH Jun
91
+ * flatisland
92
+ * jcavalieri
93
+ * DuKewu
94
+
3
95
  ## 3.2.6 - 2023-07-27 {#version-3-2-6}
4
96
 
5
97
  ### Improvements
@@ -262,11 +262,10 @@ module REXML
262
262
  string(string).length
263
263
  end
264
264
 
265
- # UNTESTED
266
265
  def Functions::normalize_space( string=nil )
267
266
  string = string(@@context[:node]) if string.nil?
268
267
  if string.kind_of? Array
269
- string.collect{|x| string.to_s.strip.gsub(/\s+/um, ' ') if string}
268
+ string.collect{|x| x.to_s.strip.gsub(/\s+/um, ' ') if x}
270
269
  else
271
270
  string.to_s.strip.gsub(/\s+/um, ' ')
272
271
  end
data/lib/rexml/node.rb CHANGED
@@ -52,10 +52,14 @@ module REXML
52
52
 
53
53
  # Visit all subnodes of +self+ recursively
54
54
  def each_recursive(&block) # :yields: node
55
- self.elements.each {|node|
56
- block.call(node)
57
- node.each_recursive(&block)
58
- }
55
+ stack = []
56
+ each { |child| stack.unshift child if child.node_type == :element }
57
+ until stack.empty?
58
+ child = stack.pop
59
+ yield child
60
+ n = stack.size
61
+ child.each { |grandchild| stack.insert n, grandchild if grandchild.node_type == :element }
62
+ end
59
63
  end
60
64
 
61
65
  # Find (and return) first subnode (recursively) for which the block
@@ -29,6 +29,7 @@ module REXML
29
29
  err << "\nLine: #{line}\n"
30
30
  err << "Position: #{position}\n"
31
31
  err << "Last 80 unconsumed characters:\n"
32
+ err.force_encoding("ASCII-8BIT")
32
33
  err << @source.buffer[0..80].force_encoding("ASCII-8BIT").gsub(/\n/, ' ')
33
34
  end
34
35
 
@@ -1,4 +1,4 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative '../parseexception'
3
3
  require_relative '../undefinednamespaceexception'
4
4
  require_relative '../source'
@@ -7,6 +7,17 @@ require "strscan"
7
7
 
8
8
  module REXML
9
9
  module Parsers
10
+ if StringScanner::Version < "3.0.8"
11
+ module StringScannerCaptures
12
+ refine StringScanner do
13
+ def captures
14
+ values_at(*(1...size))
15
+ end
16
+ end
17
+ end
18
+ using StringScannerCaptures
19
+ end
20
+
10
21
  # = Using the Pull Parser
11
22
  # <em>This API is experimental, and subject to change.</em>
12
23
  # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
@@ -96,7 +107,7 @@ module REXML
96
107
  ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
97
108
  PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
98
109
  GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
99
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
110
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
100
111
 
101
112
  NOTATIONDECL_START = /\A\s*<!NOTATION/um
102
113
  EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -112,6 +123,19 @@ module REXML
112
123
  "apos" => [/&apos;/, "&apos;", "'", /'/]
113
124
  }
114
125
 
126
+ module Private
127
+ INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
128
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
129
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
130
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
131
+ NAME_PATTERN = /\s*#{NAME}/um
132
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
133
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
134
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
135
+ end
136
+ private_constant :Private
137
+ include Private
138
+
115
139
  def initialize( source )
116
140
  self.stream = source
117
141
  @listeners = []
@@ -196,181 +220,184 @@ module REXML
196
220
  return @stack.shift if @stack.size > 0
197
221
  #STDERR.puts @source.encoding
198
222
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
223
+
224
+ @source.ensure_buffer
199
225
  if @document_status == nil
200
- word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
201
- word = word[1] unless word.nil?
202
- #STDERR.puts "WORD = #{word.inspect}"
203
- case word
204
- when COMMENT_START
205
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
206
- when XMLDECL_START
207
- #STDERR.puts "XMLDECL"
208
- results = @source.match( XMLDECL_PATTERN, true )[1]
209
- version = VERSION.match( results )
210
- version = version[1] unless version.nil?
211
- encoding = ENCODING.match(results)
212
- encoding = encoding[1] unless encoding.nil?
213
- if need_source_encoding_update?(encoding)
214
- @source.encoding = encoding
215
- end
216
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
217
- encoding = "UTF-16"
218
- end
219
- standalone = STANDALONE.match(results)
220
- standalone = standalone[1] unless standalone.nil?
221
- return [ :xmldecl, version, encoding, standalone ]
222
- when INSTRUCTION_START
223
- return process_instruction
224
- when DOCTYPE_START
225
- base_error_message = "Malformed DOCTYPE"
226
- @source.match(DOCTYPE_START, true)
227
- @nsstack.unshift(curr_ns=Set.new)
228
- name = parse_name(base_error_message)
229
- if @source.match(/\A\s*\[/um, true)
230
- id = [nil, nil, nil]
231
- @document_status = :in_doctype
232
- elsif @source.match(/\A\s*>/um, true)
233
- id = [nil, nil, nil]
234
- @document_status = :after_doctype
235
- else
236
- id = parse_id(base_error_message,
237
- accept_external_id: true,
238
- accept_public_id: false)
239
- if id[0] == "SYSTEM"
240
- # For backward compatibility
241
- id[1], id[2] = id[2], nil
226
+ start_position = @source.position
227
+ if @source.match("<?", true)
228
+ return process_instruction(start_position)
229
+ elsif @source.match("<!", true)
230
+ if @source.match("--", true)
231
+ return [ :comment, @source.match(/(.*?)-->/um, true)[1] ]
232
+ elsif @source.match("DOCTYPE", true)
233
+ base_error_message = "Malformed DOCTYPE"
234
+ unless @source.match(/\s+/um, true)
235
+ if @source.match(">")
236
+ message = "#{base_error_message}: name is missing"
237
+ else
238
+ message = "#{base_error_message}: invalid name"
239
+ end
240
+ @source.position = start_position
241
+ raise REXML::ParseException.new(message, @source)
242
242
  end
243
- if @source.match(/\A\s*\[/um, true)
243
+ @nsstack.unshift(curr_ns=Set.new)
244
+ name = parse_name(base_error_message)
245
+ if @source.match(/\s*\[/um, true)
246
+ id = [nil, nil, nil]
244
247
  @document_status = :in_doctype
245
- elsif @source.match(/\A\s*>/um, true)
248
+ elsif @source.match(/\s*>/um, true)
249
+ id = [nil, nil, nil]
246
250
  @document_status = :after_doctype
251
+ @source.ensure_buffer
247
252
  else
248
- message = "#{base_error_message}: garbage after external ID"
249
- raise REXML::ParseException.new(message, @source)
253
+ id = parse_id(base_error_message,
254
+ accept_external_id: true,
255
+ accept_public_id: false)
256
+ if id[0] == "SYSTEM"
257
+ # For backward compatibility
258
+ id[1], id[2] = id[2], nil
259
+ end
260
+ if @source.match(/\s*\[/um, true)
261
+ @document_status = :in_doctype
262
+ elsif @source.match(/\s*>/um, true)
263
+ @document_status = :after_doctype
264
+ @source.ensure_buffer
265
+ else
266
+ message = "#{base_error_message}: garbage after external ID"
267
+ raise REXML::ParseException.new(message, @source)
268
+ end
250
269
  end
251
- end
252
- args = [:start_doctype, name, *id]
253
- if @document_status == :after_doctype
254
- @source.match(/\A\s*/um, true)
255
- @stack << [ :end_doctype ]
256
- end
257
- return args
258
- when /\A\s+/
259
- else
260
- @document_status = :after_doctype
261
- if @source.encoding == "UTF-8"
262
- @source.buffer.force_encoding(::Encoding::UTF_8)
270
+ args = [:start_doctype, name, *id]
271
+ if @document_status == :after_doctype
272
+ @source.match(/\s*/um, true)
273
+ @stack << [ :end_doctype ]
274
+ end
275
+ return args
276
+ else
277
+ message = "Invalid XML"
278
+ raise REXML::ParseException.new(message, @source)
263
279
  end
264
280
  end
265
281
  end
266
282
  if @document_status == :in_doctype
267
- md = @source.match(/\A\s*(.*?>)/um)
268
- case md[1]
269
- when SYSTEMENTITY
270
- match = @source.match( SYSTEMENTITY, true )[1]
271
- return [ :externalentity, match ]
272
-
273
- when ELEMENTDECL_START
274
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
275
-
276
- when ENTITY_START
277
- match = @source.match( ENTITYDECL, true ).to_a.compact
278
- match[0] = :entitydecl
279
- ref = false
280
- if match[1] == '%'
281
- ref = true
282
- match.delete_at 1
283
- end
284
- # Now we have to sort out what kind of entity reference this is
285
- if match[2] == 'SYSTEM'
286
- # External reference
287
- match[3] = match[3][1..-2] # PUBID
288
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
289
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
290
- elsif match[2] == 'PUBLIC'
291
- # External reference
292
- match[3] = match[3][1..-2] # PUBID
293
- match[4] = match[4][1..-2] # HREF
294
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
295
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
296
- else
297
- match[2] = match[2][1..-2]
298
- match.pop if match.size == 4
299
- # match is [ :entity, name, value ]
300
- end
301
- match << '%' if ref
302
- return match
303
- when ATTLISTDECL_START
304
- md = @source.match( ATTLISTDECL_PATTERN, true )
305
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
306
- element = md[1]
307
- contents = md[0]
308
-
309
- pairs = {}
310
- values = md[0].scan( ATTDEF_RE )
311
- values.each do |attdef|
312
- unless attdef[3] == "#IMPLIED"
313
- attdef.compact!
314
- val = attdef[3]
315
- val = attdef[4] if val == "#FIXED "
316
- pairs[attdef[0]] = val
317
- if attdef[0] =~ /^xmlns:(.*)/
318
- @nsstack[0] << $1
319
- end
283
+ @source.match(/\s*/um, true) # skip spaces
284
+ start_position = @source.position
285
+ if @source.match("<!", true)
286
+ if @source.match("ELEMENT", true)
287
+ md = @source.match(/(.*?)>/um, true)
288
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
289
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
290
+ elsif @source.match("ENTITY", true)
291
+ match = [:entitydecl, *@source.match(ENTITYDECL_PATTERN, true).captures.compact]
292
+ ref = false
293
+ if match[1] == '%'
294
+ ref = true
295
+ match.delete_at 1
320
296
  end
321
- end
322
- return [ :attlistdecl, element, pairs, contents ]
323
- when NOTATIONDECL_START
324
- base_error_message = "Malformed notation declaration"
325
- unless @source.match(/\A\s*<!NOTATION\s+/um, true)
326
- if @source.match(/\A\s*<!NOTATION\s*>/um)
327
- message = "#{base_error_message}: name is missing"
297
+ # Now we have to sort out what kind of entity reference this is
298
+ if match[2] == 'SYSTEM'
299
+ # External reference
300
+ match[3] = match[3][1..-2] # PUBID
301
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
302
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
303
+ elsif match[2] == 'PUBLIC'
304
+ # External reference
305
+ match[3] = match[3][1..-2] # PUBID
306
+ match[4] = match[4][1..-2] # HREF
307
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
308
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
328
309
  else
329
- message = "#{base_error_message}: invalid declaration name"
310
+ match[2] = match[2][1..-2]
311
+ match.pop if match.size == 4
312
+ # match is [ :entity, name, value ]
330
313
  end
331
- raise REXML::ParseException.new(message, @source)
332
- end
333
- name = parse_name(base_error_message)
334
- id = parse_id(base_error_message,
335
- accept_external_id: true,
336
- accept_public_id: true)
337
- unless @source.match(/\A\s*>/um, true)
338
- message = "#{base_error_message}: garbage before end >"
339
- raise REXML::ParseException.new(message, @source)
314
+ match << '%' if ref
315
+ return match
316
+ elsif @source.match("ATTLIST", true)
317
+ md = @source.match(ATTLISTDECL_END, true)
318
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
319
+ element = md[1]
320
+ contents = md[0]
321
+
322
+ pairs = {}
323
+ values = md[0].scan( ATTDEF_RE )
324
+ values.each do |attdef|
325
+ unless attdef[3] == "#IMPLIED"
326
+ attdef.compact!
327
+ val = attdef[3]
328
+ val = attdef[4] if val == "#FIXED "
329
+ pairs[attdef[0]] = val
330
+ if attdef[0] =~ /^xmlns:(.*)/
331
+ @nsstack[0] << $1
332
+ end
333
+ end
334
+ end
335
+ return [ :attlistdecl, element, pairs, contents ]
336
+ elsif @source.match("NOTATION", true)
337
+ base_error_message = "Malformed notation declaration"
338
+ unless @source.match(/\s+/um, true)
339
+ if @source.match(">")
340
+ message = "#{base_error_message}: name is missing"
341
+ else
342
+ message = "#{base_error_message}: invalid name"
343
+ end
344
+ @source.position = start_position
345
+ raise REXML::ParseException.new(message, @source)
346
+ end
347
+ name = parse_name(base_error_message)
348
+ id = parse_id(base_error_message,
349
+ accept_external_id: true,
350
+ accept_public_id: true)
351
+ unless @source.match(/\s*>/um, true)
352
+ message = "#{base_error_message}: garbage before end >"
353
+ raise REXML::ParseException.new(message, @source)
354
+ end
355
+ return [:notationdecl, name, *id]
356
+ elsif md = @source.match(/--(.*?)-->/um, true)
357
+ case md[1]
358
+ when /--/, /-\z/
359
+ raise REXML::ParseException.new("Malformed comment", @source)
360
+ end
361
+ return [ :comment, md[1] ] if md
340
362
  end
341
- return [:notationdecl, name, *id]
342
- when DOCTYPE_END
363
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
364
+ return [ :externalentity, match[1] ]
365
+ elsif @source.match(/\]\s*>/um, true)
343
366
  @document_status = :after_doctype
344
- @source.match( DOCTYPE_END, true )
345
367
  return [ :end_doctype ]
346
368
  end
347
369
  end
348
370
  if @document_status == :after_doctype
349
- @source.match(/\A\s*/um, true)
371
+ @source.match(/\s*/um, true)
350
372
  end
351
373
  begin
352
- @source.read if @source.buffer.size<2
353
- if @source.buffer[0] == ?<
354
- if @source.buffer[1] == ?/
374
+ start_position = @source.position
375
+ if @source.match("<", true)
376
+ # :text's read_until may remain only "<" in buffer. In the
377
+ # case, buffer is empty here. So we need to fill buffer
378
+ # here explicitly.
379
+ @source.ensure_buffer
380
+ if @source.match("/", true)
355
381
  @nsstack.shift
356
382
  last_tag = @tags.pop
357
- md = @source.match( CLOSE_MATCH, true )
383
+ md = @source.match(CLOSE_PATTERN, true)
358
384
  if md and !last_tag
359
385
  message = "Unexpected top-level end tag (got '#{md[1]}')"
360
386
  raise REXML::ParseException.new(message, @source)
361
387
  end
362
388
  if md.nil? or last_tag != md[1]
363
389
  message = "Missing end tag for '#{last_tag}'"
364
- message << " (got '#{md[1]}')" if md
390
+ message += " (got '#{md[1]}')" if md
391
+ @source.position = start_position if md.nil?
365
392
  raise REXML::ParseException.new(message, @source)
366
393
  end
367
394
  return [ :end_element, last_tag ]
368
- elsif @source.buffer[1] == ?!
369
- md = @source.match(/\A(\s*[^>]*>)/um)
395
+ elsif @source.match("!", true)
396
+ md = @source.match(/([^>]*>)/um)
370
397
  #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
371
398
  raise REXML::ParseException.new("Malformed node", @source) unless md
372
- if md[0][2] == ?-
373
- md = @source.match( COMMENT_PATTERN, true )
399
+ if md[0][0] == ?-
400
+ md = @source.match(/--(.*?)-->/um, true)
374
401
 
375
402
  case md[1]
376
403
  when /--/, /-\z/
@@ -379,19 +406,21 @@ module REXML
379
406
 
380
407
  return [ :comment, md[1] ] if md
381
408
  else
382
- md = @source.match( CDATA_PATTERN, true )
409
+ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
383
410
  return [ :cdata, md[1] ] if md
384
411
  end
385
412
  raise REXML::ParseException.new( "Declarations can only occur "+
386
413
  "in the doctype declaration.", @source)
387
- elsif @source.buffer[1] == ??
388
- return process_instruction
414
+ elsif @source.match("?", true)
415
+ return process_instruction(start_position)
389
416
  else
390
417
  # Get the next tag
391
- md = @source.match(TAG_MATCH, true)
418
+ md = @source.match(TAG_PATTERN, true)
392
419
  unless md
420
+ @source.position = start_position
393
421
  raise REXML::ParseException.new("malformed XML: missing tag start", @source)
394
422
  end
423
+ tag = md[1]
395
424
  @document_status = :in_element
396
425
  prefixes = Set.new
397
426
  prefixes << md[2] if md[2]
@@ -405,23 +434,19 @@ module REXML
405
434
  end
406
435
 
407
436
  if closed
408
- @closed = md[1]
437
+ @closed = tag
409
438
  @nsstack.shift
410
439
  else
411
- @tags.push( md[1] )
440
+ @tags.push( tag )
412
441
  end
413
- return [ :start_element, md[1], attributes ]
442
+ return [ :start_element, tag, attributes ]
414
443
  end
415
444
  else
416
- md = @source.match( TEXT_PATTERN, true )
417
- if md[0].length == 0
418
- @source.match( /(\s+)/, true )
445
+ text = @source.read_until("<")
446
+ if text.chomp!("<")
447
+ @source.position -= "<".bytesize
419
448
  end
420
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
421
- #return [ :text, "" ] if md[0].length == 0
422
- # unnormalized = Text::unnormalize( md[1], self )
423
- # return PullEvent.new( :text, md[1], unnormalized )
424
- return [ :text, md[1] ]
449
+ return [ :text, text ]
425
450
  end
426
451
  rescue REXML::UndefinedNamespaceException
427
452
  raise
@@ -463,8 +488,7 @@ module REXML
463
488
 
464
489
  # Unescapes all possible entities
465
490
  def unnormalize( string, entities=nil, filter=nil )
466
- rv = string.clone
467
- rv.gsub!( /\r\n?/, "\n" )
491
+ rv = string.gsub( /\r\n?/, "\n" )
468
492
  matches = rv.scan( REFERENCE_RE )
469
493
  return rv if matches.size == 0
470
494
  rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
@@ -499,9 +523,9 @@ module REXML
499
523
  end
500
524
 
501
525
  def parse_name(base_error_message)
502
- md = @source.match(/\A\s*#{NAME}/um, true)
526
+ md = @source.match(NAME_PATTERN, true)
503
527
  unless md
504
- if @source.match(/\A\s*\S/um)
528
+ if @source.match(/\s*\S/um)
505
529
  message = "#{base_error_message}: invalid name"
506
530
  else
507
531
  message = "#{base_error_message}: name is missing"
@@ -577,97 +601,91 @@ module REXML
577
601
  end
578
602
  end
579
603
 
580
- def process_instruction
581
- match_data = @source.match(INSTRUCTION_PATTERN, true)
604
+ def process_instruction(start_position)
605
+ match_data = @source.match(INSTRUCTION_END, true)
582
606
  unless match_data
583
607
  message = "Invalid processing instruction node"
608
+ @source.position = start_position
584
609
  raise REXML::ParseException.new(message, @source)
585
610
  end
611
+ if @document_status.nil? and match_data[1] == "xml"
612
+ content = match_data[2]
613
+ version = VERSION.match(content)
614
+ version = version[1] unless version.nil?
615
+ encoding = ENCODING.match(content)
616
+ encoding = encoding[1] unless encoding.nil?
617
+ if need_source_encoding_update?(encoding)
618
+ @source.encoding = encoding
619
+ end
620
+ if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
621
+ encoding = "UTF-16"
622
+ end
623
+ standalone = STANDALONE.match(content)
624
+ standalone = standalone[1] unless standalone.nil?
625
+ return [ :xmldecl, version, encoding, standalone ]
626
+ end
586
627
  [:processing_instruction, match_data[1], match_data[2]]
587
628
  end
588
629
 
589
630
  def parse_attributes(prefixes, curr_ns)
590
631
  attributes = {}
591
632
  closed = false
592
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
593
- if match_data.nil?
594
- message = "Start tag isn't ended"
595
- raise REXML::ParseException.new(message, @source)
596
- end
597
-
598
- raw_attributes = match_data[1]
599
- closed = !match_data[2].nil?
600
- return attributes, closed if raw_attributes.nil?
601
- return attributes, closed if raw_attributes.empty?
602
-
603
- scanner = StringScanner.new(raw_attributes)
604
- until scanner.eos?
605
- if scanner.scan(/\s+/)
606
- break if scanner.eos?
607
- end
608
-
609
- pos = scanner.pos
610
- loop do
611
- break if scanner.scan(ATTRIBUTE_PATTERN)
612
- unless scanner.scan(QNAME)
613
- message = "Invalid attribute name: <#{scanner.rest}>"
614
- raise REXML::ParseException.new(message, @source)
615
- end
616
- name = scanner[0]
617
- unless scanner.scan(/\s*=\s*/um)
633
+ while true
634
+ if @source.match(">", true)
635
+ return attributes, closed
636
+ elsif @source.match("/>", true)
637
+ closed = true
638
+ return attributes, closed
639
+ elsif match = @source.match(QNAME, true)
640
+ name = match[1]
641
+ prefix = match[2]
642
+ local_part = match[3]
643
+
644
+ unless @source.match(/\s*=\s*/um, true)
618
645
  message = "Missing attribute equal: <#{name}>"
619
646
  raise REXML::ParseException.new(message, @source)
620
647
  end
621
- quote = scanner.scan(/['"]/)
622
- unless quote
648
+ unless match = @source.match(/(['"])/, true)
623
649
  message = "Missing attribute value start quote: <#{name}>"
624
650
  raise REXML::ParseException.new(message, @source)
625
651
  end
626
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
627
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
628
- if match_data
629
- scanner << "/" if closed
630
- scanner << ">"
631
- scanner << match_data[1]
632
- scanner.pos = pos
633
- closed = !match_data[2].nil?
634
- next
635
- end
636
- message =
637
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
652
+ quote = match[1]
653
+ start_position = @source.position
654
+ value = @source.read_until(quote)
655
+ unless value.chomp!(quote)
656
+ @source.position = start_position
657
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
638
658
  raise REXML::ParseException.new(message, @source)
639
659
  end
640
- end
641
- name = scanner[1]
642
- prefix = scanner[2]
643
- local_part = scanner[3]
644
- # quote = scanner[4]
645
- value = scanner[5]
646
- if prefix == "xmlns"
647
- if local_part == "xml"
648
- if value != "http://www.w3.org/XML/1998/namespace"
649
- msg = "The 'xml' prefix must not be bound to any other namespace "+
660
+ @source.match(/\s*/um, true)
661
+ if prefix == "xmlns"
662
+ if local_part == "xml"
663
+ if value != "http://www.w3.org/XML/1998/namespace"
664
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
665
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
666
+ raise REXML::ParseException.new( msg, @source, self )
667
+ end
668
+ elsif local_part == "xmlns"
669
+ msg = "The 'xmlns' prefix must not be declared "+
650
670
  "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
651
- raise REXML::ParseException.new( msg, @source, self )
671
+ raise REXML::ParseException.new( msg, @source, self)
652
672
  end
653
- elsif local_part == "xmlns"
654
- msg = "The 'xmlns' prefix must not be declared "+
655
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
656
- raise REXML::ParseException.new( msg, @source, self)
673
+ curr_ns << local_part
674
+ elsif prefix
675
+ prefixes << prefix unless prefix == "xml"
657
676
  end
658
- curr_ns << local_part
659
- elsif prefix
660
- prefixes << prefix unless prefix == "xml"
661
- end
662
677
 
663
- if attributes.has_key?(name)
664
- msg = "Duplicate attribute #{name.inspect}"
665
- raise REXML::ParseException.new(msg, @source, self)
666
- end
678
+ if attributes[name]
679
+ msg = "Duplicate attribute #{name.inspect}"
680
+ raise REXML::ParseException.new(msg, @source, self)
681
+ end
667
682
 
668
- attributes[name] = value
683
+ attributes[name] = value
684
+ else
685
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
686
+ raise REXML::ParseException.new(message, @source)
687
+ end
669
688
  end
670
- return attributes, closed
671
689
  end
672
690
  end
673
691
  end
data/lib/rexml/rexml.rb CHANGED
@@ -31,7 +31,7 @@
31
31
  module REXML
32
32
  COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
33
33
  DATE = "2008/019"
34
- VERSION = "3.2.6"
34
+ VERSION = "3.2.9"
35
35
  REVISION = ""
36
36
 
37
37
  Copyright = COPYRIGHT
data/lib/rexml/source.rb CHANGED
@@ -30,18 +30,27 @@ module REXML
30
30
  # objects and provides consumption of text
31
31
  class Source
32
32
  include Encoding
33
- # The current buffer (what we're going to read next)
34
- attr_reader :buffer
35
33
  # The line number of the last consumed text
36
34
  attr_reader :line
37
35
  attr_reader :encoding
38
36
 
37
+ module Private
38
+ PRE_DEFINED_TERM_PATTERNS = {}
39
+ pre_defined_terms = ["'", '"', "<"]
40
+ pre_defined_terms.each do |term|
41
+ PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/
42
+ end
43
+ end
44
+ private_constant :Private
45
+ include Private
46
+
39
47
  # Constructor
40
48
  # @param arg must be a String, and should be a valid XML document
41
49
  # @param encoding if non-null, sets the encoding of the source to this
42
50
  # value, overriding all encoding detection
43
51
  def initialize(arg, encoding=nil)
44
- @orig = @buffer = arg
52
+ @orig = arg
53
+ @scanner = StringScanner.new(@orig)
45
54
  if encoding
46
55
  self.encoding = encoding
47
56
  else
@@ -50,6 +59,14 @@ module REXML
50
59
  @line = 0
51
60
  end
52
61
 
62
+ # The current buffer (what we're going to read next)
63
+ def buffer
64
+ @scanner.rest
65
+ end
66
+
67
+ def buffer_encoding=(encoding)
68
+ @scanner.string.force_encoding(encoding)
69
+ end
53
70
 
54
71
  # Inherited from Encoding
55
72
  # Overridden to support optimized en/decoding
@@ -58,98 +75,78 @@ module REXML
58
75
  encoding_updated
59
76
  end
60
77
 
61
- # Scans the source for a given pattern. Note, that this is not your
62
- # usual scan() method. For one thing, the pattern argument has some
63
- # requirements; for another, the source can be consumed. You can easily
64
- # confuse this method. Originally, the patterns were easier
65
- # to construct and this method more robust, because this method
66
- # generated search regexps on the fly; however, this was
67
- # computationally expensive and slowed down the entire REXML package
68
- # considerably, since this is by far the most commonly called method.
69
- # @param pattern must be a Regexp, and must be in the form of
70
- # /^\s*(#{your pattern, with no groups})(.*)/. The first group
71
- # will be returned; the second group is used if the consume flag is
72
- # set.
73
- # @param consume if true, the pattern returned will be consumed, leaving
74
- # everything after it in the Source.
75
- # @return the pattern, if found, or nil if the Source is empty or the
76
- # pattern is not found.
77
- def scan(pattern, cons=false)
78
- return nil if @buffer.nil?
79
- rv = @buffer.scan(pattern)
80
- @buffer = $' if cons and rv.size>0
81
- rv
78
+ def read(term = nil)
82
79
  end
83
80
 
84
- def read
81
+ def read_until(term)
82
+ pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
83
+ data = @scanner.scan_until(pattern)
84
+ unless data
85
+ data = @scanner.rest
86
+ @scanner.pos = @scanner.string.bytesize
87
+ end
88
+ data
85
89
  end
86
90
 
87
- def consume( pattern )
88
- @buffer = $' if pattern.match( @buffer )
91
+ def ensure_buffer
89
92
  end
90
93
 
91
- def match_to( char, pattern )
92
- return pattern.match(@buffer)
94
+ def match(pattern, cons=false)
95
+ if cons
96
+ @scanner.scan(pattern).nil? ? nil : @scanner
97
+ else
98
+ @scanner.check(pattern).nil? ? nil : @scanner
99
+ end
93
100
  end
94
101
 
95
- def match_to_consume( char, pattern )
96
- md = pattern.match(@buffer)
97
- @buffer = $'
98
- return md
102
+ def position
103
+ @scanner.pos
99
104
  end
100
105
 
101
- def match(pattern, cons=false)
102
- md = pattern.match(@buffer)
103
- @buffer = $' if cons and md
104
- return md
106
+ def position=(pos)
107
+ @scanner.pos = pos
105
108
  end
106
109
 
107
110
  # @return true if the Source is exhausted
108
111
  def empty?
109
- @buffer == ""
110
- end
111
-
112
- def position
113
- @orig.index( @buffer )
112
+ @scanner.eos?
114
113
  end
115
114
 
116
115
  # @return the current line in the source
117
116
  def current_line
118
117
  lines = @orig.split
119
- res = lines.grep @buffer[0..30]
118
+ res = lines.grep @scanner.rest[0..30]
120
119
  res = res[-1] if res.kind_of? Array
121
120
  lines.index( res ) if res
122
121
  end
123
122
 
124
123
  private
124
+
125
125
  def detect_encoding
126
- buffer_encoding = @buffer.encoding
126
+ scanner_encoding = @scanner.rest.encoding
127
127
  detected_encoding = "UTF-8"
128
128
  begin
129
- @buffer.force_encoding("ASCII-8BIT")
130
- if @buffer[0, 2] == "\xfe\xff"
131
- @buffer[0, 2] = ""
129
+ @scanner.string.force_encoding("ASCII-8BIT")
130
+ if @scanner.scan(/\xfe\xff/n)
132
131
  detected_encoding = "UTF-16BE"
133
- elsif @buffer[0, 2] == "\xff\xfe"
134
- @buffer[0, 2] = ""
132
+ elsif @scanner.scan(/\xff\xfe/n)
135
133
  detected_encoding = "UTF-16LE"
136
- elsif @buffer[0, 3] == "\xef\xbb\xbf"
137
- @buffer[0, 3] = ""
134
+ elsif @scanner.scan(/\xef\xbb\xbf/n)
138
135
  detected_encoding = "UTF-8"
139
136
  end
140
137
  ensure
141
- @buffer.force_encoding(buffer_encoding)
138
+ @scanner.string.force_encoding(scanner_encoding)
142
139
  end
143
140
  self.encoding = detected_encoding
144
141
  end
145
142
 
146
143
  def encoding_updated
147
144
  if @encoding != 'UTF-8'
148
- @buffer = decode(@buffer)
145
+ @scanner.string = decode(@scanner.rest)
149
146
  @to_utf = true
150
147
  else
151
148
  @to_utf = false
152
- @buffer.force_encoding ::Encoding::UTF_8
149
+ @scanner.string.force_encoding(::Encoding::UTF_8)
153
150
  end
154
151
  end
155
152
  end
@@ -172,7 +169,7 @@ module REXML
172
169
  end
173
170
 
174
171
  if !@to_utf and
175
- @buffer.respond_to?(:force_encoding) and
172
+ @orig.respond_to?(:force_encoding) and
176
173
  @source.respond_to?(:external_encoding) and
177
174
  @source.external_encoding != ::Encoding::UTF_8
178
175
  @force_utf8 = true
@@ -181,65 +178,62 @@ module REXML
181
178
  end
182
179
  end
183
180
 
184
- def scan(pattern, cons=false)
185
- rv = super
186
- # You'll notice that this next section is very similar to the same
187
- # section in match(), but just a liiittle different. This is
188
- # because it is a touch faster to do it this way with scan()
189
- # than the way match() does it; enough faster to warrant duplicating
190
- # some code
191
- if rv.size == 0
192
- until @buffer =~ pattern or @source.nil?
193
- begin
194
- @buffer << readline
195
- rescue Iconv::IllegalSequence
196
- raise
197
- rescue
198
- @source = nil
199
- end
200
- end
201
- rv = super
202
- end
203
- rv.taint if RUBY_VERSION < '2.7'
204
- rv
205
- end
206
-
207
- def read
181
+ def read(term = nil)
182
+ term = encode(term) if term
208
183
  begin
209
- @buffer << readline
184
+ @scanner << readline(term)
185
+ true
210
186
  rescue Exception, NameError
211
187
  @source = nil
188
+ false
189
+ end
190
+ end
191
+
192
+ def read_until(term)
193
+ pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
194
+ term = encode(term)
195
+ until str = @scanner.scan_until(pattern)
196
+ break if @source.nil?
197
+ break if @source.eof?
198
+ @scanner << readline(term)
199
+ end
200
+ if str
201
+ read if @scanner.eos? and !@source.eof?
202
+ str
203
+ else
204
+ rest = @scanner.rest
205
+ @scanner.pos = @scanner.string.bytesize
206
+ rest
212
207
  end
213
208
  end
214
209
 
215
- def consume( pattern )
216
- match( pattern, true )
210
+ def ensure_buffer
211
+ read if @scanner.eos? && @source
217
212
  end
218
213
 
214
+ # Note: When specifying a string for 'pattern', it must not include '>' except in the following formats:
215
+ # - ">"
216
+ # - "XXX>" (X is any string excluding '>')
219
217
  def match( pattern, cons=false )
220
- rv = pattern.match(@buffer)
221
- @buffer = $' if cons and rv
222
- while !rv and @source
223
- begin
224
- @buffer << readline
225
- rv = pattern.match(@buffer)
226
- @buffer = $' if cons and rv
227
- rescue
228
- @source = nil
218
+ while true
219
+ if cons
220
+ md = @scanner.scan(pattern)
221
+ else
222
+ md = @scanner.check(pattern)
229
223
  end
224
+ break if md
225
+ return nil if pattern.is_a?(String)
226
+ return nil if @source.nil?
227
+ return nil unless read
230
228
  end
231
- rv.taint if RUBY_VERSION < '2.7'
232
- rv
229
+
230
+ md.nil? ? nil : @scanner
233
231
  end
234
232
 
235
233
  def empty?
236
234
  super and ( @source.nil? || @source.eof? )
237
235
  end
238
236
 
239
- def position
240
- @er_source.pos rescue 0
241
- end
242
-
243
237
  # @return the current line in the source
244
238
  def current_line
245
239
  begin
@@ -263,8 +257,8 @@ module REXML
263
257
  end
264
258
 
265
259
  private
266
- def readline
267
- str = @source.readline(@line_break)
260
+ def readline(term = nil)
261
+ str = @source.readline(term || @line_break)
268
262
  if @pending_buffer
269
263
  if str.nil?
270
264
  str = @pending_buffer
@@ -290,7 +284,7 @@ module REXML
290
284
  @source.set_encoding(@encoding, @encoding)
291
285
  end
292
286
  @line_break = encode(">")
293
- @pending_buffer, @buffer = @buffer, ""
287
+ @pending_buffer, @scanner.string = @scanner.rest, ""
294
288
  @pending_buffer.force_encoding(@encoding)
295
289
  super
296
290
  end
@@ -590,6 +590,7 @@ module REXML
590
590
 
591
591
  def evaluate_predicate(expression, nodesets)
592
592
  enter(:predicate, expression, nodesets) if @debug
593
+ new_nodeset_count = 0
593
594
  new_nodesets = nodesets.collect do |nodeset|
594
595
  new_nodeset = []
595
596
  subcontext = { :size => nodeset.size }
@@ -606,17 +607,20 @@ module REXML
606
607
  result = result[0] if result.kind_of? Array and result.length == 1
607
608
  if result.kind_of? Numeric
608
609
  if result == node.position
609
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
610
+ new_nodeset_count += 1
611
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
610
612
  end
611
613
  elsif result.instance_of? Array
612
614
  if result.size > 0 and result.inject(false) {|k,s| s or k}
613
615
  if result.size > 0
614
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
616
+ new_nodeset_count += 1
617
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
615
618
  end
616
619
  end
617
620
  else
618
621
  if result
619
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
622
+ new_nodeset_count += 1
623
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
620
624
  end
621
625
  end
622
626
  end
metadata CHANGED
@@ -1,51 +1,22 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rexml
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.2.6
4
+ version: 3.2.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kouhei Sutou
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2023-07-27 00:00:00.000000000 Z
10
+ date: 2024-06-09 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
- name: bundler
13
+ name: strscan
15
14
  requirement: !ruby/object:Gem::Requirement
16
15
  requirements:
17
16
  - - ">="
18
17
  - !ruby/object:Gem::Version
19
18
  version: '0'
20
- type: :development
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - ">="
25
- - !ruby/object:Gem::Version
26
- version: '0'
27
- - !ruby/object:Gem::Dependency
28
- name: rake
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :development
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: test-unit
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - ">="
46
- - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :development
19
+ type: :runtime
49
20
  prerelease: false
50
21
  version_requirements: !ruby/object:Gem::Requirement
51
22
  requirements:
@@ -145,7 +116,6 @@ homepage: https://github.com/ruby/rexml
145
116
  licenses:
146
117
  - BSD-2-Clause
147
118
  metadata: {}
148
- post_install_message:
149
119
  rdoc_options:
150
120
  - "--main"
151
121
  - README.md
@@ -162,8 +132,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
162
132
  - !ruby/object:Gem::Version
163
133
  version: '0'
164
134
  requirements: []
165
- rubygems_version: 3.5.0.dev
166
- signing_key:
135
+ rubygems_version: 3.6.0.dev
167
136
  specification_version: 4
168
137
  summary: An XML toolkit for Ruby
169
138
  test_files: []