rexml 3.2.6 → 3.2.7

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of rexml might be problematic. Click here for more details.

checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2583ae302aa5e698f0887a689c416e5debe0533ac472a9f96fce6a8912040fd8
4
- data.tar.gz: b0ffa6301fd899969a78e060ccaeafebfc2169e3c63ff499ebc6170468866475
3
+ metadata.gz: 8c54a60c677a865a023fc0bf1fc403419b530cbc7b306bc7da18f1489e02cd79
4
+ data.tar.gz: 5dbbae05d90151d6d4ea9d8b5a4a3097e144ab79bb346c30e75c3d62cbc05dd7
5
5
  SHA512:
6
- metadata.gz: f63fb0b84ef51e790cc6310244f2106d8c47ec9a00687c58c743afda82b60be9986d503c6f56f947db06f6758707facccd03405c4d1009376e856080aa26d0e4
7
- data.tar.gz: db62bea7391837a7ab4cfc5cb5a412ed4deb8d232653ca66d93a323a5a76383eed520cd4ced5b20204f29b04e84678791cd6f807195868f5d4a5e519a73d2aaf
6
+ metadata.gz: 5579b5fe5f6a5488d78d0ed19cdad1498aeb44bbe0b72dca9895391d1a3d1aaaed353fa14e7366d3c08ab6f723e4bb11d6cdb7a667fd310d5cdcec954bb0e77e
7
+ data.tar.gz: 2db805399a3cf3c6cf5bced1157e3c84539c5f3d12d806db951c5c3fd6aaadb86b3a4feaa0ea60a2771432009f873df3be3a688947156be9a63039a5f9bf449c
data/NEWS.md CHANGED
@@ -1,6 +1,58 @@
1
1
  # News
2
2
 
3
- ## 3.2.6 - 2023-07-27 {#version-3-2-6}
3
+ ## 3.2.7 - 2024-05-16 {#version-3-2-7}
4
+
5
+ ### Improvements
6
+
7
+ * Improve parse performance by using `StringScanner`.
8
+
9
+ * GH-106
10
+ * GH-107
11
+ * GH-108
12
+ * GH-109
13
+ * GH-112
14
+ * GH-113
15
+ * GH-114
16
+ * GH-115
17
+ * GH-116
18
+ * GH-117
19
+ * GH-118
20
+ * GH-119
21
+ * GH-121
22
+
23
+ * Patch by NAITOH Jun.
24
+
25
+ * Improved parse performance when an attribute has many `<`s.
26
+
27
+ * GH-124
28
+
29
+ ### Fixes
30
+
31
+ * XPath: Fixed a bug of `normalize_space(array)`.
32
+
33
+ * GH-110
34
+ * GH-111
35
+
36
+ * Patch by flatisland.
37
+
38
+ * XPath: Fixed a bug that wrong position is used with nested path.
39
+
40
+ * GH-110
41
+ * GH-122
42
+
43
+ * Reported by jcavalieri.
44
+ * Patch by NAITOH Jun.
45
+
46
+ * Fixed a bug that an exception message can't be generated for
47
+ invalid encoding XML.
48
+
49
+ * GH-29
50
+ * GH-123
51
+
52
+ * Reported by DuKewu.
53
+ * Patch by NAITOH Jun.
54
+
55
+ w## 3.2.6 - 2023-07-27 {#version-3-2-6}
4
56
 
5
57
  ### Improvements
6
58
 
@@ -262,11 +262,10 @@ module REXML
262
262
  string(string).length
263
263
  end
264
264
 
265
- # UNTESTED
266
265
  def Functions::normalize_space( string=nil )
267
266
  string = string(@@context[:node]) if string.nil?
268
267
  if string.kind_of? Array
269
- string.collect{|x| string.to_s.strip.gsub(/\s+/um, ' ') if string}
268
+ string.collect{|x| x.to_s.strip.gsub(/\s+/um, ' ') if x}
270
269
  else
271
270
  string.to_s.strip.gsub(/\s+/um, ' ')
272
271
  end
@@ -29,6 +29,7 @@ module REXML
29
29
  err << "\nLine: #{line}\n"
30
30
  err << "Position: #{position}\n"
31
31
  err << "Last 80 unconsumed characters:\n"
32
+ err.force_encoding("ASCII-8BIT")
32
33
  err << @source.buffer[0..80].force_encoding("ASCII-8BIT").gsub(/\n/, ' ')
33
34
  end
34
35
 
@@ -1,4 +1,4 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative '../parseexception'
3
3
  require_relative '../undefinednamespaceexception'
4
4
  require_relative '../source'
@@ -96,7 +96,7 @@ module REXML
96
96
  ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
97
97
  PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
98
98
  GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
99
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
99
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
100
100
 
101
101
  NOTATIONDECL_START = /\A\s*<!NOTATION/um
102
102
  EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -112,6 +112,19 @@ module REXML
112
112
  "apos" => [/&apos;/, "&apos;", "'", /'/]
113
113
  }
114
114
 
115
+ module Private
116
+ INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
117
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
118
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
119
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
120
+ NAME_PATTERN = /\s*#{NAME}/um
121
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
122
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
123
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
124
+ end
125
+ private_constant :Private
126
+ include Private
127
+
115
128
  def initialize( source )
116
129
  self.stream = source
117
130
  @listeners = []
@@ -196,181 +209,180 @@ module REXML
196
209
  return @stack.shift if @stack.size > 0
197
210
  #STDERR.puts @source.encoding
198
211
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
212
+
213
+ @source.ensure_buffer
199
214
  if @document_status == nil
200
- word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
201
- word = word[1] unless word.nil?
202
- #STDERR.puts "WORD = #{word.inspect}"
203
- case word
204
- when COMMENT_START
205
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
206
- when XMLDECL_START
207
- #STDERR.puts "XMLDECL"
208
- results = @source.match( XMLDECL_PATTERN, true )[1]
209
- version = VERSION.match( results )
210
- version = version[1] unless version.nil?
211
- encoding = ENCODING.match(results)
212
- encoding = encoding[1] unless encoding.nil?
213
- if need_source_encoding_update?(encoding)
214
- @source.encoding = encoding
215
- end
216
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
217
- encoding = "UTF-16"
218
- end
219
- standalone = STANDALONE.match(results)
220
- standalone = standalone[1] unless standalone.nil?
221
- return [ :xmldecl, version, encoding, standalone ]
222
- when INSTRUCTION_START
223
- return process_instruction
224
- when DOCTYPE_START
225
- base_error_message = "Malformed DOCTYPE"
226
- @source.match(DOCTYPE_START, true)
227
- @nsstack.unshift(curr_ns=Set.new)
228
- name = parse_name(base_error_message)
229
- if @source.match(/\A\s*\[/um, true)
230
- id = [nil, nil, nil]
231
- @document_status = :in_doctype
232
- elsif @source.match(/\A\s*>/um, true)
233
- id = [nil, nil, nil]
234
- @document_status = :after_doctype
235
- else
236
- id = parse_id(base_error_message,
237
- accept_external_id: true,
238
- accept_public_id: false)
239
- if id[0] == "SYSTEM"
240
- # For backward compatibility
241
- id[1], id[2] = id[2], nil
215
+ start_position = @source.position
216
+ if @source.match("<?", true)
217
+ return process_instruction(start_position)
218
+ elsif @source.match("<!", true)
219
+ if @source.match("--", true)
220
+ return [ :comment, @source.match(/(.*?)-->/um, true)[1] ]
221
+ elsif @source.match("DOCTYPE", true)
222
+ base_error_message = "Malformed DOCTYPE"
223
+ unless @source.match(/\s+/um, true)
224
+ if @source.match(">")
225
+ message = "#{base_error_message}: name is missing"
226
+ else
227
+ message = "#{base_error_message}: invalid name"
228
+ end
229
+ @source.position = start_position
230
+ raise REXML::ParseException.new(message, @source)
242
231
  end
243
- if @source.match(/\A\s*\[/um, true)
232
+ @nsstack.unshift(curr_ns=Set.new)
233
+ name = parse_name(base_error_message)
234
+ if @source.match(/\s*\[/um, true)
235
+ id = [nil, nil, nil]
244
236
  @document_status = :in_doctype
245
- elsif @source.match(/\A\s*>/um, true)
237
+ elsif @source.match(/\s*>/um, true)
238
+ id = [nil, nil, nil]
246
239
  @document_status = :after_doctype
240
+ @source.ensure_buffer
247
241
  else
248
- message = "#{base_error_message}: garbage after external ID"
249
- raise REXML::ParseException.new(message, @source)
242
+ id = parse_id(base_error_message,
243
+ accept_external_id: true,
244
+ accept_public_id: false)
245
+ if id[0] == "SYSTEM"
246
+ # For backward compatibility
247
+ id[1], id[2] = id[2], nil
248
+ end
249
+ if @source.match(/\s*\[/um, true)
250
+ @document_status = :in_doctype
251
+ elsif @source.match(/\s*>/um, true)
252
+ @document_status = :after_doctype
253
+ @source.ensure_buffer
254
+ else
255
+ message = "#{base_error_message}: garbage after external ID"
256
+ raise REXML::ParseException.new(message, @source)
257
+ end
250
258
  end
251
- end
252
- args = [:start_doctype, name, *id]
253
- if @document_status == :after_doctype
254
- @source.match(/\A\s*/um, true)
255
- @stack << [ :end_doctype ]
256
- end
257
- return args
258
- when /\A\s+/
259
- else
260
- @document_status = :after_doctype
261
- if @source.encoding == "UTF-8"
262
- @source.buffer.force_encoding(::Encoding::UTF_8)
259
+ args = [:start_doctype, name, *id]
260
+ if @document_status == :after_doctype
261
+ @source.match(/\s*/um, true)
262
+ @stack << [ :end_doctype ]
263
+ end
264
+ return args
265
+ else
266
+ message = "Invalid XML"
267
+ raise REXML::ParseException.new(message, @source)
263
268
  end
264
269
  end
265
270
  end
266
271
  if @document_status == :in_doctype
267
- md = @source.match(/\A\s*(.*?>)/um)
268
- case md[1]
269
- when SYSTEMENTITY
270
- match = @source.match( SYSTEMENTITY, true )[1]
271
- return [ :externalentity, match ]
272
-
273
- when ELEMENTDECL_START
274
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
275
-
276
- when ENTITY_START
277
- match = @source.match( ENTITYDECL, true ).to_a.compact
278
- match[0] = :entitydecl
279
- ref = false
280
- if match[1] == '%'
281
- ref = true
282
- match.delete_at 1
283
- end
284
- # Now we have to sort out what kind of entity reference this is
285
- if match[2] == 'SYSTEM'
286
- # External reference
287
- match[3] = match[3][1..-2] # PUBID
288
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
289
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
290
- elsif match[2] == 'PUBLIC'
291
- # External reference
292
- match[3] = match[3][1..-2] # PUBID
293
- match[4] = match[4][1..-2] # HREF
294
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
295
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
296
- else
297
- match[2] = match[2][1..-2]
298
- match.pop if match.size == 4
299
- # match is [ :entity, name, value ]
300
- end
301
- match << '%' if ref
302
- return match
303
- when ATTLISTDECL_START
304
- md = @source.match( ATTLISTDECL_PATTERN, true )
305
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
306
- element = md[1]
307
- contents = md[0]
308
-
309
- pairs = {}
310
- values = md[0].scan( ATTDEF_RE )
311
- values.each do |attdef|
312
- unless attdef[3] == "#IMPLIED"
313
- attdef.compact!
314
- val = attdef[3]
315
- val = attdef[4] if val == "#FIXED "
316
- pairs[attdef[0]] = val
317
- if attdef[0] =~ /^xmlns:(.*)/
318
- @nsstack[0] << $1
319
- end
272
+ @source.match(/\s*/um, true) # skip spaces
273
+ start_position = @source.position
274
+ if @source.match("<!", true)
275
+ if @source.match("ELEMENT", true)
276
+ md = @source.match(/(.*?)>/um, true)
277
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
278
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
279
+ elsif @source.match("ENTITY", true)
280
+ match = [:entitydecl, *@source.match(ENTITYDECL_PATTERN, true).captures.compact]
281
+ ref = false
282
+ if match[1] == '%'
283
+ ref = true
284
+ match.delete_at 1
320
285
  end
321
- end
322
- return [ :attlistdecl, element, pairs, contents ]
323
- when NOTATIONDECL_START
324
- base_error_message = "Malformed notation declaration"
325
- unless @source.match(/\A\s*<!NOTATION\s+/um, true)
326
- if @source.match(/\A\s*<!NOTATION\s*>/um)
327
- message = "#{base_error_message}: name is missing"
286
+ # Now we have to sort out what kind of entity reference this is
287
+ if match[2] == 'SYSTEM'
288
+ # External reference
289
+ match[3] = match[3][1..-2] # PUBID
290
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
291
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
292
+ elsif match[2] == 'PUBLIC'
293
+ # External reference
294
+ match[3] = match[3][1..-2] # PUBID
295
+ match[4] = match[4][1..-2] # HREF
296
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
297
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
328
298
  else
329
- message = "#{base_error_message}: invalid declaration name"
299
+ match[2] = match[2][1..-2]
300
+ match.pop if match.size == 4
301
+ # match is [ :entity, name, value ]
330
302
  end
331
- raise REXML::ParseException.new(message, @source)
332
- end
333
- name = parse_name(base_error_message)
334
- id = parse_id(base_error_message,
335
- accept_external_id: true,
336
- accept_public_id: true)
337
- unless @source.match(/\A\s*>/um, true)
338
- message = "#{base_error_message}: garbage before end >"
339
- raise REXML::ParseException.new(message, @source)
303
+ match << '%' if ref
304
+ return match
305
+ elsif @source.match("ATTLIST", true)
306
+ md = @source.match(ATTLISTDECL_END, true)
307
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
308
+ element = md[1]
309
+ contents = md[0]
310
+
311
+ pairs = {}
312
+ values = md[0].scan( ATTDEF_RE )
313
+ values.each do |attdef|
314
+ unless attdef[3] == "#IMPLIED"
315
+ attdef.compact!
316
+ val = attdef[3]
317
+ val = attdef[4] if val == "#FIXED "
318
+ pairs[attdef[0]] = val
319
+ if attdef[0] =~ /^xmlns:(.*)/
320
+ @nsstack[0] << $1
321
+ end
322
+ end
323
+ end
324
+ return [ :attlistdecl, element, pairs, contents ]
325
+ elsif @source.match("NOTATION", true)
326
+ base_error_message = "Malformed notation declaration"
327
+ unless @source.match(/\s+/um, true)
328
+ if @source.match(">")
329
+ message = "#{base_error_message}: name is missing"
330
+ else
331
+ message = "#{base_error_message}: invalid name"
332
+ end
333
+ @source.position = start_position
334
+ raise REXML::ParseException.new(message, @source)
335
+ end
336
+ name = parse_name(base_error_message)
337
+ id = parse_id(base_error_message,
338
+ accept_external_id: true,
339
+ accept_public_id: true)
340
+ unless @source.match(/\s*>/um, true)
341
+ message = "#{base_error_message}: garbage before end >"
342
+ raise REXML::ParseException.new(message, @source)
343
+ end
344
+ return [:notationdecl, name, *id]
345
+ elsif md = @source.match(/--(.*?)-->/um, true)
346
+ case md[1]
347
+ when /--/, /-\z/
348
+ raise REXML::ParseException.new("Malformed comment", @source)
349
+ end
350
+ return [ :comment, md[1] ] if md
340
351
  end
341
- return [:notationdecl, name, *id]
342
- when DOCTYPE_END
352
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
353
+ return [ :externalentity, match[1] ]
354
+ elsif @source.match(/\]\s*>/um, true)
343
355
  @document_status = :after_doctype
344
- @source.match( DOCTYPE_END, true )
345
356
  return [ :end_doctype ]
346
357
  end
347
358
  end
348
359
  if @document_status == :after_doctype
349
- @source.match(/\A\s*/um, true)
360
+ @source.match(/\s*/um, true)
350
361
  end
351
362
  begin
352
- @source.read if @source.buffer.size<2
353
- if @source.buffer[0] == ?<
354
- if @source.buffer[1] == ?/
363
+ start_position = @source.position
364
+ if @source.match("<", true)
365
+ if @source.match("/", true)
355
366
  @nsstack.shift
356
367
  last_tag = @tags.pop
357
- md = @source.match( CLOSE_MATCH, true )
368
+ md = @source.match(CLOSE_PATTERN, true)
358
369
  if md and !last_tag
359
370
  message = "Unexpected top-level end tag (got '#{md[1]}')"
360
371
  raise REXML::ParseException.new(message, @source)
361
372
  end
362
373
  if md.nil? or last_tag != md[1]
363
374
  message = "Missing end tag for '#{last_tag}'"
364
- message << " (got '#{md[1]}')" if md
375
+ message += " (got '#{md[1]}')" if md
376
+ @source.position = start_position if md.nil?
365
377
  raise REXML::ParseException.new(message, @source)
366
378
  end
367
379
  return [ :end_element, last_tag ]
368
- elsif @source.buffer[1] == ?!
369
- md = @source.match(/\A(\s*[^>]*>)/um)
380
+ elsif @source.match("!", true)
381
+ md = @source.match(/([^>]*>)/um)
370
382
  #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
371
383
  raise REXML::ParseException.new("Malformed node", @source) unless md
372
- if md[0][2] == ?-
373
- md = @source.match( COMMENT_PATTERN, true )
384
+ if md[0][0] == ?-
385
+ md = @source.match(/--(.*?)-->/um, true)
374
386
 
375
387
  case md[1]
376
388
  when /--/, /-\z/
@@ -379,19 +391,21 @@ module REXML
379
391
 
380
392
  return [ :comment, md[1] ] if md
381
393
  else
382
- md = @source.match( CDATA_PATTERN, true )
394
+ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
383
395
  return [ :cdata, md[1] ] if md
384
396
  end
385
397
  raise REXML::ParseException.new( "Declarations can only occur "+
386
398
  "in the doctype declaration.", @source)
387
- elsif @source.buffer[1] == ??
388
- return process_instruction
399
+ elsif @source.match("?", true)
400
+ return process_instruction(start_position)
389
401
  else
390
402
  # Get the next tag
391
- md = @source.match(TAG_MATCH, true)
403
+ md = @source.match(TAG_PATTERN, true)
392
404
  unless md
405
+ @source.position = start_position
393
406
  raise REXML::ParseException.new("malformed XML: missing tag start", @source)
394
407
  end
408
+ tag = md[1]
395
409
  @document_status = :in_element
396
410
  prefixes = Set.new
397
411
  prefixes << md[2] if md[2]
@@ -405,23 +419,17 @@ module REXML
405
419
  end
406
420
 
407
421
  if closed
408
- @closed = md[1]
422
+ @closed = tag
409
423
  @nsstack.shift
410
424
  else
411
- @tags.push( md[1] )
425
+ @tags.push( tag )
412
426
  end
413
- return [ :start_element, md[1], attributes ]
427
+ return [ :start_element, tag, attributes ]
414
428
  end
415
429
  else
416
- md = @source.match( TEXT_PATTERN, true )
417
- if md[0].length == 0
418
- @source.match( /(\s+)/, true )
419
- end
420
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
421
- #return [ :text, "" ] if md[0].length == 0
422
- # unnormalized = Text::unnormalize( md[1], self )
423
- # return PullEvent.new( :text, md[1], unnormalized )
424
- return [ :text, md[1] ]
430
+ md = @source.match(/([^<]*)/um, true)
431
+ text = md[1]
432
+ return [ :text, text ]
425
433
  end
426
434
  rescue REXML::UndefinedNamespaceException
427
435
  raise
@@ -463,8 +471,7 @@ module REXML
463
471
 
464
472
  # Unescapes all possible entities
465
473
  def unnormalize( string, entities=nil, filter=nil )
466
- rv = string.clone
467
- rv.gsub!( /\r\n?/, "\n" )
474
+ rv = string.gsub( /\r\n?/, "\n" )
468
475
  matches = rv.scan( REFERENCE_RE )
469
476
  return rv if matches.size == 0
470
477
  rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
@@ -499,9 +506,9 @@ module REXML
499
506
  end
500
507
 
501
508
  def parse_name(base_error_message)
502
- md = @source.match(/\A\s*#{NAME}/um, true)
509
+ md = @source.match(NAME_PATTERN, true)
503
510
  unless md
504
- if @source.match(/\A\s*\S/um)
511
+ if @source.match(/\s*\S/um)
505
512
  message = "#{base_error_message}: invalid name"
506
513
  else
507
514
  message = "#{base_error_message}: name is missing"
@@ -577,97 +584,89 @@ module REXML
577
584
  end
578
585
  end
579
586
 
580
- def process_instruction
581
- match_data = @source.match(INSTRUCTION_PATTERN, true)
587
+ def process_instruction(start_position)
588
+ match_data = @source.match(INSTRUCTION_END, true)
582
589
  unless match_data
583
590
  message = "Invalid processing instruction node"
591
+ @source.position = start_position
584
592
  raise REXML::ParseException.new(message, @source)
585
593
  end
594
+ if @document_status.nil? and match_data[1] == "xml"
595
+ content = match_data[2]
596
+ version = VERSION.match(content)
597
+ version = version[1] unless version.nil?
598
+ encoding = ENCODING.match(content)
599
+ encoding = encoding[1] unless encoding.nil?
600
+ if need_source_encoding_update?(encoding)
601
+ @source.encoding = encoding
602
+ end
603
+ if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
604
+ encoding = "UTF-16"
605
+ end
606
+ standalone = STANDALONE.match(content)
607
+ standalone = standalone[1] unless standalone.nil?
608
+ return [ :xmldecl, version, encoding, standalone ]
609
+ end
586
610
  [:processing_instruction, match_data[1], match_data[2]]
587
611
  end
588
612
 
589
613
  def parse_attributes(prefixes, curr_ns)
590
614
  attributes = {}
591
615
  closed = false
592
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
593
- if match_data.nil?
594
- message = "Start tag isn't ended"
595
- raise REXML::ParseException.new(message, @source)
596
- end
597
-
598
- raw_attributes = match_data[1]
599
- closed = !match_data[2].nil?
600
- return attributes, closed if raw_attributes.nil?
601
- return attributes, closed if raw_attributes.empty?
602
-
603
- scanner = StringScanner.new(raw_attributes)
604
- until scanner.eos?
605
- if scanner.scan(/\s+/)
606
- break if scanner.eos?
607
- end
608
-
609
- pos = scanner.pos
610
- loop do
611
- break if scanner.scan(ATTRIBUTE_PATTERN)
612
- unless scanner.scan(QNAME)
613
- message = "Invalid attribute name: <#{scanner.rest}>"
614
- raise REXML::ParseException.new(message, @source)
615
- end
616
- name = scanner[0]
617
- unless scanner.scan(/\s*=\s*/um)
616
+ while true
617
+ if @source.match(">", true)
618
+ return attributes, closed
619
+ elsif @source.match("/>", true)
620
+ closed = true
621
+ return attributes, closed
622
+ elsif match = @source.match(QNAME, true)
623
+ name = match[1]
624
+ prefix = match[2]
625
+ local_part = match[3]
626
+
627
+ unless @source.match(/\s*=\s*/um, true)
618
628
  message = "Missing attribute equal: <#{name}>"
619
629
  raise REXML::ParseException.new(message, @source)
620
630
  end
621
- quote = scanner.scan(/['"]/)
622
- unless quote
631
+ unless match = @source.match(/(['"])/, true)
623
632
  message = "Missing attribute value start quote: <#{name}>"
624
633
  raise REXML::ParseException.new(message, @source)
625
634
  end
626
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
627
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
628
- if match_data
629
- scanner << "/" if closed
630
- scanner << ">"
631
- scanner << match_data[1]
632
- scanner.pos = pos
633
- closed = !match_data[2].nil?
634
- next
635
- end
636
- message =
637
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
635
+ quote = match[1]
636
+ value = @source.read_until(quote)
637
+ unless value.chomp!(quote)
638
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
638
639
  raise REXML::ParseException.new(message, @source)
639
640
  end
640
- end
641
- name = scanner[1]
642
- prefix = scanner[2]
643
- local_part = scanner[3]
644
- # quote = scanner[4]
645
- value = scanner[5]
646
- if prefix == "xmlns"
647
- if local_part == "xml"
648
- if value != "http://www.w3.org/XML/1998/namespace"
649
- msg = "The 'xml' prefix must not be bound to any other namespace "+
641
+ @source.match(/\s*/um, true)
642
+ if prefix == "xmlns"
643
+ if local_part == "xml"
644
+ if value != "http://www.w3.org/XML/1998/namespace"
645
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
646
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
647
+ raise REXML::ParseException.new( msg, @source, self )
648
+ end
649
+ elsif local_part == "xmlns"
650
+ msg = "The 'xmlns' prefix must not be declared "+
650
651
  "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
651
- raise REXML::ParseException.new( msg, @source, self )
652
+ raise REXML::ParseException.new( msg, @source, self)
652
653
  end
653
- elsif local_part == "xmlns"
654
- msg = "The 'xmlns' prefix must not be declared "+
655
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
656
- raise REXML::ParseException.new( msg, @source, self)
654
+ curr_ns << local_part
655
+ elsif prefix
656
+ prefixes << prefix unless prefix == "xml"
657
657
  end
658
- curr_ns << local_part
659
- elsif prefix
660
- prefixes << prefix unless prefix == "xml"
661
- end
662
658
 
663
- if attributes.has_key?(name)
664
- msg = "Duplicate attribute #{name.inspect}"
665
- raise REXML::ParseException.new(msg, @source, self)
666
- end
659
+ if attributes[name]
660
+ msg = "Duplicate attribute #{name.inspect}"
661
+ raise REXML::ParseException.new(msg, @source, self)
662
+ end
667
663
 
668
- attributes[name] = value
664
+ attributes[name] = value
665
+ else
666
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
667
+ raise REXML::ParseException.new(message, @source)
668
+ end
669
669
  end
670
- return attributes, closed
671
670
  end
672
671
  end
673
672
  end
data/lib/rexml/rexml.rb CHANGED
@@ -31,7 +31,7 @@
31
31
  module REXML
32
32
  COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
33
33
  DATE = "2008/019"
34
- VERSION = "3.2.6"
34
+ VERSION = "3.2.7"
35
35
  REVISION = ""
36
36
 
37
37
  Copyright = COPYRIGHT
data/lib/rexml/source.rb CHANGED
@@ -30,8 +30,6 @@ module REXML
30
30
  # objects and provides consumption of text
31
31
  class Source
32
32
  include Encoding
33
- # The current buffer (what we're going to read next)
34
- attr_reader :buffer
35
33
  # The line number of the last consumed text
36
34
  attr_reader :line
37
35
  attr_reader :encoding
@@ -41,7 +39,8 @@ module REXML
41
39
  # @param encoding if non-null, sets the encoding of the source to this
42
40
  # value, overriding all encoding detection
43
41
  def initialize(arg, encoding=nil)
44
- @orig = @buffer = arg
42
+ @orig = arg
43
+ @scanner = StringScanner.new(@orig)
45
44
  if encoding
46
45
  self.encoding = encoding
47
46
  else
@@ -50,6 +49,14 @@ module REXML
50
49
  @line = 0
51
50
  end
52
51
 
52
+ # The current buffer (what we're going to read next)
53
+ def buffer
54
+ @scanner.rest
55
+ end
56
+
57
+ def buffer_encoding=(encoding)
58
+ @scanner.string.force_encoding(encoding)
59
+ end
53
60
 
54
61
  # Inherited from Encoding
55
62
  # Overridden to support optimized en/decoding
@@ -58,98 +65,72 @@ module REXML
58
65
  encoding_updated
59
66
  end
60
67
 
61
- # Scans the source for a given pattern. Note, that this is not your
62
- # usual scan() method. For one thing, the pattern argument has some
63
- # requirements; for another, the source can be consumed. You can easily
64
- # confuse this method. Originally, the patterns were easier
65
- # to construct and this method more robust, because this method
66
- # generated search regexps on the fly; however, this was
67
- # computationally expensive and slowed down the entire REXML package
68
- # considerably, since this is by far the most commonly called method.
69
- # @param pattern must be a Regexp, and must be in the form of
70
- # /^\s*(#{your pattern, with no groups})(.*)/. The first group
71
- # will be returned; the second group is used if the consume flag is
72
- # set.
73
- # @param consume if true, the pattern returned will be consumed, leaving
74
- # everything after it in the Source.
75
- # @return the pattern, if found, or nil if the Source is empty or the
76
- # pattern is not found.
77
- def scan(pattern, cons=false)
78
- return nil if @buffer.nil?
79
- rv = @buffer.scan(pattern)
80
- @buffer = $' if cons and rv.size>0
81
- rv
68
+ def read(term = nil)
82
69
  end
83
70
 
84
- def read
71
+ def read_until(term)
72
+ @scanner.scan_until(Regexp.union(term)) or @scanner.rest
85
73
  end
86
74
 
87
- def consume( pattern )
88
- @buffer = $' if pattern.match( @buffer )
75
+ def ensure_buffer
89
76
  end
90
77
 
91
- def match_to( char, pattern )
92
- return pattern.match(@buffer)
78
+ def match(pattern, cons=false)
79
+ if cons
80
+ @scanner.scan(pattern).nil? ? nil : @scanner
81
+ else
82
+ @scanner.check(pattern).nil? ? nil : @scanner
83
+ end
93
84
  end
94
85
 
95
- def match_to_consume( char, pattern )
96
- md = pattern.match(@buffer)
97
- @buffer = $'
98
- return md
86
+ def position
87
+ @scanner.pos
99
88
  end
100
89
 
101
- def match(pattern, cons=false)
102
- md = pattern.match(@buffer)
103
- @buffer = $' if cons and md
104
- return md
90
+ def position=(pos)
91
+ @scanner.pos = pos
105
92
  end
106
93
 
107
94
  # @return true if the Source is exhausted
108
95
  def empty?
109
- @buffer == ""
110
- end
111
-
112
- def position
113
- @orig.index( @buffer )
96
+ @scanner.eos?
114
97
  end
115
98
 
116
99
  # @return the current line in the source
117
100
  def current_line
118
101
  lines = @orig.split
119
- res = lines.grep @buffer[0..30]
102
+ res = lines.grep @scanner.rest[0..30]
120
103
  res = res[-1] if res.kind_of? Array
121
104
  lines.index( res ) if res
122
105
  end
123
106
 
124
107
  private
108
+
125
109
  def detect_encoding
126
- buffer_encoding = @buffer.encoding
110
+ scanner_encoding = @scanner.rest.encoding
127
111
  detected_encoding = "UTF-8"
128
112
  begin
129
- @buffer.force_encoding("ASCII-8BIT")
130
- if @buffer[0, 2] == "\xfe\xff"
131
- @buffer[0, 2] = ""
113
+ @scanner.string.force_encoding("ASCII-8BIT")
114
+ if @scanner.scan(/\xfe\xff/n)
132
115
  detected_encoding = "UTF-16BE"
133
- elsif @buffer[0, 2] == "\xff\xfe"
134
- @buffer[0, 2] = ""
116
+ elsif @scanner.scan(/\xff\xfe/n)
135
117
  detected_encoding = "UTF-16LE"
136
- elsif @buffer[0, 3] == "\xef\xbb\xbf"
137
- @buffer[0, 3] = ""
118
+ elsif @scanner.scan(/\xef\xbb\xbf/n)
138
119
  detected_encoding = "UTF-8"
139
120
  end
140
121
  ensure
141
- @buffer.force_encoding(buffer_encoding)
122
+ @scanner.string.force_encoding(scanner_encoding)
142
123
  end
143
124
  self.encoding = detected_encoding
144
125
  end
145
126
 
146
127
  def encoding_updated
147
128
  if @encoding != 'UTF-8'
148
- @buffer = decode(@buffer)
129
+ @scanner.string = decode(@scanner.rest)
149
130
  @to_utf = true
150
131
  else
151
132
  @to_utf = false
152
- @buffer.force_encoding ::Encoding::UTF_8
133
+ @scanner.string.force_encoding(::Encoding::UTF_8)
153
134
  end
154
135
  end
155
136
  end
@@ -172,7 +153,7 @@ module REXML
172
153
  end
173
154
 
174
155
  if !@to_utf and
175
- @buffer.respond_to?(:force_encoding) and
156
+ @orig.respond_to?(:force_encoding) and
176
157
  @source.respond_to?(:external_encoding) and
177
158
  @source.external_encoding != ::Encoding::UTF_8
178
159
  @force_utf8 = true
@@ -181,65 +162,58 @@ module REXML
181
162
  end
182
163
  end
183
164
 
184
- def scan(pattern, cons=false)
185
- rv = super
186
- # You'll notice that this next section is very similar to the same
187
- # section in match(), but just a liiittle different. This is
188
- # because it is a touch faster to do it this way with scan()
189
- # than the way match() does it; enough faster to warrant duplicating
190
- # some code
191
- if rv.size == 0
192
- until @buffer =~ pattern or @source.nil?
193
- begin
194
- @buffer << readline
195
- rescue Iconv::IllegalSequence
196
- raise
197
- rescue
198
- @source = nil
199
- end
200
- end
201
- rv = super
165
+ def read(term = nil)
166
+ begin
167
+ @scanner << readline(term)
168
+ true
169
+ rescue Exception, NameError
170
+ @source = nil
171
+ false
202
172
  end
203
- rv.taint if RUBY_VERSION < '2.7'
204
- rv
205
173
  end
206
174
 
207
- def read
175
+ def read_until(term)
176
+ pattern = Regexp.union(term)
177
+ data = []
208
178
  begin
209
- @buffer << readline
210
- rescue Exception, NameError
211
- @source = nil
179
+ until str = @scanner.scan_until(pattern)
180
+ @scanner << readline(term)
181
+ end
182
+ rescue EOFError
183
+ @scanner.rest
184
+ else
185
+ read if @scanner.eos? and !@source.eof?
186
+ str
212
187
  end
213
188
  end
214
189
 
215
- def consume( pattern )
216
- match( pattern, true )
190
+ def ensure_buffer
191
+ read if @scanner.eos? && @source
217
192
  end
218
193
 
194
+ # Note: When specifying a string for 'pattern', it must not include '>' except in the following formats:
195
+ # - ">"
196
+ # - "XXX>" (X is any string excluding '>')
219
197
  def match( pattern, cons=false )
220
- rv = pattern.match(@buffer)
221
- @buffer = $' if cons and rv
222
- while !rv and @source
223
- begin
224
- @buffer << readline
225
- rv = pattern.match(@buffer)
226
- @buffer = $' if cons and rv
227
- rescue
228
- @source = nil
198
+ while true
199
+ if cons
200
+ md = @scanner.scan(pattern)
201
+ else
202
+ md = @scanner.check(pattern)
229
203
  end
204
+ break if md
205
+ return nil if pattern.is_a?(String)
206
+ return nil if @source.nil?
207
+ return nil unless read
230
208
  end
231
- rv.taint if RUBY_VERSION < '2.7'
232
- rv
209
+
210
+ md.nil? ? nil : @scanner
233
211
  end
234
212
 
235
213
  def empty?
236
214
  super and ( @source.nil? || @source.eof? )
237
215
  end
238
216
 
239
- def position
240
- @er_source.pos rescue 0
241
- end
242
-
243
217
  # @return the current line in the source
244
218
  def current_line
245
219
  begin
@@ -263,8 +237,8 @@ module REXML
263
237
  end
264
238
 
265
239
  private
266
- def readline
267
- str = @source.readline(@line_break)
240
+ def readline(term = nil)
241
+ str = @source.readline(term || @line_break)
268
242
  if @pending_buffer
269
243
  if str.nil?
270
244
  str = @pending_buffer
@@ -290,7 +264,7 @@ module REXML
290
264
  @source.set_encoding(@encoding, @encoding)
291
265
  end
292
266
  @line_break = encode(">")
293
- @pending_buffer, @buffer = @buffer, ""
267
+ @pending_buffer, @scanner.string = @scanner.rest, ""
294
268
  @pending_buffer.force_encoding(@encoding)
295
269
  super
296
270
  end
@@ -590,6 +590,7 @@ module REXML
590
590
 
591
591
  def evaluate_predicate(expression, nodesets)
592
592
  enter(:predicate, expression, nodesets) if @debug
593
+ new_nodeset_count = 0
593
594
  new_nodesets = nodesets.collect do |nodeset|
594
595
  new_nodeset = []
595
596
  subcontext = { :size => nodeset.size }
@@ -606,17 +607,20 @@ module REXML
606
607
  result = result[0] if result.kind_of? Array and result.length == 1
607
608
  if result.kind_of? Numeric
608
609
  if result == node.position
609
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
610
+ new_nodeset_count += 1
611
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
610
612
  end
611
613
  elsif result.instance_of? Array
612
614
  if result.size > 0 and result.inject(false) {|k,s| s or k}
613
615
  if result.size > 0
614
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
616
+ new_nodeset_count += 1
617
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
615
618
  end
616
619
  end
617
620
  else
618
621
  if result
619
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
622
+ new_nodeset_count += 1
623
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
620
624
  end
621
625
  end
622
626
  end
metadata CHANGED
@@ -1,57 +1,28 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rexml
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.2.6
4
+ version: 3.2.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kouhei Sutou
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2023-07-27 00:00:00.000000000 Z
10
+ date: 2024-05-16 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
- name: bundler
13
+ name: strscan
15
14
  requirement: !ruby/object:Gem::Requirement
16
15
  requirements:
17
16
  - - ">="
18
17
  - !ruby/object:Gem::Version
19
- version: '0'
20
- type: :development
18
+ version: 3.0.9
19
+ type: :runtime
21
20
  prerelease: false
22
21
  version_requirements: !ruby/object:Gem::Requirement
23
22
  requirements:
24
23
  - - ">="
25
24
  - !ruby/object:Gem::Version
26
- version: '0'
27
- - !ruby/object:Gem::Dependency
28
- name: rake
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :development
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: test-unit
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - ">="
46
- - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :development
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- version: '0'
25
+ version: 3.0.9
55
26
  description: An XML toolkit for Ruby
56
27
  email:
57
28
  - kou@cozmixng.org
@@ -145,7 +116,6 @@ homepage: https://github.com/ruby/rexml
145
116
  licenses:
146
117
  - BSD-2-Clause
147
118
  metadata: {}
148
- post_install_message:
149
119
  rdoc_options:
150
120
  - "--main"
151
121
  - README.md
@@ -162,8 +132,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
162
132
  - !ruby/object:Gem::Version
163
133
  version: '0'
164
134
  requirements: []
165
- rubygems_version: 3.5.0.dev
166
- signing_key:
135
+ rubygems_version: 3.6.0.dev
167
136
  specification_version: 4
168
137
  summary: An XML toolkit for Ruby
169
138
  test_files: []