rexml 3.2.5 → 3.3.8

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of rexml might be problematic. Click here for more details.

data/lib/rexml/source.rb CHANGED
@@ -1,8 +1,28 @@
1
1
  # coding: US-ASCII
2
2
  # frozen_string_literal: false
3
+
4
+ require "strscan"
5
+
3
6
  require_relative 'encoding'
4
7
 
5
8
  module REXML
9
+ if StringScanner::Version < "1.0.0"
10
+ module StringScannerCheckScanString
11
+ refine StringScanner do
12
+ def check(pattern)
13
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
14
+ super(pattern)
15
+ end
16
+
17
+ def scan(pattern)
18
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
19
+ super(pattern)
20
+ end
21
+ end
22
+ end
23
+ using StringScannerCheckScanString
24
+ end
25
+
6
26
  # Generates Source-s. USE THIS CLASS.
7
27
  class SourceFactory
8
28
  # Generates a Source object
@@ -30,18 +50,27 @@ module REXML
30
50
  # objects and provides consumption of text
31
51
  class Source
32
52
  include Encoding
33
- # The current buffer (what we're going to read next)
34
- attr_reader :buffer
35
53
  # The line number of the last consumed text
36
54
  attr_reader :line
37
55
  attr_reader :encoding
38
56
 
57
+ module Private
58
+ SCANNER_RESET_SIZE = 100000
59
+ PRE_DEFINED_TERM_PATTERNS = {}
60
+ pre_defined_terms = ["'", '"', "<"]
61
+ pre_defined_terms.each do |term|
62
+ PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/
63
+ end
64
+ end
65
+ private_constant :Private
66
+
39
67
  # Constructor
40
68
  # @param arg must be a String, and should be a valid XML document
41
69
  # @param encoding if non-null, sets the encoding of the source to this
42
70
  # value, overriding all encoding detection
43
71
  def initialize(arg, encoding=nil)
44
- @orig = @buffer = arg
72
+ @orig = arg
73
+ @scanner = StringScanner.new(@orig)
45
74
  if encoding
46
75
  self.encoding = encoding
47
76
  else
@@ -50,6 +79,20 @@ module REXML
50
79
  @line = 0
51
80
  end
52
81
 
82
+ # The current buffer (what we're going to read next)
83
+ def buffer
84
+ @scanner.rest
85
+ end
86
+
87
+ def drop_parsed_content
88
+ if @scanner.pos > Private::SCANNER_RESET_SIZE
89
+ @scanner.string = @scanner.rest
90
+ end
91
+ end
92
+
93
+ def buffer_encoding=(encoding)
94
+ @scanner.string.force_encoding(encoding)
95
+ end
53
96
 
54
97
  # Inherited from Encoding
55
98
  # Overridden to support optimized en/decoding
@@ -58,98 +101,78 @@ module REXML
58
101
  encoding_updated
59
102
  end
60
103
 
61
- # Scans the source for a given pattern. Note, that this is not your
62
- # usual scan() method. For one thing, the pattern argument has some
63
- # requirements; for another, the source can be consumed. You can easily
64
- # confuse this method. Originally, the patterns were easier
65
- # to construct and this method more robust, because this method
66
- # generated search regexps on the fly; however, this was
67
- # computationally expensive and slowed down the entire REXML package
68
- # considerably, since this is by far the most commonly called method.
69
- # @param pattern must be a Regexp, and must be in the form of
70
- # /^\s*(#{your pattern, with no groups})(.*)/. The first group
71
- # will be returned; the second group is used if the consume flag is
72
- # set.
73
- # @param consume if true, the pattern returned will be consumed, leaving
74
- # everything after it in the Source.
75
- # @return the pattern, if found, or nil if the Source is empty or the
76
- # pattern is not found.
77
- def scan(pattern, cons=false)
78
- return nil if @buffer.nil?
79
- rv = @buffer.scan(pattern)
80
- @buffer = $' if cons and rv.size>0
81
- rv
104
+ def read(term = nil)
82
105
  end
83
106
 
84
- def read
107
+ def read_until(term)
108
+ pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
109
+ data = @scanner.scan_until(pattern)
110
+ unless data
111
+ data = @scanner.rest
112
+ @scanner.pos = @scanner.string.bytesize
113
+ end
114
+ data
85
115
  end
86
116
 
87
- def consume( pattern )
88
- @buffer = $' if pattern.match( @buffer )
117
+ def ensure_buffer
89
118
  end
90
119
 
91
- def match_to( char, pattern )
92
- return pattern.match(@buffer)
120
+ def match(pattern, cons=false)
121
+ if cons
122
+ @scanner.scan(pattern).nil? ? nil : @scanner
123
+ else
124
+ @scanner.check(pattern).nil? ? nil : @scanner
125
+ end
93
126
  end
94
127
 
95
- def match_to_consume( char, pattern )
96
- md = pattern.match(@buffer)
97
- @buffer = $'
98
- return md
128
+ def position
129
+ @scanner.pos
99
130
  end
100
131
 
101
- def match(pattern, cons=false)
102
- md = pattern.match(@buffer)
103
- @buffer = $' if cons and md
104
- return md
132
+ def position=(pos)
133
+ @scanner.pos = pos
105
134
  end
106
135
 
107
136
  # @return true if the Source is exhausted
108
137
  def empty?
109
- @buffer == ""
110
- end
111
-
112
- def position
113
- @orig.index( @buffer )
138
+ @scanner.eos?
114
139
  end
115
140
 
116
141
  # @return the current line in the source
117
142
  def current_line
118
143
  lines = @orig.split
119
- res = lines.grep @buffer[0..30]
144
+ res = lines.grep @scanner.rest[0..30]
120
145
  res = res[-1] if res.kind_of? Array
121
146
  lines.index( res ) if res
122
147
  end
123
148
 
124
149
  private
150
+
125
151
  def detect_encoding
126
- buffer_encoding = @buffer.encoding
152
+ scanner_encoding = @scanner.rest.encoding
127
153
  detected_encoding = "UTF-8"
128
154
  begin
129
- @buffer.force_encoding("ASCII-8BIT")
130
- if @buffer[0, 2] == "\xfe\xff"
131
- @buffer[0, 2] = ""
155
+ @scanner.string.force_encoding("ASCII-8BIT")
156
+ if @scanner.scan(/\xfe\xff/n)
132
157
  detected_encoding = "UTF-16BE"
133
- elsif @buffer[0, 2] == "\xff\xfe"
134
- @buffer[0, 2] = ""
158
+ elsif @scanner.scan(/\xff\xfe/n)
135
159
  detected_encoding = "UTF-16LE"
136
- elsif @buffer[0, 3] == "\xef\xbb\xbf"
137
- @buffer[0, 3] = ""
160
+ elsif @scanner.scan(/\xef\xbb\xbf/n)
138
161
  detected_encoding = "UTF-8"
139
162
  end
140
163
  ensure
141
- @buffer.force_encoding(buffer_encoding)
164
+ @scanner.string.force_encoding(scanner_encoding)
142
165
  end
143
166
  self.encoding = detected_encoding
144
167
  end
145
168
 
146
169
  def encoding_updated
147
170
  if @encoding != 'UTF-8'
148
- @buffer = decode(@buffer)
171
+ @scanner.string = decode(@scanner.rest)
149
172
  @to_utf = true
150
173
  else
151
174
  @to_utf = false
152
- @buffer.force_encoding ::Encoding::UTF_8
175
+ @scanner.string.force_encoding(::Encoding::UTF_8)
153
176
  end
154
177
  end
155
178
  end
@@ -172,7 +195,7 @@ module REXML
172
195
  end
173
196
 
174
197
  if !@to_utf and
175
- @buffer.respond_to?(:force_encoding) and
198
+ @orig.respond_to?(:force_encoding) and
176
199
  @source.respond_to?(:external_encoding) and
177
200
  @source.external_encoding != ::Encoding::UTF_8
178
201
  @force_utf8 = true
@@ -181,65 +204,72 @@ module REXML
181
204
  end
182
205
  end
183
206
 
184
- def scan(pattern, cons=false)
185
- rv = super
186
- # You'll notice that this next section is very similar to the same
187
- # section in match(), but just a liiittle different. This is
188
- # because it is a touch faster to do it this way with scan()
189
- # than the way match() does it; enough faster to warrant duplicating
190
- # some code
191
- if rv.size == 0
192
- until @buffer =~ pattern or @source.nil?
193
- begin
194
- @buffer << readline
195
- rescue Iconv::IllegalSequence
196
- raise
197
- rescue
198
- @source = nil
207
+ def read(term = nil, min_bytes = 1)
208
+ term = encode(term) if term
209
+ begin
210
+ str = readline(term)
211
+ @scanner << str
212
+ read_bytes = str.bytesize
213
+ begin
214
+ while read_bytes < min_bytes
215
+ str = readline(term)
216
+ @scanner << str
217
+ read_bytes += str.bytesize
199
218
  end
219
+ rescue IOError
200
220
  end
201
- rv = super
221
+ true
222
+ rescue Exception, NameError
223
+ @source = nil
224
+ false
202
225
  end
203
- rv.taint if RUBY_VERSION < '2.7'
204
- rv
205
226
  end
206
227
 
207
- def read
208
- begin
209
- @buffer << readline
210
- rescue Exception, NameError
211
- @source = nil
228
+ def read_until(term)
229
+ pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
230
+ term = encode(term)
231
+ until str = @scanner.scan_until(pattern)
232
+ break if @source.nil?
233
+ break if @source.eof?
234
+ @scanner << readline(term)
235
+ end
236
+ if str
237
+ read if @scanner.eos? and !@source.eof?
238
+ str
239
+ else
240
+ rest = @scanner.rest
241
+ @scanner.pos = @scanner.string.bytesize
242
+ rest
212
243
  end
213
244
  end
214
245
 
215
- def consume( pattern )
216
- match( pattern, true )
246
+ def ensure_buffer
247
+ read if @scanner.eos? && @source
217
248
  end
218
249
 
219
250
  def match( pattern, cons=false )
220
- rv = pattern.match(@buffer)
221
- @buffer = $' if cons and rv
222
- while !rv and @source
223
- begin
224
- @buffer << readline
225
- rv = pattern.match(@buffer)
226
- @buffer = $' if cons and rv
227
- rescue
228
- @source = nil
251
+ # To avoid performance issue, we need to increase bytes to read per scan
252
+ min_bytes = 1
253
+ while true
254
+ if cons
255
+ md = @scanner.scan(pattern)
256
+ else
257
+ md = @scanner.check(pattern)
229
258
  end
259
+ break if md
260
+ return nil if pattern.is_a?(String)
261
+ return nil if @source.nil?
262
+ return nil unless read(nil, min_bytes)
263
+ min_bytes *= 2
230
264
  end
231
- rv.taint if RUBY_VERSION < '2.7'
232
- rv
265
+
266
+ md.nil? ? nil : @scanner
233
267
  end
234
268
 
235
269
  def empty?
236
270
  super and ( @source.nil? || @source.eof? )
237
271
  end
238
272
 
239
- def position
240
- @er_source.pos rescue 0
241
- end
242
-
243
273
  # @return the current line in the source
244
274
  def current_line
245
275
  begin
@@ -263,8 +293,8 @@ module REXML
263
293
  end
264
294
 
265
295
  private
266
- def readline
267
- str = @source.readline(@line_break)
296
+ def readline(term = nil)
297
+ str = @source.readline(term || @line_break)
268
298
  if @pending_buffer
269
299
  if str.nil?
270
300
  str = @pending_buffer
@@ -290,7 +320,7 @@ module REXML
290
320
  @source.set_encoding(@encoding, @encoding)
291
321
  end
292
322
  @line_break = encode(">")
293
- @pending_buffer, @buffer = @buffer, ""
323
+ @pending_buffer, @scanner.string = @scanner.rest, ""
294
324
  @pending_buffer.force_encoding(@encoding)
295
325
  super
296
326
  end
data/lib/rexml/text.rb CHANGED
@@ -1,4 +1,4 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative 'security'
3
3
  require_relative 'entity'
4
4
  require_relative 'doctype'
@@ -131,7 +131,7 @@ module REXML
131
131
  def Text.check string, pattern, doctype
132
132
 
133
133
  # illegal anywhere
134
- if string !~ VALID_XML_CHARS
134
+ if !string.match?(VALID_XML_CHARS)
135
135
  if String.method_defined? :encode
136
136
  string.chars.each do |c|
137
137
  case c.ord
@@ -151,25 +151,45 @@ module REXML
151
151
  end
152
152
  end
153
153
 
154
- # context sensitive
155
- string.scan(pattern) do
156
- if $1[-1] != ?;
157
- raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
158
- elsif $1[0] == ?&
159
- if $5 and $5[0] == ?#
160
- case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
161
- when *VALID_CHAR
154
+ pos = 0
155
+ while (index = string.index(/<|&/, pos))
156
+ if string[index] == "<"
157
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
158
+ end
159
+
160
+ unless (end_index = string.index(/[^\s];/, index + 1))
161
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
162
+ end
163
+
164
+ value = string[(index + 1)..end_index]
165
+ if /\s/.match?(value)
166
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
167
+ end
168
+
169
+ if value[0] == "#"
170
+ character_reference = value[1..-1]
171
+
172
+ unless (/\A(\d+|x[0-9a-fA-F]+)\z/.match?(character_reference))
173
+ if character_reference[0] == "x" || character_reference[-1] == "x"
174
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
162
175
  else
163
- raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
176
+ raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
164
177
  end
165
- # FIXME: below can't work but this needs API change.
166
- # elsif @parent and $3 and !SUBSTITUTES.include?($1)
167
- # if !doctype or !doctype.entities.has_key?($3)
168
- # raise "Undeclared entity '#{$1}' in raw string \"#{string}\""
169
- # end
170
178
  end
179
+
180
+ case (character_reference[0] == "x" ? character_reference[1..-1].to_i(16) : character_reference[0..-1].to_i)
181
+ when *VALID_CHAR
182
+ else
183
+ raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
184
+ end
185
+ elsif !(/\A#{Entity::NAME}\z/um.match?(value))
186
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
171
187
  end
188
+
189
+ pos = end_index + 1
172
190
  end
191
+
192
+ string
173
193
  end
174
194
 
175
195
  def node_type
@@ -248,7 +268,8 @@ module REXML
248
268
  # u = Text.new( "sean russell", false, nil, true )
249
269
  # u.value #-> "sean russell"
250
270
  def value
251
- @unnormalized ||= Text::unnormalize( @string, doctype )
271
+ @unnormalized ||= Text::unnormalize(@string, doctype,
272
+ entity_expansion_text_limit: document&.entity_expansion_text_limit)
252
273
  end
253
274
 
254
275
  # Sets the contents of this text node. This expects the text to be
@@ -371,7 +392,7 @@ module REXML
371
392
  copy = input.to_s
372
393
  # Doing it like this rather than in a loop improves the speed
373
394
  #copy = copy.gsub( EREFERENCE, '&amp;' )
374
- copy = copy.gsub( "&", "&amp;" )
395
+ copy = copy.gsub( "&", "&amp;" ) if copy.include?("&")
375
396
  if doctype
376
397
  # Replace all ampersands that aren't part of an entity
377
398
  doctype.entities.each_value do |entity|
@@ -382,18 +403,21 @@ module REXML
382
403
  else
383
404
  # Replace all ampersands that aren't part of an entity
384
405
  DocType::DEFAULT_ENTITIES.each_value do |entity|
385
- copy = copy.gsub(entity.value, "&#{entity.name};" )
406
+ if copy.include?(entity.value)
407
+ copy = copy.gsub(entity.value, "&#{entity.name};" )
408
+ end
386
409
  end
387
410
  end
388
411
  copy
389
412
  end
390
413
 
391
414
  # Unescapes all possible entities
392
- def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil )
415
+ def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expansion_text_limit: nil )
416
+ entity_expansion_text_limit ||= Security.entity_expansion_text_limit
393
417
  sum = 0
394
418
  string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) {
395
419
  s = Text.expand($&, doctype, filter)
396
- if sum + s.bytesize > Security.entity_expansion_text_limit
420
+ if sum + s.bytesize > entity_expansion_text_limit
397
421
  raise "entity expansion has grown too large"
398
422
  else
399
423
  sum += s.bytesize
@@ -590,6 +590,7 @@ module REXML
590
590
 
591
591
  def evaluate_predicate(expression, nodesets)
592
592
  enter(:predicate, expression, nodesets) if @debug
593
+ new_nodeset_count = 0
593
594
  new_nodesets = nodesets.collect do |nodeset|
594
595
  new_nodeset = []
595
596
  subcontext = { :size => nodeset.size }
@@ -606,17 +607,20 @@ module REXML
606
607
  result = result[0] if result.kind_of? Array and result.length == 1
607
608
  if result.kind_of? Numeric
608
609
  if result == node.position
609
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
610
+ new_nodeset_count += 1
611
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
610
612
  end
611
613
  elsif result.instance_of? Array
612
614
  if result.size > 0 and result.inject(false) {|k,s| s or k}
613
615
  if result.size > 0
614
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
616
+ new_nodeset_count += 1
617
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
615
618
  end
616
619
  end
617
620
  else
618
621
  if result
619
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
622
+ new_nodeset_count += 1
623
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
620
624
  end
621
625
  end
622
626
  end
metadata CHANGED
@@ -1,57 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rexml
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.2.5
4
+ version: 3.3.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kouhei Sutou
8
- autorequire:
9
- bindir: exe
8
+ bindir: bin
10
9
  cert_chain: []
11
- date: 2021-04-05 00:00:00.000000000 Z
12
- dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: bundler
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ">="
18
- - !ruby/object:Gem::Version
19
- version: '0'
20
- type: :development
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - ">="
25
- - !ruby/object:Gem::Version
26
- version: '0'
27
- - !ruby/object:Gem::Dependency
28
- name: rake
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :development
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: test-unit
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - ">="
46
- - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :development
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- version: '0'
10
+ date: 2024-09-29 00:00:00.000000000 Z
11
+ dependencies: []
55
12
  description: An XML toolkit for Ruby
56
13
  email:
57
14
  - kou@cozmixng.org
@@ -73,6 +30,7 @@ extra_rdoc_files:
73
30
  - doc/rexml/tasks/tocs/master_toc.rdoc
74
31
  - doc/rexml/tasks/tocs/node_toc.rdoc
75
32
  - doc/rexml/tasks/tocs/parent_toc.rdoc
33
+ - doc/rexml/tutorial.rdoc
76
34
  files:
77
35
  - LICENSE.txt
78
36
  - NEWS.md
@@ -89,6 +47,7 @@ files:
89
47
  - doc/rexml/tasks/tocs/master_toc.rdoc
90
48
  - doc/rexml/tasks/tocs/node_toc.rdoc
91
49
  - doc/rexml/tasks/tocs/parent_toc.rdoc
50
+ - doc/rexml/tutorial.rdoc
92
51
  - lib/rexml.rb
93
52
  - lib/rexml/attlistdecl.rb
94
53
  - lib/rexml/attribute.rb
@@ -142,8 +101,8 @@ files:
142
101
  homepage: https://github.com/ruby/rexml
143
102
  licenses:
144
103
  - BSD-2-Clause
145
- metadata: {}
146
- post_install_message:
104
+ metadata:
105
+ changelog_uri: https://github.com/ruby/rexml/releases/tag/v3.3.8
147
106
  rdoc_options:
148
107
  - "--main"
149
108
  - README.md
@@ -153,15 +112,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
153
112
  requirements:
154
113
  - - ">="
155
114
  - !ruby/object:Gem::Version
156
- version: '0'
115
+ version: 2.5.0
157
116
  required_rubygems_version: !ruby/object:Gem::Requirement
158
117
  requirements:
159
118
  - - ">="
160
119
  - !ruby/object:Gem::Version
161
120
  version: '0'
162
121
  requirements: []
163
- rubygems_version: 3.2.3
164
- signing_key:
122
+ rubygems_version: 3.6.0.dev
165
123
  specification_version: 4
166
124
  summary: An XML toolkit for Ruby
167
125
  test_files: []