rexml 3.2.5 → 3.3.6

Sign up to get free protection for your applications and to get access to all the features.
data/lib/rexml/source.rb CHANGED
@@ -1,8 +1,28 @@
1
1
  # coding: US-ASCII
2
2
  # frozen_string_literal: false
3
+
4
+ require "strscan"
5
+
3
6
  require_relative 'encoding'
4
7
 
5
8
  module REXML
9
+ if StringScanner::Version < "1.0.0"
10
+ module StringScannerCheckScanString
11
+ refine StringScanner do
12
+ def check(pattern)
13
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
14
+ super(pattern)
15
+ end
16
+
17
+ def scan(pattern)
18
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
19
+ super(pattern)
20
+ end
21
+ end
22
+ end
23
+ using StringScannerCheckScanString
24
+ end
25
+
6
26
  # Generates Source-s. USE THIS CLASS.
7
27
  class SourceFactory
8
28
  # Generates a Source object
@@ -30,18 +50,27 @@ module REXML
30
50
  # objects and provides consumption of text
31
51
  class Source
32
52
  include Encoding
33
- # The current buffer (what we're going to read next)
34
- attr_reader :buffer
35
53
  # The line number of the last consumed text
36
54
  attr_reader :line
37
55
  attr_reader :encoding
38
56
 
57
+ module Private
58
+ SCANNER_RESET_SIZE = 100000
59
+ PRE_DEFINED_TERM_PATTERNS = {}
60
+ pre_defined_terms = ["'", '"', "<"]
61
+ pre_defined_terms.each do |term|
62
+ PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/
63
+ end
64
+ end
65
+ private_constant :Private
66
+
39
67
  # Constructor
40
68
  # @param arg must be a String, and should be a valid XML document
41
69
  # @param encoding if non-null, sets the encoding of the source to this
42
70
  # value, overriding all encoding detection
43
71
  def initialize(arg, encoding=nil)
44
- @orig = @buffer = arg
72
+ @orig = arg
73
+ @scanner = StringScanner.new(@orig)
45
74
  if encoding
46
75
  self.encoding = encoding
47
76
  else
@@ -50,6 +79,20 @@ module REXML
50
79
  @line = 0
51
80
  end
52
81
 
82
+ # The current buffer (what we're going to read next)
83
+ def buffer
84
+ @scanner.rest
85
+ end
86
+
87
+ def drop_parsed_content
88
+ if @scanner.pos > Private::SCANNER_RESET_SIZE
89
+ @scanner.string = @scanner.rest
90
+ end
91
+ end
92
+
93
+ def buffer_encoding=(encoding)
94
+ @scanner.string.force_encoding(encoding)
95
+ end
53
96
 
54
97
  # Inherited from Encoding
55
98
  # Overridden to support optimized en/decoding
@@ -58,98 +101,78 @@ module REXML
58
101
  encoding_updated
59
102
  end
60
103
 
61
- # Scans the source for a given pattern. Note, that this is not your
62
- # usual scan() method. For one thing, the pattern argument has some
63
- # requirements; for another, the source can be consumed. You can easily
64
- # confuse this method. Originally, the patterns were easier
65
- # to construct and this method more robust, because this method
66
- # generated search regexps on the fly; however, this was
67
- # computationally expensive and slowed down the entire REXML package
68
- # considerably, since this is by far the most commonly called method.
69
- # @param pattern must be a Regexp, and must be in the form of
70
- # /^\s*(#{your pattern, with no groups})(.*)/. The first group
71
- # will be returned; the second group is used if the consume flag is
72
- # set.
73
- # @param consume if true, the pattern returned will be consumed, leaving
74
- # everything after it in the Source.
75
- # @return the pattern, if found, or nil if the Source is empty or the
76
- # pattern is not found.
77
- def scan(pattern, cons=false)
78
- return nil if @buffer.nil?
79
- rv = @buffer.scan(pattern)
80
- @buffer = $' if cons and rv.size>0
81
- rv
104
+ def read(term = nil)
82
105
  end
83
106
 
84
- def read
107
+ def read_until(term)
108
+ pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
109
+ data = @scanner.scan_until(pattern)
110
+ unless data
111
+ data = @scanner.rest
112
+ @scanner.pos = @scanner.string.bytesize
113
+ end
114
+ data
85
115
  end
86
116
 
87
- def consume( pattern )
88
- @buffer = $' if pattern.match( @buffer )
117
+ def ensure_buffer
89
118
  end
90
119
 
91
- def match_to( char, pattern )
92
- return pattern.match(@buffer)
120
+ def match(pattern, cons=false)
121
+ if cons
122
+ @scanner.scan(pattern).nil? ? nil : @scanner
123
+ else
124
+ @scanner.check(pattern).nil? ? nil : @scanner
125
+ end
93
126
  end
94
127
 
95
- def match_to_consume( char, pattern )
96
- md = pattern.match(@buffer)
97
- @buffer = $'
98
- return md
128
+ def position
129
+ @scanner.pos
99
130
  end
100
131
 
101
- def match(pattern, cons=false)
102
- md = pattern.match(@buffer)
103
- @buffer = $' if cons and md
104
- return md
132
+ def position=(pos)
133
+ @scanner.pos = pos
105
134
  end
106
135
 
107
136
  # @return true if the Source is exhausted
108
137
  def empty?
109
- @buffer == ""
110
- end
111
-
112
- def position
113
- @orig.index( @buffer )
138
+ @scanner.eos?
114
139
  end
115
140
 
116
141
  # @return the current line in the source
117
142
  def current_line
118
143
  lines = @orig.split
119
- res = lines.grep @buffer[0..30]
144
+ res = lines.grep @scanner.rest[0..30]
120
145
  res = res[-1] if res.kind_of? Array
121
146
  lines.index( res ) if res
122
147
  end
123
148
 
124
149
  private
150
+
125
151
  def detect_encoding
126
- buffer_encoding = @buffer.encoding
152
+ scanner_encoding = @scanner.rest.encoding
127
153
  detected_encoding = "UTF-8"
128
154
  begin
129
- @buffer.force_encoding("ASCII-8BIT")
130
- if @buffer[0, 2] == "\xfe\xff"
131
- @buffer[0, 2] = ""
155
+ @scanner.string.force_encoding("ASCII-8BIT")
156
+ if @scanner.scan(/\xfe\xff/n)
132
157
  detected_encoding = "UTF-16BE"
133
- elsif @buffer[0, 2] == "\xff\xfe"
134
- @buffer[0, 2] = ""
158
+ elsif @scanner.scan(/\xff\xfe/n)
135
159
  detected_encoding = "UTF-16LE"
136
- elsif @buffer[0, 3] == "\xef\xbb\xbf"
137
- @buffer[0, 3] = ""
160
+ elsif @scanner.scan(/\xef\xbb\xbf/n)
138
161
  detected_encoding = "UTF-8"
139
162
  end
140
163
  ensure
141
- @buffer.force_encoding(buffer_encoding)
164
+ @scanner.string.force_encoding(scanner_encoding)
142
165
  end
143
166
  self.encoding = detected_encoding
144
167
  end
145
168
 
146
169
  def encoding_updated
147
170
  if @encoding != 'UTF-8'
148
- @buffer = decode(@buffer)
171
+ @scanner.string = decode(@scanner.rest)
149
172
  @to_utf = true
150
173
  else
151
174
  @to_utf = false
152
- @buffer.force_encoding ::Encoding::UTF_8
175
+ @scanner.string.force_encoding(::Encoding::UTF_8)
153
176
  end
154
177
  end
155
178
  end
@@ -172,7 +195,7 @@ module REXML
172
195
  end
173
196
 
174
197
  if !@to_utf and
175
- @buffer.respond_to?(:force_encoding) and
198
+ @orig.respond_to?(:force_encoding) and
176
199
  @source.respond_to?(:external_encoding) and
177
200
  @source.external_encoding != ::Encoding::UTF_8
178
201
  @force_utf8 = true
@@ -181,65 +204,72 @@ module REXML
181
204
  end
182
205
  end
183
206
 
184
- def scan(pattern, cons=false)
185
- rv = super
186
- # You'll notice that this next section is very similar to the same
187
- # section in match(), but just a liiittle different. This is
188
- # because it is a touch faster to do it this way with scan()
189
- # than the way match() does it; enough faster to warrant duplicating
190
- # some code
191
- if rv.size == 0
192
- until @buffer =~ pattern or @source.nil?
193
- begin
194
- @buffer << readline
195
- rescue Iconv::IllegalSequence
196
- raise
197
- rescue
198
- @source = nil
207
+ def read(term = nil, min_bytes = 1)
208
+ term = encode(term) if term
209
+ begin
210
+ str = readline(term)
211
+ @scanner << str
212
+ read_bytes = str.bytesize
213
+ begin
214
+ while read_bytes < min_bytes
215
+ str = readline(term)
216
+ @scanner << str
217
+ read_bytes += str.bytesize
199
218
  end
219
+ rescue IOError
200
220
  end
201
- rv = super
221
+ true
222
+ rescue Exception, NameError
223
+ @source = nil
224
+ false
202
225
  end
203
- rv.taint if RUBY_VERSION < '2.7'
204
- rv
205
226
  end
206
227
 
207
- def read
208
- begin
209
- @buffer << readline
210
- rescue Exception, NameError
211
- @source = nil
228
+ def read_until(term)
229
+ pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
230
+ term = encode(term)
231
+ until str = @scanner.scan_until(pattern)
232
+ break if @source.nil?
233
+ break if @source.eof?
234
+ @scanner << readline(term)
235
+ end
236
+ if str
237
+ read if @scanner.eos? and !@source.eof?
238
+ str
239
+ else
240
+ rest = @scanner.rest
241
+ @scanner.pos = @scanner.string.bytesize
242
+ rest
212
243
  end
213
244
  end
214
245
 
215
- def consume( pattern )
216
- match( pattern, true )
246
+ def ensure_buffer
247
+ read if @scanner.eos? && @source
217
248
  end
218
249
 
219
250
  def match( pattern, cons=false )
220
- rv = pattern.match(@buffer)
221
- @buffer = $' if cons and rv
222
- while !rv and @source
223
- begin
224
- @buffer << readline
225
- rv = pattern.match(@buffer)
226
- @buffer = $' if cons and rv
227
- rescue
228
- @source = nil
251
+ # To avoid performance issue, we need to increase bytes to read per scan
252
+ min_bytes = 1
253
+ while true
254
+ if cons
255
+ md = @scanner.scan(pattern)
256
+ else
257
+ md = @scanner.check(pattern)
229
258
  end
259
+ break if md
260
+ return nil if pattern.is_a?(String)
261
+ return nil if @source.nil?
262
+ return nil unless read(nil, min_bytes)
263
+ min_bytes *= 2
230
264
  end
231
- rv.taint if RUBY_VERSION < '2.7'
232
- rv
265
+
266
+ md.nil? ? nil : @scanner
233
267
  end
234
268
 
235
269
  def empty?
236
270
  super and ( @source.nil? || @source.eof? )
237
271
  end
238
272
 
239
- def position
240
- @er_source.pos rescue 0
241
- end
242
-
243
273
  # @return the current line in the source
244
274
  def current_line
245
275
  begin
@@ -263,8 +293,8 @@ module REXML
263
293
  end
264
294
 
265
295
  private
266
- def readline
267
- str = @source.readline(@line_break)
296
+ def readline(term = nil)
297
+ str = @source.readline(term || @line_break)
268
298
  if @pending_buffer
269
299
  if str.nil?
270
300
  str = @pending_buffer
@@ -290,7 +320,7 @@ module REXML
290
320
  @source.set_encoding(@encoding, @encoding)
291
321
  end
292
322
  @line_break = encode(">")
293
- @pending_buffer, @buffer = @buffer, ""
323
+ @pending_buffer, @scanner.string = @scanner.rest, ""
294
324
  @pending_buffer.force_encoding(@encoding)
295
325
  super
296
326
  end
data/lib/rexml/text.rb CHANGED
@@ -1,4 +1,4 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require_relative 'security'
3
3
  require_relative 'entity'
4
4
  require_relative 'doctype'
@@ -131,7 +131,7 @@ module REXML
131
131
  def Text.check string, pattern, doctype
132
132
 
133
133
  # illegal anywhere
134
- if string !~ VALID_XML_CHARS
134
+ if !string.match?(VALID_XML_CHARS)
135
135
  if String.method_defined? :encode
136
136
  string.chars.each do |c|
137
137
  case c.ord
@@ -151,25 +151,45 @@ module REXML
151
151
  end
152
152
  end
153
153
 
154
- # context sensitive
155
- string.scan(pattern) do
156
- if $1[-1] != ?;
157
- raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
158
- elsif $1[0] == ?&
159
- if $5 and $5[0] == ?#
160
- case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
161
- when *VALID_CHAR
154
+ pos = 0
155
+ while (index = string.index(/<|&/, pos))
156
+ if string[index] == "<"
157
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
158
+ end
159
+
160
+ unless (end_index = string.index(/[^\s];/, index + 1))
161
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
162
+ end
163
+
164
+ value = string[(index + 1)..end_index]
165
+ if /\s/.match?(value)
166
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
167
+ end
168
+
169
+ if value[0] == "#"
170
+ character_reference = value[1..-1]
171
+
172
+ unless (/\A(\d+|x[0-9a-fA-F]+)\z/.match?(character_reference))
173
+ if character_reference[0] == "x" || character_reference[-1] == "x"
174
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
162
175
  else
163
- raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
176
+ raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
164
177
  end
165
- # FIXME: below can't work but this needs API change.
166
- # elsif @parent and $3 and !SUBSTITUTES.include?($1)
167
- # if !doctype or !doctype.entities.has_key?($3)
168
- # raise "Undeclared entity '#{$1}' in raw string \"#{string}\""
169
- # end
170
178
  end
179
+
180
+ case (character_reference[0] == "x" ? character_reference[1..-1].to_i(16) : character_reference[0..-1].to_i)
181
+ when *VALID_CHAR
182
+ else
183
+ raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
184
+ end
185
+ elsif !(/\A#{Entity::NAME}\z/um.match?(value))
186
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
171
187
  end
188
+
189
+ pos = end_index + 1
172
190
  end
191
+
192
+ string
173
193
  end
174
194
 
175
195
  def node_type
@@ -371,7 +391,7 @@ module REXML
371
391
  copy = input.to_s
372
392
  # Doing it like this rather than in a loop improves the speed
373
393
  #copy = copy.gsub( EREFERENCE, '&amp;' )
374
- copy = copy.gsub( "&", "&amp;" )
394
+ copy = copy.gsub( "&", "&amp;" ) if copy.include?("&")
375
395
  if doctype
376
396
  # Replace all ampersands that aren't part of an entity
377
397
  doctype.entities.each_value do |entity|
@@ -382,7 +402,9 @@ module REXML
382
402
  else
383
403
  # Replace all ampersands that aren't part of an entity
384
404
  DocType::DEFAULT_ENTITIES.each_value do |entity|
385
- copy = copy.gsub(entity.value, "&#{entity.name};" )
405
+ if copy.include?(entity.value)
406
+ copy = copy.gsub(entity.value, "&#{entity.name};" )
407
+ end
386
408
  end
387
409
  end
388
410
  copy
@@ -590,6 +590,7 @@ module REXML
590
590
 
591
591
  def evaluate_predicate(expression, nodesets)
592
592
  enter(:predicate, expression, nodesets) if @debug
593
+ new_nodeset_count = 0
593
594
  new_nodesets = nodesets.collect do |nodeset|
594
595
  new_nodeset = []
595
596
  subcontext = { :size => nodeset.size }
@@ -606,17 +607,20 @@ module REXML
606
607
  result = result[0] if result.kind_of? Array and result.length == 1
607
608
  if result.kind_of? Numeric
608
609
  if result == node.position
609
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
610
+ new_nodeset_count += 1
611
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
610
612
  end
611
613
  elsif result.instance_of? Array
612
614
  if result.size > 0 and result.inject(false) {|k,s| s or k}
613
615
  if result.size > 0
614
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
616
+ new_nodeset_count += 1
617
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
615
618
  end
616
619
  end
617
620
  else
618
621
  if result
619
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
622
+ new_nodeset_count += 1
623
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
620
624
  end
621
625
  end
622
626
  end
metadata CHANGED
@@ -1,51 +1,22 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rexml
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.2.5
4
+ version: 3.3.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kouhei Sutou
8
- autorequire:
9
- bindir: exe
8
+ bindir: bin
10
9
  cert_chain: []
11
- date: 2021-04-05 00:00:00.000000000 Z
10
+ date: 2024-08-22 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
- name: bundler
13
+ name: strscan
15
14
  requirement: !ruby/object:Gem::Requirement
16
15
  requirements:
17
16
  - - ">="
18
17
  - !ruby/object:Gem::Version
19
18
  version: '0'
20
- type: :development
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - ">="
25
- - !ruby/object:Gem::Version
26
- version: '0'
27
- - !ruby/object:Gem::Dependency
28
- name: rake
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :development
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: test-unit
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - ">="
46
- - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :development
19
+ type: :runtime
49
20
  prerelease: false
50
21
  version_requirements: !ruby/object:Gem::Requirement
51
22
  requirements:
@@ -73,6 +44,7 @@ extra_rdoc_files:
73
44
  - doc/rexml/tasks/tocs/master_toc.rdoc
74
45
  - doc/rexml/tasks/tocs/node_toc.rdoc
75
46
  - doc/rexml/tasks/tocs/parent_toc.rdoc
47
+ - doc/rexml/tutorial.rdoc
76
48
  files:
77
49
  - LICENSE.txt
78
50
  - NEWS.md
@@ -89,6 +61,7 @@ files:
89
61
  - doc/rexml/tasks/tocs/master_toc.rdoc
90
62
  - doc/rexml/tasks/tocs/node_toc.rdoc
91
63
  - doc/rexml/tasks/tocs/parent_toc.rdoc
64
+ - doc/rexml/tutorial.rdoc
92
65
  - lib/rexml.rb
93
66
  - lib/rexml/attlistdecl.rb
94
67
  - lib/rexml/attribute.rb
@@ -142,8 +115,8 @@ files:
142
115
  homepage: https://github.com/ruby/rexml
143
116
  licenses:
144
117
  - BSD-2-Clause
145
- metadata: {}
146
- post_install_message:
118
+ metadata:
119
+ changelog_uri: https://github.com/ruby/rexml/releases/tag/v3.3.6
147
120
  rdoc_options:
148
121
  - "--main"
149
122
  - README.md
@@ -153,15 +126,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
153
126
  requirements:
154
127
  - - ">="
155
128
  - !ruby/object:Gem::Version
156
- version: '0'
129
+ version: 2.5.0
157
130
  required_rubygems_version: !ruby/object:Gem::Requirement
158
131
  requirements:
159
132
  - - ">="
160
133
  - !ruby/object:Gem::Version
161
134
  version: '0'
162
135
  requirements: []
163
- rubygems_version: 3.2.3
164
- signing_key:
136
+ rubygems_version: 3.6.0.dev
165
137
  specification_version: 4
166
138
  summary: An XML toolkit for Ruby
167
139
  test_files: []