moxml 0.1.21 → 0.1.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/opal.yml +37 -0
  3. data/.rspec-opal +5 -0
  4. data/Gemfile +6 -0
  5. data/Rakefile +67 -0
  6. data/lib/compat/opal/rexml/namespace.rb +56 -0
  7. data/lib/compat/opal/rexml/parsers/baseparser.rb +952 -0
  8. data/lib/compat/opal/rexml/source.rb +213 -0
  9. data/lib/compat/opal/rexml/text.rb +418 -0
  10. data/lib/compat/opal/rexml/xmltokens.rb +45 -0
  11. data/lib/compat/opal/rexml_compat.rb +76 -0
  12. data/lib/moxml/adapter/customized_rexml/formatter.rb +11 -10
  13. data/lib/moxml/adapter/headed_ox.rb +2 -6
  14. data/lib/moxml/adapter/libxml.rb +5 -20
  15. data/lib/moxml/adapter/nokogiri.rb +7 -18
  16. data/lib/moxml/adapter/oga.rb +4 -22
  17. data/lib/moxml/adapter/ox.rb +8 -23
  18. data/lib/moxml/adapter/rexml.rb +29 -33
  19. data/lib/moxml/adapter.rb +38 -8
  20. data/lib/moxml/config.rb +1 -1
  21. data/lib/moxml/entity_registry.rb +36 -31
  22. data/lib/moxml/entity_registry_opal_data.rb +2137 -0
  23. data/lib/moxml/node.rb +19 -26
  24. data/lib/moxml/sax/namespace_splitter.rb +54 -0
  25. data/lib/moxml/version.rb +1 -1
  26. data/lib/moxml/xml_utils.rb +9 -1
  27. data/spec/consistency/adapter_parity_spec.rb +1 -1
  28. data/spec/integration/all_adapters_spec.rb +1 -1
  29. data/spec/integration/w3c_namespace_spec.rb +1 -1
  30. data/spec/moxml/adapter/ox_spec.rb +8 -0
  31. data/spec/moxml/adapter/platform_spec.rb +69 -0
  32. data/spec/moxml/adapter/shared_examples/adapter_contract.rb +0 -6
  33. data/spec/moxml/entity_registry_spec.rb +10 -0
  34. data/spec/moxml/native_attachment/opal_spec.rb +39 -2
  35. data/spec/moxml/node_type_map_spec.rb +43 -0
  36. data/spec/moxml/opal_rexml_adapter_spec.rb +14 -0
  37. data/spec/moxml/opal_smoke_spec.rb +61 -0
  38. data/spec/moxml/sax/namespace_splitter_spec.rb +67 -0
  39. data/spec/moxml/text_spec.rb +1 -1
  40. data/spec/spec_helper.rb +32 -13
  41. data/spec/support/opal.rb +16 -0
  42. metadata +17 -1
@@ -0,0 +1,213 @@
1
+ # frozen_string_literal: false
2
+
3
+ require "stringio"
4
+ require "strscan"
5
+
6
+ require 'rexml/encoding'
7
+
8
+ module REXML
9
+ if defined?(StringScanner::Version) && StringScanner::Version < "1.0.0"
10
+ module StringScannerCheckScanString
11
+ refine StringScanner do
12
+ def check(pattern)
13
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
14
+ super(pattern)
15
+ end
16
+
17
+ def scan(pattern)
18
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
19
+ super(pattern)
20
+ end
21
+
22
+ def match?(pattern)
23
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
24
+ super(pattern)
25
+ end
26
+
27
+ def skip(pattern)
28
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
29
+ super(pattern)
30
+ end
31
+ end
32
+ end
33
+ using StringScannerCheckScanString
34
+ end
35
+
36
+ class SourceFactory
37
+ def SourceFactory::create_from(arg)
38
+ if arg.respond_to? :read and
39
+ arg.respond_to? :readline and
40
+ arg.respond_to? :nil? and
41
+ arg.respond_to? :eof?
42
+ if RUBY_ENGINE == "opal"
43
+ # Opal's StringScanner lacks <<, so use Source (full-string) instead
44
+ # of IOSource (streaming). Read everything upfront.
45
+ Source.new(arg.read, nil)
46
+ else
47
+ IOSource.new(arg)
48
+ end
49
+ elsif arg.respond_to? :to_str
50
+ if RUBY_ENGINE == "opal"
51
+ Source.new(arg, nil)
52
+ else
53
+ IOSource.new(StringIO.new(arg))
54
+ end
55
+ elsif arg.kind_of? Source
56
+ arg
57
+ else
58
+ raise "#{arg.class} is not a valid input stream. It must walk \n"+
59
+ "like either a String, an IO, or a Source."
60
+ end
61
+ end
62
+ end
63
+
64
+ class Source
65
+ include Encoding
66
+ attr_reader :line
67
+ attr_reader :encoding
68
+
69
+ module Private
70
+ SPACES_PATTERN = /\s+/
71
+ SCANNER_RESET_SIZE = 100000
72
+ PRE_DEFINED_TERM_PATTERNS = {}
73
+ pre_defined_terms = ["'", '"', "<", "]]>", "?>"]
74
+ # Opal's StringScanner requires RegExp objects, not strings.
75
+ pre_defined_terms.each do |term|
76
+ PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/
77
+ end
78
+ end
79
+ private_constant :Private
80
+
81
+ def initialize(arg, encoding=nil)
82
+ @orig = arg
83
+ @scanner = StringScanner.new(@orig)
84
+ if encoding
85
+ self.encoding = encoding
86
+ else
87
+ detect_encoding
88
+ end
89
+ @line = 0
90
+ @encoded_terms = {}
91
+ end
92
+
93
+ def buffer
94
+ @scanner.rest
95
+ end
96
+
97
+ def drop_parsed_content
98
+ if @scanner.pos > Private::SCANNER_RESET_SIZE
99
+ @scanner = StringScanner.new(@scanner.rest)
100
+ end
101
+ end
102
+
103
+ def buffer_encoding=(encoding)
104
+ # no-op under Opal (no Encoding support)
105
+ end
106
+
107
+ def encoding=(enc)
108
+ return unless super
109
+ encoding_updated
110
+ end
111
+
112
+ def read(term = nil)
113
+ end
114
+
115
+ def read_until(term)
116
+ pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
117
+ data = @scanner.scan_until(pattern)
118
+ unless data
119
+ data = @scanner.rest
120
+ @scanner.pos = @scanner.string.bytesize
121
+ end
122
+ data
123
+ end
124
+
125
+ def ensure_buffer
126
+ end
127
+
128
+ def match(pattern, cons=false)
129
+ pattern = Regexp.new(Regexp.escape(pattern)) if pattern.is_a?(String)
130
+ if cons
131
+ @scanner.scan(pattern).nil? ? nil : @scanner
132
+ else
133
+ @scanner.check(pattern).nil? ? nil : @scanner
134
+ end
135
+ end
136
+
137
+ def match?(pattern, cons=false)
138
+ pattern = Regexp.new(Regexp.escape(pattern)) if pattern.is_a?(String)
139
+ window = @scanner.peek(4096)
140
+ return false if window.empty?
141
+ m = pattern.match(window)
142
+ return false unless m && m.begin(0) == 0
143
+ @scanner.pos += m[0].length if cons
144
+ true
145
+ end
146
+
147
+ def skip_spaces
148
+ @scanner.skip(Private::SPACES_PATTERN) ? true : false
149
+ end
150
+
151
+ def position
152
+ @scanner.pos
153
+ end
154
+
155
+ def position=(pos)
156
+ @scanner.pos = pos
157
+ end
158
+
159
+ def peek_byte
160
+ @scanner.peek_byte
161
+ end
162
+
163
+ def scan_byte
164
+ @scanner.scan_byte
165
+ end
166
+
167
+ def empty?
168
+ @scanner.eos?
169
+ end
170
+
171
+ def current_line
172
+ lines = @orig.split
173
+ res = lines.grep @scanner.rest[0..30]
174
+ res = res[-1] if res.kind_of? Array
175
+ lines.index( res ) if res
176
+ end
177
+
178
+ private
179
+
180
+ if RUBY_ENGINE == "opal"
181
+ def detect_encoding
182
+ self.encoding = "UTF-8"
183
+ end
184
+ else
185
+ def detect_encoding
186
+ scanner_encoding = @scanner.rest.encoding
187
+ detected_encoding = "UTF-8"
188
+ begin
189
+ @scanner.string.force_encoding("ASCII-8BIT")
190
+ if @scanner.scan(/\xfe\xff/n)
191
+ detected_encoding = "UTF-16BE"
192
+ elsif @scanner.scan(/\xff\xfe/n)
193
+ detected_encoding = "UTF-16LE"
194
+ elsif @scanner.scan(/\xef\xbb\xbf/n)
195
+ detected_encoding = "UTF-8"
196
+ end
197
+ ensure
198
+ @scanner.string.force_encoding(scanner_encoding)
199
+ end
200
+ self.encoding = detected_encoding
201
+ end
202
+ end
203
+
204
+ def encoding_updated
205
+ if @encoding != 'UTF-8'
206
+ @scanner = StringScanner.new(decode(@scanner.rest))
207
+ @to_utf = true
208
+ else
209
+ @to_utf = false
210
+ end
211
+ end
212
+ end
213
+ end
@@ -0,0 +1,418 @@
1
+ # frozen_string_literal: true
2
+ require 'rexml/security'
3
+ require 'rexml/entity'
4
+ require 'rexml/doctype'
5
+ require 'rexml/child'
6
+ require 'rexml/parseexception'
7
+
8
+ module REXML
9
+ # Represents text nodes in an XML document
10
+ class Text < Child
11
+ include Comparable
12
+ # The order in which the substitutions occur
13
+ SPECIALS = [ /&(?!#?[\w-]+;)/u, /</u, />/u, /"/u, /'/u, /\r/u ]
14
+ SUBSTITUTES = ['&amp;', '&lt;', '&gt;', '&quot;', '&apos;', '&#13;']
15
+ # Characters which are substituted in written strings
16
+ SLAICEPS = [ '<', '>', '"', "'", '&' ]
17
+ SETUTITSBUS = [ /&lt;/u, /&gt;/u, /&quot;/u, /&apos;/u, /&amp;/u ]
18
+
19
+ # If +raw+ is true, then REXML leaves the value alone
20
+ attr_accessor :raw
21
+
22
+ NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um
23
+ NUMERICENTITY = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
24
+ # BMP-only: above-BMP ranges removed for JavaScript regex compatibility.
25
+ VALID_CHAR = [
26
+ 0x9, 0xA, 0xD,
27
+ (0x20..0xD7FF),
28
+ (0xE000..0xFFFD),
29
+ ]
30
+
31
+ VALID_XML_CHARS = Regexp.new('^['+
32
+ VALID_CHAR.map { |item|
33
+ case item
34
+ when Integer
35
+ [item].pack('U')
36
+ when Range
37
+ [item.first, '-'.ord, item.last].pack('UUU')
38
+ end
39
+ }.join +
40
+ ']*$')
41
+
42
+ # Constructor
43
+ # +arg+ if a String, the content is set to the String. If a Text,
44
+ # the object is shallowly cloned.
45
+ #
46
+ # +respect_whitespace+ (boolean, false) if true, whitespace is
47
+ # respected
48
+ #
49
+ # +parent+ (nil) if this is a Parent object, the parent
50
+ # will be set to this.
51
+ #
52
+ # +raw+ (nil) This argument can be given three values.
53
+ # If true, then the value of used to construct this object is expected to
54
+ # contain no unescaped XML markup, and REXML will not change the text. If
55
+ # this value is false, the string may contain any characters, and REXML will
56
+ # escape any and all defined entities whose values are contained in the
57
+ # text. If this value is nil (the default), then the raw value of the
58
+ # parent will be used as the raw value for this node. If there is no raw
59
+ # value for the parent, and no value is supplied, the default is false.
60
+ # Use this field if you have entities defined for some text, and you don't
61
+ # want REXML to escape that text in output.
62
+ # Text.new( "<&", false, nil, false ) #-> "&lt;&amp;"
63
+ # Text.new( "&lt;&amp;", false, nil, false ) #-> "&amp;lt;&amp;amp;"
64
+ # Text.new( "<&", false, nil, true ) #-> Parse exception
65
+ # Text.new( "&lt;&amp;", false, nil, true ) #-> "&lt;&amp;"
66
+ # # Assume that the entity "s" is defined to be "sean"
67
+ # # and that the entity "r" is defined to be "russell"
68
+ # Text.new( "sean russell" ) #-> "&s; &r;"
69
+ # Text.new( "sean russell", false, nil, true ) #-> "sean russell"
70
+ #
71
+ # +entity_filter+ (nil) This can be an array of entities to match in the
72
+ # supplied text. This argument is only useful if +raw+ is set to false.
73
+ # Text.new( "sean russell", false, nil, false, ["s"] ) #-> "&s; russell"
74
+ # Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell"
75
+ # In the last example, the +entity_filter+ argument is ignored.
76
+ #
77
+ # +illegal+ INTERNAL USE ONLY
78
+ def initialize(arg, respect_whitespace=false, parent=nil, raw=nil,
79
+ entity_filter=nil, illegal=NEEDS_A_SECOND_CHECK )
80
+
81
+ @raw = false
82
+ @parent = nil
83
+ @entity_filter = nil
84
+
85
+ if parent
86
+ super( parent )
87
+ @raw = parent.raw
88
+ end
89
+
90
+ if arg.kind_of? String
91
+ @string = arg.dup
92
+ elsif arg.kind_of? Text
93
+ @string = arg.instance_variable_get(:@string).dup
94
+ @raw = arg.raw
95
+ @entity_filter = arg.instance_variable_get(:@entity_filter)
96
+ else
97
+ raise "Illegal argument of type #{arg.type} for Text constructor (#{arg})"
98
+ end
99
+
100
+ @string = @string.squeeze(" \n\t") unless respect_whitespace
101
+ @string = @string.gsub(/\r\n?/, "\n")
102
+ @raw = raw unless raw.nil?
103
+ @entity_filter = entity_filter if entity_filter
104
+ clear_cache
105
+
106
+ Text.check(@string, illegal) if @raw
107
+ end
108
+
109
+ def parent= parent
110
+ super(parent)
111
+ Text.check(@string, NEEDS_A_SECOND_CHECK) if @raw and @parent
112
+ end
113
+
114
+ # check for illegal characters
115
+ def Text.check string, pattern, doctype = nil
116
+
117
+ # illegal anywhere — avoid VALID_XML_CHARS regex on uncontrolled data
118
+ string.each_char do |c|
119
+ code = c.ord
120
+ unless (code == 0x9 || code == 0xA || code == 0xD ||
121
+ (code >= 0x20 && code <= 0xD7FF) ||
122
+ (code >= 0xE000 && code <= 0xFFFD))
123
+ raise "Illegal character #{c.inspect} in raw string #{string.inspect}"
124
+ end
125
+ end
126
+
127
+ pos = 0
128
+ while (index = string.index(/<|&/, pos))
129
+ if string[index] == "<"
130
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
131
+ end
132
+
133
+ unless (end_index = string.index(/[^\s];/, index + 1))
134
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
135
+ end
136
+
137
+ value = string[(index + 1)..end_index]
138
+ if /\s/.match?(value)
139
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
140
+ end
141
+
142
+ if value[0] == "#"
143
+ character_reference = value[1..-1]
144
+
145
+ unless (/^(\d+|x[0-9a-fA-F]+)$/.match?(character_reference))
146
+ if character_reference[0] == "x" || character_reference[-1] == "x"
147
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
148
+ else
149
+ raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
150
+ end
151
+ end
152
+
153
+ case (character_reference[0] == "x" ? character_reference[1..-1].to_i(16) : character_reference[0..-1].to_i)
154
+ when *VALID_CHAR
155
+ else
156
+ raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
157
+ end
158
+ elsif !(/^#{Entity::NAME}$/um.match?(value))
159
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
160
+ end
161
+
162
+ pos = end_index + 1
163
+ end
164
+
165
+ string
166
+ end
167
+
168
+ def node_type
169
+ :text
170
+ end
171
+
172
+ def empty?
173
+ @string.size==0
174
+ end
175
+
176
+
177
+ def clone
178
+ Text.new(self, true)
179
+ end
180
+
181
+
182
+ # Appends text to this text node. The text is appended in the +raw+ mode
183
+ # of this text node.
184
+ #
185
+ # +returns+ the text itself to enable method chain like
186
+ # 'text << "XXX" << "YYY"'.
187
+ def <<( to_append )
188
+ @string << to_append.gsub( /\r\n?/, "\n" )
189
+ clear_cache
190
+ self
191
+ end
192
+
193
+
194
+ # +other+ a String or a Text
195
+ # +returns+ the result of (to_s <=> arg.to_s)
196
+ def <=>( other )
197
+ to_s() <=> other.to_s
198
+ end
199
+
200
+ def doctype
201
+ @parent&.document&.doctype
202
+ end
203
+
204
+ REFERENCE = /#{Entity::REFERENCE}/
205
+ # Returns the string value of this text node. This string is always
206
+ # escaped, meaning that it is a valid XML text node string, and all
207
+ # entities that can be escaped, have been inserted. This method respects
208
+ # the entity filter set in the constructor.
209
+ #
210
+ # # Assume that the entity "s" is defined to be "sean", and that the
211
+ # # entity "r" is defined to be "russell"
212
+ # t = Text.new( "< & sean russell", false, nil, false, ['s'] )
213
+ # t.to_s #-> "&lt; &amp; &s; russell"
214
+ # t = Text.new( "< & &s; russell", false, nil, false )
215
+ # t.to_s #-> "&lt; &amp; &s; russell"
216
+ # u = Text.new( "sean russell", false, nil, true )
217
+ # u.to_s #-> "sean russell"
218
+ def to_s
219
+ return @string if @raw
220
+ @normalized ||= Text::normalize( @string, doctype, @entity_filter )
221
+ end
222
+
223
+ def inspect
224
+ @string.inspect
225
+ end
226
+
227
+ # Returns the string value of this text. This is the text without
228
+ # entities, as it might be used programmatically, or printed to the
229
+ # console. This ignores the 'raw' attribute setting, and any
230
+ # entity_filter.
231
+ #
232
+ # # Assume that the entity "s" is defined to be "sean", and that the
233
+ # # entity "r" is defined to be "russell"
234
+ # t = Text.new( "< & sean russell", false, nil, false, ['s'] )
235
+ # t.value #-> "< & sean russell"
236
+ # t = Text.new( "< & &s; russell", false, nil, false )
237
+ # t.value #-> "< & sean russell"
238
+ # u = Text.new( "sean russell", false, nil, true )
239
+ # u.value #-> "sean russell"
240
+ def value
241
+ @unnormalized ||= Text::unnormalize(@string, doctype,
242
+ entity_expansion_text_limit: document&.entity_expansion_text_limit)
243
+ end
244
+
245
+ # Sets the contents of this text node. This expects the text to be
246
+ # unnormalized. It returns self.
247
+ #
248
+ # e = Element.new( "a" )
249
+ # e.add_text( "foo" ) # <a>foo</a>
250
+ # e[0].value = "bar" # <a>bar</a>
251
+ # e[0].value = "<a>" # <a>&lt;a&gt;</a>
252
+ def value=( val )
253
+ @string = val.gsub( /\r\n?/, "\n" )
254
+ clear_cache
255
+ @raw = false
256
+ end
257
+
258
+ def wrap(string, width, addnewline=false)
259
+ # Recursively wrap string at width.
260
+ return string if string.length <= width
261
+ place = string.rindex(' ', width) # Position in string with last ' ' before cutoff
262
+ if addnewline
263
+ "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width)
264
+ else
265
+ string[0,place] + "\n" + wrap(string[place+1..-1], width)
266
+ end
267
+ end
268
+
269
+ def indent_text(string, level=1, style="\t", indentfirstline=true)
270
+ Kernel.warn("#{self.class.name}#indent_text is deprecated. See REXML::Formatters", uplevel: 1)
271
+ return string if level < 0
272
+
273
+ new_string = +''
274
+ string.each_line { |line|
275
+ indent_string = style * level
276
+ new_line = (indent_string + line).rstrip
277
+ new_string << new_line
278
+ }
279
+ new_string.strip! unless indentfirstline
280
+ new_string
281
+ end
282
+
283
+ # == DEPRECATED
284
+ # See REXML::Formatters
285
+ #
286
+ def write( writer, indent=-1, transitive=false, ie_hack=false )
287
+ Kernel.warn("#{self.class.name}#write is deprecated. See REXML::Formatters", uplevel: 1)
288
+ formatter = if indent > -1
289
+ REXML::Formatters::Pretty.new( indent )
290
+ else
291
+ REXML::Formatters::Default.new
292
+ end
293
+ formatter.write( self, writer )
294
+ end
295
+
296
+ # FIXME
297
+ # This probably won't work properly
298
+ def xpath
299
+ @parent.xpath + "/text()"
300
+ end
301
+
302
+ # Writes out text, substituting special characters beforehand.
303
+ # +out+ A String, IO, or any other object supporting <<( String )
304
+ # +input+ the text to substitute and the write out
305
+ #
306
+ # z=utf8.unpack("U*")
307
+ # ascOut=""
308
+ # z.each{|r|
309
+ # if r < 0x100
310
+ # ascOut.concat(r.chr)
311
+ # else
312
+ # ascOut.concat(sprintf("&#x%x;", r))
313
+ # end
314
+ # }
315
+ # puts ascOut
316
+ def write_with_substitution out, input
317
+ copy = input.clone
318
+ # Doing it like this rather than in a loop improves the speed
319
+ copy.gsub!( SPECIALS[0], SUBSTITUTES[0] )
320
+ copy.gsub!( SPECIALS[1], SUBSTITUTES[1] )
321
+ copy.gsub!( SPECIALS[2], SUBSTITUTES[2] )
322
+ copy.gsub!( SPECIALS[3], SUBSTITUTES[3] )
323
+ copy.gsub!( SPECIALS[4], SUBSTITUTES[4] )
324
+ copy.gsub!( SPECIALS[5], SUBSTITUTES[5] )
325
+ out << copy
326
+ end
327
+
328
+ private
329
+ def clear_cache
330
+ @normalized = nil
331
+ @unnormalized = nil
332
+ end
333
+
334
+ # Reads text, substituting entities
335
+ def Text::read_with_substitution( input, illegal=nil )
336
+ copy = input.clone
337
+
338
+ if copy =~ illegal
339
+ raise ParseException.new( "malformed text: Illegal character #$& in \"#{copy}\"" )
340
+ end if illegal
341
+
342
+ copy.gsub!( /\r\n?/, "\n" )
343
+ if copy.include? ?&
344
+ copy.gsub!( SETUTITSBUS[0], SLAICEPS[0] )
345
+ copy.gsub!( SETUTITSBUS[1], SLAICEPS[1] )
346
+ copy.gsub!( SETUTITSBUS[2], SLAICEPS[2] )
347
+ copy.gsub!( SETUTITSBUS[3], SLAICEPS[3] )
348
+ copy.gsub!( SETUTITSBUS[4], SLAICEPS[4] )
349
+ copy.gsub!( /&#0*((?:\d+)|(?:x[a-f0-9]+));/ ) {
350
+ m=$1
351
+ #m='0' if m==''
352
+ m = "0#{m}" if m[0] == ?x
353
+ [Integer(m)].pack('U*')
354
+ }
355
+ end
356
+ copy
357
+ end
358
+
359
+ EREFERENCE = /&(?!#{Entity::NAME};)/
360
+ # Escapes all possible entities
361
+ def Text::normalize( input, doctype=nil, entity_filter=nil )
362
+ copy = input.to_s
363
+ # Doing it like this rather than in a loop improves the speed
364
+ #copy = copy.gsub( EREFERENCE, '&amp;' )
365
+ copy = copy.gsub( "&", "&amp;" ) if copy.include?("&")
366
+ if doctype
367
+ # Replace all ampersands that aren't part of an entity
368
+ doctype.entities.each_value do |entity|
369
+ copy = copy.gsub( entity.value,
370
+ "&#{entity.name};" ) if entity.value and
371
+ not( entity_filter and entity_filter.include?(entity.name) )
372
+ end
373
+ else
374
+ # Replace all ampersands that aren't part of an entity
375
+ DocType::DEFAULT_ENTITIES.each_value do |entity|
376
+ if copy.include?(entity.value)
377
+ copy = copy.gsub(entity.value, "&#{entity.name};" )
378
+ end
379
+ end
380
+ end
381
+ copy
382
+ end
383
+
384
+ # Unescapes all possible entities
385
+ def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expansion_text_limit: nil )
386
+ entity_expansion_text_limit ||= Security.entity_expansion_text_limit
387
+ sum = 0
388
+ string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) {
389
+ s = Text.expand($&, doctype, filter)
390
+ if sum + s.bytesize > entity_expansion_text_limit
391
+ raise "entity expansion has grown too large"
392
+ else
393
+ sum += s.bytesize
394
+ end
395
+ s
396
+ }
397
+ end
398
+
399
+ def Text.expand(ref, doctype, filter)
400
+ if ref[1] == ?#
401
+ if ref[2] == ?x
402
+ [ref[3...-1].to_i(16)].pack('U*')
403
+ else
404
+ [ref[2...-1].to_i].pack('U*')
405
+ end
406
+ elsif ref == '&amp;'
407
+ '&'
408
+ elsif filter and filter.include?( ref[1...-1] )
409
+ ref
410
+ elsif doctype
411
+ doctype.entity( ref[1...-1] ) or ref
412
+ else
413
+ entity_value = DocType::DEFAULT_ENTITIES[ ref[1...-1] ]
414
+ entity_value ? entity_value.value : ref
415
+ end
416
+ end
417
+ end
418
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: false
2
+
3
+ module REXML
4
+ module XMLTokens
5
+ name_start_chars = [
6
+ ":",
7
+ "A-Z",
8
+ "_",
9
+ "a-z",
10
+ "\\u00C0-\\u00D6",
11
+ "\\u00D8-\\u00F6",
12
+ "\\u00F8-\\u02FF",
13
+ "\\u0370-\\u037D",
14
+ "\\u037F-\\u1FFF",
15
+ "\\u200C-\\u200D",
16
+ "\\u2070-\\u218F",
17
+ "\\u2C00-\\u2FEF",
18
+ "\\u3001-\\uD7FF",
19
+ "\\uF900-\\uFDCF",
20
+ "\\uFDF0-\\uFFFD",
21
+ ]
22
+
23
+ name_chars = name_start_chars + [
24
+ "\\-",
25
+ "\\.",
26
+ "0-9",
27
+ "\\u00B7",
28
+ "\\u0300-\\u036F",
29
+ "\\u203F-\\u2040",
30
+ ]
31
+ NAME_START_CHAR = "[#{name_start_chars.join('')}]"
32
+ NAME_CHAR = "[#{name_chars.join('')}]"
33
+ NAMECHAR = NAME_CHAR
34
+
35
+ ncname_start_chars = name_start_chars - [":"]
36
+ ncname_chars = name_chars - [":"]
37
+ NCNAME_STR = "[#{ncname_start_chars.join('')}][#{ncname_chars.join('')}]*"
38
+ NAME_STR = "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
39
+
40
+ NAME = "(#{NAME_START_CHAR}#{NAME_CHAR}*)"
41
+ NMTOKEN = "(?:#{NAME_CHAR})+"
42
+ NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
43
+ REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)"
44
+ end
45
+ end