moxml 0.1.21 → 0.1.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/opal.yml +37 -0
  3. data/.gitignore +1 -0
  4. data/.rspec-opal +5 -0
  5. data/.rubocop.yml +1 -0
  6. data/.rubocop_todo.yml +680 -110
  7. data/Gemfile +6 -0
  8. data/Rakefile +70 -0
  9. data/lib/compat/opal/rexml/namespace.rb +59 -0
  10. data/lib/compat/opal/rexml/parsers/baseparser.rb +1016 -0
  11. data/lib/compat/opal/rexml/source.rb +214 -0
  12. data/lib/compat/opal/rexml/text.rb +426 -0
  13. data/lib/compat/opal/rexml/xmltokens.rb +45 -0
  14. data/lib/compat/opal/rexml_compat.rb +77 -0
  15. data/lib/moxml/adapter/customized_oga/xml_declaration.rb +8 -1
  16. data/lib/moxml/adapter/customized_rexml/formatter.rb +11 -10
  17. data/lib/moxml/adapter/headed_ox.rb +2 -6
  18. data/lib/moxml/adapter/libxml/entity_ref_registry.rb +4 -2
  19. data/lib/moxml/adapter/libxml/entity_restorer.rb +3 -1
  20. data/lib/moxml/adapter/libxml.rb +22 -24
  21. data/lib/moxml/adapter/nokogiri.rb +24 -33
  22. data/lib/moxml/adapter/oga.rb +47 -84
  23. data/lib/moxml/adapter/ox.rb +43 -41
  24. data/lib/moxml/adapter/rexml.rb +29 -33
  25. data/lib/moxml/adapter.rb +38 -8
  26. data/lib/moxml/config.rb +16 -3
  27. data/lib/moxml/document.rb +2 -8
  28. data/lib/moxml/entity_registry.rb +40 -31
  29. data/lib/moxml/entity_registry_opal_data.rb +2138 -0
  30. data/lib/moxml/node.rb +27 -26
  31. data/lib/moxml/sax/namespace_splitter.rb +54 -0
  32. data/lib/moxml/version.rb +1 -1
  33. data/lib/moxml/xml_utils.rb +10 -1
  34. data/lib/moxml.rb +7 -0
  35. data/spec/consistency/adapter_parity_spec.rb +1 -1
  36. data/spec/integration/all_adapters_spec.rb +2 -1
  37. data/spec/integration/shared_examples/line_ending_behavior.rb +56 -0
  38. data/spec/integration/w3c_namespace_spec.rb +1 -1
  39. data/spec/moxml/adapter/libxml_internals_spec.rb +4 -2
  40. data/spec/moxml/adapter/ox_spec.rb +8 -0
  41. data/spec/moxml/adapter/platform_spec.rb +70 -0
  42. data/spec/moxml/adapter/shared_examples/adapter_contract.rb +0 -6
  43. data/spec/moxml/config_spec.rb +33 -0
  44. data/spec/moxml/entity_registry_spec.rb +10 -0
  45. data/spec/moxml/native_attachment/opal_spec.rb +39 -2
  46. data/spec/moxml/node_type_map_spec.rb +43 -0
  47. data/spec/moxml/opal_rexml_adapter_spec.rb +14 -0
  48. data/spec/moxml/opal_smoke_spec.rb +61 -0
  49. data/spec/moxml/sax/namespace_splitter_spec.rb +67 -0
  50. data/spec/moxml/text_spec.rb +1 -1
  51. data/spec/spec_helper.rb +32 -13
  52. data/spec/support/opal.rb +16 -0
  53. metadata +19 -2
@@ -0,0 +1,214 @@
1
+ # frozen_string_literal: false
2
+
3
+ require "stringio"
4
+ require "strscan"
5
+
6
+ require "rexml/encoding"
7
+
8
+ module REXML
9
+ if defined?(StringScanner::Version) && StringScanner::Version < "1.0.0"
10
+ module StringScannerCheckScanString
11
+ refine StringScanner do
12
+ def check(pattern)
13
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
14
+ super
15
+ end
16
+
17
+ def scan(pattern)
18
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
19
+ super
20
+ end
21
+
22
+ def match?(pattern)
23
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
24
+ super
25
+ end
26
+
27
+ def skip(pattern)
28
+ pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
29
+ super
30
+ end
31
+ end
32
+ end
33
+ using StringScannerCheckScanString
34
+ end
35
+
36
+ class SourceFactory
37
+ def self.create_from(arg)
38
+ if arg.respond_to?(:read) &&
39
+ arg.respond_to?(:readline) &&
40
+ arg.respond_to?(:nil?) &&
41
+ arg.respond_to?(:eof?)
42
+ if RUBY_ENGINE == "opal"
43
+ # Opal's StringScanner lacks <<, so use Source (full-string) instead
44
+ # of IOSource (streaming). Read everything upfront.
45
+ Source.new(arg.read, nil)
46
+ else
47
+ IOSource.new(arg)
48
+ end
49
+ elsif arg.respond_to? :to_str
50
+ if RUBY_ENGINE == "opal"
51
+ Source.new(arg, nil)
52
+ else
53
+ IOSource.new(StringIO.new(arg))
54
+ end
55
+ elsif arg.is_a? Source
56
+ arg
57
+ else
58
+ raise "#{arg.class} is not a valid input stream. It must walk \nlike either a String, an IO, or a Source."
59
+ end
60
+ end
61
+ end
62
+
63
+ class Source
64
+ include Encoding
65
+
66
+ attr_reader :line, :encoding
67
+
68
+ module Private
69
+ SPACES_PATTERN = /\s+/
70
+ SCANNER_RESET_SIZE = 100000
71
+ PRE_DEFINED_TERM_PATTERNS = {}
72
+ pre_defined_terms = ["'", '"', "<", "]]>", "?>"]
73
+ # Opal's StringScanner requires RegExp objects, not strings.
74
+ pre_defined_terms.each do |term|
75
+ PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/
76
+ end
77
+ PRE_DEFINED_TERM_PATTERNS.freeze
78
+ end
79
+ private_constant :Private
80
+
81
+ def initialize(arg, encoding = nil)
82
+ @orig = arg
83
+ @scanner = StringScanner.new(@orig)
84
+ if encoding
85
+ self.encoding = encoding
86
+ else
87
+ detect_encoding
88
+ end
89
+ @line = 0
90
+ @encoded_terms = {}
91
+ end
92
+
93
+ def buffer
94
+ @scanner.rest
95
+ end
96
+
97
+ def drop_parsed_content
98
+ if @scanner.pos > Private::SCANNER_RESET_SIZE
99
+ @scanner = StringScanner.new(@scanner.rest)
100
+ end
101
+ end
102
+
103
+ def buffer_encoding=(encoding)
104
+ # no-op under Opal (no Encoding support)
105
+ end
106
+
107
+ def encoding=(enc)
108
+ return unless super
109
+
110
+ encoding_updated
111
+ end
112
+
113
+ def read(term = nil); end
114
+
115
+ def read_until(term)
116
+ pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
117
+ data = @scanner.scan_until(pattern)
118
+ unless data
119
+ data = @scanner.rest
120
+ @scanner.pos = @scanner.string.bytesize
121
+ end
122
+ data
123
+ end
124
+
125
+ def ensure_buffer; end
126
+
127
+ def match(pattern, cons = false)
128
+ pattern = Regexp.new(Regexp.escape(pattern)) if pattern.is_a?(String)
129
+ if cons
130
+ @scanner.scan(pattern).nil? ? nil : @scanner
131
+ else
132
+ @scanner.check(pattern).nil? ? nil : @scanner
133
+ end
134
+ end
135
+
136
+ def match?(pattern, cons = false)
137
+ pattern = Regexp.new(Regexp.escape(pattern)) if pattern.is_a?(String)
138
+ window = @scanner.peek(4096)
139
+ return false if window.empty?
140
+
141
+ m = pattern.match(window)
142
+ return false unless m && m.begin(0) == 0
143
+
144
+ @scanner.pos += m[0].length if cons
145
+ true
146
+ end
147
+
148
+ def skip_spaces
149
+ @scanner.skip(Private::SPACES_PATTERN) ? true : false
150
+ end
151
+
152
+ def position
153
+ @scanner.pos
154
+ end
155
+
156
+ def position=(pos)
157
+ @scanner.pos = pos
158
+ end
159
+
160
+ def peek_byte
161
+ @scanner.peek_byte
162
+ end
163
+
164
+ def scan_byte
165
+ @scanner.scan_byte
166
+ end
167
+
168
+ def empty?
169
+ @scanner.eos?
170
+ end
171
+
172
+ def current_line
173
+ lines = @orig.split
174
+ res = lines.grep @scanner.rest[0..30]
175
+ res = res[-1] if res.is_a? Array
176
+ lines.index(res) if res
177
+ end
178
+
179
+ private
180
+
181
+ if RUBY_ENGINE == "opal"
182
+ def detect_encoding
183
+ self.encoding = "UTF-8"
184
+ end
185
+ else
186
+ def detect_encoding
187
+ scanner_encoding = @scanner.rest.encoding
188
+ detected_encoding = "UTF-8"
189
+ begin
190
+ @scanner.string.force_encoding("ASCII-8BIT")
191
+ if @scanner.scan(/\xfe\xff/n)
192
+ detected_encoding = "UTF-16BE"
193
+ elsif @scanner.scan(/\xff\xfe/n)
194
+ detected_encoding = "UTF-16LE"
195
+ elsif @scanner.scan(/\xef\xbb\xbf/n)
196
+ detected_encoding = "UTF-8"
197
+ end
198
+ ensure
199
+ @scanner.string.force_encoding(scanner_encoding)
200
+ end
201
+ self.encoding = detected_encoding
202
+ end
203
+ end
204
+
205
+ def encoding_updated
206
+ if @encoding == "UTF-8"
207
+ @to_utf = false
208
+ else
209
+ @scanner = StringScanner.new(decode(@scanner.rest))
210
+ @to_utf = true
211
+ end
212
+ end
213
+ end
214
+ end
@@ -0,0 +1,426 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rexml/security"
4
+ require "rexml/entity"
5
+ require "rexml/doctype"
6
+ require "rexml/child"
7
+ require "rexml/parseexception"
8
+
9
+ module REXML
10
+ # Represents text nodes in an XML document
11
+ class Text < Child
12
+ include Comparable
13
+
14
+ # The order in which the substitutions occur
15
+ SPECIALS = [/&(?!#?[\w-]+;)/u, /</u, />/u, /"/u, /'/u, /\r/u].freeze
16
+ SUBSTITUTES = ["&amp;", "&lt;", "&gt;", "&quot;", "&apos;", "&#13;"].freeze
17
+ # Characters which are substituted in written strings
18
+ SLAICEPS = ["<", ">", '"', "'", "&"].freeze
19
+ SETUTITSBUS = [/&lt;/u, /&gt;/u, /&quot;/u, /&apos;/u, /&amp;/u].freeze
20
+
21
+ # If +raw+ is true, then REXML leaves the value alone
22
+ attr_accessor :raw
23
+
24
+ NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um
25
+ NUMERICENTITY = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
26
+ # BMP-only: above-BMP ranges removed for JavaScript regex compatibility.
27
+ VALID_CHAR = [
28
+ 0x9, 0xA, 0xD,
29
+ (0x20..0xD7FF),
30
+ (0xE000..0xFFFD)
31
+ ].freeze
32
+
33
+ VALID_XML_CHARS = Regexp.new("^[" +
34
+ VALID_CHAR.map { |item|
35
+ case item
36
+ when Integer
37
+ [item].pack("U")
38
+ when Range
39
+ [item.first, "-".ord, item.last].pack("UUU")
40
+ end
41
+ }.join +
42
+ "]*$")
43
+
44
+ # Constructor
45
+ # +arg+ if a String, the content is set to the String. If a Text,
46
+ # the object is shallowly cloned.
47
+ #
48
+ # +respect_whitespace+ (boolean, false) if true, whitespace is
49
+ # respected
50
+ #
51
+ # +parent+ (nil) if this is a Parent object, the parent
52
+ # will be set to this.
53
+ #
54
+ # +raw+ (nil) This argument can be given three values.
55
+ # If true, then the value of used to construct this object is expected to
56
+ # contain no unescaped XML markup, and REXML will not change the text. If
57
+ # this value is false, the string may contain any characters, and REXML will
58
+ # escape any and all defined entities whose values are contained in the
59
+ # text. If this value is nil (the default), then the raw value of the
60
+ # parent will be used as the raw value for this node. If there is no raw
61
+ # value for the parent, and no value is supplied, the default is false.
62
+ # Use this field if you have entities defined for some text, and you don't
63
+ # want REXML to escape that text in output.
64
+ # Text.new( "<&", false, nil, false ) #-> "&lt;&amp;"
65
+ # Text.new( "&lt;&amp;", false, nil, false ) #-> "&amp;lt;&amp;amp;"
66
+ # Text.new( "<&", false, nil, true ) #-> Parse exception
67
+ # Text.new( "&lt;&amp;", false, nil, true ) #-> "&lt;&amp;"
68
+ # # Assume that the entity "s" is defined to be "sean"
69
+ # # and that the entity "r" is defined to be "russell"
70
+ # Text.new( "sean russell" ) #-> "&s; &r;"
71
+ # Text.new( "sean russell", false, nil, true ) #-> "sean russell"
72
+ #
73
+ # +entity_filter+ (nil) This can be an array of entities to match in the
74
+ # supplied text. This argument is only useful if +raw+ is set to false.
75
+ # Text.new( "sean russell", false, nil, false, ["s"] ) #-> "&s; russell"
76
+ # Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell"
77
+ # In the last example, the +entity_filter+ argument is ignored.
78
+ #
79
+ # +illegal+ INTERNAL USE ONLY
80
+ def initialize(arg, respect_whitespace = false, parent = nil, raw = nil,
81
+ entity_filter = nil, illegal = NEEDS_A_SECOND_CHECK)
82
+ @raw = false
83
+ @parent = nil
84
+ @entity_filter = nil
85
+
86
+ if parent
87
+ super(parent)
88
+ @raw = parent.raw
89
+ end
90
+
91
+ if arg.is_a? String
92
+ @string = arg.dup
93
+ elsif arg.is_a? Text
94
+ @string = arg.instance_variable_get(:@string).dup
95
+ @raw = arg.raw
96
+ @entity_filter = arg.instance_variable_get(:@entity_filter)
97
+ else
98
+ raise "Illegal argument of type #{arg.type} for Text constructor (#{arg})"
99
+ end
100
+
101
+ @string = @string.squeeze(" \n\t") unless respect_whitespace
102
+ @string = @string.gsub(/\r\n?/, "\n")
103
+ @raw = raw unless raw.nil?
104
+ @entity_filter = entity_filter if entity_filter
105
+ clear_cache
106
+
107
+ Text.check(@string, illegal) if @raw
108
+ end
109
+
110
+ def parent=(parent)
111
+ super
112
+ Text.check(@string, NEEDS_A_SECOND_CHECK) if @raw && @parent
113
+ end
114
+
115
+ # check for illegal characters
116
+ def self.check(string, _pattern, _doctype = nil)
117
+ # illegal anywhere — avoid VALID_XML_CHARS regex on uncontrolled data
118
+ string.each_char do |c|
119
+ code = c.ord
120
+ unless code == 0x9 || code == 0xA || code == 0xD ||
121
+ code.between?(0x20, 0xD7FF) ||
122
+ code.between?(0xE000, 0xFFFD)
123
+ raise "Illegal character #{c.inspect} in raw string #{string.inspect}"
124
+ end
125
+ end
126
+
127
+ pos = 0
128
+ while (index = string.index(/<|&/, pos))
129
+ if string[index] == "<"
130
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
131
+ end
132
+
133
+ unless (end_index = string.index(/[^\s];/, index + 1))
134
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
135
+ end
136
+
137
+ value = string[(index + 1)..end_index]
138
+ if /\s/.match?(value)
139
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
140
+ end
141
+
142
+ if value[0] == "#"
143
+ character_reference = value[1..]
144
+
145
+ unless /^(\d+|x[0-9a-fA-F]+)$/.match?(character_reference)
146
+ if character_reference[0] == "x" || character_reference[-1] == "x"
147
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
148
+ else
149
+ raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
150
+ end
151
+ end
152
+
153
+ case (character_reference[0] == "x" ? character_reference[1..].to_i(16) : character_reference.to_i)
154
+ when *VALID_CHAR
155
+ else
156
+ raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
157
+ end
158
+ elsif !/^#{Entity::NAME}$/umo.match?(value)
159
+ raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
160
+ end
161
+
162
+ pos = end_index + 1
163
+ end
164
+
165
+ string
166
+ end
167
+
168
+ def node_type
169
+ :text
170
+ end
171
+
172
+ def empty?
173
+ @string.empty?
174
+ end
175
+
176
+ def clone
177
+ Text.new(self, true)
178
+ end
179
+
180
+ # Appends text to this text node. The text is appended in the +raw+ mode
181
+ # of this text node.
182
+ #
183
+ # +returns+ the text itself to enable method chain like
184
+ # 'text << "XXX" << "YYY"'.
185
+ def <<(to_append)
186
+ @string << to_append.gsub(/\r\n?/, "\n")
187
+ clear_cache
188
+ self
189
+ end
190
+
191
+ # +other+ a String or a Text
192
+ # +returns+ the result of (to_s <=> arg.to_s)
193
+ def <=>(other)
194
+ to_s <=> other.to_s
195
+ end
196
+
197
+ def doctype
198
+ @parent&.document&.doctype
199
+ end
200
+
201
+ REFERENCE = /#{Entity::REFERENCE}/
202
+ # Returns the string value of this text node. This string is always
203
+ # escaped, meaning that it is a valid XML text node string, and all
204
+ # entities that can be escaped, have been inserted. This method respects
205
+ # the entity filter set in the constructor.
206
+ #
207
+ # # Assume that the entity "s" is defined to be "sean", and that the
208
+ # # entity "r" is defined to be "russell"
209
+ # t = Text.new( "< & sean russell", false, nil, false, ['s'] )
210
+ # t.to_s #-> "&lt; &amp; &s; russell"
211
+ # t = Text.new( "< & &s; russell", false, nil, false )
212
+ # t.to_s #-> "&lt; &amp; &s; russell"
213
+ # u = Text.new( "sean russell", false, nil, true )
214
+ # u.to_s #-> "sean russell"
215
+ def to_s
216
+ return @string if @raw
217
+
218
+ @to_s ||= Text::normalize(@string, doctype, @entity_filter)
219
+ end
220
+
221
+ def inspect
222
+ @string.inspect
223
+ end
224
+
225
+ # Returns the string value of this text. This is the text without
226
+ # entities, as it might be used programmatically, or printed to the
227
+ # console. This ignores the 'raw' attribute setting, and any
228
+ # entity_filter.
229
+ #
230
+ # # Assume that the entity "s" is defined to be "sean", and that the
231
+ # # entity "r" is defined to be "russell"
232
+ # t = Text.new( "< & sean russell", false, nil, false, ['s'] )
233
+ # t.value #-> "< & sean russell"
234
+ # t = Text.new( "< & &s; russell", false, nil, false )
235
+ # t.value #-> "< & sean russell"
236
+ # u = Text.new( "sean russell", false, nil, true )
237
+ # u.value #-> "sean russell"
238
+ def value
239
+ @value ||= Text::unnormalize(@string, doctype,
240
+ entity_expansion_text_limit: document&.entity_expansion_text_limit)
241
+ end
242
+
243
+ # Sets the contents of this text node. This expects the text to be
244
+ # unnormalized. It returns self.
245
+ #
246
+ # e = Element.new( "a" )
247
+ # e.add_text( "foo" ) # <a>foo</a>
248
+ # e[0].value = "bar" # <a>bar</a>
249
+ # e[0].value = "<a>" # <a>&lt;a&gt;</a>
250
+ def value=(val)
251
+ @string = val.gsub(/\r\n?/, "\n")
252
+ clear_cache
253
+ @raw = false
254
+ end
255
+
256
+ def wrap(string, width, addnewline = false)
257
+ # Recursively wrap string at width.
258
+ return string if string.length <= width
259
+
260
+ place = string.rindex(" ", width) # Position in string with last ' ' before cutoff
261
+ if addnewline
262
+ "\n#{string[0, place]}\n#{wrap(string[(place + 1)..], width)}"
263
+ else
264
+ "#{string[0, place]}\n#{wrap(string[(place + 1)..], width)}"
265
+ end
266
+ end
267
+
268
+ def indent_text(string, level = 1, style = "\t", indentfirstline = true)
269
+ Kernel.warn(
270
+ "#{self.class.name}#indent_text is deprecated. See REXML::Formatters", uplevel: 1
271
+ )
272
+ return string if level.negative?
273
+
274
+ new_string = +""
275
+ string.each_line do |line|
276
+ indent_string = style * level
277
+ new_line = (indent_string + line).rstrip
278
+ new_string << new_line
279
+ end
280
+ new_string.strip! unless indentfirstline
281
+ new_string
282
+ end
283
+
284
+ # == DEPRECATED
285
+ # See REXML::Formatters
286
+ #
287
+ def write(writer, indent = -1, _transitive = false, _ie_hack = false)
288
+ Kernel.warn(
289
+ "#{self.class.name}#write is deprecated. See REXML::Formatters", uplevel: 1
290
+ )
291
+ formatter = if indent > -1
292
+ REXML::Formatters::Pretty.new(indent)
293
+ else
294
+ REXML::Formatters::Default.new
295
+ end
296
+ formatter.write(self, writer)
297
+ end
298
+
299
+ # FIXME
300
+ # This probably won't work properly
301
+ def xpath
302
+ "#{@parent.xpath}/text()"
303
+ end
304
+
305
+ # Writes out text, substituting special characters beforehand.
306
+ # +out+ A String, IO, or any other object supporting <<( String )
307
+ # +input+ the text to substitute and the write out
308
+ #
309
+ # z=utf8.unpack("U*")
310
+ # ascOut=""
311
+ # z.each{|r|
312
+ # if r < 0x100
313
+ # ascOut.concat(r.chr)
314
+ # else
315
+ # ascOut.concat(sprintf("&#x%x;", r))
316
+ # end
317
+ # }
318
+ # puts ascOut
319
+ def write_with_substitution(out, input)
320
+ copy = input.clone
321
+ # Doing it like this rather than in a loop improves the speed
322
+ copy.gsub!(SPECIALS[0], SUBSTITUTES[0])
323
+ copy.gsub!(SPECIALS[1], SUBSTITUTES[1])
324
+ copy.gsub!(SPECIALS[2], SUBSTITUTES[2])
325
+ copy.gsub!(SPECIALS[3], SUBSTITUTES[3])
326
+ copy.gsub!(SPECIALS[4], SUBSTITUTES[4])
327
+ copy.gsub!(SPECIALS[5], SUBSTITUTES[5])
328
+ out << copy
329
+ end
330
+
331
+ private
332
+
333
+ def clear_cache
334
+ @normalized = nil
335
+ @unnormalized = nil
336
+ end
337
+
338
+ # Reads text, substituting entities
339
+ def self.read_with_substitution(input, illegal = nil)
340
+ copy = input.clone
341
+
342
+ if illegal && illegal && (copy =~ illegal)
343
+ raise ParseException.new("malformed text: Illegal character #$& in \"#{copy}\"")
344
+ end
345
+
346
+ copy.gsub!(/\r\n?/, "\n")
347
+ if copy.include? ?&
348
+ copy.gsub!(SETUTITSBUS[0], SLAICEPS[0])
349
+ copy.gsub!(SETUTITSBUS[1], SLAICEPS[1])
350
+ copy.gsub!(SETUTITSBUS[2], SLAICEPS[2])
351
+ copy.gsub!(SETUTITSBUS[3], SLAICEPS[3])
352
+ copy.gsub!(SETUTITSBUS[4], SLAICEPS[4])
353
+ copy.gsub!(/&#0*((?:\d+)|(?:x[a-f0-9]+));/) do
354
+ m = $1
355
+ # m='0' if m==''
356
+ m = "0#{m}" if m[0] == ?x
357
+ [Integer(m)].pack("U*")
358
+ end
359
+ end
360
+ copy
361
+ end
362
+
363
+ EREFERENCE = /&(?!#{Entity::NAME};)/
364
+ # Escapes all possible entities
365
+ def self.normalize(input, doctype = nil, entity_filter = nil)
366
+ copy = input.to_s
367
+ # Doing it like this rather than in a loop improves the speed
368
+ # copy = copy.gsub( EREFERENCE, '&amp;' )
369
+ copy = copy.gsub("&", "&amp;") if copy.include?("&")
370
+ if doctype
371
+ # Replace all ampersands that aren't part of an entity
372
+ doctype.entities.each_value do |entity|
373
+ if entity.value &&
374
+ not(entity_filter && entity_filter.include?(entity.name))
375
+ copy = copy.gsub(entity.value,
376
+ "&#{entity.name};")
377
+ end
378
+ end
379
+ else
380
+ # Replace all ampersands that aren't part of an entity
381
+ DocType::DEFAULT_ENTITIES.each_value do |entity|
382
+ if copy.include?(entity.value)
383
+ copy = copy.gsub(entity.value, "&#{entity.name};")
384
+ end
385
+ end
386
+ end
387
+ copy
388
+ end
389
+
390
+ # Unescapes all possible entities
391
+ def self.unnormalize(string, doctype = nil, filter = nil, _illegal = nil,
392
+ entity_expansion_text_limit: nil)
393
+ entity_expansion_text_limit ||= Security.entity_expansion_text_limit
394
+ sum = 0
395
+ string.gsub(/\r\n?/, "\n").gsub(REFERENCE) do
396
+ s = Text.expand($&, doctype, filter)
397
+ if sum + s.bytesize > entity_expansion_text_limit
398
+ raise "entity expansion has grown too large"
399
+ else
400
+ sum += s.bytesize
401
+ end
402
+
403
+ s
404
+ end
405
+ end
406
+
407
+ def self.expand(ref, doctype, filter)
408
+ if ref[1] == ?#
409
+ if ref[2] == ?x
410
+ [ref[3...-1].to_i(16)].pack("U*")
411
+ else
412
+ [ref[2...-1].to_i].pack("U*")
413
+ end
414
+ elsif ref == "&amp;"
415
+ "&"
416
+ elsif filter&.include?(ref[1...-1])
417
+ ref
418
+ elsif doctype
419
+ doctype.entity(ref[1...-1]) or ref
420
+ else
421
+ entity_value = DocType::DEFAULT_ENTITIES[ref[1...-1]]
422
+ entity_value ? entity_value.value : ref
423
+ end
424
+ end
425
+ end
426
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: false
2
+
3
+ module REXML
4
+ module XMLTokens
5
+ name_start_chars = [
6
+ ":",
7
+ "A-Z",
8
+ "_",
9
+ "a-z",
10
+ "\\u00C0-\\u00D6",
11
+ "\\u00D8-\\u00F6",
12
+ "\\u00F8-\\u02FF",
13
+ "\\u0370-\\u037D",
14
+ "\\u037F-\\u1FFF",
15
+ "\\u200C-\\u200D",
16
+ "\\u2070-\\u218F",
17
+ "\\u2C00-\\u2FEF",
18
+ "\\u3001-\\uD7FF",
19
+ "\\uF900-\\uFDCF",
20
+ "\\uFDF0-\\uFFFD",
21
+ ]
22
+
23
+ name_chars = name_start_chars + [
24
+ "\\-",
25
+ "\\.",
26
+ "0-9",
27
+ "\\u00B7",
28
+ "\\u0300-\\u036F",
29
+ "\\u203F-\\u2040",
30
+ ]
31
+ NAME_START_CHAR = "[#{name_start_chars.join}]".freeze
32
+ NAME_CHAR = "[#{name_chars.join}]".freeze
33
+ NAMECHAR = NAME_CHAR
34
+
35
+ ncname_start_chars = name_start_chars - [":"]
36
+ ncname_chars = name_chars - [":"]
37
+ NCNAME_STR = "[#{ncname_start_chars.join}][#{ncname_chars.join}]*".freeze
38
+ NAME_STR = "(?:#{NCNAME_STR}:)?#{NCNAME_STR}".freeze
39
+
40
+ NAME = "(#{NAME_START_CHAR}#{NAME_CHAR}*)".freeze
41
+ NMTOKEN = "(?:#{NAME_CHAR})+".freeze
42
+ NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*".freeze
43
+ REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)".freeze
44
+ end
45
+ end