moxml 0.1.21 → 0.1.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/opal.yml +37 -0
  3. data/.gitignore +1 -0
  4. data/.rspec-opal +5 -0
  5. data/.rubocop.yml +1 -0
  6. data/.rubocop_todo.yml +680 -110
  7. data/Gemfile +6 -0
  8. data/Rakefile +70 -0
  9. data/lib/compat/opal/rexml/namespace.rb +59 -0
  10. data/lib/compat/opal/rexml/parsers/baseparser.rb +1016 -0
  11. data/lib/compat/opal/rexml/source.rb +214 -0
  12. data/lib/compat/opal/rexml/text.rb +426 -0
  13. data/lib/compat/opal/rexml/xmltokens.rb +45 -0
  14. data/lib/compat/opal/rexml_compat.rb +77 -0
  15. data/lib/moxml/adapter/customized_oga/xml_declaration.rb +8 -1
  16. data/lib/moxml/adapter/customized_rexml/formatter.rb +11 -10
  17. data/lib/moxml/adapter/headed_ox.rb +2 -6
  18. data/lib/moxml/adapter/libxml/entity_ref_registry.rb +4 -2
  19. data/lib/moxml/adapter/libxml/entity_restorer.rb +3 -1
  20. data/lib/moxml/adapter/libxml.rb +22 -24
  21. data/lib/moxml/adapter/nokogiri.rb +24 -33
  22. data/lib/moxml/adapter/oga.rb +47 -84
  23. data/lib/moxml/adapter/ox.rb +43 -41
  24. data/lib/moxml/adapter/rexml.rb +29 -33
  25. data/lib/moxml/adapter.rb +38 -8
  26. data/lib/moxml/config.rb +16 -3
  27. data/lib/moxml/document.rb +2 -8
  28. data/lib/moxml/entity_registry.rb +40 -31
  29. data/lib/moxml/entity_registry_opal_data.rb +2138 -0
  30. data/lib/moxml/node.rb +27 -26
  31. data/lib/moxml/sax/namespace_splitter.rb +54 -0
  32. data/lib/moxml/version.rb +1 -1
  33. data/lib/moxml/xml_utils.rb +10 -1
  34. data/lib/moxml.rb +7 -0
  35. data/spec/consistency/adapter_parity_spec.rb +1 -1
  36. data/spec/integration/all_adapters_spec.rb +2 -1
  37. data/spec/integration/shared_examples/line_ending_behavior.rb +56 -0
  38. data/spec/integration/w3c_namespace_spec.rb +1 -1
  39. data/spec/moxml/adapter/libxml_internals_spec.rb +4 -2
  40. data/spec/moxml/adapter/ox_spec.rb +8 -0
  41. data/spec/moxml/adapter/platform_spec.rb +70 -0
  42. data/spec/moxml/adapter/shared_examples/adapter_contract.rb +0 -6
  43. data/spec/moxml/config_spec.rb +33 -0
  44. data/spec/moxml/entity_registry_spec.rb +10 -0
  45. data/spec/moxml/native_attachment/opal_spec.rb +39 -2
  46. data/spec/moxml/node_type_map_spec.rb +43 -0
  47. data/spec/moxml/opal_rexml_adapter_spec.rb +14 -0
  48. data/spec/moxml/opal_smoke_spec.rb +61 -0
  49. data/spec/moxml/sax/namespace_splitter_spec.rb +67 -0
  50. data/spec/moxml/text_spec.rb +1 -1
  51. data/spec/spec_helper.rb +32 -13
  52. data/spec/support/opal.rb +16 -0
  53. metadata +19 -2
@@ -0,0 +1,1016 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rexml/parseexception"
4
+ require "rexml/undefinednamespaceexception"
5
+ require "rexml/security"
6
+ require "rexml/source"
7
+ require "set"
8
+ require "strscan"
9
+
10
+ module REXML
11
+ module Parsers
12
+ unless [].respond_to?(:tally)
13
+ module EnumerableTally
14
+ refine Enumerable do
15
+ def tally
16
+ counts = {}
17
+ each do |item|
18
+ counts[item] ||= 0
19
+ counts[item] += 1
20
+ end
21
+ counts
22
+ end
23
+ end
24
+ end
25
+ using EnumerableTally
26
+ end
27
+
28
+ if StringScanner::Version < "3.0.8"
29
+ module StringScannerCaptures
30
+ refine StringScanner do
31
+ def captures
32
+ values_at(*(1...size))
33
+ end
34
+ end
35
+ end
36
+ using StringScannerCaptures
37
+ end
38
+
39
+ # = Using the Pull Parser
40
+ # <em>This API is experimental, and subject to change.</em>
41
+ # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
42
+ # while parser.has_next?
43
+ # res = parser.next
44
+ # puts res[1]['att'] if res.start_tag? and res[0] == 'b'
45
+ # end
46
+ # See the PullEvent class for information on the content of the results.
47
+ # The data is identical to the arguments passed for the various events to
48
+ # the StreamListener API.
49
+ #
50
+ # Notice that:
51
+ # parser = PullParser.new( "<a>BAD DOCUMENT" )
52
+ # while parser.has_next?
53
+ # res = parser.next
54
+ # raise res[1] if res.error?
55
+ # end
56
+ #
57
+ # Nat Price gave me some good ideas for the API.
58
+ class BaseParser
59
+ LETTER = "A-Za-z"
60
+ DIGIT = "0-9"
61
+
62
+ COMBININGCHAR = "" # TODO
63
+ EXTENDER = "" # TODO
64
+
65
+ NCNAME_STR = "[#{LETTER}_][-A-Za-z0-9._#{COMBININGCHAR}#{EXTENDER}]*".freeze
66
+ QNAME_STR = "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})".freeze
67
+ QNAME = /(#{QNAME_STR})/
68
+
69
+ # Just for backward compatibility. For example, kramdown uses this.
70
+ # It's not used in REXML.
71
+ UNAME_STR = "(?:#{NCNAME_STR}:)?#{NCNAME_STR}".freeze
72
+
73
+ NAMECHAR = '[\-\w\.:]'
74
+ NAME = "([\\w:]#{NAMECHAR}*)".freeze
75
+ NMTOKEN = "(?:#{NAMECHAR})+".freeze
76
+ NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*".freeze
77
+ REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)".freeze
78
+ REFERENCE_RE = /#{REFERENCE}/
79
+
80
+ DOCTYPE_START = /^\s*<!DOCTYPE\s/um
81
+ DOCTYPE_END = /^\s*\]\s*>/um
82
+ ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um
83
+ COMMENT_START = /^<!--/u
84
+ COMMENT_PATTERN = /<!--(.*?)-->/um
85
+ CDATA_START = /^<!\[CDATA\[/u
86
+ CDATA_END = /^\s*\]\s*>/um
87
+ CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
88
+ XMLDECL_START = /^<\?xml\s/u
89
+ XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
90
+ INSTRUCTION_START = /^<\?/u
91
+ INSTRUCTION_PATTERN = /<\?#{NAME}(\s+.*?)?\?>/um
92
+ TAG_MATCH = /^<((?:#{QNAME_STR}))/um
93
+ CLOSE_MATCH = /^\s*<\/(#{QNAME_STR})\s*>/um
94
+
95
+ VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
96
+ ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
97
+ STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um
98
+
99
+ ENTITY_START = /^\s*<!ENTITY/
100
+ ELEMENTDECL_START = /^\s*<!ELEMENT/um
101
+ ELEMENTDECL_PATTERN = /^\s*(<!ELEMENT.*?)>/um
102
+ SYSTEMENTITY = /^\s*(%.*?;)\s*$/um
103
+ ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)".freeze
104
+ NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)".freeze
105
+ ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))".freeze
106
+ ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})".freeze
107
+ ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')".freeze
108
+ DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))".freeze
109
+ ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}".freeze
110
+ ATTDEF_RE = /#{ATTDEF}/
111
+ ATTLISTDECL_START = /^\s*<!ATTLIST/um
112
+ ATTLISTDECL_PATTERN = /^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
113
+
114
+ TEXT_PATTERN = /^([^<]*)/um
115
+
116
+ # Entity constants
117
+ PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
118
+ SYSTEMLITERAL = %{((?:"[^"]*")|(?:'[^']*'))}
119
+ PUBIDLITERAL = %{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}.freeze
120
+ EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))".freeze
121
+ NDATADECL = "\\s+NDATA\\s+#{NAME}".freeze
122
+ PEREFERENCE = "%#{NAME};".freeze
123
+ ENTITYVALUE = %{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}.freeze
124
+ PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})".freeze
125
+ ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))".freeze
126
+ PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>".freeze
127
+ GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>".freeze
128
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
129
+
130
+ NOTATIONDECL_START = /^\s*<!NOTATION/um
131
+ EXTERNAL_ID_PUBLIC = /^\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
132
+ EXTERNAL_ID_SYSTEM = /^\s*SYSTEM\s+#{SYSTEMLITERAL}\s*/um
133
+ PUBLIC_ID = /^\s*PUBLIC\s+#{PUBIDLITERAL}\s*/um
134
+
135
+ EREFERENCE = /&(?!#{NAME};)/
136
+
137
+ DEFAULT_ENTITIES = {
138
+ "gt" => [/&gt;/, "&gt;", ">", />/],
139
+ "lt" => [/&lt;/, "&lt;", "<", /</],
140
+ "quot" => [/&quot;/, "&quot;", '"', /"/],
141
+ "apos" => [/&apos;/, "&apos;", "'", /'/],
142
+ }.freeze
143
+
144
+ module Private
145
+ PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
146
+ TAG_PATTERN = /((?:#{QNAME_STR}))\s*/um
147
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
148
+ EQUAL_PATTERN = /\s*=\s*/um
149
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
150
+ NAME_PATTERN = /#{NAME}/um
151
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>".freeze
152
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>".freeze
153
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
154
+ CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
155
+ CHARACTER_REFERENCES = /&#((?:\d+)|(?:x[a-fA-F0-9]+));/
156
+ DEFAULT_ENTITIES_PATTERNS = {}
157
+ default_entities = ["gt", "lt", "quot", "apos", "amp"]
158
+ default_entities.each do |term|
159
+ DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
160
+ end
161
+ DEFAULT_ENTITIES_PATTERNS.freeze
162
+ XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
163
+ end
164
+ private_constant :Private
165
+
166
+ def initialize(source)
167
+ self.stream = source
168
+ @listeners = []
169
+ @prefixes = Set.new
170
+ @entity_expansion_count = 0
171
+ @entity_expansion_limit = Security.entity_expansion_limit
172
+ @entity_expansion_text_limit = Security.entity_expansion_text_limit
173
+ @source.ensure_buffer
174
+ @version = nil
175
+ end
176
+
177
+ def add_listener(listener)
178
+ @listeners << listener
179
+ end
180
+
181
+ attr_reader :source, :entity_expansion_count
182
+ attr_writer :entity_expansion_limit, :entity_expansion_text_limit
183
+
184
+ def stream=(source)
185
+ @source = SourceFactory.create_from(source)
186
+ reset
187
+ end
188
+
189
+ def reset
190
+ @closed = nil
191
+ @have_root = false
192
+ @document_status = nil
193
+ @tags = []
194
+ @stack = []
195
+ @entities = []
196
+ @namespaces = { "xml" => Private::XML_PREFIXED_NAMESPACE }
197
+ @namespaces_restore_stack = []
198
+ end
199
+
200
+ def position
201
+ if @source.respond_to? :position
202
+ @source.position
203
+ else
204
+ # FIXME
205
+ 0
206
+ end
207
+ end
208
+
209
+ # Returns true if there are no more events
210
+ def empty?
211
+ (@source.empty? and @stack.empty?)
212
+ end
213
+
214
+ # Returns true if there are more events. Synonymous with !empty?
215
+ def has_next?
216
+ !(@source.empty? and @stack.empty?)
217
+ end
218
+
219
+ # Push an event back on the head of the stream. This method
220
+ # has (theoretically) infinite depth.
221
+ def unshift(token)
222
+ @stack.unshift(token)
223
+ end
224
+
225
+ # Peek at the +depth+ event in the stack. The first element on the stack
226
+ # is at depth 0. If +depth+ is -1, will parse to the end of the input
227
+ # stream and return the last event, which is always :end_document.
228
+ # Be aware that this causes the stream to be parsed up to the +depth+
229
+ # event, so you can effectively pre-parse the entire document (pull the
230
+ # entire thing into memory) using this method.
231
+ def peek(depth = 0)
232
+ raise %[Illegal argument "#{depth}"] if depth < -1
233
+
234
+ temp = []
235
+ if depth == -1
236
+ temp.push(pull) until empty?
237
+ else
238
+ while @stack.size + temp.size < depth + 1
239
+ temp.push(pull)
240
+ end
241
+ end
242
+ @stack += temp if temp.size.positive?
243
+ @stack[depth]
244
+ end
245
+
246
+ # Returns the next event. This is a +PullEvent+ object.
247
+ def pull
248
+ @source.drop_parsed_content
249
+
250
+ pull_event.tap do |event|
251
+ @listeners.each do |listener|
252
+ listener.receive event
253
+ end
254
+ end
255
+ end
256
+
257
+ def pull_event
258
+ if @closed
259
+ x = @closed
260
+ @closed = nil
261
+ return [:end_element, x]
262
+ end
263
+ if empty?
264
+ if @document_status == :in_doctype
265
+ raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
266
+ end
267
+
268
+ unless @tags.empty?
269
+ path = "/#{@tags.join('/')}"
270
+ raise ParseException.new("Missing end tag for '#{path}'", @source)
271
+ end
272
+
273
+ unless @document_status == :in_element
274
+ raise ParseException.new("Malformed XML: No root element", @source)
275
+ end
276
+
277
+ return [:end_document]
278
+ end
279
+ return @stack.shift if @stack.size.positive?
280
+
281
+ # STDERR.puts @source.encoding
282
+ # STDERR.puts "BUFFER = #{@source.buffer.inspect}"
283
+
284
+ @source.ensure_buffer
285
+ if @document_status == nil
286
+ start_position = @source.position
287
+ if @source.match?("<?", true)
288
+ return process_instruction
289
+ elsif @source.match?("<!", true)
290
+ if @source.match?("--", true)
291
+ return [:comment, process_comment]
292
+ elsif @source.match?("DOCTYPE", true)
293
+ base_error_message = "Malformed DOCTYPE"
294
+ unless @source.skip_spaces
295
+ message = if @source.match?(">")
296
+ "#{base_error_message}: name is missing"
297
+ else
298
+ "#{base_error_message}: invalid name"
299
+ end
300
+ @source.position = start_position
301
+ raise REXML::ParseException.new(message, @source)
302
+ end
303
+ name = parse_name(base_error_message)
304
+ @source.skip_spaces
305
+ if @source.match?("[", true)
306
+ id = [nil, nil, nil]
307
+ @document_status = :in_doctype
308
+ elsif @source.match?(">", true)
309
+ id = [nil, nil, nil]
310
+ @document_status = :after_doctype
311
+ @source.ensure_buffer
312
+ else
313
+ id = parse_id(base_error_message,
314
+ accept_external_id: true,
315
+ accept_public_id: false)
316
+ if id[0] == "SYSTEM"
317
+ # For backward compatibility
318
+ id[1] = id[2]
319
+ id[2] = nil
320
+ end
321
+ @source.skip_spaces
322
+ if @source.match?("[", true)
323
+ @document_status = :in_doctype
324
+ elsif @source.match?(">", true)
325
+ @document_status = :after_doctype
326
+ @source.ensure_buffer
327
+ else
328
+ message = "#{base_error_message}: garbage after external ID"
329
+ raise REXML::ParseException.new(message, @source)
330
+ end
331
+ end
332
+ args = [:start_doctype, name, *id]
333
+ if @document_status == :after_doctype
334
+ @source.skip_spaces
335
+ @stack << [:end_doctype]
336
+ end
337
+ return args
338
+ else
339
+ message = "Invalid XML"
340
+ raise REXML::ParseException.new(message, @source)
341
+ end
342
+ end
343
+ end
344
+ if @document_status == :in_doctype
345
+ @source.skip_spaces
346
+ start_position = @source.position
347
+ if @source.match?("<!", true)
348
+ if @source.match?("ELEMENT", true)
349
+ md = @source.match(/(.*?)>/um, true)
350
+ if md.nil?
351
+ raise REXML::ParseException.new("Bad ELEMENT declaration!",
352
+ @source)
353
+ end
354
+
355
+ return [:elementdecl, "<!ELEMENT#{md[1]}"]
356
+ elsif @source.match?("ENTITY", true)
357
+ match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
358
+ unless match_data
359
+ raise REXML::ParseException.new("Malformed entity declaration",
360
+ @source)
361
+ end
362
+
363
+ match = [:entitydecl, *match_data.captures.compact]
364
+ ref = false
365
+ if match[1] == "%"
366
+ ref = true
367
+ match.delete_at 1
368
+ end
369
+ # Now we have to sort out what kind of entity reference this is
370
+ case match[2]
371
+ when "SYSTEM"
372
+ # External reference
373
+ match[3] = match[3][1..-2] # PUBID
374
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
375
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
376
+ when "PUBLIC"
377
+ # External reference
378
+ match[3] = match[3][1..-2] # PUBID
379
+ match[4] = match[4][1..-2] # HREF
380
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
381
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
382
+ when Private::PEREFERENCE_PATTERN
383
+ raise REXML::ParseException.new(
384
+ "Parameter entity references forbidden in internal subset: #{match[2]}", @source
385
+ )
386
+ else
387
+ match[2] = match[2][1..-2]
388
+ match.pop if match.size == 4
389
+ # match is [ :entity, name, value ]
390
+ end
391
+ match << "%" if ref
392
+ return match
393
+ elsif @source.match?("ATTLIST", true)
394
+ md = @source.match(Private::ATTLISTDECL_END, true)
395
+ if md.nil?
396
+ raise REXML::ParseException.new("Bad ATTLIST declaration!",
397
+ @source)
398
+ end
399
+
400
+ element = md[1]
401
+ contents = "<!ATTLIST#{md[0]}"
402
+
403
+ pairs = {}
404
+ values = md[0].strip.scan(ATTDEF_RE)
405
+ values.each do |attdef|
406
+ unless attdef[3] == "#IMPLIED"
407
+ attdef.compact!
408
+ val = attdef[3]
409
+ val = attdef[4] if val == "#FIXED "
410
+ pairs[attdef[0]] = val
411
+ if attdef[0] =~ /^xmlns:(.*)/
412
+ @namespaces[$1] = val
413
+ end
414
+ end
415
+ end
416
+ return [:attlistdecl, element, pairs, contents]
417
+ elsif @source.match?("NOTATION", true)
418
+ base_error_message = "Malformed notation declaration"
419
+ unless @source.skip_spaces
420
+ message = if @source.match?(">")
421
+ "#{base_error_message}: name is missing"
422
+ else
423
+ "#{base_error_message}: invalid name"
424
+ end
425
+ @source.position = start_position
426
+ raise REXML::ParseException.new(message, @source)
427
+ end
428
+ name = parse_name(base_error_message)
429
+ id = parse_id(base_error_message,
430
+ accept_external_id: true,
431
+ accept_public_id: true)
432
+ @source.skip_spaces
433
+ unless @source.match?(">", true)
434
+ message = "#{base_error_message}: garbage before end >"
435
+ raise REXML::ParseException.new(message, @source)
436
+ end
437
+ return [:notationdecl, name, *id]
438
+ elsif @source.match?("--", true)
439
+ return [:comment, process_comment]
440
+ else
441
+ raise REXML::ParseException.new(
442
+ "Malformed node: Started with '<!' but not a comment nor ELEMENT,ENTITY,ATTLIST,NOTATION", @source
443
+ )
444
+ end
445
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
446
+ return [:externalentity, match[1]]
447
+ elsif @source.match?(/\]\s*>/um, true)
448
+ @document_status = :after_doctype
449
+ return [:end_doctype]
450
+ else
451
+ raise ParseException.new("Malformed DOCTYPE: invalid declaration",
452
+ @source)
453
+ end
454
+ end
455
+ if @document_status == :after_doctype
456
+ @source.skip_spaces
457
+ end
458
+ begin
459
+ start_position = @source.position
460
+ if @source.match?("<", true)
461
+ # :text's read_until may remain only "<" in buffer. In the
462
+ # case, buffer is empty here. So we need to fill buffer
463
+ # here explicitly.
464
+ @source.ensure_buffer
465
+ if @source.match?("/", true)
466
+ @namespaces_restore_stack.pop
467
+ last_tag = @tags.pop
468
+ md = @source.match(Private::CLOSE_PATTERN, true)
469
+ if md && !last_tag
470
+ message = "Unexpected top-level end tag (got '#{md[1]}')"
471
+ raise REXML::ParseException.new(message, @source)
472
+ end
473
+ if md.nil? || (last_tag != md[1])
474
+ message = "Missing end tag for '#{last_tag}'"
475
+ message += " (got '#{md[1]}')" if md
476
+ @source.position = start_position if md.nil?
477
+ raise REXML::ParseException.new(message, @source)
478
+ end
479
+ [:end_element, last_tag]
480
+ elsif @source.match?("!", true)
481
+ # STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
482
+ if @source.match?("--", true)
483
+ [:comment, process_comment]
484
+ elsif @source.match?("[CDATA[", true)
485
+ text = @source.read_until("]]>")
486
+ unless text.end_with?("]]>")
487
+ raise REXML::ParseException.new(
488
+ "Malformed CDATA: Missing end ']]>'", @source
489
+ )
490
+ end
491
+
492
+ text = text[0...-3]
493
+ [:cdata, text]
494
+ else
495
+ raise REXML::ParseException.new(
496
+ "Malformed node: Started with '<!' but not a comment nor CDATA", @source
497
+ )
498
+ end
499
+ elsif @source.match?("?", true)
500
+ process_instruction
501
+ else
502
+ # Get the next tag
503
+ md = @source.match(Private::TAG_PATTERN, true)
504
+ unless md
505
+ @source.position = start_position
506
+ raise REXML::ParseException.new(
507
+ "malformed XML: missing tag start", @source
508
+ )
509
+ end
510
+ tag = md[1]
511
+ @document_status = :in_element
512
+ @prefixes.clear
513
+ @prefixes << md[2] if md[2]
514
+ push_namespaces_restore
515
+ attributes, closed = parse_attributes(@prefixes)
516
+ # Verify that all of the prefixes have been defined
517
+ @prefixes.each do |prefix|
518
+ unless @namespaces.key?(prefix)
519
+ raise UndefinedNamespaceException.new(prefix, @source, self)
520
+ end
521
+ end
522
+
523
+ if closed
524
+ @closed = tag
525
+ pop_namespaces_restore
526
+ else
527
+ if @tags.empty? && @have_root
528
+ raise ParseException.new(
529
+ "Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source
530
+ )
531
+ end
532
+
533
+ @tags.push(tag)
534
+ end
535
+ @have_root = true
536
+ [:start_element, tag, attributes]
537
+ end
538
+ else
539
+ text = @source.read_until("<")
540
+ if text.end_with?("<")
541
+ text = text[0...-1]
542
+ @source.position -= "<".bytesize
543
+ end
544
+ if @tags.empty?
545
+ unless /^\s*$/.match?(text)
546
+ if @have_root
547
+ raise ParseException.new(
548
+ "Malformed XML: Extra content at the end of the document (got '#{text}')", @source
549
+ )
550
+ else
551
+ raise ParseException.new(
552
+ "Malformed XML: Content at the start of the document (got '#{text}')", @source
553
+ )
554
+ end
555
+ end
556
+ return pull_event if @have_root
557
+ end
558
+ [:text, text]
559
+ end
560
+ rescue REXML::UndefinedNamespaceException
561
+ raise
562
+ rescue REXML::ParseException
563
+ raise
564
+ rescue StandardError => e
565
+ raise REXML::ParseException.new("Exception parsing",
566
+ @source, self, e || $!)
567
+ end
568
+ # NOTE: The end of the method never runs, because it is unreachable.
569
+ # All branches of code above have explicit unconditional return or raise statements.
570
+ end
571
+ private :pull_event
572
+
573
+ def entity(reference, entities)
574
+ return unless entities
575
+
576
+ value = entities[reference]
577
+ return if value.nil?
578
+
579
+ record_entity_expansion
580
+ unnormalize(value, entities)
581
+ end
582
+
583
+ # Escapes all possible entities
584
+ def normalize(input, entities = nil, entity_filter = nil)
585
+ copy = input.clone
586
+ # Doing it like this rather than in a loop improves the speed
587
+ copy.gsub!(EREFERENCE, "&amp;")
588
+ if entities
589
+ entities.each do |key, value|
590
+ unless entity_filter && entity_filter.include?(entity)
591
+ copy.gsub!(value, "&#{key};")
592
+ end
593
+ end
594
+ end
595
+ copy.gsub!(EREFERENCE, "&amp;")
596
+ DEFAULT_ENTITIES.each_value do |value|
597
+ copy.gsub!(value[3], value[1])
598
+ end
599
+ copy
600
+ end
601
+
602
+ # Unescapes all possible entities
603
+ def unnormalize(string, entities = nil, filter = nil)
604
+ rv = if string.include?("\r")
605
+ string.gsub(Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n")
606
+ else
607
+ string.dup
608
+ end
609
+ matches = rv.scan(REFERENCE_RE)
610
+ return rv if matches.empty?
611
+
612
+ rv.gsub!(Private::CHARACTER_REFERENCES) do
613
+ m = $1
614
+ code_point = if m.start_with?("x")
615
+ Integer(m[1..], 16)
616
+ else
617
+ Integer(m, 10)
618
+ end
619
+ [code_point].pack("U*")
620
+ end
621
+ matches.collect! { |x| x[0] }.compact!
622
+ if filter
623
+ matches.reject! do |entity_reference|
624
+ filter.include?(entity_reference)
625
+ end
626
+ end
627
+ if matches.size.positive?
628
+ matches.tally.each do |entity_reference, n|
629
+ entity_expansion_count_before = @entity_expansion_count
630
+ entity_value = entity(entity_reference, entities)
631
+ if entity_value
632
+ if n > 1
633
+ entity_expansion_count_delta =
634
+ @entity_expansion_count - entity_expansion_count_before
635
+ record_entity_expansion(entity_expansion_count_delta * (n - 1))
636
+ end
637
+ re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
638
+ rv.gsub!(re, entity_value)
639
+ if rv.bytesize > @entity_expansion_text_limit
640
+ raise "entity expansion has grown too large"
641
+ end
642
+ else
643
+ er = DEFAULT_ENTITIES[entity_reference]
644
+ rv.gsub!(er[0], er[2]) if er
645
+ end
646
+ end
647
+ rv.gsub!(Private::DEFAULT_ENTITIES_PATTERNS["amp"], "&")
648
+ end
649
+ rv
650
+ end
651
+
652
+ private
653
+
654
+ def add_namespace(prefix, uri)
655
+ @namespaces_restore_stack.last[prefix] = @namespaces[prefix]
656
+ if uri.nil?
657
+ @namespaces.delete(prefix)
658
+ else
659
+ @namespaces[prefix] = uri
660
+ end
661
+ end
662
+
663
+ def push_namespaces_restore
664
+ namespaces_restore = {}
665
+ @namespaces_restore_stack.push(namespaces_restore)
666
+ namespaces_restore
667
+ end
668
+
669
+ def pop_namespaces_restore
670
+ namespaces_restore = @namespaces_restore_stack.pop
671
+ namespaces_restore.each do |prefix, uri|
672
+ if uri.nil?
673
+ @namespaces.delete(prefix)
674
+ else
675
+ @namespaces[prefix] = uri
676
+ end
677
+ end
678
+ end
679
+
680
+ def record_entity_expansion(delta = 1)
681
+ @entity_expansion_count += delta
682
+ if @entity_expansion_count > @entity_expansion_limit
683
+ raise "number of entity expansions exceeded, processing aborted."
684
+ end
685
+ end
686
+
687
+ def need_source_encoding_update?(xml_declaration_encoding)
688
+ return false if xml_declaration_encoding.nil?
689
+ return false if /^UTF-16$/i.match?(xml_declaration_encoding)
690
+
691
+ true
692
+ end
693
+
694
+ def normalize_xml_declaration_encoding(xml_declaration_encoding)
695
+ /^UTF-16(?:BE|LE)$/i.match?(xml_declaration_encoding) ? "UTF-16" : nil
696
+ end
697
+
698
+ def parse_name(base_error_message)
699
+ md = @source.match(Private::NAME_PATTERN, true)
700
+ unless md
701
+ message = if @source.match?(/\S/um)
702
+ "#{base_error_message}: invalid name"
703
+ else
704
+ "#{base_error_message}: name is missing"
705
+ end
706
+ raise REXML::ParseException.new(message, @source)
707
+ end
708
+ md[0]
709
+ end
710
+
711
+ def parse_id(base_error_message,
712
+ accept_external_id:,
713
+ accept_public_id:)
714
+ if accept_external_id && (md = @source.match(EXTERNAL_ID_PUBLIC, true))
715
+ pubid = system = nil
716
+ pubid_literal = md[1]
717
+ pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
718
+ system_literal = md[2]
719
+ system = system_literal[1..-2] if system_literal # Remove quote
720
+ ["PUBLIC", pubid, system]
721
+ elsif accept_public_id && (md = @source.match(PUBLIC_ID, true))
722
+ pubid = nil
723
+ pubid_literal = md[1]
724
+ pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
725
+ ["PUBLIC", pubid, nil]
726
+ elsif accept_external_id && (md = @source.match(EXTERNAL_ID_SYSTEM,
727
+ true))
728
+ system = nil
729
+ system_literal = md[1]
730
+ system = system_literal[1..-2] if system_literal # Remove quote
731
+ ["SYSTEM", nil, system]
732
+ else
733
+ details = parse_id_invalid_details(accept_external_id: accept_external_id,
734
+ accept_public_id: accept_public_id)
735
+ message = "#{base_error_message}: #{details}"
736
+ raise REXML::ParseException.new(message, @source)
737
+ end
738
+ end
739
+
740
+ def parse_id_invalid_details(accept_external_id:,
741
+ accept_public_id:)
742
+ public = /^\s*PUBLIC/um
743
+ system = /^\s*SYSTEM/um
744
+ if (accept_external_id || accept_public_id) && @source.match?(/#{public}/um)
745
+ if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
746
+ return "public ID literal is missing"
747
+ end
748
+ unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um)
749
+ return "invalid public ID literal"
750
+ end
751
+
752
+ if accept_public_id
753
+ if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
754
+ return "system ID literal is missing"
755
+ end
756
+ unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
757
+ return "invalid system literal"
758
+ end
759
+
760
+ "garbage after system literal"
761
+ else
762
+ "garbage after public ID literal"
763
+ end
764
+ elsif accept_external_id && @source.match?(/#{system}/um)
765
+ if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
766
+ return "system literal is missing"
767
+ end
768
+ unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um)
769
+ return "invalid system literal"
770
+ end
771
+
772
+ "garbage after system literal"
773
+ else
774
+ unless @source.match?(/^\s*(?:PUBLIC|SYSTEM)\s/um)
775
+ return "invalid ID type"
776
+ end
777
+
778
+ "ID type is missing"
779
+ end
780
+ end
781
+
782
+ def process_comment
783
+ text = @source.read_until("-->")
784
+ unless text.end_with?("-->")
785
+ raise REXML::ParseException.new(
786
+ "Unclosed comment: Missing end '-->'", @source
787
+ )
788
+ end
789
+
790
+ text = text[0...-3]
791
+
792
+ if text.include?("--") || text.end_with?("-")
793
+ raise REXML::ParseException.new("Malformed comment", @source)
794
+ end
795
+
796
+ text
797
+ end
798
+
799
+ def process_instruction
800
+ name = parse_name("Malformed XML: Invalid processing instruction node")
801
+ if name == "xml"
802
+ xml_declaration
803
+ else # PITarget
804
+ if @source.skip_spaces # e.g. <?name content?>
805
+ start_position = @source.position
806
+ content = @source.read_until("?>")
807
+ unless content.end_with?("?>")
808
+ @source.position = start_position
809
+ raise ParseException.new(
810
+ "Malformed XML: Unclosed processing instruction: <#{name}>", @source
811
+ )
812
+ end
813
+ content = content[0...-2]
814
+ else # e.g. <?name?>
815
+ content = nil
816
+ unless @source.match?("?>", true)
817
+ raise ParseException.new(
818
+ "Malformed XML: Unclosed processing instruction: <#{name}>", @source
819
+ )
820
+ end
821
+ end
822
+ [:processing_instruction, name, content]
823
+ end
824
+ end
825
+
826
+ def xml_declaration
827
+ unless @version.nil?
828
+ raise ParseException.new(
829
+ "Malformed XML: XML declaration is duplicated", @source
830
+ )
831
+ end
832
+ if @document_status
833
+ raise ParseException.new(
834
+ "Malformed XML: XML declaration is not at the start", @source
835
+ )
836
+ end
837
+ unless @source.skip_spaces
838
+ raise ParseException.new(
839
+ "Malformed XML: XML declaration misses spaces before version", @source
840
+ )
841
+ end
842
+ unless @source.match?("version", true)
843
+ raise ParseException.new(
844
+ "Malformed XML: XML declaration misses version", @source
845
+ )
846
+ end
847
+
848
+ @version = parse_attribute_value_with_equal("xml")
849
+ unless @source.skip_spaces
850
+ unless @source.match?("?>", true)
851
+ raise ParseException.new("Malformed XML: Unclosed XML declaration",
852
+ @source)
853
+ end
854
+
855
+ encoding = normalize_xml_declaration_encoding(@source.encoding)
856
+ return [:xmldecl, @version, encoding, nil] # e.g. <?xml version="1.0"?>
857
+ end
858
+
859
+ if @source.match?("encoding", true)
860
+ encoding = parse_attribute_value_with_equal("xml")
861
+ unless @source.skip_spaces
862
+ unless @source.match?("?>", true)
863
+ raise ParseException.new(
864
+ "Malformed XML: Unclosed XML declaration", @source
865
+ )
866
+ end
867
+
868
+ if need_source_encoding_update?(encoding)
869
+ @source.encoding = encoding
870
+ end
871
+ encoding ||= normalize_xml_declaration_encoding(@source.encoding)
872
+ return [:xmldecl, @version, encoding, nil] # e.g. <?xml version="1.1" encoding="UTF-8"?>
873
+ end
874
+ end
875
+
876
+ if @source.match?("standalone", true)
877
+ standalone = parse_attribute_value_with_equal("xml")
878
+ case standalone
879
+ when "yes", "no"
880
+ else
881
+ raise ParseException.new(
882
+ "Malformed XML: XML declaration standalone is not yes or no : <#{standalone}>", @source
883
+ )
884
+ end
885
+ end
886
+ @source.skip_spaces
887
+ unless @source.match?("?>", true)
888
+ raise ParseException.new("Malformed XML: Unclosed XML declaration",
889
+ @source)
890
+ end
891
+
892
+ if need_source_encoding_update?(encoding)
893
+ @source.encoding = encoding
894
+ end
895
+ encoding ||= normalize_xml_declaration_encoding(@source.encoding)
896
+
897
+ # e.g. <?xml version="1.0" ?>
898
+ # <?xml version="1.1" encoding="UTF-8" ?>
899
+ # <?xml version="1.1" standalone="yes"?>
900
+ # <?xml version="1.1" encoding="UTF-8" standalone="yes" ?>
901
+ [:xmldecl, @version, encoding, standalone]
902
+ end
903
+
904
+ if StringScanner::Version < "3.1.1"
905
+ def scan_quote
906
+ @source.match(/(['"])/, true)&.[](1)
907
+ end
908
+ else
909
+ def scan_quote
910
+ case @source.peek_byte
911
+ when 34 # '"'.ord
912
+ @source.scan_byte
913
+ '"'
914
+ when 39 # "'".ord
915
+ @source.scan_byte
916
+ "'"
917
+ end
918
+ end
919
+ end
920
+
921
+ def parse_attribute_value_with_equal(name)
922
+ unless @source.match?(Private::EQUAL_PATTERN, true)
923
+ message = "Missing attribute equal: <#{name}>"
924
+ raise REXML::ParseException.new(message, @source)
925
+ end
926
+ unless quote = scan_quote
927
+ message = "Missing attribute value start quote: <#{name}>"
928
+ raise REXML::ParseException.new(message, @source)
929
+ end
930
+ start_position = @source.position
931
+ value = @source.read_until(quote)
932
+ unless value.end_with?(quote)
933
+ @source.position = start_position
934
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
935
+ raise REXML::ParseException.new(message, @source)
936
+ end
937
+ value[0...-1]
938
+ end
939
+
940
+ def parse_attributes(prefixes)
941
+ attributes = {}
942
+ expanded_names = {}
943
+ closed = false
944
+ loop do
945
+ if @source.match?(">", true)
946
+ return attributes, closed
947
+ elsif @source.match?("/>", true)
948
+ closed = true
949
+ return attributes, closed
950
+ elsif match = @source.match(QNAME, true)
951
+ name = match[1]
952
+ prefix = match[2]
953
+ local_part = match[3]
954
+ value = parse_attribute_value_with_equal(name)
955
+ @source.skip_spaces
956
+ if prefix == "xmlns"
957
+ if local_part == "xml"
958
+ if value != Private::XML_PREFIXED_NAMESPACE
959
+ msg = "The 'xml' prefix must not be bound to any other namespace " +
960
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
961
+ raise REXML::ParseException.new(msg, @source, self)
962
+ end
963
+ elsif local_part == "xmlns"
964
+ msg = "The 'xmlns' prefix must not be declared " +
965
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
966
+ raise REXML::ParseException.new(msg, @source, self)
967
+ end
968
+ add_namespace(local_part, value)
969
+ elsif prefix
970
+ prefixes << prefix unless prefix == "xml"
971
+ end
972
+
973
+ if attributes[name]
974
+ msg = "Duplicate attribute #{name.inspect}"
975
+ raise REXML::ParseException.new(msg, @source, self)
976
+ end
977
+
978
+ unless prefix == "xmlns"
979
+ uri = @namespaces[prefix]
980
+ expanded_name = [uri, local_part]
981
+ existing_prefix = expanded_names[expanded_name]
982
+ if existing_prefix
983
+ message = "Namespace conflict in adding attribute \"#{local_part}\": Prefix \"#{existing_prefix}\" = \"#{uri}\" and prefix \"#{prefix}\" = \"#{uri}\""
984
+ raise REXML::ParseException.new(message, @source, self)
985
+ end
986
+ expanded_names[expanded_name] = prefix
987
+ end
988
+
989
+ attributes[name] = value
990
+ else
991
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
992
+ raise REXML::ParseException.new(message, @source)
993
+ end
994
+ end
995
+ end
996
+ end
997
+ end
998
+ end
999
+
1000
+ # case event[0]
1001
+ # when :start_element
1002
+ # when :text
1003
+ # when :end_element
1004
+ # when :processing_instruction
1005
+ # when :cdata
1006
+ # when :comment
1007
+ # when :xmldecl
1008
+ # when :start_doctype
1009
+ # when :end_doctype
1010
+ # when :externalentity
1011
+ # when :elementdecl
1012
+ # when :entity
1013
+ # when :attlistdecl
1014
+ # when :notationdecl
1015
+ # when :end_doctype
1016
+ # end