moxml 0.1.21 → 0.1.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/opal.yml +37 -0
  3. data/.rspec-opal +5 -0
  4. data/Gemfile +6 -0
  5. data/Rakefile +67 -0
  6. data/lib/compat/opal/rexml/namespace.rb +56 -0
  7. data/lib/compat/opal/rexml/parsers/baseparser.rb +952 -0
  8. data/lib/compat/opal/rexml/source.rb +213 -0
  9. data/lib/compat/opal/rexml/text.rb +418 -0
  10. data/lib/compat/opal/rexml/xmltokens.rb +45 -0
  11. data/lib/compat/opal/rexml_compat.rb +76 -0
  12. data/lib/moxml/adapter/customized_rexml/formatter.rb +11 -10
  13. data/lib/moxml/adapter/headed_ox.rb +2 -6
  14. data/lib/moxml/adapter/libxml.rb +5 -20
  15. data/lib/moxml/adapter/nokogiri.rb +7 -18
  16. data/lib/moxml/adapter/oga.rb +4 -22
  17. data/lib/moxml/adapter/ox.rb +8 -23
  18. data/lib/moxml/adapter/rexml.rb +29 -33
  19. data/lib/moxml/adapter.rb +38 -8
  20. data/lib/moxml/config.rb +1 -1
  21. data/lib/moxml/entity_registry.rb +36 -31
  22. data/lib/moxml/entity_registry_opal_data.rb +2137 -0
  23. data/lib/moxml/node.rb +19 -26
  24. data/lib/moxml/sax/namespace_splitter.rb +54 -0
  25. data/lib/moxml/version.rb +1 -1
  26. data/lib/moxml/xml_utils.rb +9 -1
  27. data/spec/consistency/adapter_parity_spec.rb +1 -1
  28. data/spec/integration/all_adapters_spec.rb +1 -1
  29. data/spec/integration/w3c_namespace_spec.rb +1 -1
  30. data/spec/moxml/adapter/ox_spec.rb +8 -0
  31. data/spec/moxml/adapter/platform_spec.rb +69 -0
  32. data/spec/moxml/adapter/shared_examples/adapter_contract.rb +0 -6
  33. data/spec/moxml/entity_registry_spec.rb +10 -0
  34. data/spec/moxml/native_attachment/opal_spec.rb +39 -2
  35. data/spec/moxml/node_type_map_spec.rb +43 -0
  36. data/spec/moxml/opal_rexml_adapter_spec.rb +14 -0
  37. data/spec/moxml/opal_smoke_spec.rb +61 -0
  38. data/spec/moxml/sax/namespace_splitter_spec.rb +67 -0
  39. data/spec/moxml/text_spec.rb +1 -1
  40. data/spec/spec_helper.rb +32 -13
  41. data/spec/support/opal.rb +16 -0
  42. metadata +17 -1
@@ -0,0 +1,952 @@
1
+ # frozen_string_literal: true
2
+ require 'rexml/parseexception'
3
+ require 'rexml/undefinednamespaceexception'
4
+ require 'rexml/security'
5
+ require 'rexml/source'
6
+ require 'set'
7
+ require "strscan"
8
+
9
+ module REXML
10
+ module Parsers
11
+ unless [].respond_to?(:tally)
12
+ module EnumerableTally
13
+ refine Enumerable do
14
+ def tally
15
+ counts = {}
16
+ each do |item|
17
+ counts[item] ||= 0
18
+ counts[item] += 1
19
+ end
20
+ counts
21
+ end
22
+ end
23
+ end
24
+ using EnumerableTally
25
+ end
26
+
27
+ if StringScanner::Version < "3.0.8"
28
+ module StringScannerCaptures
29
+ refine StringScanner do
30
+ def captures
31
+ values_at(*(1...size))
32
+ end
33
+ end
34
+ end
35
+ using StringScannerCaptures
36
+ end
37
+
38
+ # = Using the Pull Parser
39
+ # <em>This API is experimental, and subject to change.</em>
40
+ # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
41
+ # while parser.has_next?
42
+ # res = parser.next
43
+ # puts res[1]['att'] if res.start_tag? and res[0] == 'b'
44
+ # end
45
+ # See the PullEvent class for information on the content of the results.
46
+ # The data is identical to the arguments passed for the various events to
47
+ # the StreamListener API.
48
+ #
49
+ # Notice that:
50
+ # parser = PullParser.new( "<a>BAD DOCUMENT" )
51
+ # while parser.has_next?
52
+ # res = parser.next
53
+ # raise res[1] if res.error?
54
+ # end
55
+ #
56
+ # Nat Price gave me some good ideas for the API.
57
+ class BaseParser
58
+ LETTER = 'A-Za-z'
59
+ DIGIT = '0-9'
60
+
61
+ COMBININGCHAR = '' # TODO
62
+ EXTENDER = '' # TODO
63
+
64
+ NCNAME_STR= "[#{LETTER}_][-A-Za-z0-9._#{COMBININGCHAR}#{EXTENDER}]*"
65
+ QNAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})"
66
+ QNAME = /(#{QNAME_STR})/
67
+
68
+ # Just for backward compatibility. For example, kramdown uses this.
69
+ # It's not used in REXML.
70
+ UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
71
+
72
+ NAMECHAR = '[\-\w\.:]'
73
+ NAME = "([\\w:]#{NAMECHAR}*)"
74
+ NMTOKEN = "(?:#{NAMECHAR})+"
75
+ NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
76
+ REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)"
77
+ REFERENCE_RE = /#{REFERENCE}/
78
+
79
+ DOCTYPE_START = /^\s*<!DOCTYPE\s/um
80
+ DOCTYPE_END = /^\s*\]\s*>/um
81
+ ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um
82
+ COMMENT_START = /^<!--/u
83
+ COMMENT_PATTERN = /<!--(.*?)-->/um
84
+ CDATA_START = /^<!\[CDATA\[/u
85
+ CDATA_END = /^\s*\]\s*>/um
86
+ CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
87
+ XMLDECL_START = /^<\?xml\s/u;
88
+ XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
89
+ INSTRUCTION_START = /^<\?/u
90
+ INSTRUCTION_PATTERN = /<\?#{NAME}(\s+.*?)?\?>/um
91
+ TAG_MATCH = /^<((?:#{QNAME_STR}))/um
92
+ CLOSE_MATCH = /^\s*<\/(#{QNAME_STR})\s*>/um
93
+
94
+ VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
95
+ ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
96
+ STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um
97
+
98
+ ENTITY_START = /^\s*<!ENTITY/
99
+ ELEMENTDECL_START = /^\s*<!ELEMENT/um
100
+ ELEMENTDECL_PATTERN = /^\s*(<!ELEMENT.*?)>/um
101
+ SYSTEMENTITY = /^\s*(%.*?;)\s*$/um
102
+ ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
103
+ NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
104
+ ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
105
+ ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})"
106
+ ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')"
107
+ DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
108
+ ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
109
+ ATTDEF_RE = /#{ATTDEF}/
110
+ ATTLISTDECL_START = /^\s*<!ATTLIST/um
111
+ ATTLISTDECL_PATTERN = /^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
112
+
113
+ TEXT_PATTERN = /^([^<]*)/um
114
+
115
+ # Entity constants
116
+ PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
117
+ SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))}
118
+ PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}
119
+ EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"
120
+ NDATADECL = "\\s+NDATA\\s+#{NAME}"
121
+ PEREFERENCE = "%#{NAME};"
122
+ ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
123
+ PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
124
+ ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
125
+ PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
126
+ GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
127
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
128
+
129
+ NOTATIONDECL_START = /^\s*<!NOTATION/um
130
+ EXTERNAL_ID_PUBLIC = /^\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
131
+ EXTERNAL_ID_SYSTEM = /^\s*SYSTEM\s+#{SYSTEMLITERAL}\s*/um
132
+ PUBLIC_ID = /^\s*PUBLIC\s+#{PUBIDLITERAL}\s*/um
133
+
134
+ EREFERENCE = /&(?!#{NAME};)/
135
+
136
+ DEFAULT_ENTITIES = {
137
+ 'gt' => [/&gt;/, '&gt;', '>', />/],
138
+ 'lt' => [/&lt;/, '&lt;', '<', /</],
139
+ 'quot' => [/&quot;/, '&quot;', '"', /"/],
140
+ "apos" => [/&apos;/, "&apos;", "'", /'/]
141
+ }
142
+
143
+ module Private
144
+ PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
145
+ TAG_PATTERN = /((?:#{QNAME_STR}))\s*/um
146
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
147
+ EQUAL_PATTERN = /\s*=\s*/um
148
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
149
+ NAME_PATTERN = /#{NAME}/um
150
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
151
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
152
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
153
+ CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
154
+ CHARACTER_REFERENCES = /&#((?:\d+)|(?:x[a-fA-F0-9]+));/
155
+ DEFAULT_ENTITIES_PATTERNS = {}
156
+ default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
157
+ default_entities.each do |term|
158
+ DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
159
+ end
160
+ XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
161
+ end
162
+ private_constant :Private
163
+
164
+ def initialize( source )
165
+ self.stream = source
166
+ @listeners = []
167
+ @prefixes = Set.new
168
+ @entity_expansion_count = 0
169
+ @entity_expansion_limit = Security.entity_expansion_limit
170
+ @entity_expansion_text_limit = Security.entity_expansion_text_limit
171
+ @source.ensure_buffer
172
+ @version = nil
173
+ end
174
+
175
+ def add_listener( listener )
176
+ @listeners << listener
177
+ end
178
+
179
+ attr_reader :source
180
+ attr_reader :entity_expansion_count
181
+ attr_writer :entity_expansion_limit
182
+ attr_writer :entity_expansion_text_limit
183
+
184
+ def stream=( source )
185
+ @source = SourceFactory.create_from( source )
186
+ reset
187
+ end
188
+
189
+ def reset
190
+ @closed = nil
191
+ @have_root = false
192
+ @document_status = nil
193
+ @tags = []
194
+ @stack = []
195
+ @entities = []
196
+ @namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE}
197
+ @namespaces_restore_stack = []
198
+ end
199
+
200
+ def position
201
+ if @source.respond_to? :position
202
+ @source.position
203
+ else
204
+ # FIXME
205
+ 0
206
+ end
207
+ end
208
+
209
+ # Returns true if there are no more events
210
+ def empty?
211
+ (@source.empty? and @stack.empty?)
212
+ end
213
+
214
+ # Returns true if there are more events. Synonymous with !empty?
215
+ def has_next?
216
+ !(@source.empty? and @stack.empty?)
217
+ end
218
+
219
+ # Push an event back on the head of the stream. This method
220
+ # has (theoretically) infinite depth.
221
+ def unshift token
222
+ @stack.unshift(token)
223
+ end
224
+
225
+ # Peek at the +depth+ event in the stack. The first element on the stack
226
+ # is at depth 0. If +depth+ is -1, will parse to the end of the input
227
+ # stream and return the last event, which is always :end_document.
228
+ # Be aware that this causes the stream to be parsed up to the +depth+
229
+ # event, so you can effectively pre-parse the entire document (pull the
230
+ # entire thing into memory) using this method.
231
+ def peek depth=0
232
+ raise %Q[Illegal argument "#{depth}"] if depth < -1
233
+ temp = []
234
+ if depth == -1
235
+ temp.push(pull()) until empty?
236
+ else
237
+ while @stack.size+temp.size < depth+1
238
+ temp.push(pull())
239
+ end
240
+ end
241
+ @stack += temp if temp.size > 0
242
+ @stack[depth]
243
+ end
244
+
245
+ # Returns the next event. This is a +PullEvent+ object.
246
+ def pull
247
+ @source.drop_parsed_content
248
+
249
+ pull_event.tap do |event|
250
+ @listeners.each do |listener|
251
+ listener.receive event
252
+ end
253
+ end
254
+ end
255
+
256
+ def pull_event
257
+ if @closed
258
+ x, @closed = @closed, nil
259
+ return [ :end_element, x ]
260
+ end
261
+ if empty?
262
+ if @document_status == :in_doctype
263
+ raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
264
+ end
265
+ unless @tags.empty?
266
+ path = "/" + @tags.join("/")
267
+ raise ParseException.new("Missing end tag for '#{path}'", @source)
268
+ end
269
+
270
+ unless @document_status == :in_element
271
+ raise ParseException.new("Malformed XML: No root element", @source)
272
+ end
273
+
274
+ return [ :end_document ]
275
+ end
276
+ return @stack.shift if @stack.size > 0
277
+ #STDERR.puts @source.encoding
278
+ #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
279
+
280
+ @source.ensure_buffer
281
+ if @document_status == nil
282
+ start_position = @source.position
283
+ if @source.match?("<?", true)
284
+ return process_instruction
285
+ elsif @source.match?("<!", true)
286
+ if @source.match?("--", true)
287
+ return [ :comment, process_comment ]
288
+ elsif @source.match?("DOCTYPE", true)
289
+ base_error_message = "Malformed DOCTYPE"
290
+ unless @source.skip_spaces
291
+ if @source.match?(">")
292
+ message = "#{base_error_message}: name is missing"
293
+ else
294
+ message = "#{base_error_message}: invalid name"
295
+ end
296
+ @source.position = start_position
297
+ raise REXML::ParseException.new(message, @source)
298
+ end
299
+ name = parse_name(base_error_message)
300
+ @source.skip_spaces
301
+ if @source.match?("[", true)
302
+ id = [nil, nil, nil]
303
+ @document_status = :in_doctype
304
+ elsif @source.match?(">", true)
305
+ id = [nil, nil, nil]
306
+ @document_status = :after_doctype
307
+ @source.ensure_buffer
308
+ else
309
+ id = parse_id(base_error_message,
310
+ accept_external_id: true,
311
+ accept_public_id: false)
312
+ if id[0] == "SYSTEM"
313
+ # For backward compatibility
314
+ id[1], id[2] = id[2], nil
315
+ end
316
+ @source.skip_spaces
317
+ if @source.match?("[", true)
318
+ @document_status = :in_doctype
319
+ elsif @source.match?(">", true)
320
+ @document_status = :after_doctype
321
+ @source.ensure_buffer
322
+ else
323
+ message = "#{base_error_message}: garbage after external ID"
324
+ raise REXML::ParseException.new(message, @source)
325
+ end
326
+ end
327
+ args = [:start_doctype, name, *id]
328
+ if @document_status == :after_doctype
329
+ @source.skip_spaces
330
+ @stack << [ :end_doctype ]
331
+ end
332
+ return args
333
+ else
334
+ message = "Invalid XML"
335
+ raise REXML::ParseException.new(message, @source)
336
+ end
337
+ end
338
+ end
339
+ if @document_status == :in_doctype
340
+ @source.skip_spaces
341
+ start_position = @source.position
342
+ if @source.match?("<!", true)
343
+ if @source.match?("ELEMENT", true)
344
+ md = @source.match(/(.*?)>/um, true)
345
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
346
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
347
+ elsif @source.match?("ENTITY", true)
348
+ match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
349
+ unless match_data
350
+ raise REXML::ParseException.new("Malformed entity declaration", @source)
351
+ end
352
+ match = [:entitydecl, *match_data.captures.compact]
353
+ ref = false
354
+ if match[1] == '%'
355
+ ref = true
356
+ match.delete_at 1
357
+ end
358
+ # Now we have to sort out what kind of entity reference this is
359
+ if match[2] == 'SYSTEM'
360
+ # External reference
361
+ match[3] = match[3][1..-2] # PUBID
362
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
363
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
364
+ elsif match[2] == 'PUBLIC'
365
+ # External reference
366
+ match[3] = match[3][1..-2] # PUBID
367
+ match[4] = match[4][1..-2] # HREF
368
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
369
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
370
+ elsif Private::PEREFERENCE_PATTERN.match?(match[2])
371
+ raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
372
+ else
373
+ match[2] = match[2][1..-2]
374
+ match.pop if match.size == 4
375
+ # match is [ :entity, name, value ]
376
+ end
377
+ match << '%' if ref
378
+ return match
379
+ elsif @source.match?("ATTLIST", true)
380
+ md = @source.match(Private::ATTLISTDECL_END, true)
381
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
382
+ element = md[1]
383
+ contents = "<!ATTLIST" + md[0]
384
+
385
+ pairs = {}
386
+ values = md[0].strip.scan( ATTDEF_RE )
387
+ values.each do |attdef|
388
+ unless attdef[3] == "#IMPLIED"
389
+ attdef.compact!
390
+ val = attdef[3]
391
+ val = attdef[4] if val == "#FIXED "
392
+ pairs[attdef[0]] = val
393
+ if attdef[0] =~ /^xmlns:(.*)/
394
+ @namespaces[$1] = val
395
+ end
396
+ end
397
+ end
398
+ return [ :attlistdecl, element, pairs, contents ]
399
+ elsif @source.match?("NOTATION", true)
400
+ base_error_message = "Malformed notation declaration"
401
+ unless @source.skip_spaces
402
+ if @source.match?(">")
403
+ message = "#{base_error_message}: name is missing"
404
+ else
405
+ message = "#{base_error_message}: invalid name"
406
+ end
407
+ @source.position = start_position
408
+ raise REXML::ParseException.new(message, @source)
409
+ end
410
+ name = parse_name(base_error_message)
411
+ id = parse_id(base_error_message,
412
+ accept_external_id: true,
413
+ accept_public_id: true)
414
+ @source.skip_spaces
415
+ unless @source.match?(">", true)
416
+ message = "#{base_error_message}: garbage before end >"
417
+ raise REXML::ParseException.new(message, @source)
418
+ end
419
+ return [:notationdecl, name, *id]
420
+ elsif @source.match?("--", true)
421
+ return [ :comment, process_comment ]
422
+ else
423
+ raise REXML::ParseException.new("Malformed node: Started with '<!' but not a comment nor ELEMENT,ENTITY,ATTLIST,NOTATION", @source)
424
+ end
425
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
426
+ return [ :externalentity, match[1] ]
427
+ elsif @source.match?(/\]\s*>/um, true)
428
+ @document_status = :after_doctype
429
+ return [ :end_doctype ]
430
+ else
431
+ raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
432
+ end
433
+ end
434
+ if @document_status == :after_doctype
435
+ @source.skip_spaces
436
+ end
437
+ begin
438
+ start_position = @source.position
439
+ if @source.match?("<", true)
440
+ # :text's read_until may remain only "<" in buffer. In the
441
+ # case, buffer is empty here. So we need to fill buffer
442
+ # here explicitly.
443
+ @source.ensure_buffer
444
+ if @source.match?("/", true)
445
+ @namespaces_restore_stack.pop
446
+ last_tag = @tags.pop
447
+ md = @source.match(Private::CLOSE_PATTERN, true)
448
+ if md and !last_tag
449
+ message = "Unexpected top-level end tag (got '#{md[1]}')"
450
+ raise REXML::ParseException.new(message, @source)
451
+ end
452
+ if md.nil? or last_tag != md[1]
453
+ message = "Missing end tag for '#{last_tag}'"
454
+ message += " (got '#{md[1]}')" if md
455
+ @source.position = start_position if md.nil?
456
+ raise REXML::ParseException.new(message, @source)
457
+ end
458
+ return [ :end_element, last_tag ]
459
+ elsif @source.match?("!", true)
460
+ #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
461
+ if @source.match?("--", true)
462
+ return [ :comment, process_comment ]
463
+ elsif @source.match?("[CDATA[", true)
464
+ text = @source.read_until("]]>")
465
+ unless text.end_with?("]]>")
466
+ raise REXML::ParseException.new("Malformed CDATA: Missing end ']]>'", @source)
467
+ end
468
+ text = text[0...-3]
469
+ return [ :cdata, text ]
470
+ else
471
+ raise REXML::ParseException.new("Malformed node: Started with '<!' but not a comment nor CDATA", @source)
472
+ end
473
+ elsif @source.match?("?", true)
474
+ return process_instruction
475
+ else
476
+ # Get the next tag
477
+ md = @source.match(Private::TAG_PATTERN, true)
478
+ unless md
479
+ @source.position = start_position
480
+ raise REXML::ParseException.new("malformed XML: missing tag start", @source)
481
+ end
482
+ tag = md[1]
483
+ @document_status = :in_element
484
+ @prefixes.clear
485
+ @prefixes << md[2] if md[2]
486
+ push_namespaces_restore
487
+ attributes, closed = parse_attributes(@prefixes)
488
+ # Verify that all of the prefixes have been defined
489
+ for prefix in @prefixes
490
+ unless @namespaces.key?(prefix)
491
+ raise UndefinedNamespaceException.new(prefix,@source,self)
492
+ end
493
+ end
494
+
495
+ if closed
496
+ @closed = tag
497
+ pop_namespaces_restore
498
+ else
499
+ if @tags.empty? and @have_root
500
+ raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
501
+ end
502
+ @tags.push( tag )
503
+ end
504
+ @have_root = true
505
+ return [ :start_element, tag, attributes ]
506
+ end
507
+ else
508
+ text = @source.read_until("<")
509
+ if text.end_with?("<")
510
+ text = text[0...-1]
511
+ @source.position -= "<".bytesize
512
+ end
513
+ if @tags.empty?
514
+ unless /^\s*$/.match?(text)
515
+ if @have_root
516
+ raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
517
+ else
518
+ raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
519
+ end
520
+ end
521
+ return pull_event if @have_root
522
+ end
523
+ return [ :text, text ]
524
+ end
525
+ rescue REXML::UndefinedNamespaceException
526
+ raise
527
+ rescue REXML::ParseException
528
+ raise
529
+ rescue => error
530
+ raise REXML::ParseException.new( "Exception parsing",
531
+ @source, self, (error ? error : $!) )
532
+ end
533
+ # NOTE: The end of the method never runs, because it is unreachable.
534
+ # All branches of code above have explicit unconditional return or raise statements.
535
+ end
536
+ private :pull_event
537
+
538
+ def entity( reference, entities )
539
+ return unless entities
540
+
541
+ value = entities[ reference ]
542
+ return if value.nil?
543
+
544
+ record_entity_expansion
545
+ unnormalize( value, entities )
546
+ end
547
+
548
+ # Escapes all possible entities
549
+ def normalize( input, entities=nil, entity_filter=nil )
550
+ copy = input.clone
551
+ # Doing it like this rather than in a loop improves the speed
552
+ copy.gsub!( EREFERENCE, '&amp;' )
553
+ entities.each do |key, value|
554
+ copy.gsub!( value, "&#{key};" ) unless entity_filter and
555
+ entity_filter.include?(entity)
556
+ end if entities
557
+ copy.gsub!( EREFERENCE, '&amp;' )
558
+ DEFAULT_ENTITIES.each do |key, value|
559
+ copy.gsub!( value[3], value[1] )
560
+ end
561
+ copy
562
+ end
563
+
564
+ # Unescapes all possible entities
565
+ def unnormalize( string, entities=nil, filter=nil )
566
+ if string.include?("\r")
567
+ rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
568
+ else
569
+ rv = string.dup
570
+ end
571
+ matches = rv.scan( REFERENCE_RE )
572
+ return rv if matches.size == 0
573
+ rv.gsub!( Private::CHARACTER_REFERENCES ) {
574
+ m=$1
575
+ if m.start_with?("x")
576
+ code_point = Integer(m[1..-1], 16)
577
+ else
578
+ code_point = Integer(m, 10)
579
+ end
580
+ [code_point].pack('U*')
581
+ }
582
+ matches.collect!{|x|x[0]}.compact!
583
+ if filter
584
+ matches.reject! do |entity_reference|
585
+ filter.include?(entity_reference)
586
+ end
587
+ end
588
+ if matches.size > 0
589
+ matches.tally.each do |entity_reference, n|
590
+ entity_expansion_count_before = @entity_expansion_count
591
+ entity_value = entity( entity_reference, entities )
592
+ if entity_value
593
+ if n > 1
594
+ entity_expansion_count_delta =
595
+ @entity_expansion_count - entity_expansion_count_before
596
+ record_entity_expansion(entity_expansion_count_delta * (n - 1))
597
+ end
598
+ re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
599
+ rv.gsub!( re, entity_value )
600
+ if rv.bytesize > @entity_expansion_text_limit
601
+ raise "entity expansion has grown too large"
602
+ end
603
+ else
604
+ er = DEFAULT_ENTITIES[entity_reference]
605
+ rv.gsub!( er[0], er[2] ) if er
606
+ end
607
+ end
608
+ rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
609
+ end
610
+ rv
611
+ end
612
+
613
+ private
614
+ def add_namespace(prefix, uri)
615
+ @namespaces_restore_stack.last[prefix] = @namespaces[prefix]
616
+ if uri.nil?
617
+ @namespaces.delete(prefix)
618
+ else
619
+ @namespaces[prefix] = uri
620
+ end
621
+ end
622
+
623
+ def push_namespaces_restore
624
+ namespaces_restore = {}
625
+ @namespaces_restore_stack.push(namespaces_restore)
626
+ namespaces_restore
627
+ end
628
+
629
+ def pop_namespaces_restore
630
+ namespaces_restore = @namespaces_restore_stack.pop
631
+ namespaces_restore.each do |prefix, uri|
632
+ if uri.nil?
633
+ @namespaces.delete(prefix)
634
+ else
635
+ @namespaces[prefix] = uri
636
+ end
637
+ end
638
+ end
639
+
640
+ def record_entity_expansion(delta=1)
641
+ @entity_expansion_count += delta
642
+ if @entity_expansion_count > @entity_expansion_limit
643
+ raise "number of entity expansions exceeded, processing aborted."
644
+ end
645
+ end
646
+
647
+ def need_source_encoding_update?(xml_declaration_encoding)
648
+ return false if xml_declaration_encoding.nil?
649
+ return false if /^UTF-16$/i =~ xml_declaration_encoding
650
+ true
651
+ end
652
+
653
+ def normalize_xml_declaration_encoding(xml_declaration_encoding)
654
+ /^UTF-16(?:BE|LE)$/i.match?(xml_declaration_encoding) ? "UTF-16" : nil
655
+ end
656
+
657
+ def parse_name(base_error_message)
658
+ md = @source.match(Private::NAME_PATTERN, true)
659
+ unless md
660
+ if @source.match?(/\S/um)
661
+ message = "#{base_error_message}: invalid name"
662
+ else
663
+ message = "#{base_error_message}: name is missing"
664
+ end
665
+ raise REXML::ParseException.new(message, @source)
666
+ end
667
+ md[0]
668
+ end
669
+
670
+ def parse_id(base_error_message,
671
+ accept_external_id:,
672
+ accept_public_id:)
673
+ if accept_external_id and (md = @source.match(EXTERNAL_ID_PUBLIC, true))
674
+ pubid = system = nil
675
+ pubid_literal = md[1]
676
+ pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
677
+ system_literal = md[2]
678
+ system = system_literal[1..-2] if system_literal # Remove quote
679
+ ["PUBLIC", pubid, system]
680
+ elsif accept_public_id and (md = @source.match(PUBLIC_ID, true))
681
+ pubid = system = nil
682
+ pubid_literal = md[1]
683
+ pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
684
+ ["PUBLIC", pubid, nil]
685
+ elsif accept_external_id and (md = @source.match(EXTERNAL_ID_SYSTEM, true))
686
+ system = nil
687
+ system_literal = md[1]
688
+ system = system_literal[1..-2] if system_literal # Remove quote
689
+ ["SYSTEM", nil, system]
690
+ else
691
+ details = parse_id_invalid_details(accept_external_id: accept_external_id,
692
+ accept_public_id: accept_public_id)
693
+ message = "#{base_error_message}: #{details}"
694
+ raise REXML::ParseException.new(message, @source)
695
+ end
696
+ end
697
+
698
+ def parse_id_invalid_details(accept_external_id:,
699
+ accept_public_id:)
700
+ public = /^\s*PUBLIC/um
701
+ system = /^\s*SYSTEM/um
702
+ if (accept_external_id or accept_public_id) and @source.match?(/#{public}/um)
703
+ if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
704
+ return "public ID literal is missing"
705
+ end
706
+ unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um)
707
+ return "invalid public ID literal"
708
+ end
709
+ if accept_public_id
710
+ if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
711
+ return "system ID literal is missing"
712
+ end
713
+ unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
714
+ return "invalid system literal"
715
+ end
716
+ "garbage after system literal"
717
+ else
718
+ "garbage after public ID literal"
719
+ end
720
+ elsif accept_external_id and @source.match?(/#{system}/um)
721
+ if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
722
+ return "system literal is missing"
723
+ end
724
+ unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um)
725
+ return "invalid system literal"
726
+ end
727
+ "garbage after system literal"
728
+ else
729
+ unless @source.match?(/^\s*(?:PUBLIC|SYSTEM)\s/um)
730
+ return "invalid ID type"
731
+ end
732
+ "ID type is missing"
733
+ end
734
+ end
735
+
736
+ def process_comment
737
+ text = @source.read_until("-->")
738
+ unless text.end_with?("-->")
739
+ raise REXML::ParseException.new("Unclosed comment: Missing end '-->'", @source)
740
+ end
741
+ text = text[0...-3]
742
+
743
+ if text.include? "--" or text.end_with?("-")
744
+ raise REXML::ParseException.new("Malformed comment", @source)
745
+ end
746
+ text
747
+ end
748
+
749
+ def process_instruction
750
+ name = parse_name("Malformed XML: Invalid processing instruction node")
751
+ if name == "xml"
752
+ xml_declaration
753
+ else # PITarget
754
+ if @source.skip_spaces # e.g. <?name content?>
755
+ start_position = @source.position
756
+ content = @source.read_until("?>")
757
+ unless content.end_with?("?>")
758
+ @source.position = start_position
759
+ raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source)
760
+ end
761
+ content = content[0...-2]
762
+ else # e.g. <?name?>
763
+ content = nil
764
+ unless @source.match?("?>", true)
765
+ raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source)
766
+ end
767
+ end
768
+ [:processing_instruction, name, content]
769
+ end
770
+ end
771
+
772
+ def xml_declaration
773
+ unless @version.nil?
774
+ raise ParseException.new("Malformed XML: XML declaration is duplicated", @source)
775
+ end
776
+ if @document_status
777
+ raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
778
+ end
779
+ unless @source.skip_spaces
780
+ raise ParseException.new("Malformed XML: XML declaration misses spaces before version", @source)
781
+ end
782
+ unless @source.match?("version", true)
783
+ raise ParseException.new("Malformed XML: XML declaration misses version", @source)
784
+ end
785
+ @version = parse_attribute_value_with_equal("xml")
786
+ unless @source.skip_spaces
787
+ unless @source.match?("?>", true)
788
+ raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
789
+ end
790
+ encoding = normalize_xml_declaration_encoding(@source.encoding)
791
+ return [ :xmldecl, @version, encoding, nil ] # e.g. <?xml version="1.0"?>
792
+ end
793
+
794
+ if @source.match?("encoding", true)
795
+ encoding = parse_attribute_value_with_equal("xml")
796
+ unless @source.skip_spaces
797
+ unless @source.match?("?>", true)
798
+ raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
799
+ end
800
+ if need_source_encoding_update?(encoding)
801
+ @source.encoding = encoding
802
+ end
803
+ encoding ||= normalize_xml_declaration_encoding(@source.encoding)
804
+ return [ :xmldecl, @version, encoding, nil ] # e.g. <?xml version="1.1" encoding="UTF-8"?>
805
+ end
806
+ end
807
+
808
+ if @source.match?("standalone", true)
809
+ standalone = parse_attribute_value_with_equal("xml")
810
+ case standalone
811
+ when "yes", "no"
812
+ else
813
+ raise ParseException.new("Malformed XML: XML declaration standalone is not yes or no : <#{standalone}>", @source)
814
+ end
815
+ end
816
+ @source.skip_spaces
817
+ unless @source.match?("?>", true)
818
+ raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
819
+ end
820
+
821
+ if need_source_encoding_update?(encoding)
822
+ @source.encoding = encoding
823
+ end
824
+ encoding ||= normalize_xml_declaration_encoding(@source.encoding)
825
+
826
+ # e.g. <?xml version="1.0" ?>
827
+ # <?xml version="1.1" encoding="UTF-8" ?>
828
+ # <?xml version="1.1" standalone="yes"?>
829
+ # <?xml version="1.1" encoding="UTF-8" standalone="yes" ?>
830
+ [ :xmldecl, @version, encoding, standalone ]
831
+ end
832
+
833
+ if StringScanner::Version < "3.1.1"
834
+ def scan_quote
835
+ @source.match(/(['"])/, true)&.[](1)
836
+ end
837
+ else
838
+ def scan_quote
839
+ case @source.peek_byte
840
+ when 34 # '"'.ord
841
+ @source.scan_byte
842
+ '"'
843
+ when 39 # "'".ord
844
+ @source.scan_byte
845
+ "'"
846
+ else
847
+ nil
848
+ end
849
+ end
850
+ end
851
+
852
+ def parse_attribute_value_with_equal(name)
853
+ unless @source.match?(Private::EQUAL_PATTERN, true)
854
+ message = "Missing attribute equal: <#{name}>"
855
+ raise REXML::ParseException.new(message, @source)
856
+ end
857
+ unless quote = scan_quote
858
+ message = "Missing attribute value start quote: <#{name}>"
859
+ raise REXML::ParseException.new(message, @source)
860
+ end
861
+ start_position = @source.position
862
+ value = @source.read_until(quote)
863
+ unless value.end_with?(quote)
864
+ @source.position = start_position
865
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
866
+ raise REXML::ParseException.new(message, @source)
867
+ end
868
+ value = value[0...-1]
869
+ end
870
+
871
+ def parse_attributes(prefixes)
872
+ attributes = {}
873
+ expanded_names = {}
874
+ closed = false
875
+ while true
876
+ if @source.match?(">", true)
877
+ return attributes, closed
878
+ elsif @source.match?("/>", true)
879
+ closed = true
880
+ return attributes, closed
881
+ elsif match = @source.match(QNAME, true)
882
+ name = match[1]
883
+ prefix = match[2]
884
+ local_part = match[3]
885
+ value = parse_attribute_value_with_equal(name)
886
+ @source.skip_spaces
887
+ if prefix == "xmlns"
888
+ if local_part == "xml"
889
+ if value != Private::XML_PREFIXED_NAMESPACE
890
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
891
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
892
+ raise REXML::ParseException.new( msg, @source, self )
893
+ end
894
+ elsif local_part == "xmlns"
895
+ msg = "The 'xmlns' prefix must not be declared "+
896
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
897
+ raise REXML::ParseException.new( msg, @source, self)
898
+ end
899
+ add_namespace(local_part, value)
900
+ elsif prefix
901
+ prefixes << prefix unless prefix == "xml"
902
+ end
903
+
904
+ if attributes[name]
905
+ msg = "Duplicate attribute #{name.inspect}"
906
+ raise REXML::ParseException.new(msg, @source, self)
907
+ end
908
+
909
+ unless prefix == "xmlns"
910
+ uri = @namespaces[prefix]
911
+ expanded_name = [uri, local_part]
912
+ existing_prefix = expanded_names[expanded_name]
913
+ if existing_prefix
914
+ message = "Namespace conflict in adding attribute " +
915
+ "\"#{local_part}\": " +
916
+ "Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
917
+ "prefix \"#{prefix}\" = \"#{uri}\""
918
+ raise REXML::ParseException.new(message, @source, self)
919
+ end
920
+ expanded_names[expanded_name] = prefix
921
+ end
922
+
923
+ attributes[name] = value
924
+ else
925
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
926
+ raise REXML::ParseException.new(message, @source)
927
+ end
928
+ end
929
+ end
930
+ end
931
+ end
932
+ end
933
+
934
+ =begin
935
+ case event[0]
936
+ when :start_element
937
+ when :text
938
+ when :end_element
939
+ when :processing_instruction
940
+ when :cdata
941
+ when :comment
942
+ when :xmldecl
943
+ when :start_doctype
944
+ when :end_doctype
945
+ when :externalentity
946
+ when :elementdecl
947
+ when :entity
948
+ when :attlistdecl
949
+ when :notationdecl
950
+ when :end_doctype
951
+ end
952
+ =end