moxml 0.1.20 → 0.1.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0633c51783a25d02190769345f0c6a4af79c29665b180e5b46fc28f0b6eeee31
4
- data.tar.gz: b3d9d706cc185c2d045ed8d3c9a62bb884196cc2cc520cd51fd17cecb07cd897
3
+ metadata.gz: cd873f36f1ee8d7799299cedbc4bfa7da00d588e311693be8b6e718b9e09fa8d
4
+ data.tar.gz: 2597df5af105dfcfdc84586b98473e99af95681fbf1ef24b0c2c6698280e6dbb
5
5
  SHA512:
6
- metadata.gz: 6626a13b9dda295113caa1e1d99085afd20776b1d3d4f156bbdf63efcefccef59cd1ea37d14439aa7487f3f617ed65ac1003ec775b22e77614a4b0082c832307
7
- data.tar.gz: 68b26d50fa35b206835f633dfd7b26f6c389fc3507a524a84784929b089e0c699d1361af8838d86e5fcec0e8031f81ca82e2e799b544737cc1616a3da5d15707
6
+ metadata.gz: b39f087ced4fc9ea76722e32c8b036ee7e4819798bec248af520f51bb20039cb76459837e1b5a65540f46bf7112ee5922eece22cac763ef25b3235a0c5ca60f6
7
+ data.tar.gz: 75847baca549e9cb0902203f2d8b9f8e4e764c048dc0cd57260d6995f1aa233c5db6ecdb6786344cdfb097b4ec73a088d1c12b4d79b2fcb5217c936ba4605ec1
@@ -46,6 +46,11 @@ module Moxml
46
46
  else
47
47
  xml.encode("UTF-8")
48
48
  end
49
+ # Fast path: no `&` means no entity references to mark — skip
50
+ # the regex scan and string allocation entirely. The vast
51
+ # majority of XML payloads contain no entity references.
52
+ return str unless str.include?("&")
53
+
49
54
  str.gsub(ENTITY_NAME_RE) do |match|
50
55
  STANDARD_ENTITIES.include?(::Regexp.last_match(1)) ? match : "#{ENTITY_MARKER}#{::Regexp.last_match(1)};"
51
56
  end
@@ -8,6 +8,9 @@ module Moxml
8
8
  # This wrapper hides LibXML's strict document ownership model,
9
9
  # allowing nodes to be moved between documents transparently.
10
10
  # Similar pattern to Ox adapter's customized classes.
11
+ #
12
+ # The Libxml adapter owns wrapper type mapping in one place so the
13
+ # wrapper classes do not duplicate node-type knowledge.
11
14
  class Node
12
15
  attr_reader :native
13
16
 
@@ -19,7 +19,12 @@ module Moxml
19
19
  # LibXML's .content already contains escaped text, but it over-escapes
20
20
  # quotes which don't need escaping in text nodes (only in attributes)
21
21
  def to_xml
22
- @native.content.gsub(""", '"')
22
+ content = @native.content
23
+ # Skip the gsub allocation entirely when there's nothing to undo —
24
+ # the common case for parsed text without literal quotes.
25
+ return content unless content.include?(""")
26
+
27
+ content.gsub(""", '"')
23
28
  end
24
29
  end
25
30
  end
@@ -0,0 +1,105 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Moxml
4
+ module Adapter
5
+ class Libxml < Base
6
+ # Tracks entity-reference insertions that cannot live in LibXML's native
7
+ # node tree, plus the child sequence needed to serialize them in order.
8
+ class EntityRefRegistry
9
+ ENTITY_REFS_KEY = :_entity_ref_pairs
10
+ CHILD_SEQUENCE_KEY = :_child_seq_pairs
11
+ NON_WHITESPACE_RE = /\S/
12
+ private_constant :ENTITY_REFS_KEY, :CHILD_SEQUENCE_KEY, :NON_WHITESPACE_RE
13
+
14
+ def initialize(attachments, doc)
15
+ @attachments = attachments
16
+ @doc = doc
17
+ end
18
+
19
+ def active?
20
+ @doc ? @attachments.key?(@doc, ENTITY_REFS_KEY) : false
21
+ end
22
+
23
+ def register(element, ref)
24
+ return unless @doc && element
25
+
26
+ path = path_for(element)
27
+
28
+ refs_by_path = @attachments.get(@doc, ENTITY_REFS_KEY) || {}
29
+ (refs_by_path[path] ||= []) << ref
30
+ @attachments.set(@doc, ENTITY_REFS_KEY, refs_by_path)
31
+
32
+ seq_by_path = @attachments.get(@doc, CHILD_SEQUENCE_KEY) || {}
33
+ existing = seq_by_path[path]
34
+ if existing
35
+ existing << :eref
36
+ else
37
+ seq_by_path[path] = Array.new(count_native_children(element), :native)
38
+ seq_by_path[path] << :eref
39
+ @attachments.set(@doc, CHILD_SEQUENCE_KEY, seq_by_path)
40
+ end
41
+ end
42
+
43
+ def append_native(element)
44
+ return unless @doc && element
45
+
46
+ seq_by_path = @attachments.get(@doc, CHILD_SEQUENCE_KEY)
47
+ return unless seq_by_path
48
+
49
+ seq = seq_by_path[path_for(element)]
50
+ return unless seq
51
+
52
+ seq << :native
53
+ end
54
+
55
+ def refs_for(element)
56
+ return nil unless @doc && element
57
+
58
+ refs_by_path = @attachments.get(@doc, ENTITY_REFS_KEY)
59
+ refs_by_path && refs_by_path[path_for(element)]
60
+ end
61
+
62
+ def sequence_for(element)
63
+ return nil unless @doc && element
64
+
65
+ seq_by_path = @attachments.get(@doc, CHILD_SEQUENCE_KEY)
66
+ seq_by_path && seq_by_path[path_for(element)]
67
+ end
68
+
69
+ def serialization_for(element)
70
+ refs = refs_for(element)
71
+ return [nil, nil] unless refs && !refs.empty?
72
+
73
+ seq = sequence_for(element)
74
+ return [nil, nil] unless seq
75
+
76
+ [refs, seq]
77
+ end
78
+
79
+ private
80
+
81
+ def path_for(element)
82
+ element.path
83
+ end
84
+
85
+ def count_native_children(element)
86
+ return 0 unless element.is_a?(::LibXML::XML::Node) && element.children?
87
+
88
+ count = 0
89
+ element.each_child do |child|
90
+ count += 1 unless blank_text_node?(child)
91
+ end
92
+ count
93
+ end
94
+
95
+ def blank_text_node?(child)
96
+ child.text? && blank_content?(child.content)
97
+ end
98
+
99
+ def blank_content?(content)
100
+ content.nil? || !content.match?(NON_WHITESPACE_RE)
101
+ end
102
+ end
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Moxml
4
+ module Adapter
5
+ class Libxml < Base
6
+ # Restores configured character entities into explicit Moxml
7
+ # EntityReference nodes after LibXML has parsed the native tree.
8
+ class EntityRestorer
9
+ def initialize(doc, adapter: Libxml)
10
+ @doc = doc
11
+ @ctx = doc.context
12
+ @registry = @ctx.entity_registry
13
+ @config = @ctx.config
14
+ @adapter = adapter
15
+ end
16
+
17
+ def run
18
+ return unless @registry && @doc.root
19
+
20
+ walk(@doc.root)
21
+ end
22
+
23
+ private
24
+
25
+ def walk(element)
26
+ # Snapshot because we may add/remove siblings during the walk.
27
+ element.children.to_a.each do |child|
28
+ if child.is_a?(::Moxml::Text)
29
+ restore_text_node(child)
30
+ elsif child.is_a?(::Moxml::Element)
31
+ walk(child)
32
+ end
33
+ end
34
+ end
35
+
36
+ # Matches DocumentBuilder's previous behavior, including the libxml
37
+ # limitation that adjacent native text nodes get merged.
38
+ def restore_text_node(text_node)
39
+ content = text_node.content
40
+ return unless content
41
+
42
+ chunks = chunk_text(content)
43
+ return if chunks.size == 1 && chunks.first.first == :text
44
+
45
+ parent = text_node.parent
46
+ return unless parent
47
+
48
+ text_node.remove
49
+ chunks.each { |type, payload| append_chunk(parent, type, payload) }
50
+ end
51
+
52
+ def chunk_text(content)
53
+ chunks = []
54
+ buffer = +""
55
+ restorable = @registry.restorable_codepoints
56
+
57
+ content.each_char do |char|
58
+ cp = char.ord
59
+ if restorable.include?(cp) &&
60
+ (name = @registry.primary_name_for_codepoint(cp)) &&
61
+ @registry.should_restore?(cp, config: @config)
62
+ unless buffer.empty?
63
+ chunks << [:text, buffer.dup]
64
+ buffer.clear
65
+ end
66
+ chunks << [:eref, name]
67
+ else
68
+ buffer << char
69
+ end
70
+ end
71
+
72
+ chunks << [:text, buffer.dup] unless buffer.empty?
73
+ chunks
74
+ end
75
+
76
+ def append_chunk(parent, type, payload)
77
+ case type
78
+ when :text
79
+ parent.add_child(::Moxml::Text.new(@adapter.create_native_text(payload), @ctx))
80
+ when :eref
81
+ parent.add_child(
82
+ ::Moxml::EntityReference.new(
83
+ @adapter.create_native_entity_reference(payload),
84
+ @ctx,
85
+ ),
86
+ )
87
+ end
88
+ end
89
+ end
90
+ end
91
+ end
92
+ end