moxml 0.1.20 → 0.1.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/moxml/adapter/base.rb +5 -0
- data/lib/moxml/adapter/customized_libxml/node.rb +3 -0
- data/lib/moxml/adapter/customized_libxml/text.rb +6 -1
- data/lib/moxml/adapter/libxml/entity_ref_registry.rb +105 -0
- data/lib/moxml/adapter/libxml/entity_restorer.rb +92 -0
- data/lib/moxml/adapter/libxml.rb +381 -362
- data/lib/moxml/version.rb +1 -1
- data/spec/moxml/adapter/libxml_internals_spec.rb +167 -0
- data/spec/performance/benchmark_spec.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: cd873f36f1ee8d7799299cedbc4bfa7da00d588e311693be8b6e718b9e09fa8d
|
|
4
|
+
data.tar.gz: 2597df5af105dfcfdc84586b98473e99af95681fbf1ef24b0c2c6698280e6dbb
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b39f087ced4fc9ea76722e32c8b036ee7e4819798bec248af520f51bb20039cb76459837e1b5a65540f46bf7112ee5922eece22cac763ef25b3235a0c5ca60f6
|
|
7
|
+
data.tar.gz: 75847baca549e9cb0902203f2d8b9f8e4e764c048dc0cd57260d6995f1aa233c5db6ecdb6786344cdfb097b4ec73a088d1c12b4d79b2fcb5217c936ba4605ec1
|
data/lib/moxml/adapter/base.rb
CHANGED
|
@@ -46,6 +46,11 @@ module Moxml
|
|
|
46
46
|
else
|
|
47
47
|
xml.encode("UTF-8")
|
|
48
48
|
end
|
|
49
|
+
# Fast path: no `&` means no entity references to mark — skip
|
|
50
|
+
# the regex scan and string allocation entirely. The vast
|
|
51
|
+
# majority of XML payloads contain no entity references.
|
|
52
|
+
return str unless str.include?("&")
|
|
53
|
+
|
|
49
54
|
str.gsub(ENTITY_NAME_RE) do |match|
|
|
50
55
|
STANDARD_ENTITIES.include?(::Regexp.last_match(1)) ? match : "#{ENTITY_MARKER}#{::Regexp.last_match(1)};"
|
|
51
56
|
end
|
|
@@ -8,6 +8,9 @@ module Moxml
|
|
|
8
8
|
# This wrapper hides LibXML's strict document ownership model,
|
|
9
9
|
# allowing nodes to be moved between documents transparently.
|
|
10
10
|
# Similar pattern to Ox adapter's customized classes.
|
|
11
|
+
#
|
|
12
|
+
# The Libxml adapter owns wrapper type mapping in one place so the
|
|
13
|
+
# wrapper classes do not duplicate node-type knowledge.
|
|
11
14
|
class Node
|
|
12
15
|
attr_reader :native
|
|
13
16
|
|
|
@@ -19,7 +19,12 @@ module Moxml
|
|
|
19
19
|
# LibXML's .content already contains escaped text, but it over-escapes
|
|
20
20
|
# quotes which don't need escaping in text nodes (only in attributes)
|
|
21
21
|
def to_xml
|
|
22
|
-
@native.content
|
|
22
|
+
content = @native.content
|
|
23
|
+
# Skip the gsub allocation entirely when there's nothing to undo —
|
|
24
|
+
# the common case for parsed text without literal quotes.
|
|
25
|
+
return content unless content.include?(""")
|
|
26
|
+
|
|
27
|
+
content.gsub(""", '"')
|
|
23
28
|
end
|
|
24
29
|
end
|
|
25
30
|
end
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Moxml
|
|
4
|
+
module Adapter
|
|
5
|
+
class Libxml < Base
|
|
6
|
+
# Tracks entity-reference insertions that cannot live in LibXML's native
|
|
7
|
+
# node tree, plus the child sequence needed to serialize them in order.
|
|
8
|
+
class EntityRefRegistry
|
|
9
|
+
ENTITY_REFS_KEY = :_entity_ref_pairs
|
|
10
|
+
CHILD_SEQUENCE_KEY = :_child_seq_pairs
|
|
11
|
+
NON_WHITESPACE_RE = /\S/
|
|
12
|
+
private_constant :ENTITY_REFS_KEY, :CHILD_SEQUENCE_KEY, :NON_WHITESPACE_RE
|
|
13
|
+
|
|
14
|
+
def initialize(attachments, doc)
|
|
15
|
+
@attachments = attachments
|
|
16
|
+
@doc = doc
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def active?
|
|
20
|
+
@doc ? @attachments.key?(@doc, ENTITY_REFS_KEY) : false
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def register(element, ref)
|
|
24
|
+
return unless @doc && element
|
|
25
|
+
|
|
26
|
+
path = path_for(element)
|
|
27
|
+
|
|
28
|
+
refs_by_path = @attachments.get(@doc, ENTITY_REFS_KEY) || {}
|
|
29
|
+
(refs_by_path[path] ||= []) << ref
|
|
30
|
+
@attachments.set(@doc, ENTITY_REFS_KEY, refs_by_path)
|
|
31
|
+
|
|
32
|
+
seq_by_path = @attachments.get(@doc, CHILD_SEQUENCE_KEY) || {}
|
|
33
|
+
existing = seq_by_path[path]
|
|
34
|
+
if existing
|
|
35
|
+
existing << :eref
|
|
36
|
+
else
|
|
37
|
+
seq_by_path[path] = Array.new(count_native_children(element), :native)
|
|
38
|
+
seq_by_path[path] << :eref
|
|
39
|
+
@attachments.set(@doc, CHILD_SEQUENCE_KEY, seq_by_path)
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def append_native(element)
|
|
44
|
+
return unless @doc && element
|
|
45
|
+
|
|
46
|
+
seq_by_path = @attachments.get(@doc, CHILD_SEQUENCE_KEY)
|
|
47
|
+
return unless seq_by_path
|
|
48
|
+
|
|
49
|
+
seq = seq_by_path[path_for(element)]
|
|
50
|
+
return unless seq
|
|
51
|
+
|
|
52
|
+
seq << :native
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def refs_for(element)
|
|
56
|
+
return nil unless @doc && element
|
|
57
|
+
|
|
58
|
+
refs_by_path = @attachments.get(@doc, ENTITY_REFS_KEY)
|
|
59
|
+
refs_by_path && refs_by_path[path_for(element)]
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def sequence_for(element)
|
|
63
|
+
return nil unless @doc && element
|
|
64
|
+
|
|
65
|
+
seq_by_path = @attachments.get(@doc, CHILD_SEQUENCE_KEY)
|
|
66
|
+
seq_by_path && seq_by_path[path_for(element)]
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def serialization_for(element)
|
|
70
|
+
refs = refs_for(element)
|
|
71
|
+
return [nil, nil] unless refs && !refs.empty?
|
|
72
|
+
|
|
73
|
+
seq = sequence_for(element)
|
|
74
|
+
return [nil, nil] unless seq
|
|
75
|
+
|
|
76
|
+
[refs, seq]
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
private
|
|
80
|
+
|
|
81
|
+
def path_for(element)
|
|
82
|
+
element.path
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def count_native_children(element)
|
|
86
|
+
return 0 unless element.is_a?(::LibXML::XML::Node) && element.children?
|
|
87
|
+
|
|
88
|
+
count = 0
|
|
89
|
+
element.each_child do |child|
|
|
90
|
+
count += 1 unless blank_text_node?(child)
|
|
91
|
+
end
|
|
92
|
+
count
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def blank_text_node?(child)
|
|
96
|
+
child.text? && blank_content?(child.content)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def blank_content?(content)
|
|
100
|
+
content.nil? || !content.match?(NON_WHITESPACE_RE)
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Moxml
|
|
4
|
+
module Adapter
|
|
5
|
+
class Libxml < Base
|
|
6
|
+
# Restores configured character entities into explicit Moxml
|
|
7
|
+
# EntityReference nodes after LibXML has parsed the native tree.
|
|
8
|
+
class EntityRestorer
|
|
9
|
+
def initialize(doc, adapter: Libxml)
|
|
10
|
+
@doc = doc
|
|
11
|
+
@ctx = doc.context
|
|
12
|
+
@registry = @ctx.entity_registry
|
|
13
|
+
@config = @ctx.config
|
|
14
|
+
@adapter = adapter
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def run
|
|
18
|
+
return unless @registry && @doc.root
|
|
19
|
+
|
|
20
|
+
walk(@doc.root)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
def walk(element)
|
|
26
|
+
# Snapshot because we may add/remove siblings during the walk.
|
|
27
|
+
element.children.to_a.each do |child|
|
|
28
|
+
if child.is_a?(::Moxml::Text)
|
|
29
|
+
restore_text_node(child)
|
|
30
|
+
elsif child.is_a?(::Moxml::Element)
|
|
31
|
+
walk(child)
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Matches DocumentBuilder's previous behavior, including the libxml
|
|
37
|
+
# limitation that adjacent native text nodes get merged.
|
|
38
|
+
def restore_text_node(text_node)
|
|
39
|
+
content = text_node.content
|
|
40
|
+
return unless content
|
|
41
|
+
|
|
42
|
+
chunks = chunk_text(content)
|
|
43
|
+
return if chunks.size == 1 && chunks.first.first == :text
|
|
44
|
+
|
|
45
|
+
parent = text_node.parent
|
|
46
|
+
return unless parent
|
|
47
|
+
|
|
48
|
+
text_node.remove
|
|
49
|
+
chunks.each { |type, payload| append_chunk(parent, type, payload) }
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def chunk_text(content)
|
|
53
|
+
chunks = []
|
|
54
|
+
buffer = +""
|
|
55
|
+
restorable = @registry.restorable_codepoints
|
|
56
|
+
|
|
57
|
+
content.each_char do |char|
|
|
58
|
+
cp = char.ord
|
|
59
|
+
if restorable.include?(cp) &&
|
|
60
|
+
(name = @registry.primary_name_for_codepoint(cp)) &&
|
|
61
|
+
@registry.should_restore?(cp, config: @config)
|
|
62
|
+
unless buffer.empty?
|
|
63
|
+
chunks << [:text, buffer.dup]
|
|
64
|
+
buffer.clear
|
|
65
|
+
end
|
|
66
|
+
chunks << [:eref, name]
|
|
67
|
+
else
|
|
68
|
+
buffer << char
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
chunks << [:text, buffer.dup] unless buffer.empty?
|
|
73
|
+
chunks
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def append_chunk(parent, type, payload)
|
|
77
|
+
case type
|
|
78
|
+
when :text
|
|
79
|
+
parent.add_child(::Moxml::Text.new(@adapter.create_native_text(payload), @ctx))
|
|
80
|
+
when :eref
|
|
81
|
+
parent.add_child(
|
|
82
|
+
::Moxml::EntityReference.new(
|
|
83
|
+
@adapter.create_native_entity_reference(payload),
|
|
84
|
+
@ctx,
|
|
85
|
+
),
|
|
86
|
+
)
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|