moxml 0.1.16 → 0.1.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -0
  3. data/.rubocop_todo.yml +49 -133
  4. data/README.adoc +18 -0
  5. data/lib/moxml/adapter/base.rb +65 -8
  6. data/lib/moxml/adapter/headed_ox.rb +2 -1
  7. data/lib/moxml/adapter/libxml.rb +16 -3
  8. data/lib/moxml/adapter/nokogiri.rb +14 -4
  9. data/lib/moxml/adapter/oga.rb +26 -87
  10. data/lib/moxml/adapter/ox.rb +69 -19
  11. data/lib/moxml/adapter/rexml.rb +24 -3
  12. data/lib/moxml/attribute.rb +6 -0
  13. data/lib/moxml/element.rb +12 -8
  14. data/lib/moxml/node.rb +4 -1
  15. data/lib/moxml/text.rb +6 -0
  16. data/lib/moxml/version.rb +1 -1
  17. data/lib/moxml/xpath/compiler.rb +40 -21
  18. data/lib/moxml/xpath/parser.rb +12 -7
  19. data/spec/integration/all_adapters_spec.rb +1 -0
  20. data/spec/integration/shared_examples/edge_cases.rb +0 -6
  21. data/spec/integration/shared_examples/entity_reference_whitespace.rb +122 -0
  22. data/spec/integration/shared_examples/node_wrappers/cdata_behavior.rb +0 -7
  23. data/spec/integration/shared_examples/node_wrappers/namespace_behavior.rb +135 -0
  24. data/spec/integration/shared_examples/node_wrappers/node_behavior.rb +0 -3
  25. data/spec/moxml/adapter/entity_restoration_spec.rb +97 -0
  26. data/spec/moxml/builder_spec.rb +16 -1
  27. data/spec/moxml/entity_preservation_spec.rb +130 -0
  28. data/spec/moxml/entity_reference_spec.rb +114 -0
  29. data/spec/moxml/entity_registry_spec.rb +68 -0
  30. data/spec/moxml/xpath/axes_spec.rb +0 -1
  31. data/spec/moxml/xpath/compiler_spec.rb +0 -2
  32. metadata +6 -12
  33. data/TODO.remaining/1-entity-reference-adapter-support.md +0 -157
  34. data/TODO.remaining/2-entity-restoration-model-driven.md +0 -169
  35. data/TODO.remaining/3-entity-reference-test-coverage.md +0 -170
  36. data/TODO.remaining/4-lenient-entities-mode.md +0 -106
  37. data/TODO.remaining/5-fixture-integrity.md +0 -65
  38. data/TODO.remaining/6-ox-element-ordering-bug.md +0 -36
  39. data/TODO.remaining/7-headed-ox-limitations.md +0 -95
  40. data/TODO.remaining/8-xpath-predicate-gaps.md +0 -68
  41. data/TODO.remaining/9-cleanup-hygiene.md +0 -42
  42. data/TODO.remaining/README.md +0 -54
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 378f1400934e3a65fb230779fc4b1783aab059efb449912a6dc2d97c8d82903e
4
- data.tar.gz: 7cd2739dd2dc41c2edb69c129cc4ec175a7a6b8e455d4d63cfd56bd2a93e808f
3
+ metadata.gz: bbd69145e9a360635af848bf0bdda2883e35760b2763021f6bf6f1d6dca9827e
4
+ data.tar.gz: aa492e21514fd80a01f98709eddf8c3aa323b584210d56534ad5e2c2b467df18
5
5
  SHA512:
6
- metadata.gz: 311f4905dcf14fb3ec209491d9a5eae9b8fe460152f29c7f7b428db37b1c2adac09e538ce9c0a8a4eeff2b0af83a2e8b4a787adca59cb04d1c7f1b14b7fbf37d
7
- data.tar.gz: 36cc3ce0e2328547137f1716d7b7ef3de4e07cbca160b08d8fbe74ef126edd6e61fe4dc0ed1d8767ed19f573792fc8fdc52c41e332802698218584db559576e0
6
+ metadata.gz: 1cdb7d6c934f1ea788a40d81d987c97d4c1fc21ad71d22eaac73abf45d093680667f3303b35934378b8cce0d99e3fc9db47c85632678247426527d7fb3491bed
7
+ data.tar.gz: 79c352eb8df9b86831d554e17538abd4da8a6dfce61b4e566bc236334601e43bff7c670894ea05c72abac40bb1b3b90375ef8caaf6b488a9c43bc33fc70d6785
data/.gitignore CHANGED
@@ -28,6 +28,9 @@ libxml_*.txt
28
28
  # Generated benchmark reports (machine-specific)
29
29
  /benchmarks/PERFORMANCE_REPORT.md
30
30
 
31
+ # Local TODO tracking (kept locally, not committed)
32
+ TODO*
33
+
31
34
  # IDE and editor files
32
35
  .vscode/
33
36
  .idea/
@@ -49,3 +52,6 @@ libxml_*.txt
49
52
  /_site
50
53
  /docs/_site
51
54
  /docs/.jekyll-cache
55
+
56
+ # Utility scripts (local only)
57
+ /scripts/
data/.rubocop_todo.yml CHANGED
@@ -1,97 +1,46 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2026-04-22 01:41:34 UTC using RuboCop version 1.86.0.
3
+ # on 2026-04-23 07:48:23 UTC using RuboCop version 1.86.0.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
7
7
  # versions of RuboCop, may require this file to be generated again.
8
8
 
9
- # Offense count: 18
10
- # This cop supports safe autocorrection (--autocorrect).
11
- # Configuration parameters: EnforcedStyle, IndentationWidth.
12
- # SupportedStyles: with_first_argument, with_fixed_indentation
13
- Layout/ArgumentAlignment:
14
- Exclude:
15
- - 'spec/moxml/allocation_benchmark_spec.rb'
16
- - 'spec/moxml/allocation_guard_spec.rb'
17
-
18
- # Offense count: 14
9
+ # Offense count: 4
19
10
  # This cop supports safe autocorrection (--autocorrect).
20
- # Configuration parameters: EnforcedStyleAlignWith.
21
- # SupportedStylesAlignWith: either, start_of_block, start_of_line
22
- Layout/BlockAlignment:
11
+ Layout/EmptyLineAfterGuardClause:
23
12
  Exclude:
24
- - 'lib/moxml/adapter/ox.rb'
25
- - 'spec/moxml/allocation_benchmark_spec.rb'
26
- - 'spec/moxml/allocation_guard_spec.rb'
27
- - 'spec/moxml/lazy_parse_spec.rb'
28
- - 'spec/moxml/node_cache_spec.rb'
13
+ - 'lib/moxml/adapter/customized_rexml/formatter.rb'
14
+ - 'lib/moxml/adapter/libxml.rb'
15
+ - 'lib/moxml/entity_registry.rb'
29
16
 
30
- # Offense count: 7
17
+ # Offense count: 1
31
18
  # This cop supports safe autocorrection (--autocorrect).
32
- Layout/BlockEndNewline:
19
+ # Configuration parameters: EmptyLineBetweenMethodDefs, EmptyLineBetweenClassDefs, EmptyLineBetweenModuleDefs, DefLikeMacros, AllowAdjacentOneLineDefs, NumberOfEmptyLines.
20
+ Layout/EmptyLineBetweenDefs:
33
21
  Exclude:
34
22
  - 'lib/moxml/adapter/ox.rb'
35
- - 'spec/moxml/allocation_benchmark_spec.rb'
36
- - 'spec/moxml/allocation_guard_spec.rb'
37
- - 'spec/moxml/lazy_parse_spec.rb'
38
- - 'spec/moxml/node_cache_spec.rb'
39
23
 
40
- # Offense count: 3
41
- # This cop supports safe autocorrection (--autocorrect).
42
- Layout/ClosingParenthesisIndentation:
43
- Exclude:
44
- - 'spec/moxml/allocation_guard_spec.rb'
45
-
46
- # Offense count: 3
47
- # This cop supports safe autocorrection (--autocorrect).
48
- # Configuration parameters: EnforcedStyle, IndentationWidth.
49
- # SupportedStyles: consistent, consistent_relative_to_receiver, special_for_inner_method_call, special_for_inner_method_call_in_parentheses
50
- Layout/FirstArgumentIndentation:
51
- Exclude:
52
- - 'spec/moxml/allocation_guard_spec.rb'
53
-
54
- # Offense count: 13
24
+ # Offense count: 1
55
25
  # This cop supports safe autocorrection (--autocorrect).
56
- # Configuration parameters: Width, EnforcedStyleAlignWith, AllowedPatterns.
57
- # SupportedStylesAlignWith: start_of_line, relative_to_receiver
58
- Layout/IndentationWidth:
26
+ Layout/EmptyLines:
59
27
  Exclude:
60
28
  - 'lib/moxml/adapter/ox.rb'
61
- - 'spec/moxml/allocation_benchmark_spec.rb'
62
- - 'spec/moxml/allocation_guard_spec.rb'
63
- - 'spec/moxml/lazy_parse_spec.rb'
64
- - 'spec/moxml/node_cache_spec.rb'
65
29
 
66
- # Offense count: 307
30
+ # Offense count: 330
67
31
  # This cop supports safe autocorrection (--autocorrect).
68
32
  # Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
69
33
  # URISchemes: http, https
70
34
  Layout/LineLength:
71
35
  Enabled: false
72
36
 
73
- # Offense count: 3
74
- # This cop supports safe autocorrection (--autocorrect).
75
- Layout/MultilineBlockLayout:
76
- Exclude:
77
- - 'spec/moxml/allocation_benchmark_spec.rb'
78
- - 'spec/moxml/allocation_guard_spec.rb'
79
-
80
- # Offense count: 3
81
- # This cop supports safe autocorrection (--autocorrect).
82
- # Configuration parameters: EnforcedStyle.
83
- # SupportedStyles: symmetrical, new_line, same_line
84
- Layout/MultilineMethodCallBraceLayout:
85
- Exclude:
86
- - 'spec/moxml/allocation_guard_spec.rb'
87
-
88
- # Offense count: 3
37
+ # Offense count: 1
89
38
  # This cop supports safe autocorrection (--autocorrect).
90
- # Configuration parameters: AllowInHeredoc.
91
- Layout/TrailingWhitespace:
39
+ # Configuration parameters: EnforcedStyle, IndentationWidth.
40
+ # SupportedStyles: aligned, indented
41
+ Layout/MultilineOperationIndentation:
92
42
  Exclude:
93
- - 'spec/moxml/allocation_benchmark_spec.rb'
94
- - 'spec/moxml/allocation_guard_spec.rb'
43
+ - 'lib/moxml/adapter/ox.rb'
95
44
 
96
45
  # Offense count: 7
97
46
  # Configuration parameters: AllowedMethods.
@@ -101,7 +50,7 @@ Lint/ConstantDefinitionInBlock:
101
50
  - 'spec/moxml/declaration_preservation_spec.rb'
102
51
  - 'spec/moxml/sax_spec.rb'
103
52
 
104
- # Offense count: 8
53
+ # Offense count: 10
105
54
  # Configuration parameters: IgnoreLiteralBranches, IgnoreConstantBranches, IgnoreDuplicateElseBranch.
106
55
  Lint/DuplicateBranch:
107
56
  Exclude:
@@ -110,8 +59,9 @@ Lint/DuplicateBranch:
110
59
  - 'lib/moxml/adapter/libxml.rb'
111
60
  - 'lib/moxml/adapter/ox.rb'
112
61
  - 'lib/moxml/document.rb'
62
+ - 'lib/moxml/entity_registry.rb'
113
63
 
114
- # Offense count: 4
64
+ # Offense count: 5
115
65
  Lint/DuplicateMethods:
116
66
  Exclude:
117
67
  - 'lib/moxml/config.rb'
@@ -138,6 +88,11 @@ Lint/EmptyWhen:
138
88
  Exclude:
139
89
  - 'lib/moxml/xpath/compiler.rb'
140
90
 
91
+ # Offense count: 3
92
+ Lint/HashCompareByIdentity:
93
+ Exclude:
94
+ - 'lib/moxml/native_attachment.rb'
95
+
141
96
  # Offense count: 1
142
97
  Lint/IneffectiveAccessModifier:
143
98
  Exclude:
@@ -156,28 +111,28 @@ Lint/NoReturnInBeginEndBlocks:
156
111
  Exclude:
157
112
  - 'examples/api_client/api_client.rb'
158
113
 
159
- # Offense count: 100
114
+ # Offense count: 104
160
115
  # Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
161
116
  Metrics/AbcSize:
162
117
  Enabled: false
163
118
 
164
- # Offense count: 7
119
+ # Offense count: 8
165
120
  # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns, inherit_mode.
166
121
  # AllowedMethods: refine
167
122
  Metrics/BlockLength:
168
123
  Max: 90
169
124
 
170
- # Offense count: 5
125
+ # Offense count: 7
171
126
  # Configuration parameters: CountBlocks, CountModifierForms.
172
127
  Metrics/BlockNesting:
173
128
  Max: 4
174
129
 
175
- # Offense count: 70
130
+ # Offense count: 76
176
131
  # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
177
132
  Metrics/CyclomaticComplexity:
178
133
  Enabled: false
179
134
 
180
- # Offense count: 182
135
+ # Offense count: 186
181
136
  # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
182
137
  Metrics/MethodLength:
183
138
  Max: 110
@@ -187,19 +142,11 @@ Metrics/MethodLength:
187
142
  Metrics/ParameterLists:
188
143
  Max: 7
189
144
 
190
- # Offense count: 47
145
+ # Offense count: 52
191
146
  # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
192
147
  Metrics/PerceivedComplexity:
193
148
  Enabled: false
194
149
 
195
- # Offense count: 2
196
- # This cop supports unsafe autocorrection (--autocorrect-all).
197
- # Configuration parameters: EnforcedStyleForLeadingUnderscores.
198
- # SupportedStylesForLeadingUnderscores: disallowed, required, optional
199
- Naming/MemoizedInstanceVariableName:
200
- Exclude:
201
- - 'lib/moxml/element.rb'
202
-
203
150
  # Offense count: 16
204
151
  # Configuration parameters: MinNameLength, AllowNamesEndingInNumbers, AllowedNames, ForbiddenNames.
205
152
  # AllowedNames: as, at, by, cc, db, id, if, in, io, ip, of, on, os, pp, to
@@ -233,12 +180,6 @@ Naming/VariableNumber:
233
180
  - 'spec/moxml/allocation_guard_spec.rb'
234
181
  - 'spec/support/allocation_helper.rb'
235
182
 
236
- # Offense count: 1
237
- # This cop supports unsafe autocorrection (--autocorrect-all).
238
- Performance/TimesMap:
239
- Exclude:
240
- - 'spec/support/allocation_helper.rb'
241
-
242
183
  # Offense count: 5
243
184
  RSpec/BeforeAfterAll:
244
185
  Exclude:
@@ -264,12 +205,12 @@ RSpec/ContextWording:
264
205
  - 'spec/moxml/xpath/parser_spec.rb'
265
206
  - 'spec/performance/benchmark_spec.rb'
266
207
 
267
- # Offense count: 23
208
+ # Offense count: 24
268
209
  # Configuration parameters: IgnoredMetadata.
269
210
  RSpec/DescribeClass:
270
211
  Enabled: false
271
212
 
272
- # Offense count: 271
213
+ # Offense count: 295
273
214
  # Configuration parameters: CountAsOne.
274
215
  RSpec/ExampleLength:
275
216
  Max: 64
@@ -305,7 +246,7 @@ RSpec/LeakyConstantDeclaration:
305
246
  RSpec/MessageSpies:
306
247
  EnforcedStyle: receive
307
248
 
308
- # Offense count: 356
249
+ # Offense count: 390
309
250
  RSpec/MultipleExpectations:
310
251
  Max: 10
311
252
 
@@ -326,12 +267,6 @@ RSpec/NoExpectationExample:
326
267
  Exclude:
327
268
  - 'spec/performance/xpath_benchmark_spec.rb'
328
269
 
329
- # Offense count: 6
330
- RSpec/PendingWithoutReason:
331
- Exclude:
332
- - 'spec/moxml/xpath/functions/position_functions_spec.rb'
333
- - 'spec/moxml/xpath/functions/special_functions_spec.rb'
334
-
335
270
  # Offense count: 4
336
271
  RSpec/RepeatedExample:
337
272
  Exclude:
@@ -371,21 +306,6 @@ Security/Eval:
371
306
  Exclude:
372
307
  - 'spec/moxml/xpath/ruby/generator_spec.rb'
373
308
 
374
- # Offense count: 11
375
- # This cop supports safe autocorrection (--autocorrect).
376
- # Configuration parameters: EnforcedStyle, ProceduralMethods, FunctionalMethods, AllowedMethods, AllowedPatterns, AllowBracesOnProceduralOneLiners, BracesRequiredMethods.
377
- # SupportedStyles: line_count_based, semantic, braces_for_chaining, always_braces
378
- # ProceduralMethods: benchmark, bm, bmbm, create, each_with_object, measure, new, realtime, tap, with_object
379
- # FunctionalMethods: let, let!, subject, watch
380
- # AllowedMethods: lambda, proc, it
381
- Style/BlockDelimiters:
382
- Exclude:
383
- - 'lib/moxml/adapter/ox.rb'
384
- - 'spec/moxml/allocation_benchmark_spec.rb'
385
- - 'spec/moxml/allocation_guard_spec.rb'
386
- - 'spec/moxml/lazy_parse_spec.rb'
387
- - 'spec/moxml/node_cache_spec.rb'
388
-
389
309
  # Offense count: 1
390
310
  Style/DocumentDynamicEvalDefinition:
391
311
  Exclude:
@@ -404,25 +324,17 @@ Style/HashLikeCase:
404
324
  - 'lib/moxml/adapter/customized_rexml/formatter.rb'
405
325
  - 'lib/moxml/adapter/ox.rb'
406
326
 
407
- # Offense count: 1
408
- # This cop supports unsafe autocorrection (--autocorrect-all).
409
- Style/MapToHash:
410
- Exclude:
411
- - 'spec/moxml/node_cache_spec.rb'
412
-
413
327
  # Offense count: 1
414
328
  Style/MissingRespondToMissing:
415
329
  Exclude:
416
330
  - 'lib/moxml/xpath/ruby/node.rb'
417
331
 
418
332
  # Offense count: 1
419
- # This cop supports unsafe autocorrection (--autocorrect-all).
420
- # Configuration parameters: EnforcedStyle, AllowedMethods, AllowedPatterns.
421
- # SupportedStyles: predicate, comparison
422
- Style/NumericPredicate:
333
+ # This cop supports safe autocorrection (--autocorrect).
334
+ # Configuration parameters: AllowMethodComparison, ComparisonsThreshold.
335
+ Style/MultipleComparison:
423
336
  Exclude:
424
- - 'spec/**/*'
425
- - 'lib/moxml/node_set.rb'
337
+ - 'lib/moxml/xpath/compiler.rb'
426
338
 
427
339
  # Offense count: 5
428
340
  # Configuration parameters: AllowedClasses.
@@ -440,16 +352,20 @@ Style/OptionalBooleanParameter:
440
352
  - 'lib/moxml/adapter/libxml.rb'
441
353
  - 'lib/moxml/xpath/compiler.rb'
442
354
 
443
- # Offense count: 1
444
- # This cop supports unsafe autocorrection (--autocorrect-all).
445
- Style/SelectByKind:
355
+ # Offense count: 2
356
+ # This cop supports safe autocorrection (--autocorrect).
357
+ Style/RedundantAssignment:
446
358
  Exclude:
447
359
  - 'lib/moxml/adapter/ox.rb'
448
360
 
449
361
  # Offense count: 1
450
362
  # This cop supports safe autocorrection (--autocorrect).
451
- # Configuration parameters: EnforcedStyle, ConsistentQuotesInMultiline.
452
- # SupportedStyles: single_quotes, double_quotes
453
- Style/StringLiterals:
363
+ Style/RedundantConstantBase:
454
364
  Exclude:
455
- - 'spec/moxml/lazy_parse_spec.rb'
365
+ - 'spec/moxml/adapter/headed_ox_spec.rb'
366
+
367
+ # Offense count: 1
368
+ # This cop supports unsafe autocorrection (--autocorrect-all).
369
+ Style/SelectByKind:
370
+ Exclude:
371
+ - 'spec/integration/shared_examples/node_wrappers/entity_reference_behavior.rb'
data/README.adoc CHANGED
@@ -887,6 +887,18 @@ The Ox adapter provides maximum parsing speed but has XPath limitations.
887
887
  doc.xpath("//book").find { |book| book["id"] == "123" }
888
888
  ----
889
889
 
890
+ **Upstream Ox gem limitations:**
891
+
892
+ These limitations exist in the Ox gem itself and cannot be worked around in Moxml
893
+ without changes to the Ox C extension:
894
+
895
+ * *Namespace introspection* — Ox stores `xmlns` attributes but does not expose
896
+ namespace accessors on `Ox::Element`. Methods like `node.namespace`,
897
+ `node.namespaces`, and namespace inheritance are unavailable.
898
+ * *Parent node reparenting* — Ox has no method to change a node's parent after
899
+ creation, preventing `node.parent=` functionality. Nodes are immutable with
900
+ respect to their parent relationship.
901
+
890
902
  For complete Ox adapter documentation including all limitations and workarounds,
891
903
  see link:docs/_pages/adapters/ox.adoc[Ox Adapter Guide].
892
904
 
@@ -912,6 +924,12 @@ comprehensive pure Ruby XPath 1.0 engine.
912
924
  * Prefer pure Ruby XPath for debugging
913
925
  * Basic namespace queries are sufficient
914
926
 
927
+ **Inherited Ox limitations:**
928
+
929
+ HeadedOx inherits the upstream Ox gem limitations described above (namespace
930
+ introspection and parent node reparenting). Additionally, some sibling axes
931
+ are not fully supported due to Ox's tree structure.
932
+
915
933
  [source,ruby]
916
934
  ----
917
935
  # Use HeadedOx adapter
@@ -8,9 +8,54 @@ module Moxml
8
8
  class Base
9
9
  # include XmlUtils
10
10
 
11
+ # Entity marker for adapters that resolve entities during parsing.
12
+ # U+FFFC (Object Replacement Character) + U+FEFF (BOM) is a two-character
13
+ # sentinel chosen because this exact sequence followed by a valid entity
14
+ # name pattern is vanishingly unlikely in real XML content.
15
+ # Non-standard entities like © are converted to this marker before
16
+ # parsing, then restored during serialization.
17
+ # Standard XML entities (& < > " ') are NOT converted.
18
+ ENTITY_MARKER = "\u{FFFC}\u{FEFF}"
19
+ ENTITY_NAME_PATTERN = "[a-zA-Z_][\\w.:-]*"
20
+ ENTITY_NAME_RE = /&(#{ENTITY_NAME_PATTERN});/
21
+ ENTITY_MARKER_RE = /\u{FFFC}\u{FEFF}(#{ENTITY_NAME_PATTERN});/
22
+ SERIALIZED_ENTITY_MARKER_RE = /(#{ENTITY_NAME_PATTERN});/
23
+ STANDARD_ENTITIES = %w[amp lt gt quot apos].freeze
24
+
11
25
  class << self
12
26
  include XmlUtils
13
27
 
28
+ # Replace non-standard entity references with markers before parsing.
29
+ # Always returns a UTF-8 encoded string.
30
+ def preprocess_entities(xml)
31
+ return "" if xml.nil?
32
+
33
+ str = if xml.encoding == Encoding::BINARY
34
+ # Binary strings are assumed to be UTF-8. If the bytes are
35
+ # not valid UTF-8, fall back to encoding as UTF-8 with
36
+ # replacement to avoid raising on gsub.
37
+ dup = xml.dup.force_encoding("UTF-8")
38
+ dup.valid_encoding? ? dup : xml.dup.encode("UTF-8", "ASCII-8BIT", invalid: :replace, undef: :replace)
39
+ elsif xml.encoding == Encoding::UTF_8
40
+ xml
41
+ else
42
+ xml.encode("UTF-8")
43
+ end
44
+ str.gsub(ENTITY_NAME_RE) do |match|
45
+ STANDARD_ENTITIES.include?(::Regexp.last_match(1)) ? match : "#{ENTITY_MARKER}#{::Regexp.last_match(1)};"
46
+ end
47
+ end
48
+
49
+ # Restore entity markers back to named entity references.
50
+ def restore_entities(text)
51
+ return text unless text.is_a?(String)
52
+
53
+ # Force UTF-8 encoding since markers are UTF-8 characters
54
+ str = text.encoding == Encoding::UTF_8 ? text : text.dup.force_encoding("UTF-8")
55
+ result = str.gsub(ENTITY_MARKER_RE, '&\1;')
56
+ result.gsub(SERIALIZED_ENTITY_MARKER_RE, '&\1;')
57
+ end
58
+
14
59
  def set_root(_doc, _element)
15
60
  raise Moxml::NotImplementedError.new(
16
61
  "set_root not implemented",
@@ -163,6 +208,26 @@ namespace_validation_mode: :strict)
163
208
  child_native
164
209
  end
165
210
 
211
+ # Returns all namespaces in scope for this element, including
212
+ # inherited from ancestors. Adapters with native support (Nokogiri)
213
+ # override this. Default walks the ancestor chain.
214
+ def in_scope_namespaces(element)
215
+ namespaces = {}
216
+ node = element
217
+
218
+ while node
219
+ break unless node_type(node) == :element
220
+
221
+ namespace_definitions(node).each do |ns|
222
+ prefix = namespace_prefix(ns)
223
+ namespaces[prefix] = ns unless namespaces.key?(prefix)
224
+ end
225
+ node = parent(node)
226
+ end
227
+
228
+ namespaces.values
229
+ end
230
+
166
231
  protected
167
232
 
168
233
  def create_native_element(_name, _owner_doc = nil)
@@ -236,14 +301,6 @@ namespace_validation_mode: :strict)
236
301
  adapter: name,
237
302
  )
238
303
  end
239
-
240
- def in_scope_namespaces(_element)
241
- raise Moxml::NotImplementedError.new(
242
- "in_scope_namespaces not implemented",
243
- feature: "in_scope_namespaces",
244
- adapter: name,
245
- )
246
- end
247
304
  end
248
305
  end
249
306
  end
@@ -30,8 +30,9 @@ module Moxml
30
30
  # ~176K allocations per 100-element parse). Lazy parse defers wrapper
31
31
  # creation until nodes are accessed, matching Ox adapter behavior.
32
32
  def parse(xml, options = {}, _context = nil)
33
+ processed_xml = preprocess_entities(xml)
33
34
  native_doc = begin
34
- result = ::Ox.parse(xml)
35
+ result = ::Ox.parse(processed_xml)
35
36
 
36
37
  # result can be either Document or Element
37
38
  if result.is_a?(::Ox::Document)
@@ -56,6 +56,11 @@ module Moxml
56
56
  xml.to_s
57
57
  end
58
58
 
59
+ # Preprocess entities before parsing.
60
+ # This converts the string to UTF-8; LibXML will use the encoding
61
+ # parameter or XML declaration for byte interpretation.
62
+ xml_string = preprocess_entities(xml_string)
63
+
59
64
  # Extract DOCTYPE before parsing
60
65
  doctype_match = xml_string.match(/<!DOCTYPE\s+(\S+)(?:\s+PUBLIC\s+"([^"]+)"\s+"([^"]+)"| \s+SYSTEM\s+"([^"]+)")?\s*>/i)
61
66
 
@@ -842,9 +847,17 @@ module Moxml
842
847
  return [] unless native_node
843
848
  return [] unless native_node.is_a?(::LibXML::XML::Node)
844
849
 
845
- native_node.namespaces.map do |ns|
846
- ns
847
- end
850
+ namespaces = native_node.namespaces
851
+ return [] unless namespaces
852
+
853
+ namespace_list =
854
+ if namespaces.respond_to?(:definitions)
855
+ namespaces.definitions
856
+ else
857
+ namespaces
858
+ end
859
+
860
+ namespace_list.to_a
848
861
  end
849
862
 
850
863
  # Doctype accessor methods
@@ -16,14 +16,18 @@ module Moxml
16
16
  end
17
17
 
18
18
  def parse(xml, options = {}, _context = nil)
19
+ processed_xml = preprocess_entities(xml)
20
+
21
+ # preprocess_entities always returns UTF-8, so tell Nokogiri to
22
+ # parse as UTF-8 regardless of any original encoding option.
19
23
  native_doc = begin
20
24
  if options[:fragment]
21
- ::Nokogiri::XML::DocumentFragment.parse(xml) do |config|
25
+ ::Nokogiri::XML::DocumentFragment.parse(processed_xml) do |config|
22
26
  config.strict.nonet
23
27
  config.recover unless options[:strict]
24
28
  end
25
29
  else
26
- ::Nokogiri::XML(xml, nil, options[:encoding]) do |config|
30
+ ::Nokogiri::XML(processed_xml, nil, "UTF-8") do |config|
27
31
  config.strict.nonet
28
32
  config.recover unless options[:strict]
29
33
  end
@@ -180,10 +184,16 @@ module Moxml
180
184
  def children(node)
181
185
  node.children.reject do |child|
182
186
  child.text? && child.content.strip.empty? &&
183
- !(child.previous_sibling.nil? && child.next_sibling.nil?)
187
+ !(child.previous_sibling.nil? && child.next_sibling.nil?) &&
188
+ !adjacent_to_entity_reference?(child)
184
189
  end
185
190
  end
186
191
 
192
+ def adjacent_to_entity_reference?(node)
193
+ node.previous_sibling.is_a?(::Nokogiri::XML::EntityReference) ||
194
+ node.next_sibling.is_a?(::Nokogiri::XML::EntityReference)
195
+ end
196
+
187
197
  def replace_children(node, new_children)
188
198
  node.children.unlink
189
199
  new_children.each { |child| add_child(node, child) }
@@ -288,7 +298,7 @@ module Moxml
288
298
  end
289
299
 
290
300
  def text_content(node)
291
- node.text
301
+ node.text.to_s
292
302
  end
293
303
 
294
304
  def inner_text(node)