canon 0.1.17 → 0.1.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +93 -7
- data/README.adoc +1 -0
- data/docs/features/match-options/index.adoc +60 -8
- data/docs/understanding/formats/xml.adoc +38 -0
- data/lib/canon/cache.rb +2 -1
- data/lib/canon/comparison/format_detector.rb +15 -1
- data/lib/canon/comparison/whitespace_sensitivity.rb +9 -0
- data/lib/canon/comparison/xml_comparator/node_parser.rb +3 -0
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +130 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 34ab9a64b52d8598690536908941e38950bb4071fb6222b50d1cb584236e5286
|
|
4
|
+
data.tar.gz: 2dffcc8e29fcd1f75d78595ef73350208e7b369f2183c2df93672b94df7a6376
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 41d784c820a7bbafd9874bf369ef303376578e74f7171e8f00a7ed1ed0b8800576ca67f51272f1464f6f36cbaec35b6f1f1e9806916e7a8c59bc6c04e827ee79
|
|
7
|
+
data.tar.gz: d3e354615ed4b40447ae0be7de64bf6250c40650e25f897a5155f52a74f57a8601003efd37d539abf9f514d5cbe614298a2185db7d72b68b4ff95db6da1b5b99
|
data/.rubocop_todo.yml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# This configuration was generated by
|
|
2
2
|
# `rubocop --auto-gen-config`
|
|
3
|
-
# on 2026-03-24
|
|
3
|
+
# on 2026-03-24 08:58:24 UTC using RuboCop version 1.85.1.
|
|
4
4
|
# The point is for the user to remove these configuration records
|
|
5
5
|
# one by one as the offenses are removed from the code base.
|
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
|
@@ -11,13 +11,63 @@ Gemspec/RequiredRubyVersion:
|
|
|
11
11
|
Exclude:
|
|
12
12
|
- 'canon.gemspec'
|
|
13
13
|
|
|
14
|
-
# Offense count:
|
|
14
|
+
# Offense count: 10
|
|
15
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
16
|
+
# Configuration parameters: EnforcedStyle, IndentationWidth.
|
|
17
|
+
# SupportedStyles: with_first_argument, with_fixed_indentation
|
|
18
|
+
Layout/ArgumentAlignment:
|
|
19
|
+
Exclude:
|
|
20
|
+
- 'lib/canon/xml/data_model.rb'
|
|
21
|
+
- 'spec/canon/comparison/encoding_normalization_spec.rb'
|
|
22
|
+
- 'spec/canon/comparison/xml_whitespace_spec.rb'
|
|
23
|
+
|
|
24
|
+
# Offense count: 1
|
|
25
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
26
|
+
# Configuration parameters: EnforcedStyle, IndentationWidth.
|
|
27
|
+
# SupportedStyles: with_first_element, with_fixed_indentation
|
|
28
|
+
Layout/ArrayAlignment:
|
|
29
|
+
Exclude:
|
|
30
|
+
- 'lib/canon/comparison/format_detector.rb'
|
|
31
|
+
|
|
32
|
+
# Offense count: 1
|
|
33
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
34
|
+
Layout/EmptyLineAfterGuardClause:
|
|
35
|
+
Exclude:
|
|
36
|
+
- 'lib/canon/xml/data_model.rb'
|
|
37
|
+
|
|
38
|
+
# Offense count: 1
|
|
39
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
40
|
+
# Configuration parameters: AllowMultipleStyles, EnforcedHashRocketStyle, EnforcedColonStyle, EnforcedLastArgumentHashStyle.
|
|
41
|
+
# SupportedHashRocketStyles: key, separator, table
|
|
42
|
+
# SupportedColonStyles: key, separator, table
|
|
43
|
+
# SupportedLastArgumentHashStyles: always_inspect, always_ignore, ignore_implicit, ignore_explicit
|
|
44
|
+
Layout/HashAlignment:
|
|
45
|
+
Exclude:
|
|
46
|
+
- 'lib/canon/comparison/format_detector.rb'
|
|
47
|
+
|
|
48
|
+
# Offense count: 831
|
|
15
49
|
# This cop supports safe autocorrection (--autocorrect).
|
|
16
50
|
# Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
|
|
17
51
|
# URISchemes: http, https
|
|
18
52
|
Layout/LineLength:
|
|
19
53
|
Enabled: false
|
|
20
54
|
|
|
55
|
+
# Offense count: 9
|
|
56
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
57
|
+
# Configuration parameters: EnforcedStyle.
|
|
58
|
+
# SupportedStyles: symmetrical, new_line, same_line
|
|
59
|
+
Layout/MultilineMethodCallBraceLayout:
|
|
60
|
+
Exclude:
|
|
61
|
+
- 'spec/canon/comparison/encoding_normalization_spec.rb'
|
|
62
|
+
- 'spec/canon/comparison/xml_whitespace_spec.rb'
|
|
63
|
+
|
|
64
|
+
# Offense count: 2
|
|
65
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
66
|
+
# Configuration parameters: AllowInHeredoc.
|
|
67
|
+
Layout/TrailingWhitespace:
|
|
68
|
+
Exclude:
|
|
69
|
+
- 'lib/canon/comparison/format_detector.rb'
|
|
70
|
+
|
|
21
71
|
# Offense count: 49
|
|
22
72
|
# Configuration parameters: IgnoreLiteralBranches, IgnoreConstantBranches, IgnoreDuplicateElseBranch.
|
|
23
73
|
Lint/DuplicateBranch:
|
|
@@ -47,6 +97,13 @@ Lint/UnreachableCode:
|
|
|
47
97
|
Exclude:
|
|
48
98
|
- 'lib/canon/diff_formatter/debug_output.rb'
|
|
49
99
|
|
|
100
|
+
# Offense count: 1
|
|
101
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
102
|
+
# Configuration parameters: IgnoreEmptyBlocks, AllowUnusedKeywordArguments.
|
|
103
|
+
Lint/UnusedBlockArgument:
|
|
104
|
+
Exclude:
|
|
105
|
+
- 'lib/canon/xml/data_model.rb'
|
|
106
|
+
|
|
50
107
|
# Offense count: 6
|
|
51
108
|
# This cop supports safe autocorrection (--autocorrect).
|
|
52
109
|
# Configuration parameters: AllowUnusedKeywordArguments, IgnoreEmptyMethods, IgnoreNotImplementedMethods, NotImplementedExceptions.
|
|
@@ -58,7 +115,7 @@ Lint/UnusedMethodArgument:
|
|
|
58
115
|
- 'lib/canon/diff_formatter/by_line/xml_formatter.rb'
|
|
59
116
|
- 'lib/canon/diff_formatter/by_object/base_formatter.rb'
|
|
60
117
|
|
|
61
|
-
# Offense count:
|
|
118
|
+
# Offense count: 236
|
|
62
119
|
# Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
|
|
63
120
|
Metrics/AbcSize:
|
|
64
121
|
Enabled: false
|
|
@@ -69,12 +126,12 @@ Metrics/AbcSize:
|
|
|
69
126
|
Metrics/BlockLength:
|
|
70
127
|
Max: 84
|
|
71
128
|
|
|
72
|
-
# Offense count:
|
|
129
|
+
# Offense count: 193
|
|
73
130
|
# Configuration parameters: AllowedMethods, AllowedPatterns, Max.
|
|
74
131
|
Metrics/CyclomaticComplexity:
|
|
75
132
|
Enabled: false
|
|
76
133
|
|
|
77
|
-
# Offense count:
|
|
134
|
+
# Offense count: 403
|
|
78
135
|
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
|
|
79
136
|
Metrics/MethodLength:
|
|
80
137
|
Max: 95
|
|
@@ -84,7 +141,7 @@ Metrics/MethodLength:
|
|
|
84
141
|
Metrics/ParameterLists:
|
|
85
142
|
Max: 9
|
|
86
143
|
|
|
87
|
-
# Offense count:
|
|
144
|
+
# Offense count: 160
|
|
88
145
|
# Configuration parameters: AllowedMethods, AllowedPatterns, Max.
|
|
89
146
|
Metrics/PerceivedComplexity:
|
|
90
147
|
Enabled: false
|
|
@@ -115,13 +172,19 @@ Performance/CollectionLiteralInLoop:
|
|
|
115
172
|
- 'lib/canon/comparison/html_comparator.rb'
|
|
116
173
|
- 'lib/canon/xml/xml_base_handler.rb'
|
|
117
174
|
|
|
175
|
+
# Offense count: 1
|
|
176
|
+
# This cop supports unsafe autocorrection (--autocorrect-all).
|
|
177
|
+
Performance/UnfreezeString:
|
|
178
|
+
Exclude:
|
|
179
|
+
- 'spec/canon/comparison/encoding_normalization_spec.rb'
|
|
180
|
+
|
|
118
181
|
# Offense count: 68
|
|
119
182
|
# Configuration parameters: Prefixes, AllowedPatterns.
|
|
120
183
|
# Prefixes: when, with, without
|
|
121
184
|
RSpec/ContextWording:
|
|
122
185
|
Enabled: false
|
|
123
186
|
|
|
124
|
-
# Offense count:
|
|
187
|
+
# Offense count: 29
|
|
125
188
|
# Configuration parameters: IgnoredMetadata.
|
|
126
189
|
RSpec/DescribeClass:
|
|
127
190
|
Enabled: false
|
|
@@ -217,6 +280,11 @@ RSpec/NoExpectationExample:
|
|
|
217
280
|
- 'spec/canon/isodoc_blockquotes_spec.rb'
|
|
218
281
|
- 'spec/canon/match_scenarios_spec.rb'
|
|
219
282
|
|
|
283
|
+
# Offense count: 2
|
|
284
|
+
RSpec/RepeatedExample:
|
|
285
|
+
Exclude:
|
|
286
|
+
- 'spec/canon/comparison/encoding_normalization_spec.rb'
|
|
287
|
+
|
|
220
288
|
# Offense count: 7
|
|
221
289
|
# Configuration parameters: CustomTransform, IgnoreMethods, IgnoreMetadata, InflectorPath, EnforcedInflector.
|
|
222
290
|
# SupportedInflectors: default, active_support
|
|
@@ -241,6 +309,17 @@ RSpec/VerifiedDoubles:
|
|
|
241
309
|
- 'spec/canon/diff/xml_serialization_formatter_spec.rb'
|
|
242
310
|
- 'spec/canon/tree_diff/operation_converter_spec.rb'
|
|
243
311
|
|
|
312
|
+
# Offense count: 1
|
|
313
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
314
|
+
# Configuration parameters: EnforcedStyle, ProceduralMethods, FunctionalMethods, AllowedMethods, AllowedPatterns, AllowBracesOnProceduralOneLiners, BracesRequiredMethods.
|
|
315
|
+
# SupportedStyles: line_count_based, semantic, braces_for_chaining, always_braces
|
|
316
|
+
# ProceduralMethods: benchmark, bm, bmbm, create, each_with_object, measure, new, realtime, tap, with_object
|
|
317
|
+
# FunctionalMethods: let, let!, subject, watch
|
|
318
|
+
# AllowedMethods: lambda, proc, it
|
|
319
|
+
Style/BlockDelimiters:
|
|
320
|
+
Exclude:
|
|
321
|
+
- 'spec/canon/comparison/encoding_normalization_spec.rb'
|
|
322
|
+
|
|
244
323
|
# Offense count: 1
|
|
245
324
|
# This cop supports safe autocorrection (--autocorrect).
|
|
246
325
|
# Configuration parameters: EnforcedStyle, AllowComments.
|
|
@@ -263,6 +342,13 @@ Style/IdenticalConditionalBranches:
|
|
|
263
342
|
- 'lib/canon/diff_formatter/by_object/base_formatter.rb'
|
|
264
343
|
- 'lib/canon/diff_formatter/legend.rb'
|
|
265
344
|
|
|
345
|
+
# Offense count: 1
|
|
346
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
347
|
+
# Configuration parameters: AllowMethodComparison, ComparisonsThreshold.
|
|
348
|
+
Style/MultipleComparison:
|
|
349
|
+
Exclude:
|
|
350
|
+
- 'lib/canon/comparison/format_detector.rb'
|
|
351
|
+
|
|
266
352
|
# Offense count: 1
|
|
267
353
|
# Configuration parameters: AllowedMethods.
|
|
268
354
|
# AllowedMethods: respond_to_missing?
|
data/README.adoc
CHANGED
|
@@ -214,6 +214,7 @@ Compare documents based on meaning, not formatting:
|
|
|
214
214
|
* Comment handling with display control
|
|
215
215
|
* Multiple match dimensions with behaviors
|
|
216
216
|
* Predefined match profiles (strict, rendered, spec_friendly, content_only)
|
|
217
|
+
* **Cross-encoding comparison**: Compare XML documents with different character encodings (UTF-8, Shift_JIS, ISO-8859-1, UTF-16) — Canon automatically normalizes to UTF-8 before comparison
|
|
217
218
|
|
|
218
219
|
See link:docs/MATCH_OPTIONS[Match options] for details.
|
|
219
220
|
|
|
@@ -151,6 +151,43 @@ sensitivity in XML instance documents:
|
|
|
151
151
|
</text>
|
|
152
152
|
----
|
|
153
153
|
|
|
154
|
+
The `xml:space` attribute affects both structural whitespace and text content:
|
|
155
|
+
|
|
156
|
+
* **Structural whitespace** (whitespace-only text nodes between child elements)
|
|
157
|
+
* **Text content whitespace** (whitespace within text nodes)
|
|
158
|
+
|
|
159
|
+
.xml:space with structural_whitespace
|
|
160
|
+
[example]
|
|
161
|
+
====
|
|
162
|
+
[source,ruby]
|
|
163
|
+
----
|
|
164
|
+
# With xml:space="preserve", structural whitespace is preserved
|
|
165
|
+
xml1 = "<root xml:space='preserve'>\n <text>Hello</text>\n</root>"
|
|
166
|
+
xml2 = "<root xml:space='preserve'><text>Hello</text></root>"
|
|
167
|
+
|
|
168
|
+
# These are NOT equivalent (structural whitespace differs)
|
|
169
|
+
Canon::Comparison.equivalent?(xml1, xml2)
|
|
170
|
+
# => false
|
|
171
|
+
----
|
|
172
|
+
====
|
|
173
|
+
|
|
174
|
+
.xml:space with text_content
|
|
175
|
+
[example]
|
|
176
|
+
====
|
|
177
|
+
[source,ruby]
|
|
178
|
+
----
|
|
179
|
+
# With xml:space="preserve", text content whitespace is preserved
|
|
180
|
+
xml1 = '<root xml:space="preserve"><code> indented </code></root>'
|
|
181
|
+
xml2 = '<root xml:space="preserve"><code>indented</code></root>'
|
|
182
|
+
|
|
183
|
+
# These are NOT equivalent (text whitespace differs)
|
|
184
|
+
Canon::Comparison.equivalent?(xml1, xml2,
|
|
185
|
+
match: { text_content: :strict }
|
|
186
|
+
)
|
|
187
|
+
# => false
|
|
188
|
+
----
|
|
189
|
+
====
|
|
190
|
+
|
|
154
191
|
==== Whitelist and blacklist options
|
|
155
192
|
|
|
156
193
|
You can explicitly specify which elements are whitespace-sensitive using either short or long option names:
|
|
@@ -260,29 +297,44 @@ Canon::Comparison.equivalent?(xml1, xml2,
|
|
|
260
297
|
|
|
261
298
|
==== Examples
|
|
262
299
|
|
|
263
|
-
.Using xml:space
|
|
300
|
+
.Using xml:space="preserve" for structural whitespace
|
|
301
|
+
[source,ruby]
|
|
302
|
+
----
|
|
303
|
+
xml1 = "<root xml:space='preserve'>\n <text>Hello</text>\n</root>"
|
|
304
|
+
xml2 = "<root xml:space='preserve'><text>Hello</text></root>"
|
|
305
|
+
|
|
306
|
+
# Structural whitespace differs - NOT equivalent
|
|
307
|
+
Canon::Comparison.equivalent?(xml1, xml2)
|
|
308
|
+
# => false
|
|
309
|
+
----
|
|
310
|
+
|
|
311
|
+
.Using xml:space="preserve" for text content
|
|
264
312
|
[source,ruby]
|
|
265
313
|
----
|
|
266
|
-
xml1 = '<root><code xml:space="preserve">
|
|
267
|
-
xml2 = '<root><code xml:space="preserve">
|
|
314
|
+
xml1 = '<root><code xml:space="preserve"> multiple spaces </code></root>'
|
|
315
|
+
xml2 = '<root><code xml:space="preserve">multiple spaces</code></root>'
|
|
268
316
|
|
|
269
|
-
#
|
|
317
|
+
# Text content whitespace differs - NOT equivalent with text_content: :strict
|
|
270
318
|
Canon::Comparison.equivalent?(xml1, xml2,
|
|
271
|
-
match: {
|
|
319
|
+
match: { text_content: :strict }
|
|
272
320
|
)
|
|
273
321
|
# => false
|
|
274
322
|
----
|
|
275
323
|
|
|
276
|
-
.Using whitelist
|
|
324
|
+
.Using sensitive_elements whitelist
|
|
277
325
|
[source,ruby]
|
|
278
326
|
----
|
|
279
|
-
# Make <
|
|
327
|
+
# Make <sample> elements whitespace-sensitive (strings, not symbols)
|
|
328
|
+
xml1 = "<sample>\n content\n</sample>"
|
|
329
|
+
xml2 = "<sample>content</sample>"
|
|
330
|
+
|
|
280
331
|
Canon::Comparison.equivalent?(xml1, xml2,
|
|
281
332
|
match: {
|
|
282
333
|
structural_whitespace: :strict,
|
|
283
|
-
sensitive_elements: ["
|
|
334
|
+
sensitive_elements: ["sample"]
|
|
284
335
|
}
|
|
285
336
|
)
|
|
337
|
+
# => false (structural whitespace differs in <sample>)
|
|
286
338
|
----
|
|
287
339
|
|
|
288
340
|
.Overriding HTML defaults
|
|
@@ -340,6 +340,44 @@ Special attributes like `xml:lang`, `xml:space`, `xml:id`, and `xml:base` are pr
|
|
|
340
340
|
When `xml:space="preserve"` is set, whitespace is preserved in descendants.
|
|
341
341
|
----
|
|
342
342
|
|
|
343
|
+
=== Cross-encoding comparison
|
|
344
|
+
|
|
345
|
+
Canon automatically normalizes XML character encodings before comparison, enabling
|
|
346
|
+
cross-encoding comparisons to work correctly.
|
|
347
|
+
|
|
348
|
+
**Supported encodings**: UTF-8, UTF-16 (all variants), Shift_JIS, EUC-JP, ISO-8859-1, and more.
|
|
349
|
+
|
|
350
|
+
**How it works**:
|
|
351
|
+
|
|
352
|
+
1. Extract the declared encoding from the XML declaration (e.g., `encoding="Shift_JIS"`)
|
|
353
|
+
2. If declared encoding differs from UTF-8, transcode to UTF-8
|
|
354
|
+
3. Handle cases where the declared encoding doesn't match actual bytes
|
|
355
|
+
4. Use safe transcoding with replacement characters for invalid sequences
|
|
356
|
+
|
|
357
|
+
.Cross-encoding comparison example
|
|
358
|
+
[example]
|
|
359
|
+
====
|
|
360
|
+
[source,ruby]
|
|
361
|
+
----
|
|
362
|
+
# UTF-8 vs Shift_JIS - automatically normalized
|
|
363
|
+
xml1 = "<root>日本語</root>" # UTF-8
|
|
364
|
+
xml2 = "<root>日本語</root>".encode("Shift_JIS") # Shift_JIS
|
|
365
|
+
|
|
366
|
+
Canon::Comparison.equivalent?(xml1, xml2)
|
|
367
|
+
# => true (automatically transcoded to UTF-8 before comparison)
|
|
368
|
+
|
|
369
|
+
# ASCII content works across all encodings
|
|
370
|
+
xml3 = "<root>hello</root>"
|
|
371
|
+
xml4 = "<root>hello</root>".encode("ISO-8859-1")
|
|
372
|
+
|
|
373
|
+
Canon::Comparison.equivalent?(xml3, xml4)
|
|
374
|
+
# => true
|
|
375
|
+
----
|
|
376
|
+
====
|
|
377
|
+
|
|
378
|
+
This means you can compare XML files from different sources or systems without
|
|
379
|
+
worrying about their native encoding.
|
|
380
|
+
|
|
343
381
|
== Usage examples
|
|
344
382
|
|
|
345
383
|
=== Basic XML comparison
|
data/lib/canon/cache.rb
CHANGED
|
@@ -89,7 +89,8 @@ module Canon
|
|
|
89
89
|
# @return [String] Cache key
|
|
90
90
|
def key_for_format_detection(content)
|
|
91
91
|
# Use first 100 chars for quick key, plus length
|
|
92
|
-
|
|
92
|
+
# Force to binary to avoid encoding compatibility issues
|
|
93
|
+
preview = content[0..100].b
|
|
93
94
|
digest = Digest::SHA256.hexdigest(preview + content.length.to_s)
|
|
94
95
|
"fmt:#{digest[0..16]}"
|
|
95
96
|
end
|
|
@@ -62,7 +62,21 @@ module Canon
|
|
|
62
62
|
# @param str [String] String to detect format of
|
|
63
63
|
# @return [Symbol] Format type
|
|
64
64
|
def detect_string_uncached(str)
|
|
65
|
-
|
|
65
|
+
# Convert to UTF-8 for consistent handling if possible
|
|
66
|
+
# This handles cases like UTF-16 encoded XML that would otherwise fail string operations
|
|
67
|
+
str_utf8 = if ["UTF-16", "UTF-16BE",
|
|
68
|
+
"UTF-16LE"].include?(str.encoding.name)
|
|
69
|
+
begin
|
|
70
|
+
str.encode("UTF-8", str.encoding, invalid: :replace,
|
|
71
|
+
undef: :replace, replace: "?")
|
|
72
|
+
rescue EncodingError
|
|
73
|
+
str.dup.force_encoding("BINARY").encode("UTF-8")
|
|
74
|
+
end
|
|
75
|
+
else
|
|
76
|
+
str
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
trimmed = str_utf8.strip
|
|
66
80
|
|
|
67
81
|
# YAML indicators
|
|
68
82
|
return :yaml if trimmed.start_with?("---")
|
|
@@ -89,6 +89,15 @@ module Canon
|
|
|
89
89
|
insensitive = (insensitive_raw || []).map(&:to_s)
|
|
90
90
|
return false if insensitive.include?(elem_name)
|
|
91
91
|
|
|
92
|
+
# Check if we should ignore xml:space (user override)
|
|
93
|
+
if respect_xml_space?(match_opts)
|
|
94
|
+
# Check xml:space="preserve" (document declaration)
|
|
95
|
+
return true if xml_space_preserve?(element)
|
|
96
|
+
|
|
97
|
+
# Check xml:space="default" (use configured behavior)
|
|
98
|
+
return false if xml_space_default?(element)
|
|
99
|
+
end
|
|
100
|
+
|
|
92
101
|
# Whitelist: preserve whitespace
|
|
93
102
|
sensitive = resolved_sensitive_elements(match_opts)
|
|
94
103
|
return true if sensitive.include?(elem_name)
|
|
@@ -25,6 +25,9 @@ module Canon
|
|
|
25
25
|
preserve_whitespace: preserve_whitespace)
|
|
26
26
|
end
|
|
27
27
|
|
|
28
|
+
# Normalize encoding before preprocessing (UTF-16 strings can't use strip, etc.)
|
|
29
|
+
node = Canon::Xml::DataModel.normalize_encoding(node)
|
|
30
|
+
|
|
28
31
|
# Apply preprocessing to XML string before parsing
|
|
29
32
|
xml_string = apply_preprocessing(node, preprocessing).strip
|
|
30
33
|
|
data/lib/canon/version.rb
CHANGED
data/lib/canon/xml/data_model.rb
CHANGED
|
@@ -21,8 +21,11 @@ module Canon
|
|
|
21
21
|
# @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
|
|
22
22
|
# @return [Nodes::RootNode] Root of the data model tree
|
|
23
23
|
def self.from_xml(xml_string, preserve_whitespace: false)
|
|
24
|
+
# Normalize encoding before parsing
|
|
25
|
+
normalized_xml = normalize_encoding(xml_string)
|
|
26
|
+
|
|
24
27
|
# Parse with Nokogiri
|
|
25
|
-
doc = Nokogiri::XML(
|
|
28
|
+
doc = Nokogiri::XML(normalized_xml, &:nonet)
|
|
26
29
|
|
|
27
30
|
# Check for relative namespace URIs (prohibited by C14N 1.1)
|
|
28
31
|
check_for_relative_namespace_uris(doc)
|
|
@@ -31,6 +34,132 @@ module Canon
|
|
|
31
34
|
build_from_nokogiri(doc, preserve_whitespace: preserve_whitespace)
|
|
32
35
|
end
|
|
33
36
|
|
|
37
|
+
# Normalize XML string encoding to UTF-8
|
|
38
|
+
#
|
|
39
|
+
# Handles cases where:
|
|
40
|
+
# 1. The XML declaration specifies an encoding that doesn't match the actual encoding
|
|
41
|
+
# 2. The string's internal encoding is non-UTF-8 (without a declaration)
|
|
42
|
+
#
|
|
43
|
+
# For case 1, we check if the declared encoding matches the actual bytes.
|
|
44
|
+
# If bytes are valid UTF-8 despite the declaration, we update the declaration to UTF-8.
|
|
45
|
+
#
|
|
46
|
+
# @param xml_string [String] XML string to normalize
|
|
47
|
+
# @return [String] Normalized XML string with UTF-8 encoding
|
|
48
|
+
def self.normalize_encoding(xml_string)
|
|
49
|
+
return xml_string unless xml_string.is_a?(String)
|
|
50
|
+
|
|
51
|
+
# Extract declared encoding from XML declaration
|
|
52
|
+
declared_encoding = extract_xml_encoding(xml_string)
|
|
53
|
+
|
|
54
|
+
if declared_encoding
|
|
55
|
+
# Case 1: XML has a declaration
|
|
56
|
+
if declared_encoding.upcase != "UTF-8"
|
|
57
|
+
# Check if bytes are actually valid UTF-8 despite the declaration
|
|
58
|
+
utf8_reinterpreted = try_utf8_reinterpretation(xml_string)
|
|
59
|
+
if utf8_reinterpreted
|
|
60
|
+
# Bytes are valid UTF-8 - update declaration to UTF-8
|
|
61
|
+
return update_xml_declaration(xml_string, "UTF-8")
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Bytes aren't valid UTF-8 - must really be in declared encoding
|
|
65
|
+
return transcode_to_utf8(xml_string, declared_encoding)
|
|
66
|
+
end
|
|
67
|
+
elsif xml_string.encoding.name != "UTF-8"
|
|
68
|
+
# Case 2: No declaration but string encoding is non-UTF-8
|
|
69
|
+
# First, try to re-interpret bytes as UTF-8 (handles mislabeled strings)
|
|
70
|
+
reinterpreted = try_utf8_reinterpretation(xml_string)
|
|
71
|
+
return reinterpreted if reinterpreted
|
|
72
|
+
|
|
73
|
+
# If re-interpretation fails, try transcoding with the labeled encoding
|
|
74
|
+
return transcode_to_utf8(xml_string, xml_string.encoding.name)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
xml_string
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Update the encoding declaration in an XML string
|
|
81
|
+
#
|
|
82
|
+
# @param xml_string [String] XML string
|
|
83
|
+
# @param new_encoding [String] New encoding to declare
|
|
84
|
+
# @return [String] XML string with updated declaration
|
|
85
|
+
def self.update_xml_declaration(xml_string, new_encoding)
|
|
86
|
+
xml_string.sub(/\bencoding\s*=\s*["'][^"']+["']/i) do |_match|
|
|
87
|
+
%(encoding="#{new_encoding}")
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Transcode string to UTF-8
|
|
92
|
+
#
|
|
93
|
+
# @param xml_string [String] String to transcode
|
|
94
|
+
# @param source_encoding [String] Source encoding to interpret bytes as
|
|
95
|
+
# @return [String] UTF-8 transcoded string
|
|
96
|
+
def self.transcode_to_utf8(xml_string, source_encoding)
|
|
97
|
+
# First, check if the bytes are actually valid UTF-8 despite the declared encoding
|
|
98
|
+
# If so, just re-interpret as UTF-8 (common case: declaration is wrong)
|
|
99
|
+
if source_encoding != "UTF-8"
|
|
100
|
+
# Force the bytes to be interpreted as the declared encoding, then check validity
|
|
101
|
+
forced = xml_string.dup.force_encoding(source_encoding)
|
|
102
|
+
if forced.valid_encoding?
|
|
103
|
+
# Now check if the same bytes are valid UTF-8
|
|
104
|
+
utf8_check = xml_string.dup.force_encoding("UTF-8")
|
|
105
|
+
if utf8_check.valid_encoding?
|
|
106
|
+
# Bytes are valid UTF-8 - the declaration is likely wrong
|
|
107
|
+
# Return the string as UTF-8 (already is)
|
|
108
|
+
return xml_string.dup.force_encoding("UTF-8")
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Bytes aren't valid UTF-8, so they must really be in source_encoding
|
|
112
|
+
# Proceed with transcoding
|
|
113
|
+
return forced.encode("UTF-8", source_encoding,
|
|
114
|
+
invalid: :replace,
|
|
115
|
+
undef: :replace,
|
|
116
|
+
replace: "?")
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# Already UTF-8 or transcoding failed, return as-is
|
|
121
|
+
xml_string.dup.force_encoding("UTF-8")
|
|
122
|
+
rescue EncodingError
|
|
123
|
+
xml_string
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Attempt to re-interpret string as UTF-8 if bytes are valid UTF-8
|
|
127
|
+
#
|
|
128
|
+
# This handles the case where a string was incorrectly labeled with a different
|
|
129
|
+
# encoding (e.g., `.encode("Shift_JIS")` on a UTF-8 string) but the actual
|
|
130
|
+
# bytes are valid UTF-8.
|
|
131
|
+
#
|
|
132
|
+
# @param xml_string [String] XML string to check
|
|
133
|
+
# @return [String, nil] UTF-8 re-interpreted string, or nil if not possible
|
|
134
|
+
def self.try_utf8_reinterpretation(xml_string)
|
|
135
|
+
return xml_string if xml_string.encoding.name == "UTF-8"
|
|
136
|
+
|
|
137
|
+
# Try forcing to UTF-8 and see if it's valid
|
|
138
|
+
forced = xml_string.dup.force_encoding("UTF-8")
|
|
139
|
+
return forced if forced.valid_encoding?
|
|
140
|
+
|
|
141
|
+
nil
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Extract encoding from XML declaration
|
|
145
|
+
#
|
|
146
|
+
# @param xml_string [String] XML string
|
|
147
|
+
# @return [String, nil] Declared encoding or nil if not found
|
|
148
|
+
def self.extract_xml_encoding(xml_string)
|
|
149
|
+
# Match XML declaration with encoding attribute
|
|
150
|
+
# Handles: <?xml version="1.0" encoding="UTF-8"?>
|
|
151
|
+
# and: <?xml version='1.0' encoding='UTF-8'?>
|
|
152
|
+
#
|
|
153
|
+
# Use binary encoding to avoid encoding compatibility issues
|
|
154
|
+
# when the string has non-ASCII compatible encoding (e.g., UTF-16)
|
|
155
|
+
binary_string = xml_string.dup.force_encoding("BINARY")
|
|
156
|
+
if binary_string =~ /\A\s*<\?xml[^>]*\bencoding\s*=\s*["']([^"']+)["'][^>]*\?>/i
|
|
157
|
+
return Regexp.last_match(1)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
nil
|
|
161
|
+
end
|
|
162
|
+
|
|
34
163
|
# Alias for compatibility with base class interface
|
|
35
164
|
def self.parse(xml_string)
|
|
36
165
|
from_xml(xml_string)
|