canon 0.1.16 → 0.1.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +93 -7
- data/README.adoc +1 -0
- data/docs/features/match-options/index.adoc +113 -17
- data/docs/lychee.toml +1 -10
- data/docs/understanding/formats/xml.adoc +38 -0
- data/lib/canon/cache.rb +2 -1
- data/lib/canon/comparison/format_detector.rb +15 -1
- data/lib/canon/comparison/markup_comparator.rb +11 -6
- data/lib/canon/comparison/match_options/base_resolver.rb +2 -0
- data/lib/canon/comparison/match_options/xml_resolver.rb +9 -1
- data/lib/canon/comparison/whitespace_sensitivity.rb +82 -0
- data/lib/canon/comparison/xml_comparator/node_parser.rb +3 -0
- data/lib/canon/comparison/xml_node_comparison.rb +11 -8
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +2 -2
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +130 -1
- data/lib/tasks/benchmark_runner.rb +83 -75
- data/lib/tasks/performance_helpers.rb +47 -3
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 34ab9a64b52d8598690536908941e38950bb4071fb6222b50d1cb584236e5286
|
|
4
|
+
data.tar.gz: 2dffcc8e29fcd1f75d78595ef73350208e7b369f2183c2df93672b94df7a6376
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 41d784c820a7bbafd9874bf369ef303376578e74f7171e8f00a7ed1ed0b8800576ca67f51272f1464f6f36cbaec35b6f1f1e9806916e7a8c59bc6c04e827ee79
|
|
7
|
+
data.tar.gz: d3e354615ed4b40447ae0be7de64bf6250c40650e25f897a5155f52a74f57a8601003efd37d539abf9f514d5cbe614298a2185db7d72b68b4ff95db6da1b5b99
|
data/.rubocop_todo.yml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# This configuration was generated by
|
|
2
2
|
# `rubocop --auto-gen-config`
|
|
3
|
-
# on 2026-03-
|
|
3
|
+
# on 2026-03-24 08:58:24 UTC using RuboCop version 1.85.1.
|
|
4
4
|
# The point is for the user to remove these configuration records
|
|
5
5
|
# one by one as the offenses are removed from the code base.
|
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
|
@@ -11,13 +11,63 @@ Gemspec/RequiredRubyVersion:
|
|
|
11
11
|
Exclude:
|
|
12
12
|
- 'canon.gemspec'
|
|
13
13
|
|
|
14
|
-
# Offense count:
|
|
14
|
+
# Offense count: 10
|
|
15
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
16
|
+
# Configuration parameters: EnforcedStyle, IndentationWidth.
|
|
17
|
+
# SupportedStyles: with_first_argument, with_fixed_indentation
|
|
18
|
+
Layout/ArgumentAlignment:
|
|
19
|
+
Exclude:
|
|
20
|
+
- 'lib/canon/xml/data_model.rb'
|
|
21
|
+
- 'spec/canon/comparison/encoding_normalization_spec.rb'
|
|
22
|
+
- 'spec/canon/comparison/xml_whitespace_spec.rb'
|
|
23
|
+
|
|
24
|
+
# Offense count: 1
|
|
25
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
26
|
+
# Configuration parameters: EnforcedStyle, IndentationWidth.
|
|
27
|
+
# SupportedStyles: with_first_element, with_fixed_indentation
|
|
28
|
+
Layout/ArrayAlignment:
|
|
29
|
+
Exclude:
|
|
30
|
+
- 'lib/canon/comparison/format_detector.rb'
|
|
31
|
+
|
|
32
|
+
# Offense count: 1
|
|
33
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
34
|
+
Layout/EmptyLineAfterGuardClause:
|
|
35
|
+
Exclude:
|
|
36
|
+
- 'lib/canon/xml/data_model.rb'
|
|
37
|
+
|
|
38
|
+
# Offense count: 1
|
|
39
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
40
|
+
# Configuration parameters: AllowMultipleStyles, EnforcedHashRocketStyle, EnforcedColonStyle, EnforcedLastArgumentHashStyle.
|
|
41
|
+
# SupportedHashRocketStyles: key, separator, table
|
|
42
|
+
# SupportedColonStyles: key, separator, table
|
|
43
|
+
# SupportedLastArgumentHashStyles: always_inspect, always_ignore, ignore_implicit, ignore_explicit
|
|
44
|
+
Layout/HashAlignment:
|
|
45
|
+
Exclude:
|
|
46
|
+
- 'lib/canon/comparison/format_detector.rb'
|
|
47
|
+
|
|
48
|
+
# Offense count: 831
|
|
15
49
|
# This cop supports safe autocorrection (--autocorrect).
|
|
16
50
|
# Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
|
|
17
51
|
# URISchemes: http, https
|
|
18
52
|
Layout/LineLength:
|
|
19
53
|
Enabled: false
|
|
20
54
|
|
|
55
|
+
# Offense count: 9
|
|
56
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
57
|
+
# Configuration parameters: EnforcedStyle.
|
|
58
|
+
# SupportedStyles: symmetrical, new_line, same_line
|
|
59
|
+
Layout/MultilineMethodCallBraceLayout:
|
|
60
|
+
Exclude:
|
|
61
|
+
- 'spec/canon/comparison/encoding_normalization_spec.rb'
|
|
62
|
+
- 'spec/canon/comparison/xml_whitespace_spec.rb'
|
|
63
|
+
|
|
64
|
+
# Offense count: 2
|
|
65
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
66
|
+
# Configuration parameters: AllowInHeredoc.
|
|
67
|
+
Layout/TrailingWhitespace:
|
|
68
|
+
Exclude:
|
|
69
|
+
- 'lib/canon/comparison/format_detector.rb'
|
|
70
|
+
|
|
21
71
|
# Offense count: 49
|
|
22
72
|
# Configuration parameters: IgnoreLiteralBranches, IgnoreConstantBranches, IgnoreDuplicateElseBranch.
|
|
23
73
|
Lint/DuplicateBranch:
|
|
@@ -47,6 +97,13 @@ Lint/UnreachableCode:
|
|
|
47
97
|
Exclude:
|
|
48
98
|
- 'lib/canon/diff_formatter/debug_output.rb'
|
|
49
99
|
|
|
100
|
+
# Offense count: 1
|
|
101
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
102
|
+
# Configuration parameters: IgnoreEmptyBlocks, AllowUnusedKeywordArguments.
|
|
103
|
+
Lint/UnusedBlockArgument:
|
|
104
|
+
Exclude:
|
|
105
|
+
- 'lib/canon/xml/data_model.rb'
|
|
106
|
+
|
|
50
107
|
# Offense count: 6
|
|
51
108
|
# This cop supports safe autocorrection (--autocorrect).
|
|
52
109
|
# Configuration parameters: AllowUnusedKeywordArguments, IgnoreEmptyMethods, IgnoreNotImplementedMethods, NotImplementedExceptions.
|
|
@@ -58,7 +115,7 @@ Lint/UnusedMethodArgument:
|
|
|
58
115
|
- 'lib/canon/diff_formatter/by_line/xml_formatter.rb'
|
|
59
116
|
- 'lib/canon/diff_formatter/by_object/base_formatter.rb'
|
|
60
117
|
|
|
61
|
-
# Offense count:
|
|
118
|
+
# Offense count: 236
|
|
62
119
|
# Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
|
|
63
120
|
Metrics/AbcSize:
|
|
64
121
|
Enabled: false
|
|
@@ -69,12 +126,12 @@ Metrics/AbcSize:
|
|
|
69
126
|
Metrics/BlockLength:
|
|
70
127
|
Max: 84
|
|
71
128
|
|
|
72
|
-
# Offense count:
|
|
129
|
+
# Offense count: 193
|
|
73
130
|
# Configuration parameters: AllowedMethods, AllowedPatterns, Max.
|
|
74
131
|
Metrics/CyclomaticComplexity:
|
|
75
132
|
Enabled: false
|
|
76
133
|
|
|
77
|
-
# Offense count:
|
|
134
|
+
# Offense count: 403
|
|
78
135
|
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
|
|
79
136
|
Metrics/MethodLength:
|
|
80
137
|
Max: 95
|
|
@@ -84,7 +141,7 @@ Metrics/MethodLength:
|
|
|
84
141
|
Metrics/ParameterLists:
|
|
85
142
|
Max: 9
|
|
86
143
|
|
|
87
|
-
# Offense count:
|
|
144
|
+
# Offense count: 160
|
|
88
145
|
# Configuration parameters: AllowedMethods, AllowedPatterns, Max.
|
|
89
146
|
Metrics/PerceivedComplexity:
|
|
90
147
|
Enabled: false
|
|
@@ -115,13 +172,19 @@ Performance/CollectionLiteralInLoop:
|
|
|
115
172
|
- 'lib/canon/comparison/html_comparator.rb'
|
|
116
173
|
- 'lib/canon/xml/xml_base_handler.rb'
|
|
117
174
|
|
|
175
|
+
# Offense count: 1
|
|
176
|
+
# This cop supports unsafe autocorrection (--autocorrect-all).
|
|
177
|
+
Performance/UnfreezeString:
|
|
178
|
+
Exclude:
|
|
179
|
+
- 'spec/canon/comparison/encoding_normalization_spec.rb'
|
|
180
|
+
|
|
118
181
|
# Offense count: 68
|
|
119
182
|
# Configuration parameters: Prefixes, AllowedPatterns.
|
|
120
183
|
# Prefixes: when, with, without
|
|
121
184
|
RSpec/ContextWording:
|
|
122
185
|
Enabled: false
|
|
123
186
|
|
|
124
|
-
# Offense count:
|
|
187
|
+
# Offense count: 29
|
|
125
188
|
# Configuration parameters: IgnoredMetadata.
|
|
126
189
|
RSpec/DescribeClass:
|
|
127
190
|
Enabled: false
|
|
@@ -217,6 +280,11 @@ RSpec/NoExpectationExample:
|
|
|
217
280
|
- 'spec/canon/isodoc_blockquotes_spec.rb'
|
|
218
281
|
- 'spec/canon/match_scenarios_spec.rb'
|
|
219
282
|
|
|
283
|
+
# Offense count: 2
|
|
284
|
+
RSpec/RepeatedExample:
|
|
285
|
+
Exclude:
|
|
286
|
+
- 'spec/canon/comparison/encoding_normalization_spec.rb'
|
|
287
|
+
|
|
220
288
|
# Offense count: 7
|
|
221
289
|
# Configuration parameters: CustomTransform, IgnoreMethods, IgnoreMetadata, InflectorPath, EnforcedInflector.
|
|
222
290
|
# SupportedInflectors: default, active_support
|
|
@@ -241,6 +309,17 @@ RSpec/VerifiedDoubles:
|
|
|
241
309
|
- 'spec/canon/diff/xml_serialization_formatter_spec.rb'
|
|
242
310
|
- 'spec/canon/tree_diff/operation_converter_spec.rb'
|
|
243
311
|
|
|
312
|
+
# Offense count: 1
|
|
313
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
314
|
+
# Configuration parameters: EnforcedStyle, ProceduralMethods, FunctionalMethods, AllowedMethods, AllowedPatterns, AllowBracesOnProceduralOneLiners, BracesRequiredMethods.
|
|
315
|
+
# SupportedStyles: line_count_based, semantic, braces_for_chaining, always_braces
|
|
316
|
+
# ProceduralMethods: benchmark, bm, bmbm, create, each_with_object, measure, new, realtime, tap, with_object
|
|
317
|
+
# FunctionalMethods: let, let!, subject, watch
|
|
318
|
+
# AllowedMethods: lambda, proc, it
|
|
319
|
+
Style/BlockDelimiters:
|
|
320
|
+
Exclude:
|
|
321
|
+
- 'spec/canon/comparison/encoding_normalization_spec.rb'
|
|
322
|
+
|
|
244
323
|
# Offense count: 1
|
|
245
324
|
# This cop supports safe autocorrection (--autocorrect).
|
|
246
325
|
# Configuration parameters: EnforcedStyle, AllowComments.
|
|
@@ -263,6 +342,13 @@ Style/IdenticalConditionalBranches:
|
|
|
263
342
|
- 'lib/canon/diff_formatter/by_object/base_formatter.rb'
|
|
264
343
|
- 'lib/canon/diff_formatter/legend.rb'
|
|
265
344
|
|
|
345
|
+
# Offense count: 1
|
|
346
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
347
|
+
# Configuration parameters: AllowMethodComparison, ComparisonsThreshold.
|
|
348
|
+
Style/MultipleComparison:
|
|
349
|
+
Exclude:
|
|
350
|
+
- 'lib/canon/comparison/format_detector.rb'
|
|
351
|
+
|
|
266
352
|
# Offense count: 1
|
|
267
353
|
# Configuration parameters: AllowedMethods.
|
|
268
354
|
# AllowedMethods: respond_to_missing?
|
data/README.adoc
CHANGED
|
@@ -214,6 +214,7 @@ Compare documents based on meaning, not formatting:
|
|
|
214
214
|
* Comment handling with display control
|
|
215
215
|
* Multiple match dimensions with behaviors
|
|
216
216
|
* Predefined match profiles (strict, rendered, spec_friendly, content_only)
|
|
217
|
+
* **Cross-encoding comparison**: Compare XML documents with different character encodings (UTF-8, Shift_JIS, ISO-8859-1, UTF-16) — Canon automatically normalizes to UTF-8 before comparison
|
|
217
218
|
|
|
218
219
|
See link:docs/MATCH_OPTIONS[Match options] for details.
|
|
219
220
|
|
|
@@ -151,22 +151,72 @@ sensitivity in XML instance documents:
|
|
|
151
151
|
</text>
|
|
152
152
|
----
|
|
153
153
|
|
|
154
|
+
The `xml:space` attribute affects both structural whitespace and text content:
|
|
155
|
+
|
|
156
|
+
* **Structural whitespace** (whitespace-only text nodes between child elements)
|
|
157
|
+
* **Text content whitespace** (whitespace within text nodes)
|
|
158
|
+
|
|
159
|
+
.xml:space with structural_whitespace
|
|
160
|
+
[example]
|
|
161
|
+
====
|
|
162
|
+
[source,ruby]
|
|
163
|
+
----
|
|
164
|
+
# With xml:space="preserve", structural whitespace is preserved
|
|
165
|
+
xml1 = "<root xml:space='preserve'>\n <text>Hello</text>\n</root>"
|
|
166
|
+
xml2 = "<root xml:space='preserve'><text>Hello</text></root>"
|
|
167
|
+
|
|
168
|
+
# These are NOT equivalent (structural whitespace differs)
|
|
169
|
+
Canon::Comparison.equivalent?(xml1, xml2)
|
|
170
|
+
# => false
|
|
171
|
+
----
|
|
172
|
+
====
|
|
173
|
+
|
|
174
|
+
.xml:space with text_content
|
|
175
|
+
[example]
|
|
176
|
+
====
|
|
177
|
+
[source,ruby]
|
|
178
|
+
----
|
|
179
|
+
# With xml:space="preserve", text content whitespace is preserved
|
|
180
|
+
xml1 = '<root xml:space="preserve"><code> indented </code></root>'
|
|
181
|
+
xml2 = '<root xml:space="preserve"><code>indented</code></root>'
|
|
182
|
+
|
|
183
|
+
# These are NOT equivalent (text whitespace differs)
|
|
184
|
+
Canon::Comparison.equivalent?(xml1, xml2,
|
|
185
|
+
match: { text_content: :strict }
|
|
186
|
+
)
|
|
187
|
+
# => false
|
|
188
|
+
----
|
|
189
|
+
====
|
|
190
|
+
|
|
154
191
|
==== Whitelist and blacklist options
|
|
155
192
|
|
|
156
|
-
You can explicitly specify which elements are whitespace-sensitive:
|
|
193
|
+
You can explicitly specify which elements are whitespace-sensitive using either short or long option names:
|
|
157
194
|
|
|
158
195
|
[source,ruby]
|
|
159
196
|
----
|
|
160
|
-
#
|
|
197
|
+
# Short names (preferred)
|
|
198
|
+
Canon::Comparison.equivalent?(xml1, xml2,
|
|
199
|
+
match: {
|
|
200
|
+
structural_whitespace: :strict,
|
|
201
|
+
sensitive_elements: ["pre", "code", "sample"],
|
|
202
|
+
insensitive_elements: ["div", "span"]
|
|
203
|
+
}
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# Long names (backward-compatible)
|
|
161
207
|
Canon::Comparison.equivalent?(xml1, xml2,
|
|
162
208
|
match: {
|
|
163
209
|
structural_whitespace: :strict,
|
|
164
|
-
whitespace_sensitive_elements: [
|
|
165
|
-
whitespace_insensitive_elements: [
|
|
210
|
+
whitespace_sensitive_elements: ["pre", "code", "sample"],
|
|
211
|
+
whitespace_insensitive_elements: ["div", "span"]
|
|
166
212
|
}
|
|
167
213
|
)
|
|
168
214
|
----
|
|
169
215
|
|
|
216
|
+
**Element names are strings** (not symbols) for consistency with XML/HTML conventions.
|
|
217
|
+
|
|
218
|
+
**Blacklist takes precedence over whitelist** — if an element appears in both lists, whitespace is stripped.
|
|
219
|
+
|
|
170
220
|
==== respect_xml_space option
|
|
171
221
|
|
|
172
222
|
Control whether xml:space attributes in the document are honored:
|
|
@@ -211,34 +261,80 @@ When determining if an element is whitespace-sensitive, Canon uses this priority
|
|
|
211
261
|
|
|
212
262
|
==== Format-specific defaults
|
|
213
263
|
|
|
214
|
-
**HTML**:: `[
|
|
264
|
+
**HTML**:: `["pre", "textarea", "script", "style"]` - These elements preserve whitespace by HTML specification
|
|
215
265
|
**XML**:: `[]` - No default whitespace-sensitive elements, purely user-controlled
|
|
216
266
|
|
|
267
|
+
==== Two types of whitespace sensitivity
|
|
268
|
+
|
|
269
|
+
Canon handles two distinct whitespace concerns:
|
|
270
|
+
|
|
271
|
+
**1. Structural whitespace stripping** — whitespace-only text nodes between sibling elements (indentation, newlines). These are never semantically meaningful and are stripped by default for XML to enable ElementMatcher to work correctly.
|
|
272
|
+
|
|
273
|
+
**2. Text content comparison** — how non-whitespace text content is compared. Controlled by `structural_whitespace` and `text_content` dimension behaviors (`:strict`, `:normalize`, `:ignore`).
|
|
274
|
+
|
|
275
|
+
The `sensitive_elements` / `insensitive_elements` options control both concerns:
|
|
276
|
+
|
|
277
|
+
[source,ruby]
|
|
278
|
+
----
|
|
279
|
+
# For XML: structural whitespace is stripped by default
|
|
280
|
+
# Use sensitive_elements to preserve whitespace in specific elements
|
|
281
|
+
xml1 = "<root><item>Test</item></root>"
|
|
282
|
+
xml2 = "<root>\n <item>Test</item>\n</root>"
|
|
283
|
+
|
|
284
|
+
# With sensitive_elements, whitespace inside <item> is preserved
|
|
285
|
+
Canon::Comparison.equivalent?(xml1, xml2,
|
|
286
|
+
match: {
|
|
287
|
+
structural_whitespace: :strict,
|
|
288
|
+
sensitive_elements: ["item"]
|
|
289
|
+
}
|
|
290
|
+
)
|
|
291
|
+
# => true
|
|
292
|
+
----
|
|
293
|
+
|
|
294
|
+
**Precedence**: blacklist (`insensitive_elements`) > whitelist (`sensitive_elements`) > format defaults
|
|
295
|
+
|
|
296
|
+
**No inheritance**: Only the immediate parent element's name is checked — not ancestor elements.
|
|
297
|
+
|
|
217
298
|
==== Examples
|
|
218
299
|
|
|
219
|
-
.Using xml:space
|
|
300
|
+
.Using xml:space="preserve" for structural whitespace
|
|
301
|
+
[source,ruby]
|
|
302
|
+
----
|
|
303
|
+
xml1 = "<root xml:space='preserve'>\n <text>Hello</text>\n</root>"
|
|
304
|
+
xml2 = "<root xml:space='preserve'><text>Hello</text></root>"
|
|
305
|
+
|
|
306
|
+
# Structural whitespace differs - NOT equivalent
|
|
307
|
+
Canon::Comparison.equivalent?(xml1, xml2)
|
|
308
|
+
# => false
|
|
309
|
+
----
|
|
310
|
+
|
|
311
|
+
.Using xml:space="preserve" for text content
|
|
220
312
|
[source,ruby]
|
|
221
313
|
----
|
|
222
|
-
xml1 = '<root><code xml:space="preserve">
|
|
223
|
-
xml2 = '<root><code xml:space="preserve">
|
|
314
|
+
xml1 = '<root><code xml:space="preserve"> multiple spaces </code></root>'
|
|
315
|
+
xml2 = '<root><code xml:space="preserve">multiple spaces</code></root>'
|
|
224
316
|
|
|
225
|
-
#
|
|
317
|
+
# Text content whitespace differs - NOT equivalent with text_content: :strict
|
|
226
318
|
Canon::Comparison.equivalent?(xml1, xml2,
|
|
227
|
-
match: {
|
|
319
|
+
match: { text_content: :strict }
|
|
228
320
|
)
|
|
229
321
|
# => false
|
|
230
322
|
----
|
|
231
323
|
|
|
232
|
-
.Using whitelist
|
|
324
|
+
.Using sensitive_elements whitelist
|
|
233
325
|
[source,ruby]
|
|
234
326
|
----
|
|
235
|
-
# Make <
|
|
327
|
+
# Make <sample> elements whitespace-sensitive (strings, not symbols)
|
|
328
|
+
xml1 = "<sample>\n content\n</sample>"
|
|
329
|
+
xml2 = "<sample>content</sample>"
|
|
330
|
+
|
|
236
331
|
Canon::Comparison.equivalent?(xml1, xml2,
|
|
237
332
|
match: {
|
|
238
333
|
structural_whitespace: :strict,
|
|
239
|
-
|
|
334
|
+
sensitive_elements: ["sample"]
|
|
240
335
|
}
|
|
241
336
|
)
|
|
337
|
+
# => false (structural whitespace differs in <sample>)
|
|
242
338
|
----
|
|
243
339
|
|
|
244
340
|
.Overriding HTML defaults
|
|
@@ -249,7 +345,7 @@ Canon::Comparison.equivalent?(html1, html2,
|
|
|
249
345
|
format: :html,
|
|
250
346
|
match: {
|
|
251
347
|
structural_whitespace: :strict,
|
|
252
|
-
|
|
348
|
+
insensitive_elements: ["script"]
|
|
253
349
|
}
|
|
254
350
|
)
|
|
255
351
|
----
|
|
@@ -636,12 +732,12 @@ expect(actual).to be_xml_equivalent_to(expected,
|
|
|
636
732
|
element_hierarchy: :ignore
|
|
637
733
|
)
|
|
638
734
|
|
|
639
|
-
# Element-level whitespace sensitivity
|
|
735
|
+
# Element-level whitespace sensitivity (strings, not symbols)
|
|
640
736
|
expect(actual).to be_xml_equivalent_to(expected,
|
|
641
737
|
match: { structural_whitespace: :strict }
|
|
642
738
|
)
|
|
643
739
|
.with_options(
|
|
644
|
-
|
|
740
|
+
sensitive_elements: ["pre", "code", "sample"],
|
|
645
741
|
respect_xml_space: true
|
|
646
742
|
)
|
|
647
743
|
|
|
@@ -650,7 +746,7 @@ expect(html).to be_html_equivalent_to(expected,
|
|
|
650
746
|
match: { structural_whitespace: :strict }
|
|
651
747
|
)
|
|
652
748
|
.with_options(
|
|
653
|
-
|
|
749
|
+
insensitive_elements: ["script", "style"]
|
|
654
750
|
)
|
|
655
751
|
====
|
|
656
752
|
|
data/docs/lychee.toml
CHANGED
|
@@ -9,9 +9,6 @@ max_cache_age = "1d"
|
|
|
9
9
|
# Check both source files and built site
|
|
10
10
|
include_verbatim = true
|
|
11
11
|
|
|
12
|
-
# Recursively check all files
|
|
13
|
-
recursive = true
|
|
14
|
-
|
|
15
12
|
# File types to check (regex patterns)
|
|
16
13
|
include = [
|
|
17
14
|
"_site/**/*.html",
|
|
@@ -50,9 +47,6 @@ user_agent = "lychee/canon-docs-link-checker"
|
|
|
50
47
|
# Check HTTP, HTTPS, and file:// schemes
|
|
51
48
|
scheme = ["https", "http", "file"]
|
|
52
49
|
|
|
53
|
-
# Include file:// URLs for local link checking
|
|
54
|
-
include_file = true
|
|
55
|
-
|
|
56
50
|
# Handle different link types
|
|
57
51
|
include_mail = false # Don't check mailto: links
|
|
58
52
|
|
|
@@ -66,7 +60,4 @@ verbose = "warn"
|
|
|
66
60
|
require_https = false # Don't enforce
|
|
67
61
|
|
|
68
62
|
# Index files for directory URLs
|
|
69
|
-
index_files = ["index.html"]
|
|
70
|
-
|
|
71
|
-
# Ignore patterns file
|
|
72
|
-
ignore_file = ".lycheeignore"
|
|
63
|
+
index_files = ["index.html"]
|
|
@@ -340,6 +340,44 @@ Special attributes like `xml:lang`, `xml:space`, `xml:id`, and `xml:base` are pr
|
|
|
340
340
|
When `xml:space="preserve"` is set, whitespace is preserved in descendants.
|
|
341
341
|
----
|
|
342
342
|
|
|
343
|
+
=== Cross-encoding comparison
|
|
344
|
+
|
|
345
|
+
Canon automatically normalizes XML character encodings before comparison, enabling
|
|
346
|
+
cross-encoding comparisons to work correctly.
|
|
347
|
+
|
|
348
|
+
**Supported encodings**: UTF-8, UTF-16 (all variants), Shift_JIS, EUC-JP, ISO-8859-1, and more.
|
|
349
|
+
|
|
350
|
+
**How it works**:
|
|
351
|
+
|
|
352
|
+
1. Extract the declared encoding from the XML declaration (e.g., `encoding="Shift_JIS"`)
|
|
353
|
+
2. If declared encoding differs from UTF-8, transcode to UTF-8
|
|
354
|
+
3. Handle cases where the declared encoding doesn't match actual bytes
|
|
355
|
+
4. Use safe transcoding with replacement characters for invalid sequences
|
|
356
|
+
|
|
357
|
+
.Cross-encoding comparison example
|
|
358
|
+
[example]
|
|
359
|
+
====
|
|
360
|
+
[source,ruby]
|
|
361
|
+
----
|
|
362
|
+
# UTF-8 vs Shift_JIS - automatically normalized
|
|
363
|
+
xml1 = "<root>日本語</root>" # UTF-8
|
|
364
|
+
xml2 = "<root>日本語</root>".encode("Shift_JIS") # Shift_JIS
|
|
365
|
+
|
|
366
|
+
Canon::Comparison.equivalent?(xml1, xml2)
|
|
367
|
+
# => true (automatically transcoded to UTF-8 before comparison)
|
|
368
|
+
|
|
369
|
+
# ASCII content works across all encodings
|
|
370
|
+
xml3 = "<root>hello</root>"
|
|
371
|
+
xml4 = "<root>hello</root>".encode("ISO-8859-1")
|
|
372
|
+
|
|
373
|
+
Canon::Comparison.equivalent?(xml3, xml4)
|
|
374
|
+
# => true
|
|
375
|
+
----
|
|
376
|
+
====
|
|
377
|
+
|
|
378
|
+
This means you can compare XML files from different sources or systems without
|
|
379
|
+
worrying about their native encoding.
|
|
380
|
+
|
|
343
381
|
== Usage examples
|
|
344
382
|
|
|
345
383
|
=== Basic XML comparison
|
data/lib/canon/cache.rb
CHANGED
|
@@ -89,7 +89,8 @@ module Canon
|
|
|
89
89
|
# @return [String] Cache key
|
|
90
90
|
def key_for_format_detection(content)
|
|
91
91
|
# Use first 100 chars for quick key, plus length
|
|
92
|
-
|
|
92
|
+
# Force to binary to avoid encoding compatibility issues
|
|
93
|
+
preview = content[0..100].b
|
|
93
94
|
digest = Digest::SHA256.hexdigest(preview + content.length.to_s)
|
|
94
95
|
"fmt:#{digest[0..16]}"
|
|
95
96
|
end
|
|
@@ -62,7 +62,21 @@ module Canon
|
|
|
62
62
|
# @param str [String] String to detect format of
|
|
63
63
|
# @return [Symbol] Format type
|
|
64
64
|
def detect_string_uncached(str)
|
|
65
|
-
|
|
65
|
+
# Convert to UTF-8 for consistent handling if possible
|
|
66
|
+
# This handles cases like UTF-16 encoded XML that would otherwise fail string operations
|
|
67
|
+
str_utf8 = if ["UTF-16", "UTF-16BE",
|
|
68
|
+
"UTF-16LE"].include?(str.encoding.name)
|
|
69
|
+
begin
|
|
70
|
+
str.encode("UTF-8", str.encoding, invalid: :replace,
|
|
71
|
+
undef: :replace, replace: "?")
|
|
72
|
+
rescue EncodingError
|
|
73
|
+
str.dup.force_encoding("BINARY").encode("UTF-8")
|
|
74
|
+
end
|
|
75
|
+
else
|
|
76
|
+
str
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
trimmed = str_utf8.strip
|
|
66
80
|
|
|
67
81
|
# YAML indicators
|
|
68
82
|
return :yaml if trimmed.start_with?("---")
|
|
@@ -174,12 +174,17 @@ module Canon
|
|
|
174
174
|
end
|
|
175
175
|
end
|
|
176
176
|
|
|
177
|
-
#
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
177
|
+
# Strip whitespace-only text nodes based on parent element configuration.
|
|
178
|
+
# Use sensitive_elements / insensitive_elements to control.
|
|
179
|
+
# Blacklist (insensitive) > whitelist (sensitive) > format defaults.
|
|
180
|
+
return false unless text_node?(node) && node.parent
|
|
181
|
+
return false unless MatchOptions.normalize_text(node_text(node)).empty?
|
|
182
|
+
|
|
183
|
+
return true unless WhitespaceSensitivity.whitespace_preserved?(
|
|
184
|
+
node.parent, match_opts
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
false
|
|
183
188
|
|
|
184
189
|
false
|
|
185
190
|
end
|
|
@@ -8,6 +8,12 @@ module Canon
|
|
|
8
8
|
# XML/HTML-specific match options resolver
|
|
9
9
|
class XmlResolver < BaseResolver
|
|
10
10
|
# Format-specific defaults for XML/HTML
|
|
11
|
+
#
|
|
12
|
+
# Sensitive elements (preserve structural whitespace):
|
|
13
|
+
# - XML: none by default — all structural whitespace stripped
|
|
14
|
+
# - HTML: pre, code, textarea, script, style by default
|
|
15
|
+
# Use sensitive_elements option to add elements that preserve whitespace.
|
|
16
|
+
#
|
|
11
17
|
FORMAT_DEFAULTS = {
|
|
12
18
|
html: {
|
|
13
19
|
preprocessing: :rendered,
|
|
@@ -33,7 +39,9 @@ module Canon
|
|
|
33
39
|
|
|
34
40
|
# Predefined match profiles for XML/HTML
|
|
35
41
|
MATCH_PROFILES = {
|
|
36
|
-
# Strict: Match exactly as written in source (XML default)
|
|
42
|
+
# Strict: Match exactly as written in source (XML default).
|
|
43
|
+
# Structural whitespace is stripped by default for XML.
|
|
44
|
+
# Use sensitive_elements to preserve structural whitespace in specific elements.
|
|
37
45
|
strict: {
|
|
38
46
|
preprocessing: :none,
|
|
39
47
|
text_content: :strict,
|
|
@@ -66,6 +66,88 @@ module Canon
|
|
|
66
66
|
element_sensitive?(node, opts)
|
|
67
67
|
end
|
|
68
68
|
|
|
69
|
+
# Check if structural whitespace is preserved (not stripped) for an element.
|
|
70
|
+
#
|
|
71
|
+
# Uses sensitive_elements (whitelist) and insensitive_elements (blacklist)
|
|
72
|
+
# from match_opts. Blacklist takes precedence over whitelist.
|
|
73
|
+
# Format defaults apply when neither is configured.
|
|
74
|
+
#
|
|
75
|
+
# No inheritance from ancestors — checks only the immediate parent element name.
|
|
76
|
+
#
|
|
77
|
+
# @param element [Object] Element node to check
|
|
78
|
+
# @param match_opts [Hash] Resolved match options
|
|
79
|
+
# @return [Boolean] true if whitespace is preserved (not stripped)
|
|
80
|
+
def whitespace_preserved?(element, match_opts)
|
|
81
|
+
return false unless element
|
|
82
|
+
return false unless element.respond_to?(:name)
|
|
83
|
+
|
|
84
|
+
elem_name = element.name.to_s
|
|
85
|
+
|
|
86
|
+
# Blacklist: always strip (highest priority)
|
|
87
|
+
insensitive_raw = match_opts[:insensitive_elements]
|
|
88
|
+
insensitive_raw ||= match_opts[:whitespace_insensitive_elements]
|
|
89
|
+
insensitive = (insensitive_raw || []).map(&:to_s)
|
|
90
|
+
return false if insensitive.include?(elem_name)
|
|
91
|
+
|
|
92
|
+
# Check if we should ignore xml:space (user override)
|
|
93
|
+
if respect_xml_space?(match_opts)
|
|
94
|
+
# Check xml:space="preserve" (document declaration)
|
|
95
|
+
return true if xml_space_preserve?(element)
|
|
96
|
+
|
|
97
|
+
# Check xml:space="default" (use configured behavior)
|
|
98
|
+
return false if xml_space_default?(element)
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Whitelist: preserve whitespace
|
|
102
|
+
sensitive = resolved_sensitive_elements(match_opts)
|
|
103
|
+
return true if sensitive.include?(elem_name)
|
|
104
|
+
|
|
105
|
+
# Default: preserve for HTML, strip for XML
|
|
106
|
+
format = match_opts[:format] || :xml
|
|
107
|
+
case format
|
|
108
|
+
when :html, :html4, :html5
|
|
109
|
+
true
|
|
110
|
+
else
|
|
111
|
+
false
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Get resolved list of whitespace-sensitive element names (strings).
|
|
116
|
+
#
|
|
117
|
+
# Combines format defaults + user whitelist, minus user blacklist.
|
|
118
|
+
# Supports both short names (sensitive_elements) and long names
|
|
119
|
+
# (whitespace_sensitive_elements) for backward compatibility.
|
|
120
|
+
#
|
|
121
|
+
# @param match_opts [Hash] Resolved match options
|
|
122
|
+
# @return [Array<String>] Sensitive element names
|
|
123
|
+
def resolved_sensitive_elements(match_opts)
|
|
124
|
+
sensitive = []
|
|
125
|
+
|
|
126
|
+
# 1. Format defaults
|
|
127
|
+
format = match_opts[:format] || :xml
|
|
128
|
+
case format
|
|
129
|
+
when :html, :html4, :html5
|
|
130
|
+
sensitive += %w[pre code textarea script style]
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# 2. User whitelist (additive to format defaults)
|
|
134
|
+
whitelist = match_opts[:sensitive_elements]
|
|
135
|
+
whitelist ||= match_opts[:whitespace_sensitive_elements]
|
|
136
|
+
if whitelist
|
|
137
|
+
sensitive += whitelist.map(&:to_s)
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# 3. User blacklist removes from combined set
|
|
141
|
+
blacklist_raw = match_opts[:insensitive_elements]
|
|
142
|
+
blacklist_raw ||= match_opts[:whitespace_insensitive_elements]
|
|
143
|
+
if blacklist_raw
|
|
144
|
+
blacklist = blacklist_raw.to_set(&:to_s)
|
|
145
|
+
sensitive.reject! { |e| blacklist.include?(e) }
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
sensitive.uniq
|
|
149
|
+
end
|
|
150
|
+
|
|
69
151
|
# Get format-specific default sensitive elements
|
|
70
152
|
#
|
|
71
153
|
# This is the SINGLE SOURCE OF TRUTH for default whitespace-sensitive
|
|
@@ -25,6 +25,9 @@ module Canon
|
|
|
25
25
|
preserve_whitespace: preserve_whitespace)
|
|
26
26
|
end
|
|
27
27
|
|
|
28
|
+
# Normalize encoding before preprocessing (UTF-16 strings can't use strip, etc.)
|
|
29
|
+
node = Canon::Xml::DataModel.normalize_encoding(node)
|
|
30
|
+
|
|
28
31
|
# Apply preprocessing to XML string before parsing
|
|
29
32
|
xml_string = apply_preprocessing(node, preprocessing).strip
|
|
30
33
|
|
|
@@ -190,14 +190,17 @@ diff_children, differences)
|
|
|
190
190
|
end
|
|
191
191
|
end
|
|
192
192
|
|
|
193
|
-
#
|
|
194
|
-
#
|
|
195
|
-
#
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
193
|
+
# Strip whitespace-only text nodes based on parent element configuration.
|
|
194
|
+
# Use sensitive_elements / insensitive_elements to control.
|
|
195
|
+
# Blacklist (insensitive) > whitelist (sensitive) > format defaults.
|
|
196
|
+
return false unless text_node?(node) && node.parent
|
|
197
|
+
return false unless MatchOptions.normalize_text(node_text(node)).empty?
|
|
198
|
+
|
|
199
|
+
return true unless WhitespaceSensitivity.whitespace_preserved?(
|
|
200
|
+
node.parent, match_opts
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
false
|
|
201
204
|
|
|
202
205
|
false
|
|
203
206
|
end
|
|
@@ -326,9 +326,9 @@ module Canon
|
|
|
326
326
|
# Handle cases where one node is missing (e.g. text added or removed)
|
|
327
327
|
if node1.nil? || node2.nil?
|
|
328
328
|
if node1.nil?
|
|
329
|
-
text2 = node2
|
|
329
|
+
text2 = NodeUtils.get_node_text(node2)
|
|
330
330
|
else
|
|
331
|
-
text1 = node1
|
|
331
|
+
text1 = NodeUtils.get_node_text(node1)
|
|
332
332
|
end
|
|
333
333
|
end
|
|
334
334
|
|
data/lib/canon/version.rb
CHANGED
data/lib/canon/xml/data_model.rb
CHANGED
|
@@ -21,8 +21,11 @@ module Canon
|
|
|
21
21
|
# @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
|
|
22
22
|
# @return [Nodes::RootNode] Root of the data model tree
|
|
23
23
|
def self.from_xml(xml_string, preserve_whitespace: false)
|
|
24
|
+
# Normalize encoding before parsing
|
|
25
|
+
normalized_xml = normalize_encoding(xml_string)
|
|
26
|
+
|
|
24
27
|
# Parse with Nokogiri
|
|
25
|
-
doc = Nokogiri::XML(
|
|
28
|
+
doc = Nokogiri::XML(normalized_xml, &:nonet)
|
|
26
29
|
|
|
27
30
|
# Check for relative namespace URIs (prohibited by C14N 1.1)
|
|
28
31
|
check_for_relative_namespace_uris(doc)
|
|
@@ -31,6 +34,132 @@ module Canon
|
|
|
31
34
|
build_from_nokogiri(doc, preserve_whitespace: preserve_whitespace)
|
|
32
35
|
end
|
|
33
36
|
|
|
37
|
+
# Normalize XML string encoding to UTF-8
|
|
38
|
+
#
|
|
39
|
+
# Handles cases where:
|
|
40
|
+
# 1. The XML declaration specifies an encoding that doesn't match the actual encoding
|
|
41
|
+
# 2. The string's internal encoding is non-UTF-8 (without a declaration)
|
|
42
|
+
#
|
|
43
|
+
# For case 1, we check if the declared encoding matches the actual bytes.
|
|
44
|
+
# If bytes are valid UTF-8 despite the declaration, we update the declaration to UTF-8.
|
|
45
|
+
#
|
|
46
|
+
# @param xml_string [String] XML string to normalize
|
|
47
|
+
# @return [String] Normalized XML string with UTF-8 encoding
|
|
48
|
+
def self.normalize_encoding(xml_string)
|
|
49
|
+
return xml_string unless xml_string.is_a?(String)
|
|
50
|
+
|
|
51
|
+
# Extract declared encoding from XML declaration
|
|
52
|
+
declared_encoding = extract_xml_encoding(xml_string)
|
|
53
|
+
|
|
54
|
+
if declared_encoding
|
|
55
|
+
# Case 1: XML has a declaration
|
|
56
|
+
if declared_encoding.upcase != "UTF-8"
|
|
57
|
+
# Check if bytes are actually valid UTF-8 despite the declaration
|
|
58
|
+
utf8_reinterpreted = try_utf8_reinterpretation(xml_string)
|
|
59
|
+
if utf8_reinterpreted
|
|
60
|
+
# Bytes are valid UTF-8 - update declaration to UTF-8
|
|
61
|
+
return update_xml_declaration(xml_string, "UTF-8")
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Bytes aren't valid UTF-8 - must really be in declared encoding
|
|
65
|
+
return transcode_to_utf8(xml_string, declared_encoding)
|
|
66
|
+
end
|
|
67
|
+
elsif xml_string.encoding.name != "UTF-8"
|
|
68
|
+
# Case 2: No declaration but string encoding is non-UTF-8
|
|
69
|
+
# First, try to re-interpret bytes as UTF-8 (handles mislabeled strings)
|
|
70
|
+
reinterpreted = try_utf8_reinterpretation(xml_string)
|
|
71
|
+
return reinterpreted if reinterpreted
|
|
72
|
+
|
|
73
|
+
# If re-interpretation fails, try transcoding with the labeled encoding
|
|
74
|
+
return transcode_to_utf8(xml_string, xml_string.encoding.name)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
xml_string
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Update the encoding declaration in an XML string
|
|
81
|
+
#
|
|
82
|
+
# @param xml_string [String] XML string
|
|
83
|
+
# @param new_encoding [String] New encoding to declare
|
|
84
|
+
# @return [String] XML string with updated declaration
|
|
85
|
+
def self.update_xml_declaration(xml_string, new_encoding)
|
|
86
|
+
xml_string.sub(/\bencoding\s*=\s*["'][^"']+["']/i) do |_match|
|
|
87
|
+
%(encoding="#{new_encoding}")
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Transcode string to UTF-8
|
|
92
|
+
#
|
|
93
|
+
# @param xml_string [String] String to transcode
|
|
94
|
+
# @param source_encoding [String] Source encoding to interpret bytes as
|
|
95
|
+
# @return [String] UTF-8 transcoded string
|
|
96
|
+
def self.transcode_to_utf8(xml_string, source_encoding)
|
|
97
|
+
# First, check if the bytes are actually valid UTF-8 despite the declared encoding
|
|
98
|
+
# If so, just re-interpret as UTF-8 (common case: declaration is wrong)
|
|
99
|
+
if source_encoding != "UTF-8"
|
|
100
|
+
# Force the bytes to be interpreted as the declared encoding, then check validity
|
|
101
|
+
forced = xml_string.dup.force_encoding(source_encoding)
|
|
102
|
+
if forced.valid_encoding?
|
|
103
|
+
# Now check if the same bytes are valid UTF-8
|
|
104
|
+
utf8_check = xml_string.dup.force_encoding("UTF-8")
|
|
105
|
+
if utf8_check.valid_encoding?
|
|
106
|
+
# Bytes are valid UTF-8 - the declaration is likely wrong
|
|
107
|
+
# Return the string as UTF-8 (already is)
|
|
108
|
+
return xml_string.dup.force_encoding("UTF-8")
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Bytes aren't valid UTF-8, so they must really be in source_encoding
|
|
112
|
+
# Proceed with transcoding
|
|
113
|
+
return forced.encode("UTF-8", source_encoding,
|
|
114
|
+
invalid: :replace,
|
|
115
|
+
undef: :replace,
|
|
116
|
+
replace: "?")
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# Already UTF-8 or transcoding failed, return as-is
|
|
121
|
+
xml_string.dup.force_encoding("UTF-8")
|
|
122
|
+
rescue EncodingError
|
|
123
|
+
xml_string
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Attempt to re-interpret string as UTF-8 if bytes are valid UTF-8
|
|
127
|
+
#
|
|
128
|
+
# This handles the case where a string was incorrectly labeled with a different
|
|
129
|
+
# encoding (e.g., `.encode("Shift_JIS")` on a UTF-8 string) but the actual
|
|
130
|
+
# bytes are valid UTF-8.
|
|
131
|
+
#
|
|
132
|
+
# @param xml_string [String] XML string to check
|
|
133
|
+
# @return [String, nil] UTF-8 re-interpreted string, or nil if not possible
|
|
134
|
+
def self.try_utf8_reinterpretation(xml_string)
|
|
135
|
+
return xml_string if xml_string.encoding.name == "UTF-8"
|
|
136
|
+
|
|
137
|
+
# Try forcing to UTF-8 and see if it's valid
|
|
138
|
+
forced = xml_string.dup.force_encoding("UTF-8")
|
|
139
|
+
return forced if forced.valid_encoding?
|
|
140
|
+
|
|
141
|
+
nil
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Extract encoding from XML declaration
|
|
145
|
+
#
|
|
146
|
+
# @param xml_string [String] XML string
|
|
147
|
+
# @return [String, nil] Declared encoding or nil if not found
|
|
148
|
+
def self.extract_xml_encoding(xml_string)
|
|
149
|
+
# Match XML declaration with encoding attribute
|
|
150
|
+
# Handles: <?xml version="1.0" encoding="UTF-8"?>
|
|
151
|
+
# and: <?xml version='1.0' encoding='UTF-8'?>
|
|
152
|
+
#
|
|
153
|
+
# Use binary encoding to avoid encoding compatibility issues
|
|
154
|
+
# when the string has non-ASCII compatible encoding (e.g., UTF-16)
|
|
155
|
+
binary_string = xml_string.dup.force_encoding("BINARY")
|
|
156
|
+
if binary_string =~ /\A\s*<\?xml[^>]*\bencoding\s*=\s*["']([^"']+)["'][^>]*\?>/i
|
|
157
|
+
return Regexp.last_match(1)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
nil
|
|
161
|
+
end
|
|
162
|
+
|
|
34
163
|
# Alias for compatibility with base class interface
|
|
35
164
|
def self.parse(xml_string)
|
|
36
165
|
from_xml(xml_string)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "benchmark/ips"
|
|
4
|
+
require "table_tennis"
|
|
4
5
|
|
|
5
6
|
# Ensure lib/ is on the load path regardless of tmp location
|
|
6
7
|
lib_path = File.expand_path(File.join(__dir__, "..", "..", "lib"))
|
|
@@ -101,7 +102,8 @@ class BenchmarkRunner
|
|
|
101
102
|
end
|
|
102
103
|
|
|
103
104
|
# Category section with description
|
|
104
|
-
def self.category(title, icon:, description:, failure_means:,
|
|
105
|
+
def self.category(title, icon:, description:, failure_means:,
|
|
106
|
+
compare_against: nil)
|
|
105
107
|
puts
|
|
106
108
|
puts "#{CYAN}#{VL}#{CLEAR} #{BOLD}#{MAGENTA}#{icon} #{title}#{CLEAR}"
|
|
107
109
|
puts
|
|
@@ -124,26 +126,21 @@ class BenchmarkRunner
|
|
|
124
126
|
puts
|
|
125
127
|
end
|
|
126
128
|
|
|
127
|
-
# Results table for a category
|
|
128
|
-
def self.
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
129
|
+
# Results table for a category using TableTennis
|
|
130
|
+
def self.table(results)
|
|
131
|
+
rows = results.map do |r|
|
|
132
|
+
{
|
|
133
|
+
test: r[:name],
|
|
134
|
+
ips: r[:ips],
|
|
135
|
+
deviation: "#{r[:deviation].round(1)}%",
|
|
136
|
+
status: r[:is_best] ? "BEST" : "",
|
|
137
|
+
}
|
|
138
|
+
end
|
|
133
139
|
|
|
134
|
-
|
|
135
|
-
speedup_str = speedup ? " ⚡#{speedup.round(2)}x" : ""
|
|
136
|
-
label_str = is_best ? "#{GREEN}#{label}#{CLEAR}" : label
|
|
137
|
-
bar = render_bar(ips)
|
|
140
|
+
return if rows.empty?
|
|
138
141
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
puts
|
|
142
|
-
end
|
|
143
|
-
|
|
144
|
-
def self.table_footer
|
|
145
|
-
sep(char: "─", width: 76)
|
|
146
|
-
puts
|
|
142
|
+
table = TableTennis.new(rows, theme: :dark)
|
|
143
|
+
table.render
|
|
147
144
|
end
|
|
148
145
|
|
|
149
146
|
def self.speedup_badge(factor, label)
|
|
@@ -151,24 +148,7 @@ class BenchmarkRunner
|
|
|
151
148
|
puts " #{GREEN} #{factor.round(2)}x faster#{CLEAR}"
|
|
152
149
|
end
|
|
153
150
|
|
|
154
|
-
|
|
155
|
-
@max_ips = nil
|
|
156
|
-
end
|
|
157
|
-
|
|
158
|
-
def self.set_max_ips(val)
|
|
159
|
-
@max_ips = val
|
|
160
|
-
end
|
|
161
|
-
|
|
162
|
-
def self.render_bar(ips, max_width: 20)
|
|
163
|
-
@max_ips ||= ips
|
|
164
|
-
ratio = ips / @max_ips.to_f
|
|
165
|
-
width = [(ratio * max_width).round, 1].max
|
|
166
|
-
filled = [width, max_width].min
|
|
167
|
-
empty = max_width - filled
|
|
168
|
-
("█" * filled) + ("░" * empty)
|
|
169
|
-
end
|
|
170
|
-
|
|
171
|
-
# Summary card
|
|
151
|
+
# Summary card using TableTennis
|
|
172
152
|
def self.summary_card(results)
|
|
173
153
|
puts
|
|
174
154
|
sep(width: 78)
|
|
@@ -176,16 +156,23 @@ class BenchmarkRunner
|
|
|
176
156
|
puts " #{BOLD}#{MAGENTA}SUMMARY#{CLEAR}"
|
|
177
157
|
puts
|
|
178
158
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
puts " #{DIM}◆#{CLEAR} #{format('%-35s', r[:label])} #{ips_str}"
|
|
159
|
+
rows = results.map do |r|
|
|
160
|
+
{
|
|
161
|
+
benchmark: r[:label],
|
|
162
|
+
ips: r[:ips]&.round(1),
|
|
163
|
+
}
|
|
185
164
|
end
|
|
186
165
|
|
|
166
|
+
return if rows.empty?
|
|
167
|
+
|
|
168
|
+
table = TableTennis.new(rows,
|
|
169
|
+
title: "Performance Results",
|
|
170
|
+
theme: :dark,
|
|
171
|
+
headers: { benchmark: "Benchmark", ips: "IPS" })
|
|
172
|
+
table.render
|
|
173
|
+
|
|
187
174
|
puts
|
|
188
|
-
puts " #{DIM}#{
|
|
175
|
+
puts " #{DIM}#{results.length} benchmarks completed#{CLEAR}"
|
|
189
176
|
puts
|
|
190
177
|
end
|
|
191
178
|
end
|
|
@@ -239,24 +226,35 @@ class BenchmarkRunner
|
|
|
239
226
|
# Test definitions
|
|
240
227
|
BENCHMARKS = {
|
|
241
228
|
xml_parsing: [
|
|
242
|
-
{ name: "DOM (simple)", method: :xml_parse_dom_simple,
|
|
243
|
-
|
|
244
|
-
{ name: "
|
|
245
|
-
|
|
229
|
+
{ name: "DOM (simple)", method: :xml_parse_dom_simple,
|
|
230
|
+
desc: "Standard DOM parsing" },
|
|
231
|
+
{ name: "SAX (simple)", method: :xml_parse_sax_simple,
|
|
232
|
+
desc: "Streaming SAX parsing" },
|
|
233
|
+
{ name: "DOM (large)", method: :xml_parse_dom_large,
|
|
234
|
+
desc: "Large document DOM" },
|
|
235
|
+
{ name: "SAX (large)", method: :xml_parse_sax_large,
|
|
236
|
+
desc: "Large document SAX" },
|
|
246
237
|
],
|
|
247
238
|
html_parsing: [
|
|
248
239
|
{ name: "Simple HTML", method: :html_parse_simple, desc: "Basic HTML" },
|
|
249
|
-
{ name: "Complex HTML", method: :html_parse_complex,
|
|
240
|
+
{ name: "Complex HTML", method: :html_parse_complex,
|
|
241
|
+
desc: "HTML with scripts/tables" },
|
|
250
242
|
],
|
|
251
243
|
xml_comparison: [
|
|
252
|
-
{ name: "Identical XML", method: :xml_compare_identical,
|
|
253
|
-
|
|
254
|
-
{ name: "
|
|
244
|
+
{ name: "Identical XML", method: :xml_compare_identical,
|
|
245
|
+
desc: "Same documents" },
|
|
246
|
+
{ name: "Similar XML", method: :xml_compare_similar,
|
|
247
|
+
desc: "Slightly different" },
|
|
248
|
+
{ name: "Different XML", method: :xml_compare_different,
|
|
249
|
+
desc: "Different namespaces" },
|
|
255
250
|
],
|
|
256
251
|
html_comparison: [
|
|
257
|
-
{ name: "Identical HTML", method: :html_compare_identical,
|
|
258
|
-
|
|
259
|
-
{ name: "
|
|
252
|
+
{ name: "Identical HTML", method: :html_compare_identical,
|
|
253
|
+
desc: "Same HTML" },
|
|
254
|
+
{ name: "Similar HTML", method: :html_compare_similar,
|
|
255
|
+
desc: "Slightly different" },
|
|
256
|
+
{ name: "Different HTML", method: :html_compare_different,
|
|
257
|
+
desc: "Different structure" },
|
|
260
258
|
],
|
|
261
259
|
formatting: [
|
|
262
260
|
{ name: "XML C14N", method: :xml_c14n_format, desc: "Canonical XML" },
|
|
@@ -287,7 +285,8 @@ class BenchmarkRunner
|
|
|
287
285
|
end.join
|
|
288
286
|
"<#{prefix}root#{ns_attr}#{attrs}>#{children}</#{prefix}root>"
|
|
289
287
|
else
|
|
290
|
-
child = build_xml_element(items / 2, depth - 1, prefix, with_attrs,
|
|
288
|
+
child = build_xml_element(items / 2, depth - 1, prefix, with_attrs,
|
|
289
|
+
"")
|
|
291
290
|
"<#{prefix}root#{ns_attr}#{attrs}>#{child}</#{prefix}root>"
|
|
292
291
|
end
|
|
293
292
|
end
|
|
@@ -401,8 +400,6 @@ class BenchmarkRunner
|
|
|
401
400
|
end
|
|
402
401
|
|
|
403
402
|
def run_benchmarks
|
|
404
|
-
Term.reset_max_ips
|
|
405
|
-
|
|
406
403
|
# Header
|
|
407
404
|
Term.header("Canon Performance Benchmarks", color: Term::CYAN)
|
|
408
405
|
|
|
@@ -434,8 +431,6 @@ class BenchmarkRunner
|
|
|
434
431
|
compare_against: config[:compare_against],
|
|
435
432
|
)
|
|
436
433
|
|
|
437
|
-
Term.table_header
|
|
438
|
-
|
|
439
434
|
# Run each test in category
|
|
440
435
|
category_results = []
|
|
441
436
|
max_ips = 0
|
|
@@ -457,23 +452,32 @@ class BenchmarkRunner
|
|
|
457
452
|
$stdout = original_stdout
|
|
458
453
|
end
|
|
459
454
|
|
|
460
|
-
#
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
# Print results with relative bars
|
|
464
|
-
category_results.each do |r|
|
|
455
|
+
# Build results for TableTennis table
|
|
456
|
+
table_rows = category_results.map do |r|
|
|
465
457
|
is_best = r[:result][:upper] >= max_ips
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
458
|
+
label = "#{config[:name]}: #{r[:name]}"
|
|
459
|
+
@all_results << { label: label,
|
|
460
|
+
ips: (r[:result][:lower] + r[:result][:upper]) / 2.0 }
|
|
461
|
+
@results[label] = r[:result] # Populate @results for comparison
|
|
462
|
+
{
|
|
463
|
+
name: r[:name],
|
|
464
|
+
ips: (r[:result][:lower] + r[:result][:upper]) / 2.0,
|
|
465
|
+
deviation: calculate_deviation(r[:result]),
|
|
466
|
+
is_best: is_best,
|
|
467
|
+
}
|
|
469
468
|
end
|
|
470
469
|
|
|
471
|
-
|
|
470
|
+
# Render TableTennis table
|
|
471
|
+
Term.table(table_rows)
|
|
472
472
|
|
|
473
473
|
# SAX vs DOM comparison for XML parsing
|
|
474
474
|
if category == :xml_parsing && SAX_AVAILABLE
|
|
475
|
-
sax = category_results.find
|
|
476
|
-
|
|
475
|
+
sax = category_results.find do |r|
|
|
476
|
+
r[:name].include?("SAX") && r[:name].include?("large")
|
|
477
|
+
end
|
|
478
|
+
dom = category_results.find do |r|
|
|
479
|
+
r[:name].include?("DOM") && r[:name].include?("large")
|
|
480
|
+
end
|
|
477
481
|
|
|
478
482
|
if sax && dom
|
|
479
483
|
sax_ips = (sax[:result][:lower] + sax[:result][:upper]) / 2.0
|
|
@@ -481,9 +485,11 @@ class BenchmarkRunner
|
|
|
481
485
|
speedup = sax_ips / dom_ips
|
|
482
486
|
|
|
483
487
|
if speedup > 1.0
|
|
484
|
-
Term.speedup_badge(speedup,
|
|
488
|
+
Term.speedup_badge(speedup,
|
|
489
|
+
"SAX is faster than DOM for large documents")
|
|
485
490
|
else
|
|
486
|
-
Term.hint("DOM is #{format('%.2f',
|
|
491
|
+
Term.hint("DOM is #{format('%.2f',
|
|
492
|
+
1 / speedup)}x faster than SAX for large documents")
|
|
487
493
|
end
|
|
488
494
|
end
|
|
489
495
|
end
|
|
@@ -509,7 +515,8 @@ class BenchmarkRunner
|
|
|
509
515
|
html = DataGenerator.generate_html(items: @items)
|
|
510
516
|
measure { Canon.parse_html(html) }
|
|
511
517
|
when :html_parse_complex
|
|
512
|
-
html = DataGenerator.generate_html(items: @items, with_scripts: true,
|
|
518
|
+
html = DataGenerator.generate_html(items: @items, with_scripts: true,
|
|
519
|
+
with_tables: true)
|
|
513
520
|
measure { Canon.parse_html(html) }
|
|
514
521
|
when :xml_compare_identical
|
|
515
522
|
xml = DataGenerator.generate_xml(items: @items)
|
|
@@ -566,7 +573,8 @@ class BenchmarkRunner
|
|
|
566
573
|
error_margin = std_dev / mean
|
|
567
574
|
error_pct = error_margin.round(4)
|
|
568
575
|
|
|
569
|
-
{ lower: mean.round(4) * (1 - error_pct),
|
|
576
|
+
{ lower: mean.round(4) * (1 - error_pct),
|
|
577
|
+
upper: mean.round(4) * (1 + error_pct) }
|
|
570
578
|
end
|
|
571
579
|
|
|
572
580
|
def measure_time
|
|
@@ -4,6 +4,7 @@ require "json"
|
|
|
4
4
|
require "open3"
|
|
5
5
|
require "tmpdir"
|
|
6
6
|
require "fileutils"
|
|
7
|
+
require "table_tennis"
|
|
7
8
|
|
|
8
9
|
module PerformanceHelpers
|
|
9
10
|
# ANSI color codes for terminal output
|
|
@@ -97,9 +98,49 @@ module PerformanceHelpers
|
|
|
97
98
|
all_base.merge!(base_results)
|
|
98
99
|
all_current.merge!(curr_results)
|
|
99
100
|
|
|
101
|
+
# Collect comparison results for TableTennis table
|
|
102
|
+
comparison_rows = []
|
|
103
|
+
|
|
100
104
|
curr_results.each do |label, result|
|
|
101
|
-
|
|
105
|
+
base_result = base_results[label]
|
|
106
|
+
cmp = compare_metrics(label, result, base_result, threshold)
|
|
107
|
+
comparison_rows << cmp
|
|
102
108
|
end
|
|
109
|
+
|
|
110
|
+
print_comparison_table(comparison_rows, threshold)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def print_comparison_table(comparison_rows, threshold)
|
|
114
|
+
rows = comparison_rows.map do |cmp|
|
|
115
|
+
{
|
|
116
|
+
benchmark: cmp[:label],
|
|
117
|
+
base_ips: cmp[:base_ips]&.round(1),
|
|
118
|
+
curr_ips: cmp[:curr_ips]&.round(1),
|
|
119
|
+
change: cmp[:change] ? "#{(cmp[:change] * 100).round(1)}%" : "N/A",
|
|
120
|
+
status: if cmp[:base_ips].nil?
|
|
121
|
+
"NEW"
|
|
122
|
+
elsif cmp[:change] < -threshold
|
|
123
|
+
"REGRESSED"
|
|
124
|
+
else
|
|
125
|
+
"OK"
|
|
126
|
+
end,
|
|
127
|
+
}
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
return if rows.empty?
|
|
131
|
+
|
|
132
|
+
table = TableTennis.new(rows,
|
|
133
|
+
title: "Performance Comparison",
|
|
134
|
+
theme: :dark,
|
|
135
|
+
headers: {
|
|
136
|
+
benchmark: "Benchmark",
|
|
137
|
+
base_ips: "Base IPS",
|
|
138
|
+
curr_ips: "Curr IPS",
|
|
139
|
+
change: "Change",
|
|
140
|
+
status: "Status",
|
|
141
|
+
})
|
|
142
|
+
table.render
|
|
143
|
+
puts
|
|
103
144
|
end
|
|
104
145
|
|
|
105
146
|
def compare_metrics(label, curr, base, threshold)
|
|
@@ -197,7 +238,9 @@ module PerformanceHelpers
|
|
|
197
238
|
# Handle new benchmarks that don't exist in base
|
|
198
239
|
if base_metrics.nil?
|
|
199
240
|
curr_ips = (curr_metrics[:lower] + curr_metrics[:upper]) / 2.0
|
|
200
|
-
puts "#{format('%-30s',
|
|
241
|
+
puts "#{format('%-30s',
|
|
242
|
+
label)}: #{GREEN}NEW#{CLEAR} (current: #{format('%.2f',
|
|
243
|
+
curr_ips)} IPS) [N/A]\n\n"
|
|
201
244
|
return
|
|
202
245
|
end
|
|
203
246
|
|
|
@@ -212,7 +255,8 @@ module PerformanceHelpers
|
|
|
212
255
|
base_str = format("%.2f", base_ips)
|
|
213
256
|
curr_str = format("%.2f", curr_ips)
|
|
214
257
|
|
|
215
|
-
puts "#{format('%-30s',
|
|
258
|
+
puts "#{format('%-30s',
|
|
259
|
+
label)}: #{GRAY}#{base_str}#{CLEAR} → #{color}#{curr_str}#{CLEAR} IPS " \
|
|
216
260
|
"(change: #{color}#{delta_str}#{CLEAR}) [#{color}#{status}#{CLEAR}]\n\n"
|
|
217
261
|
end
|
|
218
262
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: canon
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.18
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-03-
|
|
11
|
+
date: 2026-03-24 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: diff-lcs
|