canon 0.1.13 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +69 -21
- data/README.adoc +41 -0
- data/docs/interfaces/ruby-api/index.adoc +26 -0
- data/docs/understanding/formats/xml.adoc +25 -0
- data/lib/canon/color_detector.rb +16 -13
- data/lib/canon/comparison/dimensions/text_content_dimension.rb +9 -1
- data/lib/canon/comparison/html_comparator.rb +89 -35
- data/lib/canon/comparison/html_parser.rb +22 -0
- data/lib/canon/comparison/markup_comparator.rb +39 -4
- data/lib/canon/comparison/profile_definition.rb +1 -1
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +87 -9
- data/lib/canon/comparison/xml_comparator.rb +44 -7
- data/lib/canon/comparison/xml_node_comparison.rb +107 -9
- data/lib/canon/comparison.rb +44 -0
- data/lib/canon/config/env_schema.rb +2 -1
- data/lib/canon/config.rb +10 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +9 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +4 -0
- data/lib/canon/formatters/xml_formatter.rb +20 -0
- data/lib/canon/html/data_model.rb +26 -4
- data/lib/canon/rspec_matchers.rb +15 -0
- data/lib/canon/tree_diff/adapters/html_adapter.rb +20 -2
- data/lib/canon/tree_diff/matchers/hash_matcher.rb +12 -2
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/element_matcher.rb +70 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 81995d22ec29adb9b2fb60f0ed8bc0219fe28e468c89a2001901b0f4521c757b
|
|
4
|
+
data.tar.gz: fabc6e6c77e92848783e747459377caa787330d3360f83f544b6372cc68ba227
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d33e2fcd54ae3b5cab9fdcfe980b1a8d1f2f97b1389ea430ecfda093b275e77f93e49e5a4f1171797df3fcb7f8d0ef28654301dbb846c16a4eb751018ea10129
|
|
7
|
+
data.tar.gz: 85ffc85bf577b631c9aee7e16f81e0be163de2025154dd197962943c533cd3a1aa0d79799d084194780ff8654e766b0ac3ac36b635425d47e64009c71a4edb6d
|
data/.rubocop_todo.yml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# This configuration was generated by
|
|
2
2
|
# `rubocop --auto-gen-config`
|
|
3
|
-
# on 2026-
|
|
3
|
+
# on 2026-02-17 14:18:53 UTC using RuboCop version 1.81.7.
|
|
4
4
|
# The point is for the user to remove these configuration records
|
|
5
5
|
# one by one as the offenses are removed from the code base.
|
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
|
@@ -12,13 +12,52 @@ Gemspec/RequiredRubyVersion:
|
|
|
12
12
|
Exclude:
|
|
13
13
|
- 'canon.gemspec'
|
|
14
14
|
|
|
15
|
-
# Offense count:
|
|
15
|
+
# Offense count: 1
|
|
16
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
17
|
+
# Configuration parameters: EnforcedStyle, IndentationWidth.
|
|
18
|
+
# SupportedStyles: with_first_argument, with_fixed_indentation
|
|
19
|
+
Layout/ArgumentAlignment:
|
|
20
|
+
Exclude:
|
|
21
|
+
- 'lib/canon/xml/element_matcher.rb'
|
|
22
|
+
|
|
23
|
+
# Offense count: 23
|
|
24
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
25
|
+
# Configuration parameters: EnforcedStyleAlignWith.
|
|
26
|
+
# SupportedStylesAlignWith: either, start_of_block, start_of_line
|
|
27
|
+
Layout/BlockAlignment:
|
|
28
|
+
Exclude:
|
|
29
|
+
- 'spec/canon/fixtures/isodoc_spec.rb'
|
|
30
|
+
- 'spec/canon/table_class_attribute_bug_spec.rb'
|
|
31
|
+
|
|
32
|
+
# Offense count: 23
|
|
33
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
34
|
+
Layout/BlockEndNewline:
|
|
35
|
+
Exclude:
|
|
36
|
+
- 'spec/canon/fixtures/isodoc_spec.rb'
|
|
37
|
+
- 'spec/canon/table_class_attribute_bug_spec.rb'
|
|
38
|
+
|
|
39
|
+
# Offense count: 46
|
|
40
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
41
|
+
# Configuration parameters: Width, AllowedPatterns.
|
|
42
|
+
Layout/IndentationWidth:
|
|
43
|
+
Exclude:
|
|
44
|
+
- 'spec/canon/fixtures/isodoc_spec.rb'
|
|
45
|
+
- 'spec/canon/table_class_attribute_bug_spec.rb'
|
|
46
|
+
|
|
47
|
+
# Offense count: 780
|
|
16
48
|
# This cop supports safe autocorrection (--autocorrect).
|
|
17
49
|
# Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, IgnoreCopDirectives, AllowedPatterns, SplitStrings.
|
|
18
50
|
# URISchemes: http, https
|
|
19
51
|
Layout/LineLength:
|
|
20
52
|
Enabled: false
|
|
21
53
|
|
|
54
|
+
# Offense count: 1
|
|
55
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
56
|
+
# Configuration parameters: AllowInHeredoc.
|
|
57
|
+
Layout/TrailingWhitespace:
|
|
58
|
+
Exclude:
|
|
59
|
+
- 'lib/canon/xml/element_matcher.rb'
|
|
60
|
+
|
|
22
61
|
# Offense count: 48
|
|
23
62
|
# Configuration parameters: IgnoreLiteralBranches, IgnoreConstantBranches, IgnoreDuplicateElseBranch.
|
|
24
63
|
Lint/DuplicateBranch:
|
|
@@ -48,44 +87,45 @@ Lint/UnreachableCode:
|
|
|
48
87
|
Exclude:
|
|
49
88
|
- 'lib/canon/diff_formatter/debug_output.rb'
|
|
50
89
|
|
|
51
|
-
# Offense count:
|
|
90
|
+
# Offense count: 7
|
|
52
91
|
# This cop supports safe autocorrection (--autocorrect).
|
|
53
92
|
# Configuration parameters: AllowUnusedKeywordArguments, IgnoreEmptyMethods, IgnoreNotImplementedMethods, NotImplementedExceptions.
|
|
54
93
|
# NotImplementedExceptions: NotImplementedError
|
|
55
94
|
Lint/UnusedMethodArgument:
|
|
56
95
|
Exclude:
|
|
96
|
+
- 'lib/canon/comparison.rb'
|
|
57
97
|
- 'lib/canon/diff/path_builder.rb'
|
|
58
98
|
- 'lib/canon/diff_formatter/by_line/base_formatter.rb'
|
|
59
99
|
- 'lib/canon/diff_formatter/by_line/xml_formatter.rb'
|
|
60
100
|
- 'lib/canon/diff_formatter/by_object/base_formatter.rb'
|
|
61
101
|
|
|
62
|
-
# Offense count:
|
|
102
|
+
# Offense count: 215
|
|
63
103
|
# Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
|
|
64
104
|
Metrics/AbcSize:
|
|
65
105
|
Enabled: false
|
|
66
106
|
|
|
67
|
-
# Offense count:
|
|
107
|
+
# Offense count: 21
|
|
68
108
|
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns, inherit_mode.
|
|
69
109
|
# AllowedMethods: refine
|
|
70
110
|
Metrics/BlockLength:
|
|
71
111
|
Max: 84
|
|
72
112
|
|
|
73
|
-
# Offense count:
|
|
113
|
+
# Offense count: 183
|
|
74
114
|
# Configuration parameters: AllowedMethods, AllowedPatterns, Max.
|
|
75
115
|
Metrics/CyclomaticComplexity:
|
|
76
116
|
Enabled: false
|
|
77
117
|
|
|
78
|
-
# Offense count:
|
|
118
|
+
# Offense count: 369
|
|
79
119
|
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
|
|
80
120
|
Metrics/MethodLength:
|
|
81
|
-
Max:
|
|
121
|
+
Max: 115
|
|
82
122
|
|
|
83
123
|
# Offense count: 44
|
|
84
124
|
# Configuration parameters: CountKeywordArgs, MaxOptionalParameters.
|
|
85
125
|
Metrics/ParameterLists:
|
|
86
126
|
Max: 9
|
|
87
127
|
|
|
88
|
-
# Offense count:
|
|
128
|
+
# Offense count: 149
|
|
89
129
|
# Configuration parameters: AllowedMethods, AllowedPatterns, Max.
|
|
90
130
|
Metrics/PerceivedComplexity:
|
|
91
131
|
Enabled: false
|
|
@@ -119,12 +159,13 @@ Naming/VariableNumber:
|
|
|
119
159
|
- 'lib/canon/comparison/markup_comparator.rb'
|
|
120
160
|
- 'lib/canon/comparison/xml_comparator/diff_node_builder.rb'
|
|
121
161
|
|
|
122
|
-
# Offense count:
|
|
162
|
+
# Offense count: 13
|
|
123
163
|
# Configuration parameters: MinSize.
|
|
124
164
|
Performance/CollectionLiteralInLoop:
|
|
125
165
|
Exclude:
|
|
126
166
|
- 'lib/canon/comparison/html_comparator.rb'
|
|
127
167
|
- 'lib/canon/xml/xml_base_handler.rb'
|
|
168
|
+
- 'spec/canon/table_class_attribute_bug_spec.rb'
|
|
128
169
|
|
|
129
170
|
# Offense count: 68
|
|
130
171
|
# Configuration parameters: Prefixes, AllowedPatterns.
|
|
@@ -132,7 +173,7 @@ Performance/CollectionLiteralInLoop:
|
|
|
132
173
|
RSpec/ContextWording:
|
|
133
174
|
Enabled: false
|
|
134
175
|
|
|
135
|
-
# Offense count:
|
|
176
|
+
# Offense count: 27
|
|
136
177
|
# Configuration parameters: IgnoredMetadata.
|
|
137
178
|
RSpec/DescribeClass:
|
|
138
179
|
Enabled: false
|
|
@@ -143,13 +184,7 @@ RSpec/DescribeMethod:
|
|
|
143
184
|
- 'spec/canon/comparison/multiple_differences_spec.rb'
|
|
144
185
|
- 'spec/canon/diff_formatter/character_map_customization_spec.rb'
|
|
145
186
|
|
|
146
|
-
# Offense count:
|
|
147
|
-
# This cop supports safe autocorrection (--autocorrect).
|
|
148
|
-
RSpec/EmptyHook:
|
|
149
|
-
Exclude:
|
|
150
|
-
- 'spec/canon/color_detector_spec.rb'
|
|
151
|
-
|
|
152
|
-
# Offense count: 679
|
|
187
|
+
# Offense count: 696
|
|
153
188
|
# Configuration parameters: CountAsOne.
|
|
154
189
|
RSpec/ExampleLength:
|
|
155
190
|
Max: 67
|
|
@@ -201,11 +236,11 @@ RSpec/MultipleDescribes:
|
|
|
201
236
|
Exclude:
|
|
202
237
|
- 'spec/canon/comparison/match_options_spec.rb'
|
|
203
238
|
|
|
204
|
-
# Offense count:
|
|
239
|
+
# Offense count: 536
|
|
205
240
|
RSpec/MultipleExpectations:
|
|
206
241
|
Max: 15
|
|
207
242
|
|
|
208
|
-
# Offense count:
|
|
243
|
+
# Offense count: 70
|
|
209
244
|
# Configuration parameters: AllowSubject.
|
|
210
245
|
RSpec/MultipleMemoizedHelpers:
|
|
211
246
|
Max: 13
|
|
@@ -224,12 +259,13 @@ RSpec/NamedSubject:
|
|
|
224
259
|
RSpec/NestedGroups:
|
|
225
260
|
Max: 4
|
|
226
261
|
|
|
227
|
-
# Offense count:
|
|
262
|
+
# Offense count: 11
|
|
228
263
|
# Configuration parameters: AllowedPatterns.
|
|
229
264
|
# AllowedPatterns: ^expect_, ^assert_
|
|
230
265
|
RSpec/NoExpectationExample:
|
|
231
266
|
Exclude:
|
|
232
267
|
- 'spec/canon/context_grouping_spec.rb'
|
|
268
|
+
- 'spec/canon/fixtures/isodoc_spec.rb'
|
|
233
269
|
- 'spec/canon/informative_diffs_debug_spec.rb'
|
|
234
270
|
- 'spec/canon/isodoc_blockquotes_spec.rb'
|
|
235
271
|
- 'spec/canon/match_scenarios_spec.rb'
|
|
@@ -257,6 +293,18 @@ RSpec/VerifiedDoubles:
|
|
|
257
293
|
- 'spec/canon/diff/xml_serialization_formatter_spec.rb'
|
|
258
294
|
- 'spec/canon/tree_diff/operation_converter_spec.rb'
|
|
259
295
|
|
|
296
|
+
# Offense count: 44
|
|
297
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
298
|
+
# Configuration parameters: EnforcedStyle, ProceduralMethods, FunctionalMethods, AllowedMethods, AllowedPatterns, AllowBracesOnProceduralOneLiners, BracesRequiredMethods.
|
|
299
|
+
# SupportedStyles: line_count_based, semantic, braces_for_chaining, always_braces
|
|
300
|
+
# ProceduralMethods: benchmark, bm, bmbm, create, each_with_object, measure, new, realtime, tap, with_object
|
|
301
|
+
# FunctionalMethods: let, let!, subject, watch
|
|
302
|
+
# AllowedMethods: lambda, proc, it
|
|
303
|
+
Style/BlockDelimiters:
|
|
304
|
+
Exclude:
|
|
305
|
+
- 'spec/canon/fixtures/isodoc_spec.rb'
|
|
306
|
+
- 'spec/canon/table_class_attribute_bug_spec.rb'
|
|
307
|
+
|
|
260
308
|
# Offense count: 1
|
|
261
309
|
# This cop supports safe autocorrection (--autocorrect).
|
|
262
310
|
# Configuration parameters: EnforcedStyle, AllowComments.
|
data/README.adoc
CHANGED
|
@@ -16,6 +16,47 @@ Key features:
|
|
|
16
16
|
* **Multiple interfaces**: Ruby API, CLI, RSpec matchers
|
|
17
17
|
* **Smart diff output**: By-line or by-object modes with syntax highlighting
|
|
18
18
|
|
|
19
|
+
== When to use formatting vs comparison
|
|
20
|
+
|
|
21
|
+
Canon provides two main APIs with different purposes:
|
|
22
|
+
|
|
23
|
+
*Use `Canon.format` for formatting/canonicalization:*
|
|
24
|
+
|
|
25
|
+
* Pretty-printing XML/JSON/YAML for display
|
|
26
|
+
* Canonicalizing documents for storage
|
|
27
|
+
* Normalizing formatting
|
|
28
|
+
|
|
29
|
+
*Use `Canon::Comparison.equivalent?` for semantic comparison:*
|
|
30
|
+
|
|
31
|
+
* Test assertions
|
|
32
|
+
* Document equivalence checking
|
|
33
|
+
* Diff generation
|
|
34
|
+
|
|
35
|
+
[IMPORTANT]
|
|
36
|
+
====
|
|
37
|
+
Do NOT use `Canon.format_xml` output for string comparison in tests.
|
|
38
|
+
The formatting process changes line counts and formatting, which causes
|
|
39
|
+
false test failures.
|
|
40
|
+
|
|
41
|
+
Use `Canon::Comparison.equivalent?` instead, which performs semantic
|
|
42
|
+
comparison and properly handles XML declarations.
|
|
43
|
+
====
|
|
44
|
+
|
|
45
|
+
[example]
|
|
46
|
+
====
|
|
47
|
+
[source,ruby]
|
|
48
|
+
----
|
|
49
|
+
# WRONG - formatting changes line counts
|
|
50
|
+
expect(Canon.format_xml(actual)).to eq(expected_formatted)
|
|
51
|
+
|
|
52
|
+
# RIGHT - semantic comparison ignores formatting differences
|
|
53
|
+
expect(Canon::Comparison.equivalent?(actual, expected, format: :xml)).to be true
|
|
54
|
+
|
|
55
|
+
# BEST - use RSpec matchers
|
|
56
|
+
expect(actual).to be_xml_equivalent_to(expected)
|
|
57
|
+
----
|
|
58
|
+
====
|
|
59
|
+
|
|
19
60
|
== Installation
|
|
20
61
|
|
|
21
62
|
Add to your application's Gemfile:
|
|
@@ -18,6 +18,32 @@ For command-line usage, see link:../cli/[CLI documentation].
|
|
|
18
18
|
|
|
19
19
|
For RSpec testing, see link:../rspec/[RSpec documentation].
|
|
20
20
|
|
|
21
|
+
== Choosing the right API
|
|
22
|
+
|
|
23
|
+
Canon provides two main categories of APIs with different purposes.
|
|
24
|
+
|
|
25
|
+
=== Formatting APIs
|
|
26
|
+
|
|
27
|
+
Use `Canon.format` or `Canon.format_xml` when you need to:
|
|
28
|
+
|
|
29
|
+
* Pretty-print documents for display
|
|
30
|
+
* Canonicalize documents for storage
|
|
31
|
+
* Normalize document formatting
|
|
32
|
+
|
|
33
|
+
NOTE: XML declarations are preserved in pretty-print mode and removed in
|
|
34
|
+
canonicalization mode.
|
|
35
|
+
|
|
36
|
+
=== Comparison APIs
|
|
37
|
+
|
|
38
|
+
Use `Canon::Comparison.equivalent?` when you need to:
|
|
39
|
+
|
|
40
|
+
* Compare documents semantically
|
|
41
|
+
* Generate diffs
|
|
42
|
+
* Make test assertions
|
|
43
|
+
|
|
44
|
+
NOTE: XML declarations are stripped during preprocessing for semantic comparison.
|
|
45
|
+
Documents with and without XML declarations are considered equivalent.
|
|
46
|
+
|
|
21
47
|
== General
|
|
22
48
|
|
|
23
49
|
Canon provides a unified Ruby API for working with XML, HTML, JSON, and YAML
|
|
@@ -183,6 +183,31 @@ configures preprocessing, match options, diff algorithm, and formatting.
|
|
|
183
183
|
|
|
184
184
|
== XML-specific features
|
|
185
185
|
|
|
186
|
+
=== XML declaration handling
|
|
187
|
+
|
|
188
|
+
The XML declaration (`<?xml version="1.0" encoding="UTF-8"?>`) is handled
|
|
189
|
+
differently depending on the operation:
|
|
190
|
+
|
|
191
|
+
[cols="2,3"]
|
|
192
|
+
|===
|
|
193
|
+
| Operation | XML Declaration
|
|
194
|
+
|
|
195
|
+
| `Canon.format_xml` (pretty)
|
|
196
|
+
| Preserved
|
|
197
|
+
|
|
198
|
+
| `Canon.format_xml` (c14n)
|
|
199
|
+
| Removed (per W3C C14N spec)
|
|
200
|
+
|
|
201
|
+
| `Canon::Comparison.equivalent?`
|
|
202
|
+
| Stripped during preprocessing
|
|
203
|
+
|
|
204
|
+
| RSpec matchers
|
|
205
|
+
| Stripped during preprocessing
|
|
206
|
+
|===
|
|
207
|
+
|
|
208
|
+
This means documents with and without XML declarations are considered
|
|
209
|
+
equivalent when using the comparison API.
|
|
210
|
+
|
|
186
211
|
=== Comment handling
|
|
187
212
|
|
|
188
213
|
XML comments are preserved in canonical form unless `--with-comments` is explicitly set.
|
data/lib/canon/color_detector.rb
CHANGED
|
@@ -81,24 +81,27 @@ module Canon
|
|
|
81
81
|
#
|
|
82
82
|
# @return [Boolean] true if colors appear to be supported
|
|
83
83
|
def detect_from_env
|
|
84
|
-
# Check for known color-capable terminals
|
|
85
|
-
colorterm = ENV["COLORTERM"]
|
|
86
|
-
return true if COLOR_TERM_VALUES.include?(colorterm)
|
|
87
|
-
|
|
88
84
|
# Check TERM variable
|
|
89
85
|
term = ENV["TERM"]
|
|
90
|
-
if term
|
|
86
|
+
if term && NO_COLOR_TERMS.any? { |t| term.include?(t) }
|
|
91
87
|
# Known no-color terminals
|
|
92
|
-
return false
|
|
88
|
+
return false
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Check CI environments
|
|
92
|
+
# Some CI systems support colors, others don't
|
|
93
|
+
return detect_ci_colors if ci_environment?
|
|
94
|
+
|
|
95
|
+
if term
|
|
93
96
|
# Known color-capable terminals
|
|
94
97
|
return true if COLOR_TERM_SUFFIXES.any? { |s| term.end_with?(s) }
|
|
95
98
|
# Most modern terminals support basic ANSI colors
|
|
96
99
|
return true unless term.empty? || term == "unknown"
|
|
97
100
|
end
|
|
98
101
|
|
|
99
|
-
# Check
|
|
100
|
-
|
|
101
|
-
return
|
|
102
|
+
# Check for known color-capable terminals
|
|
103
|
+
colorterm = ENV["COLORTERM"]
|
|
104
|
+
return true if COLOR_TERM_VALUES.include?(colorterm)
|
|
102
105
|
|
|
103
106
|
# Default: assume colors are supported on modern terminals
|
|
104
107
|
# This is a safe default for most use cases
|
|
@@ -123,16 +126,16 @@ module Canon
|
|
|
123
126
|
#
|
|
124
127
|
# @return [Boolean] true if CI environment likely supports colors
|
|
125
128
|
def detect_ci_colors
|
|
129
|
+
# Most modern CI systems support ANSI colors
|
|
130
|
+
# Only disable for explicitly known non-color CI
|
|
131
|
+
return false if ENV["TERM"] == "dumb"
|
|
132
|
+
|
|
126
133
|
# GitHub Actions explicitly supports colors
|
|
127
134
|
return true if ENV["GITHUB_ACTIONS"]
|
|
128
135
|
|
|
129
136
|
# TeamCity supports colors with specific env var
|
|
130
137
|
return true if ENV["TEAMCITY_VERSION"]
|
|
131
138
|
|
|
132
|
-
# Most modern CI systems support ANSI colors
|
|
133
|
-
# Only disable for explicitly known non-color CI
|
|
134
|
-
return false if ENV["TERM"] == "dumb"
|
|
135
|
-
|
|
136
139
|
# Default to supporting colors in CI
|
|
137
140
|
true
|
|
138
141
|
end
|
|
@@ -44,12 +44,20 @@ module Canon
|
|
|
44
44
|
# Normalized text comparison
|
|
45
45
|
#
|
|
46
46
|
# Collapses whitespace and compares.
|
|
47
|
+
# Two whitespace-only strings that both normalize to empty are equivalent.
|
|
47
48
|
#
|
|
48
49
|
# @param text1 [String, nil] First text
|
|
49
50
|
# @param text2 [String, nil] Second text
|
|
50
51
|
# @return [Boolean] true if normalized texts are equal
|
|
51
52
|
def compare_normalize(text1, text2)
|
|
52
|
-
|
|
53
|
+
normalized1 = normalize_text(text1)
|
|
54
|
+
normalized2 = normalize_text(text2)
|
|
55
|
+
|
|
56
|
+
# Both empty after normalization = equivalent
|
|
57
|
+
# This handles whitespace-only text nodes that normalize to empty
|
|
58
|
+
return true if normalized1.empty? && normalized2.empty?
|
|
59
|
+
|
|
60
|
+
normalized1 == normalized2
|
|
53
61
|
end
|
|
54
62
|
|
|
55
63
|
private
|
|
@@ -60,6 +60,11 @@ module Canon
|
|
|
60
60
|
def equivalent?(html1, html2, opts = {}, child_opts = {})
|
|
61
61
|
opts = DEFAULT_OPTS.merge(opts)
|
|
62
62
|
|
|
63
|
+
# Capture original HTML strings BEFORE any parsing/transformation
|
|
64
|
+
# These are used for display to preserve original formatting
|
|
65
|
+
original_str1 = extract_original_string(html1)
|
|
66
|
+
original_str2 = extract_original_string(html2)
|
|
67
|
+
|
|
63
68
|
# Resolve match options with format-specific defaults
|
|
64
69
|
match_opts_hash = MatchOptions::Xml.resolve(
|
|
65
70
|
format: :html,
|
|
@@ -117,41 +122,14 @@ module Canon
|
|
|
117
122
|
# This is a SAFETY CHECK for legacy cases where Nokogiri nodes might still be used
|
|
118
123
|
# The main path (parse_node) now returns Canon::Xml::Nodes::RootNode, so this
|
|
119
124
|
# check should rarely trigger, but we keep it for robustness
|
|
120
|
-
if (node1
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
all_children1 = node1.children.to_a
|
|
126
|
-
all_children2 = node2.children.to_a
|
|
127
|
-
|
|
128
|
-
# Filter children based on match options (e.g., ignore comments)
|
|
129
|
-
children1 = XmlNodeComparison.filter_children(all_children1, opts)
|
|
130
|
-
children2 = XmlNodeComparison.filter_children(all_children2, opts)
|
|
131
|
-
|
|
132
|
-
if children1.length != children2.length
|
|
133
|
-
result = Comparison::UNEQUAL_ELEMENTS
|
|
134
|
-
elsif children1.empty?
|
|
135
|
-
result = Comparison::EQUIVALENT
|
|
136
|
-
else
|
|
137
|
-
# Compare each pair of children
|
|
138
|
-
result = Comparison::EQUIVALENT
|
|
139
|
-
children1.zip(children2).each do |child1, child2|
|
|
140
|
-
child_result = XmlNodeComparison.compare_nodes(child1, child2,
|
|
141
|
-
opts, child_opts,
|
|
142
|
-
diff_children,
|
|
143
|
-
differences)
|
|
144
|
-
if child_result != Comparison::EQUIVALENT
|
|
145
|
-
result = child_result
|
|
146
|
-
break
|
|
147
|
-
end
|
|
148
|
-
end
|
|
149
|
-
end
|
|
150
|
-
else
|
|
151
|
-
result = XmlNodeComparison.compare_nodes(node1, node2, opts,
|
|
125
|
+
result = if fragment_nodes?(node1, node2)
|
|
126
|
+
compare_fragment_children(node1, node2, opts, child_opts,
|
|
127
|
+
diff_children, differences)
|
|
128
|
+
else
|
|
129
|
+
XmlNodeComparison.compare_nodes(node1, node2, opts,
|
|
152
130
|
child_opts, diff_children,
|
|
153
131
|
differences)
|
|
154
|
-
|
|
132
|
+
end
|
|
155
133
|
|
|
156
134
|
# Classify DiffNodes as normative/informative if we have verbose output
|
|
157
135
|
if opts[:verbose] && !differences.empty?
|
|
@@ -165,6 +143,7 @@ module Canon
|
|
|
165
143
|
ComparisonResult.new(
|
|
166
144
|
differences: differences,
|
|
167
145
|
preprocessed_strings: [preprocessed_str1, preprocessed_str2],
|
|
146
|
+
original_strings: [original_str1, original_str2],
|
|
168
147
|
format: :html,
|
|
169
148
|
html_version: detect_html_version_from_node(node1),
|
|
170
149
|
match_options: match_opts_hash,
|
|
@@ -187,6 +166,53 @@ module Canon
|
|
|
187
166
|
|
|
188
167
|
private
|
|
189
168
|
|
|
169
|
+
# Check if both nodes are document fragments
|
|
170
|
+
#
|
|
171
|
+
# @param node1 [Object] First node
|
|
172
|
+
# @param node2 [Object] Second node
|
|
173
|
+
# @return [Boolean] true if both are document fragments
|
|
174
|
+
def fragment_nodes?(node1, node2)
|
|
175
|
+
(node1.is_a?(Nokogiri::HTML4::DocumentFragment) ||
|
|
176
|
+
node1.is_a?(Nokogiri::XML::DocumentFragment)) &&
|
|
177
|
+
(node2.is_a?(Nokogiri::HTML4::DocumentFragment) ||
|
|
178
|
+
node2.is_a?(Nokogiri::XML::DocumentFragment))
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Compare children of document fragments
|
|
182
|
+
#
|
|
183
|
+
# @param node1 [Nokogiri::DocumentFragment] First fragment
|
|
184
|
+
# @param node2 [Nokogiri::DocumentFragment] Second fragment
|
|
185
|
+
# @param opts [Hash] Comparison options
|
|
186
|
+
# @param child_opts [Hash] Child comparison options
|
|
187
|
+
# @param diff_children [Boolean] Whether to diff children
|
|
188
|
+
# @param differences [Array] Array to append differences to
|
|
189
|
+
# @return [Symbol] Comparison result constant
|
|
190
|
+
def compare_fragment_children(node1, node2, opts, child_opts,
|
|
191
|
+
diff_children, differences)
|
|
192
|
+
all_children1 = node1.children.to_a
|
|
193
|
+
all_children2 = node2.children.to_a
|
|
194
|
+
|
|
195
|
+
children1 = XmlNodeComparison.filter_children(all_children1, opts)
|
|
196
|
+
children2 = XmlNodeComparison.filter_children(all_children2, opts)
|
|
197
|
+
|
|
198
|
+
if children1.length != children2.length
|
|
199
|
+
return Comparison::UNEQUAL_ELEMENTS
|
|
200
|
+
elsif children1.empty?
|
|
201
|
+
return Comparison::EQUIVALENT
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Compare each pair of children
|
|
205
|
+
children1.zip(children2).each do |child1, child2|
|
|
206
|
+
child_result = XmlNodeComparison.compare_nodes(child1, child2,
|
|
207
|
+
opts, child_opts,
|
|
208
|
+
diff_children,
|
|
209
|
+
differences)
|
|
210
|
+
return child_result if child_result != Comparison::EQUIVALENT
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
Comparison::EQUIVALENT
|
|
214
|
+
end
|
|
215
|
+
|
|
190
216
|
# Perform semantic tree diff using SemanticTreeMatchStrategy
|
|
191
217
|
#
|
|
192
218
|
# @param html1 [String, Nokogiri::HTML::Document] First HTML
|
|
@@ -195,6 +221,11 @@ module Canon
|
|
|
195
221
|
# @param match_opts_hash [Hash] Resolved match options
|
|
196
222
|
# @return [Boolean, ComparisonResult] Result of tree diff comparison
|
|
197
223
|
def perform_semantic_tree_diff(html1, html2, opts, match_opts_hash)
|
|
224
|
+
# Capture original HTML strings BEFORE any parsing/transformation
|
|
225
|
+
# These are used for display to preserve original formatting
|
|
226
|
+
original_str1 = extract_original_string(html1)
|
|
227
|
+
original_str2 = extract_original_string(html2)
|
|
228
|
+
|
|
198
229
|
# Parse to Canon::Xml::Node (preserves preprocessing)
|
|
199
230
|
# For HTML, we parse as XML to get Canon::Xml::Node structure
|
|
200
231
|
node1 = parse_node_for_semantic(html1,
|
|
@@ -223,6 +254,7 @@ module Canon
|
|
|
223
254
|
ComparisonResult.new(
|
|
224
255
|
differences: differences,
|
|
225
256
|
preprocessed_strings: preprocessed,
|
|
257
|
+
original_strings: [original_str1, original_str2],
|
|
226
258
|
format: :html,
|
|
227
259
|
html_version: html_version,
|
|
228
260
|
match_options: match_opts_hash.merge(strategy.metadata),
|
|
@@ -343,7 +375,7 @@ module Canon
|
|
|
343
375
|
# If already a Nokogiri node, check for incompatible XML documents
|
|
344
376
|
unless node.is_a?(String)
|
|
345
377
|
# Detect if this is an XML document (not HTML)
|
|
346
|
-
if
|
|
378
|
+
if xml_document?(node)
|
|
347
379
|
raise Canon::CompareFormatMismatchError.new(:xml, :html)
|
|
348
380
|
end
|
|
349
381
|
|
|
@@ -508,6 +540,28 @@ module Canon
|
|
|
508
540
|
end
|
|
509
541
|
end
|
|
510
542
|
|
|
543
|
+
# Extract original HTML string from various input types
|
|
544
|
+
# This preserves the original formatting without minification
|
|
545
|
+
#
|
|
546
|
+
# @param html [String, Nokogiri::Node, Canon::Xml::Node] Input HTML
|
|
547
|
+
# @return [String] Original HTML string
|
|
548
|
+
def extract_original_string(html)
|
|
549
|
+
if html.is_a?(String)
|
|
550
|
+
html
|
|
551
|
+
elsif html.is_a?(Canon::Xml::Node)
|
|
552
|
+
# Serialize Canon nodes to string
|
|
553
|
+
Canon::Xml::DataModel.serialize(html)
|
|
554
|
+
elsif html.respond_to?(:to_html)
|
|
555
|
+
# Nokogiri nodes - use to_html to preserve formatting
|
|
556
|
+
html.to_html
|
|
557
|
+
elsif html.respond_to?(:to_s)
|
|
558
|
+
html.to_s
|
|
559
|
+
else
|
|
560
|
+
raise Canon::Error,
|
|
561
|
+
"Unable to extract original string from: #{html.class}"
|
|
562
|
+
end
|
|
563
|
+
end
|
|
564
|
+
|
|
511
565
|
# Normalize HTML comments within style and script tags
|
|
512
566
|
# Also removes whitespace-only CDATA children that Nokogiri creates
|
|
513
567
|
def normalize_html_style_script_comments(doc)
|
|
@@ -637,7 +691,7 @@ compare_profile = nil)
|
|
|
637
691
|
# Check if a node is an XML document (not HTML)
|
|
638
692
|
# XML documents typically have XML processing instructions or are
|
|
639
693
|
# instances of Nokogiri::XML::Document (not HTML variants)
|
|
640
|
-
def
|
|
694
|
+
def xml_document?(node)
|
|
641
695
|
# Check if it's a pure XML document (not HTML4/HTML5 which also
|
|
642
696
|
# inherit from XML::Document)
|
|
643
697
|
# Check both Document and DocumentFragment variants
|
|
@@ -25,6 +25,11 @@ module Canon
|
|
|
25
25
|
return content unless content.is_a?(String)
|
|
26
26
|
return content if already_parsed?(content)
|
|
27
27
|
|
|
28
|
+
# Normalize HTML to ensure consistent parsing by HTML4.fragment
|
|
29
|
+
# The key issue is that HTML4.fragment treats newlines after </head>
|
|
30
|
+
# differently than no newlines, causing inconsistent parsing
|
|
31
|
+
content = normalize_html_for_parsing(content)
|
|
32
|
+
|
|
28
33
|
begin
|
|
29
34
|
case format
|
|
30
35
|
when :html5
|
|
@@ -74,6 +79,23 @@ module Canon
|
|
|
74
79
|
# Check for HTML5 DOCTYPE (case-insensitive)
|
|
75
80
|
content.include?("<!DOCTYPE html>") ? :html5 : :html4
|
|
76
81
|
end
|
|
82
|
+
|
|
83
|
+
# Normalize HTML to ensure consistent parsing by HTML4.fragment
|
|
84
|
+
#
|
|
85
|
+
# The key issue is that HTML4.fragment treats whitespace after </head>
|
|
86
|
+
# differently than no whitespace, causing inconsistent parsing:
|
|
87
|
+
# - "</head>\n<body>" parses to [body, ...] (body is treated as content)
|
|
88
|
+
# - "</head><body>" parses to [meta, div, ...] (wrapper tags stripped)
|
|
89
|
+
#
|
|
90
|
+
# This method normalizes the HTML to ensure consistent parsing.
|
|
91
|
+
#
|
|
92
|
+
# @param content [String] HTML content
|
|
93
|
+
# @return [String] Normalized HTML content
|
|
94
|
+
def normalize_html_for_parsing(content)
|
|
95
|
+
# Remove whitespace between </head> and <body> to ensure consistent parsing
|
|
96
|
+
# This makes formatted and minified HTML parse the same way
|
|
97
|
+
content.gsub(%r{</head>\s*<body>}i, "</head><body>")
|
|
98
|
+
end
|
|
77
99
|
end
|
|
78
100
|
end
|
|
79
101
|
end
|