canon 0.1.13 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 60a7f65c6d95c4c10672244fe19a027022aadb8de563d9f3ac58da151085e883
4
- data.tar.gz: 47f17024dd3d1a7055cef281439038d42bfb92527765f92d68392fbb14390d39
3
+ metadata.gz: 81995d22ec29adb9b2fb60f0ed8bc0219fe28e468c89a2001901b0f4521c757b
4
+ data.tar.gz: fabc6e6c77e92848783e747459377caa787330d3360f83f544b6372cc68ba227
5
5
  SHA512:
6
- metadata.gz: 526cfa7a890447be2abc8bc358ac96a67a58ed6cb8016beebb65d087e44de48ef742c90be9ab50960c2b3d543bc7e3a7118af88a4f02af0a3e85eeea83161f14
7
- data.tar.gz: 95e16e97ee9b71a1f4d220bc3c4f004f3f39f76d1b2631c68e3a5805b9f9aea4390bab9f9537dc0be254c690e6fcfa4146497ce8867bb15a28fec85387d1a0d1
6
+ metadata.gz: d33e2fcd54ae3b5cab9fdcfe980b1a8d1f2f97b1389ea430ecfda093b275e77f93e49e5a4f1171797df3fcb7f8d0ef28654301dbb846c16a4eb751018ea10129
7
+ data.tar.gz: 85ffc85bf577b631c9aee7e16f81e0be163de2025154dd197962943c533cd3a1aa0d79799d084194780ff8654e766b0ac3ac36b635425d47e64009c71a4edb6d
data/.rubocop_todo.yml CHANGED
@@ -1,6 +1,6 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2026-01-21 09:17:44 UTC using RuboCop version 1.81.7.
3
+ # on 2026-02-17 14:18:53 UTC using RuboCop version 1.81.7.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
@@ -12,13 +12,52 @@ Gemspec/RequiredRubyVersion:
12
12
  Exclude:
13
13
  - 'canon.gemspec'
14
14
 
15
- # Offense count: 700
15
+ # Offense count: 1
16
+ # This cop supports safe autocorrection (--autocorrect).
17
+ # Configuration parameters: EnforcedStyle, IndentationWidth.
18
+ # SupportedStyles: with_first_argument, with_fixed_indentation
19
+ Layout/ArgumentAlignment:
20
+ Exclude:
21
+ - 'lib/canon/xml/element_matcher.rb'
22
+
23
+ # Offense count: 23
24
+ # This cop supports safe autocorrection (--autocorrect).
25
+ # Configuration parameters: EnforcedStyleAlignWith.
26
+ # SupportedStylesAlignWith: either, start_of_block, start_of_line
27
+ Layout/BlockAlignment:
28
+ Exclude:
29
+ - 'spec/canon/fixtures/isodoc_spec.rb'
30
+ - 'spec/canon/table_class_attribute_bug_spec.rb'
31
+
32
+ # Offense count: 23
33
+ # This cop supports safe autocorrection (--autocorrect).
34
+ Layout/BlockEndNewline:
35
+ Exclude:
36
+ - 'spec/canon/fixtures/isodoc_spec.rb'
37
+ - 'spec/canon/table_class_attribute_bug_spec.rb'
38
+
39
+ # Offense count: 46
40
+ # This cop supports safe autocorrection (--autocorrect).
41
+ # Configuration parameters: Width, AllowedPatterns.
42
+ Layout/IndentationWidth:
43
+ Exclude:
44
+ - 'spec/canon/fixtures/isodoc_spec.rb'
45
+ - 'spec/canon/table_class_attribute_bug_spec.rb'
46
+
47
+ # Offense count: 780
16
48
  # This cop supports safe autocorrection (--autocorrect).
17
49
  # Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, IgnoreCopDirectives, AllowedPatterns, SplitStrings.
18
50
  # URISchemes: http, https
19
51
  Layout/LineLength:
20
52
  Enabled: false
21
53
 
54
+ # Offense count: 1
55
+ # This cop supports safe autocorrection (--autocorrect).
56
+ # Configuration parameters: AllowInHeredoc.
57
+ Layout/TrailingWhitespace:
58
+ Exclude:
59
+ - 'lib/canon/xml/element_matcher.rb'
60
+
22
61
  # Offense count: 48
23
62
  # Configuration parameters: IgnoreLiteralBranches, IgnoreConstantBranches, IgnoreDuplicateElseBranch.
24
63
  Lint/DuplicateBranch:
@@ -48,44 +87,45 @@ Lint/UnreachableCode:
48
87
  Exclude:
49
88
  - 'lib/canon/diff_formatter/debug_output.rb'
50
89
 
51
- # Offense count: 6
90
+ # Offense count: 7
52
91
  # This cop supports safe autocorrection (--autocorrect).
53
92
  # Configuration parameters: AllowUnusedKeywordArguments, IgnoreEmptyMethods, IgnoreNotImplementedMethods, NotImplementedExceptions.
54
93
  # NotImplementedExceptions: NotImplementedError
55
94
  Lint/UnusedMethodArgument:
56
95
  Exclude:
96
+ - 'lib/canon/comparison.rb'
57
97
  - 'lib/canon/diff/path_builder.rb'
58
98
  - 'lib/canon/diff_formatter/by_line/base_formatter.rb'
59
99
  - 'lib/canon/diff_formatter/by_line/xml_formatter.rb'
60
100
  - 'lib/canon/diff_formatter/by_object/base_formatter.rb'
61
101
 
62
- # Offense count: 209
102
+ # Offense count: 215
63
103
  # Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
64
104
  Metrics/AbcSize:
65
105
  Enabled: false
66
106
 
67
- # Offense count: 20
107
+ # Offense count: 21
68
108
  # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns, inherit_mode.
69
109
  # AllowedMethods: refine
70
110
  Metrics/BlockLength:
71
111
  Max: 84
72
112
 
73
- # Offense count: 177
113
+ # Offense count: 183
74
114
  # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
75
115
  Metrics/CyclomaticComplexity:
76
116
  Enabled: false
77
117
 
78
- # Offense count: 363
118
+ # Offense count: 369
79
119
  # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
80
120
  Metrics/MethodLength:
81
- Max: 110
121
+ Max: 115
82
122
 
83
123
  # Offense count: 44
84
124
  # Configuration parameters: CountKeywordArgs, MaxOptionalParameters.
85
125
  Metrics/ParameterLists:
86
126
  Max: 9
87
127
 
88
- # Offense count: 143
128
+ # Offense count: 149
89
129
  # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
90
130
  Metrics/PerceivedComplexity:
91
131
  Enabled: false
@@ -119,12 +159,13 @@ Naming/VariableNumber:
119
159
  - 'lib/canon/comparison/markup_comparator.rb'
120
160
  - 'lib/canon/comparison/xml_comparator/diff_node_builder.rb'
121
161
 
122
- # Offense count: 2
162
+ # Offense count: 13
123
163
  # Configuration parameters: MinSize.
124
164
  Performance/CollectionLiteralInLoop:
125
165
  Exclude:
126
166
  - 'lib/canon/comparison/html_comparator.rb'
127
167
  - 'lib/canon/xml/xml_base_handler.rb'
168
+ - 'spec/canon/table_class_attribute_bug_spec.rb'
128
169
 
129
170
  # Offense count: 68
130
171
  # Configuration parameters: Prefixes, AllowedPatterns.
@@ -132,7 +173,7 @@ Performance/CollectionLiteralInLoop:
132
173
  RSpec/ContextWording:
133
174
  Enabled: false
134
175
 
135
- # Offense count: 25
176
+ # Offense count: 27
136
177
  # Configuration parameters: IgnoredMetadata.
137
178
  RSpec/DescribeClass:
138
179
  Enabled: false
@@ -143,13 +184,7 @@ RSpec/DescribeMethod:
143
184
  - 'spec/canon/comparison/multiple_differences_spec.rb'
144
185
  - 'spec/canon/diff_formatter/character_map_customization_spec.rb'
145
186
 
146
- # Offense count: 1
147
- # This cop supports safe autocorrection (--autocorrect).
148
- RSpec/EmptyHook:
149
- Exclude:
150
- - 'spec/canon/color_detector_spec.rb'
151
-
152
- # Offense count: 679
187
+ # Offense count: 696
153
188
  # Configuration parameters: CountAsOne.
154
189
  RSpec/ExampleLength:
155
190
  Max: 67
@@ -201,11 +236,11 @@ RSpec/MultipleDescribes:
201
236
  Exclude:
202
237
  - 'spec/canon/comparison/match_options_spec.rb'
203
238
 
204
- # Offense count: 522
239
+ # Offense count: 536
205
240
  RSpec/MultipleExpectations:
206
241
  Max: 15
207
242
 
208
- # Offense count: 69
243
+ # Offense count: 70
209
244
  # Configuration parameters: AllowSubject.
210
245
  RSpec/MultipleMemoizedHelpers:
211
246
  Max: 13
@@ -224,12 +259,13 @@ RSpec/NamedSubject:
224
259
  RSpec/NestedGroups:
225
260
  Max: 4
226
261
 
227
- # Offense count: 10
262
+ # Offense count: 11
228
263
  # Configuration parameters: AllowedPatterns.
229
264
  # AllowedPatterns: ^expect_, ^assert_
230
265
  RSpec/NoExpectationExample:
231
266
  Exclude:
232
267
  - 'spec/canon/context_grouping_spec.rb'
268
+ - 'spec/canon/fixtures/isodoc_spec.rb'
233
269
  - 'spec/canon/informative_diffs_debug_spec.rb'
234
270
  - 'spec/canon/isodoc_blockquotes_spec.rb'
235
271
  - 'spec/canon/match_scenarios_spec.rb'
@@ -257,6 +293,18 @@ RSpec/VerifiedDoubles:
257
293
  - 'spec/canon/diff/xml_serialization_formatter_spec.rb'
258
294
  - 'spec/canon/tree_diff/operation_converter_spec.rb'
259
295
 
296
+ # Offense count: 44
297
+ # This cop supports safe autocorrection (--autocorrect).
298
+ # Configuration parameters: EnforcedStyle, ProceduralMethods, FunctionalMethods, AllowedMethods, AllowedPatterns, AllowBracesOnProceduralOneLiners, BracesRequiredMethods.
299
+ # SupportedStyles: line_count_based, semantic, braces_for_chaining, always_braces
300
+ # ProceduralMethods: benchmark, bm, bmbm, create, each_with_object, measure, new, realtime, tap, with_object
301
+ # FunctionalMethods: let, let!, subject, watch
302
+ # AllowedMethods: lambda, proc, it
303
+ Style/BlockDelimiters:
304
+ Exclude:
305
+ - 'spec/canon/fixtures/isodoc_spec.rb'
306
+ - 'spec/canon/table_class_attribute_bug_spec.rb'
307
+
260
308
  # Offense count: 1
261
309
  # This cop supports safe autocorrection (--autocorrect).
262
310
  # Configuration parameters: EnforcedStyle, AllowComments.
data/README.adoc CHANGED
@@ -16,6 +16,47 @@ Key features:
16
16
  * **Multiple interfaces**: Ruby API, CLI, RSpec matchers
17
17
  * **Smart diff output**: By-line or by-object modes with syntax highlighting
18
18
 
19
+ == When to use formatting vs comparison
20
+
21
+ Canon provides two main APIs with different purposes:
22
+
23
+ *Use `Canon.format` for formatting/canonicalization:*
24
+
25
+ * Pretty-printing XML/JSON/YAML for display
26
+ * Canonicalizing documents for storage
27
+ * Normalizing formatting
28
+
29
+ *Use `Canon::Comparison.equivalent?` for semantic comparison:*
30
+
31
+ * Test assertions
32
+ * Document equivalence checking
33
+ * Diff generation
34
+
35
+ [IMPORTANT]
36
+ ====
37
+ Do NOT use `Canon.format_xml` output for string comparison in tests.
38
+ The formatting process changes line counts and formatting, which causes
39
+ false test failures.
40
+
41
+ Use `Canon::Comparison.equivalent?` instead, which performs semantic
42
+ comparison and properly handles XML declarations.
43
+ ====
44
+
45
+ [example]
46
+ ====
47
+ [source,ruby]
48
+ ----
49
+ # WRONG - formatting changes line counts
50
+ expect(Canon.format_xml(actual)).to eq(expected_formatted)
51
+
52
+ # RIGHT - semantic comparison ignores formatting differences
53
+ expect(Canon::Comparison.equivalent?(actual, expected, format: :xml)).to be true
54
+
55
+ # BEST - use RSpec matchers
56
+ expect(actual).to be_xml_equivalent_to(expected)
57
+ ----
58
+ ====
59
+
19
60
  == Installation
20
61
 
21
62
  Add to your application's Gemfile:
@@ -18,6 +18,32 @@ For command-line usage, see link:../cli/[CLI documentation].
18
18
 
19
19
  For RSpec testing, see link:../rspec/[RSpec documentation].
20
20
 
21
+ == Choosing the right API
22
+
23
+ Canon provides two main categories of APIs with different purposes.
24
+
25
+ === Formatting APIs
26
+
27
+ Use `Canon.format` or `Canon.format_xml` when you need to:
28
+
29
+ * Pretty-print documents for display
30
+ * Canonicalize documents for storage
31
+ * Normalize document formatting
32
+
33
+ NOTE: XML declarations are preserved in pretty-print mode and removed in
34
+ canonicalization mode.
35
+
36
+ === Comparison APIs
37
+
38
+ Use `Canon::Comparison.equivalent?` when you need to:
39
+
40
+ * Compare documents semantically
41
+ * Generate diffs
42
+ * Make test assertions
43
+
44
+ NOTE: XML declarations are stripped during preprocessing for semantic comparison.
45
+ Documents with and without XML declarations are considered equivalent.
46
+
21
47
  == General
22
48
 
23
49
  Canon provides a unified Ruby API for working with XML, HTML, JSON, and YAML
@@ -183,6 +183,31 @@ configures preprocessing, match options, diff algorithm, and formatting.
183
183
 
184
184
  == XML-specific features
185
185
 
186
+ === XML declaration handling
187
+
188
+ The XML declaration (`<?xml version="1.0" encoding="UTF-8"?>`) is handled
189
+ differently depending on the operation:
190
+
191
+ [cols="2,3"]
192
+ |===
193
+ | Operation | XML Declaration
194
+
195
+ | `Canon.format_xml` (pretty)
196
+ | Preserved
197
+
198
+ | `Canon.format_xml` (c14n)
199
+ | Removed (per W3C C14N spec)
200
+
201
+ | `Canon::Comparison.equivalent?`
202
+ | Stripped during preprocessing
203
+
204
+ | RSpec matchers
205
+ | Stripped during preprocessing
206
+ |===
207
+
208
+ This means documents with and without XML declarations are considered
209
+ equivalent when using the comparison API.
210
+
186
211
  === Comment handling
187
212
 
188
213
  XML comments are preserved in canonical form unless `--with-comments` is explicitly set.
@@ -81,24 +81,27 @@ module Canon
81
81
  #
82
82
  # @return [Boolean] true if colors appear to be supported
83
83
  def detect_from_env
84
- # Check for known color-capable terminals
85
- colorterm = ENV["COLORTERM"]
86
- return true if COLOR_TERM_VALUES.include?(colorterm)
87
-
88
84
  # Check TERM variable
89
85
  term = ENV["TERM"]
90
- if term
86
+ if term && NO_COLOR_TERMS.any? { |t| term.include?(t) }
91
87
  # Known no-color terminals
92
- return false if NO_COLOR_TERMS.any? { |t| term.include?(t) }
88
+ return false
89
+ end
90
+
91
+ # Check CI environments
92
+ # Some CI systems support colors, others don't
93
+ return detect_ci_colors if ci_environment?
94
+
95
+ if term
93
96
  # Known color-capable terminals
94
97
  return true if COLOR_TERM_SUFFIXES.any? { |s| term.end_with?(s) }
95
98
  # Most modern terminals support basic ANSI colors
96
99
  return true unless term.empty? || term == "unknown"
97
100
  end
98
101
 
99
- # Check CI environments
100
- # Some CI systems support colors, others don't
101
- return detect_ci_colors if ci_environment?
102
+ # Check for known color-capable terminals
103
+ colorterm = ENV["COLORTERM"]
104
+ return true if COLOR_TERM_VALUES.include?(colorterm)
102
105
 
103
106
  # Default: assume colors are supported on modern terminals
104
107
  # This is a safe default for most use cases
@@ -123,16 +126,16 @@ module Canon
123
126
  #
124
127
  # @return [Boolean] true if CI environment likely supports colors
125
128
  def detect_ci_colors
129
+ # Most modern CI systems support ANSI colors
130
+ # Only disable for explicitly known non-color CI
131
+ return false if ENV["TERM"] == "dumb"
132
+
126
133
  # GitHub Actions explicitly supports colors
127
134
  return true if ENV["GITHUB_ACTIONS"]
128
135
 
129
136
  # TeamCity supports colors with specific env var
130
137
  return true if ENV["TEAMCITY_VERSION"]
131
138
 
132
- # Most modern CI systems support ANSI colors
133
- # Only disable for explicitly known non-color CI
134
- return false if ENV["TERM"] == "dumb"
135
-
136
139
  # Default to supporting colors in CI
137
140
  true
138
141
  end
@@ -44,12 +44,20 @@ module Canon
44
44
  # Normalized text comparison
45
45
  #
46
46
  # Collapses whitespace and compares.
47
+ # Two whitespace-only strings that both normalize to empty are equivalent.
47
48
  #
48
49
  # @param text1 [String, nil] First text
49
50
  # @param text2 [String, nil] Second text
50
51
  # @return [Boolean] true if normalized texts are equal
51
52
  def compare_normalize(text1, text2)
52
- normalize_text(text1) == normalize_text(text2)
53
+ normalized1 = normalize_text(text1)
54
+ normalized2 = normalize_text(text2)
55
+
56
+ # Both empty after normalization = equivalent
57
+ # This handles whitespace-only text nodes that normalize to empty
58
+ return true if normalized1.empty? && normalized2.empty?
59
+
60
+ normalized1 == normalized2
53
61
  end
54
62
 
55
63
  private
@@ -60,6 +60,11 @@ module Canon
60
60
  def equivalent?(html1, html2, opts = {}, child_opts = {})
61
61
  opts = DEFAULT_OPTS.merge(opts)
62
62
 
63
+ # Capture original HTML strings BEFORE any parsing/transformation
64
+ # These are used for display to preserve original formatting
65
+ original_str1 = extract_original_string(html1)
66
+ original_str2 = extract_original_string(html2)
67
+
63
68
  # Resolve match options with format-specific defaults
64
69
  match_opts_hash = MatchOptions::Xml.resolve(
65
70
  format: :html,
@@ -117,41 +122,14 @@ module Canon
117
122
  # This is a SAFETY CHECK for legacy cases where Nokogiri nodes might still be used
118
123
  # The main path (parse_node) now returns Canon::Xml::Nodes::RootNode, so this
119
124
  # check should rarely trigger, but we keep it for robustness
120
- if (node1.is_a?(Nokogiri::HTML4::DocumentFragment) ||
121
- node1.is_a?(Nokogiri::XML::DocumentFragment)) &&
122
- (node2.is_a?(Nokogiri::HTML4::DocumentFragment) ||
123
- node2.is_a?(Nokogiri::XML::DocumentFragment))
124
- # Compare children of fragments - filter them first
125
- all_children1 = node1.children.to_a
126
- all_children2 = node2.children.to_a
127
-
128
- # Filter children based on match options (e.g., ignore comments)
129
- children1 = XmlNodeComparison.filter_children(all_children1, opts)
130
- children2 = XmlNodeComparison.filter_children(all_children2, opts)
131
-
132
- if children1.length != children2.length
133
- result = Comparison::UNEQUAL_ELEMENTS
134
- elsif children1.empty?
135
- result = Comparison::EQUIVALENT
136
- else
137
- # Compare each pair of children
138
- result = Comparison::EQUIVALENT
139
- children1.zip(children2).each do |child1, child2|
140
- child_result = XmlNodeComparison.compare_nodes(child1, child2,
141
- opts, child_opts,
142
- diff_children,
143
- differences)
144
- if child_result != Comparison::EQUIVALENT
145
- result = child_result
146
- break
147
- end
148
- end
149
- end
150
- else
151
- result = XmlNodeComparison.compare_nodes(node1, node2, opts,
125
+ result = if fragment_nodes?(node1, node2)
126
+ compare_fragment_children(node1, node2, opts, child_opts,
127
+ diff_children, differences)
128
+ else
129
+ XmlNodeComparison.compare_nodes(node1, node2, opts,
152
130
  child_opts, diff_children,
153
131
  differences)
154
- end
132
+ end
155
133
 
156
134
  # Classify DiffNodes as normative/informative if we have verbose output
157
135
  if opts[:verbose] && !differences.empty?
@@ -165,6 +143,7 @@ module Canon
165
143
  ComparisonResult.new(
166
144
  differences: differences,
167
145
  preprocessed_strings: [preprocessed_str1, preprocessed_str2],
146
+ original_strings: [original_str1, original_str2],
168
147
  format: :html,
169
148
  html_version: detect_html_version_from_node(node1),
170
149
  match_options: match_opts_hash,
@@ -187,6 +166,53 @@ module Canon
187
166
 
188
167
  private
189
168
 
169
+ # Check if both nodes are document fragments
170
+ #
171
+ # @param node1 [Object] First node
172
+ # @param node2 [Object] Second node
173
+ # @return [Boolean] true if both are document fragments
174
+ def fragment_nodes?(node1, node2)
175
+ (node1.is_a?(Nokogiri::HTML4::DocumentFragment) ||
176
+ node1.is_a?(Nokogiri::XML::DocumentFragment)) &&
177
+ (node2.is_a?(Nokogiri::HTML4::DocumentFragment) ||
178
+ node2.is_a?(Nokogiri::XML::DocumentFragment))
179
+ end
180
+
181
+ # Compare children of document fragments
182
+ #
183
+ # @param node1 [Nokogiri::DocumentFragment] First fragment
184
+ # @param node2 [Nokogiri::DocumentFragment] Second fragment
185
+ # @param opts [Hash] Comparison options
186
+ # @param child_opts [Hash] Child comparison options
187
+ # @param diff_children [Boolean] Whether to diff children
188
+ # @param differences [Array] Array to append differences to
189
+ # @return [Symbol] Comparison result constant
190
+ def compare_fragment_children(node1, node2, opts, child_opts,
191
+ diff_children, differences)
192
+ all_children1 = node1.children.to_a
193
+ all_children2 = node2.children.to_a
194
+
195
+ children1 = XmlNodeComparison.filter_children(all_children1, opts)
196
+ children2 = XmlNodeComparison.filter_children(all_children2, opts)
197
+
198
+ if children1.length != children2.length
199
+ return Comparison::UNEQUAL_ELEMENTS
200
+ elsif children1.empty?
201
+ return Comparison::EQUIVALENT
202
+ end
203
+
204
+ # Compare each pair of children
205
+ children1.zip(children2).each do |child1, child2|
206
+ child_result = XmlNodeComparison.compare_nodes(child1, child2,
207
+ opts, child_opts,
208
+ diff_children,
209
+ differences)
210
+ return child_result if child_result != Comparison::EQUIVALENT
211
+ end
212
+
213
+ Comparison::EQUIVALENT
214
+ end
215
+
190
216
  # Perform semantic tree diff using SemanticTreeMatchStrategy
191
217
  #
192
218
  # @param html1 [String, Nokogiri::HTML::Document] First HTML
@@ -195,6 +221,11 @@ module Canon
195
221
  # @param match_opts_hash [Hash] Resolved match options
196
222
  # @return [Boolean, ComparisonResult] Result of tree diff comparison
197
223
  def perform_semantic_tree_diff(html1, html2, opts, match_opts_hash)
224
+ # Capture original HTML strings BEFORE any parsing/transformation
225
+ # These are used for display to preserve original formatting
226
+ original_str1 = extract_original_string(html1)
227
+ original_str2 = extract_original_string(html2)
228
+
198
229
  # Parse to Canon::Xml::Node (preserves preprocessing)
199
230
  # For HTML, we parse as XML to get Canon::Xml::Node structure
200
231
  node1 = parse_node_for_semantic(html1,
@@ -223,6 +254,7 @@ module Canon
223
254
  ComparisonResult.new(
224
255
  differences: differences,
225
256
  preprocessed_strings: preprocessed,
257
+ original_strings: [original_str1, original_str2],
226
258
  format: :html,
227
259
  html_version: html_version,
228
260
  match_options: match_opts_hash.merge(strategy.metadata),
@@ -343,7 +375,7 @@ module Canon
343
375
  # If already a Nokogiri node, check for incompatible XML documents
344
376
  unless node.is_a?(String)
345
377
  # Detect if this is an XML document (not HTML)
346
- if is_xml_document?(node)
378
+ if xml_document?(node)
347
379
  raise Canon::CompareFormatMismatchError.new(:xml, :html)
348
380
  end
349
381
 
@@ -508,6 +540,28 @@ module Canon
508
540
  end
509
541
  end
510
542
 
543
+ # Extract original HTML string from various input types
544
+ # This preserves the original formatting without minification
545
+ #
546
+ # @param html [String, Nokogiri::Node, Canon::Xml::Node] Input HTML
547
+ # @return [String] Original HTML string
548
+ def extract_original_string(html)
549
+ if html.is_a?(String)
550
+ html
551
+ elsif html.is_a?(Canon::Xml::Node)
552
+ # Serialize Canon nodes to string
553
+ Canon::Xml::DataModel.serialize(html)
554
+ elsif html.respond_to?(:to_html)
555
+ # Nokogiri nodes - use to_html to preserve formatting
556
+ html.to_html
557
+ elsif html.respond_to?(:to_s)
558
+ html.to_s
559
+ else
560
+ raise Canon::Error,
561
+ "Unable to extract original string from: #{html.class}"
562
+ end
563
+ end
564
+
511
565
  # Normalize HTML comments within style and script tags
512
566
  # Also removes whitespace-only CDATA children that Nokogiri creates
513
567
  def normalize_html_style_script_comments(doc)
@@ -637,7 +691,7 @@ compare_profile = nil)
637
691
  # Check if a node is an XML document (not HTML)
638
692
  # XML documents typically have XML processing instructions or are
639
693
  # instances of Nokogiri::XML::Document (not HTML variants)
640
- def is_xml_document?(node)
694
+ def xml_document?(node)
641
695
  # Check if it's a pure XML document (not HTML4/HTML5 which also
642
696
  # inherit from XML::Document)
643
697
  # Check both Document and DocumentFragment variants
@@ -25,6 +25,11 @@ module Canon
25
25
  return content unless content.is_a?(String)
26
26
  return content if already_parsed?(content)
27
27
 
28
+ # Normalize HTML to ensure consistent parsing by HTML4.fragment
29
+ # The key issue is that HTML4.fragment treats newlines after </head>
30
+ # differently than no newlines, causing inconsistent parsing
31
+ content = normalize_html_for_parsing(content)
32
+
28
33
  begin
29
34
  case format
30
35
  when :html5
@@ -74,6 +79,23 @@ module Canon
74
79
  # Check for HTML5 DOCTYPE (case-insensitive)
75
80
  content.include?("<!DOCTYPE html>") ? :html5 : :html4
76
81
  end
82
+
83
+ # Normalize HTML to ensure consistent parsing by HTML4.fragment
84
+ #
85
+ # The key issue is that HTML4.fragment treats whitespace after </head>
86
+ # differently than no whitespace, causing inconsistent parsing:
87
+ # - "</head>\n<body>" parses to [body, ...] (body is treated as content)
88
+ # - "</head><body>" parses to [meta, div, ...] (wrapper tags stripped)
89
+ #
90
+ # This method normalizes the HTML to ensure consistent parsing.
91
+ #
92
+ # @param content [String] HTML content
93
+ # @return [String] Normalized HTML content
94
+ def normalize_html_for_parsing(content)
95
+ # Remove whitespace between </head> and <body> to ensure consistent parsing
96
+ # This makes formatted and minified HTML parse the same way
97
+ content.gsub(%r{</head>\s*<body>}i, "</head><body>")
98
+ end
77
99
  end
78
100
  end
79
101
  end