canon 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +163 -67
  3. data/README.adoc +400 -7
  4. data/docs/Gemfile +9 -0
  5. data/docs/INDEX.adoc +99 -182
  6. data/docs/_config.yml +100 -0
  7. data/docs/advanced/diff-classification.adoc +547 -0
  8. data/docs/advanced/diff-pipeline.adoc +358 -0
  9. data/docs/advanced/index.adoc +214 -0
  10. data/docs/advanced/semantic-diff-report.adoc +390 -0
  11. data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
  12. data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
  13. data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
  14. data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
  15. data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
  16. data/docs/features/diff-formatting/display-filtering.adoc +472 -0
  17. data/docs/features/diff-formatting/index.adoc +140 -0
  18. data/docs/features/environment-configuration/index.adoc +327 -0
  19. data/docs/features/environment-configuration/override-system.adoc +436 -0
  20. data/docs/features/environment-configuration/size-limits.adoc +273 -0
  21. data/docs/features/index.adoc +173 -0
  22. data/docs/features/input-validation/index.adoc +521 -0
  23. data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
  24. data/docs/features/match-options/html-policies.adoc +312 -0
  25. data/docs/features/match-options/index.adoc +621 -0
  26. data/docs/getting-started/index.adoc +83 -0
  27. data/docs/getting-started/quick-start.adoc +76 -0
  28. data/docs/guides/choosing-configuration.adoc +689 -0
  29. data/docs/guides/index.adoc +181 -0
  30. data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
  31. data/docs/interfaces/index.adoc +101 -0
  32. data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
  33. data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
  34. data/docs/lychee.toml +65 -0
  35. data/docs/reference/cli-options.adoc +418 -0
  36. data/docs/reference/environment-variables.adoc +375 -0
  37. data/docs/reference/index.adoc +204 -0
  38. data/docs/reference/options-across-interfaces.adoc +417 -0
  39. data/docs/understanding/algorithms/dom-diff.adoc +389 -0
  40. data/docs/understanding/algorithms/index.adoc +314 -0
  41. data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
  42. data/docs/understanding/architecture.adoc +447 -0
  43. data/docs/understanding/comparison-pipeline.adoc +317 -0
  44. data/docs/understanding/formats/html.adoc +380 -0
  45. data/docs/understanding/formats/index.adoc +261 -0
  46. data/docs/understanding/formats/json.adoc +390 -0
  47. data/docs/understanding/formats/xml.adoc +366 -0
  48. data/docs/understanding/formats/yaml.adoc +504 -0
  49. data/docs/understanding/index.adoc +130 -0
  50. data/lib/canon/cli.rb +42 -1
  51. data/lib/canon/commands/diff_command.rb +108 -23
  52. data/lib/canon/comparison/compare_profile.rb +101 -0
  53. data/lib/canon/comparison/comparison_result.rb +41 -2
  54. data/lib/canon/comparison/html_comparator.rb +292 -71
  55. data/lib/canon/comparison/html_compare_profile.rb +117 -0
  56. data/lib/canon/comparison/match_options.rb +42 -4
  57. data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
  58. data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
  59. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
  60. data/lib/canon/comparison/xml_comparator.rb +695 -91
  61. data/lib/canon/comparison.rb +207 -2
  62. data/lib/canon/config/env_provider.rb +71 -0
  63. data/lib/canon/config/env_schema.rb +58 -0
  64. data/lib/canon/config/override_resolver.rb +55 -0
  65. data/lib/canon/config/type_converter.rb +59 -0
  66. data/lib/canon/config.rb +158 -29
  67. data/lib/canon/data_model.rb +29 -0
  68. data/lib/canon/diff/diff_classifier.rb +74 -14
  69. data/lib/canon/diff/diff_context_builder.rb +41 -0
  70. data/lib/canon/diff/diff_line.rb +18 -2
  71. data/lib/canon/diff/diff_node.rb +18 -3
  72. data/lib/canon/diff/diff_node_mapper.rb +71 -12
  73. data/lib/canon/diff/formatting_detector.rb +53 -0
  74. data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
  75. data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
  76. data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
  77. data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
  78. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
  79. data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
  80. data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
  81. data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
  82. data/lib/canon/diff_formatter/debug_output.rb +7 -1
  83. data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
  84. data/lib/canon/diff_formatter/legend.rb +42 -0
  85. data/lib/canon/diff_formatter.rb +78 -9
  86. data/lib/canon/errors.rb +56 -0
  87. data/lib/canon/formatters/html_formatter_base.rb +35 -1
  88. data/lib/canon/formatters/json_formatter.rb +3 -0
  89. data/lib/canon/formatters/yaml_formatter.rb +3 -0
  90. data/lib/canon/html/data_model.rb +229 -0
  91. data/lib/canon/html.rb +9 -0
  92. data/lib/canon/options/cli_generator.rb +70 -0
  93. data/lib/canon/options/registry.rb +234 -0
  94. data/lib/canon/rspec_matchers.rb +34 -13
  95. data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
  96. data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
  97. data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
  98. data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
  99. data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
  100. data/lib/canon/tree_diff/core/matching.rb +241 -0
  101. data/lib/canon/tree_diff/core/node_signature.rb +164 -0
  102. data/lib/canon/tree_diff/core/node_weight.rb +135 -0
  103. data/lib/canon/tree_diff/core/tree_node.rb +450 -0
  104. data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
  105. data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
  106. data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
  107. data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
  108. data/lib/canon/tree_diff/operation_converter.rb +631 -0
  109. data/lib/canon/tree_diff/operations/operation.rb +92 -0
  110. data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
  111. data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
  112. data/lib/canon/tree_diff.rb +33 -0
  113. data/lib/canon/validators/json_validator.rb +3 -1
  114. data/lib/canon/validators/yaml_validator.rb +3 -1
  115. data/lib/canon/version.rb +1 -1
  116. data/lib/canon/xml/data_model.rb +22 -23
  117. data/lib/canon/xml/element_matcher.rb +128 -20
  118. data/lib/canon/xml/namespace_helper.rb +110 -0
  119. data/lib/canon.rb +3 -0
  120. metadata +81 -23
  121. data/_config.yml +0 -116
  122. data/docs/ADVANCED_TOPICS.adoc +0 -20
  123. data/docs/BASIC_USAGE.adoc +0 -16
  124. data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  125. data/docs/DIFF_ARCHITECTURE.adoc +0 -435
  126. data/docs/DIFF_FORMATTING.adoc +0 -540
  127. data/docs/FORMATS.adoc +0 -447
  128. data/docs/INPUT_VALIDATION.adoc +0 -477
  129. data/docs/MATCH_ARCHITECTURE.adoc +0 -463
  130. data/docs/MATCH_OPTIONS.adoc +0 -719
  131. data/docs/MODES.adoc +0 -432
  132. data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  133. data/docs/OPTIONS.adoc +0 -1387
  134. data/docs/PREPROCESSING.adoc +0 -491
  135. data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
  136. data/docs/UNDERSTANDING_CANON.adoc +0 -17
@@ -0,0 +1,365 @@
1
+ ---
2
+ title: Algorithm-Specific Behavior
3
+ parent: Match Options
4
+ grand_parent: Features
5
+ nav_order: 4
6
+ ---
7
+ = Algorithm-Specific Behavior
8
+
9
+ == Purpose
10
+
11
+ Match options control what to compare and how strictly, but **different algorithms interpret these options differently**. Understanding these differences is crucial for choosing the right configuration.
12
+
13
+ This page explains how the DOM and Semantic algorithms each handle match dimensions and provides guidance for migrating between algorithms.
14
+
15
+ == Key Concept
16
+
17
+ The same match option settings can produce different comparison behavior depending on which algorithm you choose:
18
+
19
+ * **DOM algorithm** uses options for element-by-element positional comparison
20
+ * **Semantic algorithm** uses options during signature calculation and similarity matching
21
+
22
+ == Algorithm Comparison
23
+
24
+ === DOM Algorithm Match Behavior
25
+
26
+ The DOM algorithm applies match options during **positional element comparison**:
27
+
28
+ **Characteristics**:
29
+ * Elements matched by position in document tree
30
+ * Match options control comparison strictness at each position
31
+ * No understanding of semantic relationships
32
+ * Order matters significantly
33
+
34
+ **How Options Are Used**:
35
+ * `text_content` - Controls how text at each position is compared
36
+ * `structural_whitespace` - Controls whitespace comparison in structure
37
+ * `attribute_order` - Controls whether attribute order must match
38
+ * `attribute_values` - Controls how attribute values are compared
39
+
40
+ **Best For**:
41
+ * Documents with similar structure
42
+ * Traditional diff workflows
43
+ * Fast comparisons
44
+ * Stable, predictable results
45
+
46
+ === Semantic Algorithm Match Behavior
47
+
48
+ The Semantic algorithm applies match options during **signature calculation and similarity matching**:
49
+
50
+ **Characteristics**:
51
+ * Elements matched by semantic signatures
52
+ * Match options influence signature generation
53
+ * Understands moves, merges, splits
54
+ * Order less critical (uses similarity scoring)
55
+
56
+ **How Options Are Used**:
57
+ * `text_content` - Included in element signature
58
+ * `structural_whitespace` - Affects structural signatures
59
+ * `attribute_order` - Ignored (attributes are unordered in signatures)
60
+ * `attribute_values` - Included in element signature
61
+
62
+ **Best For**:
63
+ * Restructured documents
64
+ * Detecting semantic changes
65
+ * Operation-level analysis
66
+ * Content evolution tracking
67
+
68
+ == Match Dimension Handling by Algorithm
69
+
70
+ This table shows how each algorithm interprets each match dimension:
71
+
72
+ [cols="2,3,3"]
73
+ |===
74
+ |Match Dimension |DOM Algorithm |Semantic Algorithm
75
+
76
+ |**text_content**
77
+ |Compares text at each position. `strict` requires exact match, `normalize` normalizes whitespace, `ignore` skips text comparison
78
+ |Influences element signature. `strict` includes exact text, `normalize` includes normalized text, `ignore` excludes text from signature
79
+
80
+ |**structural_whitespace**
81
+ |Compares whitespace-only text nodes at each position
82
+ |Affects structural signature calculation. Normalized whitespace creates different signatures
83
+
84
+ |**attribute_whitespace**
85
+ |Compares whitespace in attribute values at each position
86
+ |Affects attribute value signatures. Normalized values create different signatures
87
+
88
+ |**attribute_order**
89
+ |`strict` requires same attribute order, `ignore` allows any order at each position
90
+ |Always ignored - attributes are unordered in semantic signatures
91
+
92
+ |**attribute_values**
93
+ |Compares attribute values at each position
94
+ |Attribute values included in element signature
95
+
96
+ |**key_order** (JSON/YAML)
97
+ |`strict` requires same key order, `ignore` allows any order
98
+ |Always ignored - keys are unordered in semantic signatures
99
+
100
+ |**comments**
101
+ |Compares comments at each position. `strict` requires exact match, `normalize` normalizes, `ignore` skips
102
+ |Comments can be included in signatures or ignored. Less impact than in DOM
103
+
104
+ |**namespace_uri** (XML)
105
+ |Always compared strictly - elements must have same namespace URI to match at each position
106
+ |Always included in element signature - elements must have same namespace URI for signatures to match
107
+ |===
108
+
109
+ == Example: Same Options, Different Results
110
+
111
+ Here's an example showing how the same match options produce different results with each algorithm:
112
+
113
+ === Document Pair
114
+
115
+ [source,xml]
116
+ ----
117
+ <!-- Document 1 -->
118
+ <book>
119
+ <title>Canon Guide</title>
120
+ <author>John Doe</author>
121
+ </book>
122
+
123
+ <!-- Document 2 -->
124
+ <book>
125
+ <author>John Doe</author>
126
+ <title>Canon Guide</title>
127
+ </book>
128
+ ----
129
+
130
+ === DOM Algorithm Result
131
+
132
+ [source,ruby]
133
+ ----
134
+ Canon::Comparison.equivalent?(doc1, doc2,
135
+ diff_algorithm: :dom,
136
+ match: {
137
+ text_content: :normalize,
138
+ attribute_order: :ignore
139
+ },
140
+ verbose: true
141
+ )
142
+ # Result: NOT EQUIVALENT
143
+ # Reason: Elements at positions don't match (title vs author)
144
+ # Even though content is identical, position matters
145
+ ----
146
+
147
+ === Semantic Algorithm Result
148
+
149
+ [source,ruby]
150
+ ----
151
+ Canon::Comparison.equivalent?(doc1, doc2,
152
+ diff_algorithm: :semantic,
153
+ match: {
154
+ text_content: :normalize,
155
+ attribute_order: :ignore
156
+ },
157
+ verbose: true
158
+ )
159
+ # Result: EQUIVALENT (with MOVE operation)
160
+ # Reason: Elements have same signatures, just reordered
161
+ # Semantic algorithm detects this as a MOVE operation
162
+ ----
163
+
164
+ == Match Profile Behavior Differences
165
+
166
+ Match profiles also behave differently with each algorithm:
167
+
168
+ === `strict` Profile
169
+
170
+ **DOM Algorithm**:
171
+ * Exact positional matching
172
+ * All elements must be in same positions
173
+ * Whitespace must match exactly
174
+ * Fast comparison
175
+
176
+ **Semantic Algorithm**:
177
+ * Exact signature matching
178
+ * Elements can be reordered
179
+ * Signatures must match exactly
180
+ * Slower but detects moves
181
+
182
+ === `spec_friendly` Profile
183
+
184
+ **DOM Algorithm**:
185
+ * Ignores formatting at each position
186
+ * Position still matters
187
+ * Good for test assertions with similar structure
188
+
189
+ **Semantic Algorithm**:
190
+ * Ignores formatting in signatures
191
+ * Position doesn't matter
192
+ * Good for test assertions with any structure
193
+
194
+ === `content_only` Profile
195
+
196
+ **DOM Algorithm**:
197
+ * Compares only text content at positions
198
+ * Still position-dependent
199
+ * Ignores all structural differences at each position
200
+
201
+ **Semantic Algorithm**:
202
+ * Generates signatures from content only
203
+ * Position-independent
204
+ * True content-only comparison
205
+
206
+ == Migration Guide
207
+
208
+ === Switching from DOM to Semantic
209
+
210
+ When migrating from DOM to Semantic algorithm:
211
+
212
+ **Expected Changes**:
213
+ 1. **Reordered elements** will be detected as MOVEs instead of DELETE+INSERT
214
+ 2. **attribute_order** setting becomes irrelevant (always ignored)
215
+ 3. **Performance** will be slower but more intelligent
216
+ 4. **Output format** changes to operation-based
217
+
218
+ **Configuration Adjustments**:
219
+
220
+ [source,ruby]
221
+ ----
222
+ # Before (DOM)
223
+ Canon::Comparison.equivalent?(doc1, doc2,
224
+ diff_algorithm: :dom,
225
+ match: {
226
+ attribute_order: :strict # This mattered
227
+ },
228
+ diff_mode: :by_line
229
+ )
230
+
231
+ # After (Semantic)
232
+ Canon::Comparison.equivalent?(doc1, doc2,
233
+ diff_algorithm: :semantic,
234
+ match: {
235
+ attribute_order: :ignore # Changed (but actually doesn't matter)
236
+ },
237
+ diff_mode: :by_object # Better for semantic output
238
+ )
239
+ ----
240
+
241
+ **What to Watch For**:
242
+ * Tests expecting positional differences may now pass (moves detected)
243
+ * Diff output format changes significantly
244
+ * Performance may be slower on large documents
245
+
246
+ === Switching from Semantic to DOM
247
+
248
+ When migrating from Semantic to DOM algorithm:
249
+
250
+ **Expected Changes**:
251
+ 1. **MOVE operations** will become DELETE+INSERT pairs
252
+ 2. **Reordered content** will show as differences
253
+ 3. **Performance** will be faster
254
+ 4. **Output format** changes to line-based
255
+
256
+ **Configuration Adjustments**:
257
+
258
+ [source,ruby]
259
+ ----
260
+ # Before (Semantic)
261
+ Canon::Comparison.equivalent?(doc1, doc2,
262
+ diff_algorithm: :semantic,
263
+ diff_mode: :by_object
264
+ )
265
+
266
+ # After (DOM)
267
+ Canon::Comparison.equivalent?(doc1, doc2,
268
+ diff_algorithm: :dom,
269
+ match: {
270
+ attribute_order: :ignore # May want to add this
271
+ },
272
+ diff_mode: :by_line # Better for DOM output
273
+ )
274
+ ----
275
+
276
+ **What to Watch For**:
277
+ * Tests may now fail on reordered content
278
+ * Need to add `attribute_order: :ignore` if attribute order shouldn't matter
279
+ * Diff output is less semantic, more positional
280
+
281
+ == Choosing the Right Algorithm
282
+
283
+ === Use DOM Algorithm When
284
+
285
+ * Documents have similar structure
286
+ * Position matters
287
+ * Fast performance is critical
288
+ * Traditional diff output is sufficient
289
+ * Stability is important (production use)
290
+
291
+ === Use Semantic Algorithm When
292
+
293
+ * Documents may be restructured
294
+ * Need to detect moves/reorders
295
+ * Operation-level analysis is valuable
296
+ * Content evolution tracking is needed
297
+ * Willing to accept experimental status
298
+
299
+ == Common Patterns
300
+
301
+ === Pattern 1: Test-Friendly DOM Comparison
302
+
303
+ [source,ruby]
304
+ ----
305
+ Canon::Comparison.equivalent?(expected, actual,
306
+ diff_algorithm: :dom,
307
+ match_profile: :spec_friendly,
308
+ verbose: true
309
+ )
310
+ # Ignores formatting but requires same structure
311
+ ----
312
+
313
+ === Pattern 2: Content-Only Semantic Comparison
314
+
315
+ [source,ruby]
316
+ ----
317
+ Canon::Comparison.equivalent?(doc1, doc2,
318
+ diff_algorithm: :semantic,
319
+ match_profile: :content_only,
320
+ verbose: true,
321
+ diff_mode: :by_object
322
+ )
323
+ # True content comparison, structure-independent
324
+ ----
325
+
326
+ === Pattern 3: Hybrid Approach
327
+
328
+ [source,ruby]
329
+ ----
330
+ # Try DOM first (fast)
331
+ if Canon::Comparison.equivalent?(doc1, doc2, diff_algorithm: :dom)
332
+ puts "Documents identical"
333
+ else
334
+ # Use semantic for detailed analysis
335
+ result = Canon::Comparison.equivalent?(doc1, doc2,
336
+ diff_algorithm: :semantic,
337
+ verbose: true,
338
+ diff_mode: :by_object
339
+ )
340
+ puts result.operations
341
+ end
342
+ ----
343
+
344
+ == Performance Implications
345
+
346
+ === DOM Algorithm Performance
347
+
348
+ * **Speed**: Fast (linear with document size)
349
+ * **Memory**: Low (processes line-by-line)
350
+ * **Best for**: Documents < 100KB
351
+
352
+ === Semantic Algorithm Performance
353
+
354
+ * **Speed**: Slower (quadratic worst case)
355
+ * **Memory**: Higher (builds tree structures)
356
+ * **Best for**: Documents < 10KB or where intelligence is worth the cost
357
+
358
+ == See Also
359
+
360
+ * link:index.adoc[Match Options Overview]
361
+ * link:../../understanding/algorithms/[Algorithms] - Detailed algorithm documentation
362
+ * link:dimensions.adoc[Match Dimensions] - All available dimensions
363
+ * link:profiles.adoc[Match Profiles] - Preset configurations
364
+ * link:../diff-formatting/algorithm-specific-output.adoc[Algorithm-Specific Output] - How output differs
365
+ * link:../../guides/choosing-configuration.adoc[Choosing Configuration] - Decision guide
@@ -0,0 +1,312 @@
1
+ ---
2
+ layout: default
3
+ title: HTML-Specific Policies
4
+ parent: Match Options
5
+ grand_parent: Features
6
+ nav_order: 4
7
+ ---
8
+
9
+ :toc:
10
+ :toclevels: 3
11
+
12
+ == HTML-Specific Comparison Policies
13
+
14
+ === Overview
15
+
16
+ HTML comparison has specific policies that differ from XML due to HTML's unique
17
+ characteristics and rendering behavior. Canon uses `HtmlCompareProfile` to
18
+ implement these format-specific policies.
19
+
20
+ === Default Policies
21
+
22
+ HTML uses the `:rendered` profile by default:
23
+
24
+ [source,ruby]
25
+ ----
26
+ {
27
+ preprocessing: :rendered,
28
+ text_content: :normalize,
29
+ structural_whitespace: :normalize,
30
+ comments: :ignore,
31
+ attribute_order: :ignore
32
+ }
33
+ ----
34
+
35
+ This reflects how browsers render HTML - whitespace is normalized and comments
36
+ are presentational.
37
+
38
+ === HTML Version Detection
39
+
40
+ Canon automatically detects HTML version:
41
+
42
+ * **HTML4**: Case-insensitive element/attribute names
43
+ * **HTML5**: Case-sensitive (preserves case)
44
+
45
+ Detection is based on DOCTYPE or parsing mode.
46
+
47
+ === Whitespace Preservation
48
+
49
+ Certain HTML elements require strict whitespace preservation regardless of the
50
+ `text_content` policy:
51
+
52
+ [cols="1,3"]
53
+ |===
54
+ |Element |Purpose
55
+
56
+ |`<pre>`
57
+ |Preformatted text blocks
58
+
59
+ |`<code>`
60
+ |Code snippets
61
+
62
+ |`<textarea>`
63
+ |Form input fields
64
+
65
+ |`<script>`
66
+ |JavaScript code
67
+
68
+ |`<style>`
69
+ |CSS stylesheets
70
+ |===
71
+
72
+ Inside these elements, ALL whitespace is preserved even when `text_content:
73
+ :normalize` is set.
74
+
75
+ .Example: Whitespace preservation in <pre>
76
+ ====
77
+ [source,ruby]
78
+ ----
79
+ html1 = '<pre>Line 1\n Line 2</pre>'
80
+ html2 = '<pre>Line 1\nLine 2</pre>'
81
+
82
+ # Whitespace is preserved - not equivalent
83
+ Canon::Comparison.equivalent?(html1, html2, preprocessing: :rendered)
84
+ # => false
85
+ ----
86
+
87
+ The indentation difference matters in `<pre>` elements.
88
+ ====
89
+
90
+ .Example: Whitespace normalization in <div>
91
+ ====
92
+ [source,ruby]
93
+ ----
94
+ html1 = '<div>Text with spaces</div>'
95
+ html2 = '<div>Text with spaces</div>'
96
+
97
+ # Whitespace is normalized - equivalent
98
+ Canon::Comparison.equivalent?(html1, html2, preprocessing: :rendered)
99
+ # => true
100
+ ----
101
+
102
+ Multiple spaces are normalized to single spaces in regular elements.
103
+ ====
104
+
105
+ === Comment Handling
106
+
107
+ HTML comments are presentational by default (like CSS styles):
108
+
109
+ [source,ruby]
110
+ ----
111
+ # Default: comments ignored (informative)
112
+ html1 = '<div><!-- comment --><p>Text</p></div>'
113
+ html2 = '<div><p>Text</p></div>'
114
+
115
+ Canon::Comparison.equivalent?(html1, html2)
116
+ # => true (comments don't affect equivalence)
117
+
118
+ # Strict mode: comments compared (normative)
119
+ Canon::Comparison.equivalent?(html1, html2, match: { comments: :strict })
120
+ # => false (comments affect equivalence)
121
+ ----
122
+
123
+ === Why comments are ignored by default
124
+
125
+ In HTML, comments serve similar purposes to CSS:
126
+ * Developer notes
127
+ * Conditional comments (IE hacks)
128
+ * Disabled code blocks
129
+ * Build tool markers
130
+
131
+ They don't affect rendering or semantic meaning, so they're
132
+ informative by default.
133
+
134
+ === Case Sensitivity
135
+
136
+ HTML4 and HTML5 have different case sensitivity rules:
137
+
138
+ .HTML4 (case-insensitive)
139
+ ====
140
+ [source,ruby]
141
+ ----
142
+ html1 = '<DIV CLASS="test">Content</DIV>'
143
+ html2 = '<div class="test">Content</div>'
144
+
145
+ Canon::Comparison.equivalent?(html1, html2, format: :html4)
146
+ # => true (case doesn't matter in HTML4)
147
+ ----
148
+ ====
149
+
150
+ .HTML5 (case-sensitive)
151
+ ====
152
+ [source,ruby]
153
+ ----
154
+ html1 = '<DIV CLASS="test">Content</DIV>'
155
+ html2 = '<div class="test">Content</div>'
156
+
157
+ Canon::Comparison.equivalent?(html1, html2, format: :html5)
158
+ # => false (case matters in HTML5, though uncommon)
159
+ ----
160
+ ====
161
+
162
+ === Usage Examples
163
+
164
+ === Default HTML comparison
165
+
166
+ [source,ruby]
167
+ ----
168
+ require 'canon/comparison'
169
+
170
+ html1 = '<div> <p> Text </p> </div>'
171
+ html2 = '<div><p>Text</p></div>'
172
+
173
+ # Uses HtmlCompareProfile automatically
174
+ result = Canon::Comparison.equivalent?(html1, html2)
175
+ # => true (whitespace normalized, comments ignored)
176
+ ----
177
+
178
+ === Strict HTML comparison
179
+
180
+ [source,ruby]
181
+ ----
182
+ # All differences matter
183
+ result = Canon::Comparison.equivalent?(html1, html2,
184
+ match: {
185
+ text_content: :strict,
186
+ structural_whitespace: :strict,
187
+ comments: :strict,
188
+ attribute_order: :strict
189
+ }
190
+ )
191
+ # => false (whitespace differences are normative)
192
+ ----
193
+
194
+ === Mixed policies
195
+
196
+ [source,ruby]
197
+ ----
198
+ # Normalize whitespace but compare comments strictly
199
+ result = Canon::Comparison.equivalent?(html1, html2,
200
+ match: {
201
+ text_content: :normalize,
202
+ structural_whitespace: :normalize,
203
+ comments: :strict
204
+ }
205
+ )
206
+ ----
207
+
208
+ === Preprocessing Options
209
+
210
+ HTML supports several preprocessing modes:
211
+
212
+ === `:rendered` (default)
213
+
214
+ Simulates browser rendering:
215
+ - Normalizes whitespace
216
+ - Preserves whitespace in special elements
217
+ - Ignores comments
218
+
219
+ [source,ruby]
220
+ ----
221
+ Canon::Comparison.equivalent?(html1, html2, preprocessing: :rendered)
222
+ ----
223
+
224
+ === `:format`
225
+
226
+ Pretty-prints before comparison:
227
+ - Consistent indentation
228
+ - One element per line
229
+ - Good for visual diffs
230
+
231
+ [source,ruby]
232
+ ----
233
+ Canon::Comparison.equivalent?(html1, html2, preprocessing: :format)
234
+ ----
235
+
236
+ === `:none`
237
+
238
+ No preprocessing:
239
+ - Raw comparison
240
+ - Useful for exact matching
241
+
242
+ [source,ruby]
243
+ ----
244
+ Canon::Comparison.equivalent?(html1, html2, preprocessing: :none)
245
+ ----
246
+
247
+ === Advanced Examples
248
+
249
+ === Compare HTML with mixed content
250
+
251
+ [source,ruby]
252
+ ----
253
+ html1 = '<p>This is <em>important</em> text.</p>'
254
+ html2 = '<p>This is <em>important</em> text.</p>'
255
+
256
+ result = Canon::Comparison.equivalent?(
257
+ html1, html2,
258
+ verbose: true,
259
+ match: { text_content: :normalize, structural_whitespace: :normalize }
260
+ )
261
+
262
+ result.equivalent? # => true
263
+ result.differences # => [#<DiffNode formatting: true, normative: false>]
264
+ ----
265
+
266
+ === Compare with element-specific preservation
267
+
268
+ [source,ruby]
269
+ ----
270
+ html1 = '<div><pre> Code </pre></div>'
271
+ html2 = '<div><pre>Code</pre></div>'
272
+
273
+ # Whitespace preserved in <pre>, normalized in <div>
274
+ result = Canon::Comparison.equivalent?(html1, html2)
275
+ # => false (whitespace matters in <pre>)
276
+ ----
277
+
278
+ === Detect normative vs informative differences
279
+
280
+ [source,ruby]
281
+ ----
282
+ html1 = '<div class="a" id="1"><!-- v1 --><p>Text</p></div>'
283
+ html2 = '<div id="1" class="b"><!-- v2 --><p>Text</p></div>'
284
+
285
+ result = Canon::Comparison.equivalent?(
286
+ html1, html2,
287
+ verbose: true,
288
+ match: { attribute_order: :ignore, comments: :ignore }
289
+ )
290
+
291
+ # Attribute order: informative (ignored)
292
+ # Comments: informative (ignored)
293
+ # Attribute value (class): normative (different)
294
+
295
+ result.equivalent? # => false
296
+ result.differences.select(&:normative?) # => [class attribute diff]
297
+ result.differences.reject(&:normative?) # => [order diff, comment diff]
298
+ ----
299
+
300
+ === Implementation
301
+
302
+ See the following files for implementation details:
303
+
304
+ * [`lib/canon/comparison/html_compare_profile.rb`](../../lib/canon/comparison/html_compare_profile.rb) - HTML-specific profile
305
+ * [`lib/canon/comparison/compare_profile.rb`](../../lib/canon/comparison/compare_profile.rb) - Base profile
306
+ * [`spec/canon/comparison/html_compare_profile_spec.rb`](../../spec/canon/comparison/html_compare_profile_spec.rb) - Comprehensive examples
307
+
308
+ === See Also
309
+
310
+ * link:index.html[Match Options] - Overview of match system
311
+ * link:algorithm-specific-behavior.html[Algorithm-Specific Behavior] - How algorithms handle options
312
+ * link:../../advanced/diff-classification.html[Diff Classification] - Normative vs informative