canon 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +163 -67
- data/README.adoc +400 -7
- data/docs/Gemfile +9 -0
- data/docs/INDEX.adoc +99 -182
- data/docs/_config.yml +100 -0
- data/docs/advanced/diff-classification.adoc +547 -0
- data/docs/advanced/diff-pipeline.adoc +358 -0
- data/docs/advanced/index.adoc +214 -0
- data/docs/advanced/semantic-diff-report.adoc +390 -0
- data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
- data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
- data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
- data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
- data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
- data/docs/features/diff-formatting/display-filtering.adoc +472 -0
- data/docs/features/diff-formatting/index.adoc +140 -0
- data/docs/features/environment-configuration/index.adoc +327 -0
- data/docs/features/environment-configuration/override-system.adoc +436 -0
- data/docs/features/environment-configuration/size-limits.adoc +273 -0
- data/docs/features/index.adoc +173 -0
- data/docs/features/input-validation/index.adoc +521 -0
- data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
- data/docs/features/match-options/html-policies.adoc +312 -0
- data/docs/features/match-options/index.adoc +621 -0
- data/docs/getting-started/index.adoc +83 -0
- data/docs/getting-started/quick-start.adoc +76 -0
- data/docs/guides/choosing-configuration.adoc +689 -0
- data/docs/guides/index.adoc +181 -0
- data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
- data/docs/interfaces/index.adoc +101 -0
- data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
- data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
- data/docs/lychee.toml +65 -0
- data/docs/reference/cli-options.adoc +418 -0
- data/docs/reference/environment-variables.adoc +375 -0
- data/docs/reference/index.adoc +204 -0
- data/docs/reference/options-across-interfaces.adoc +417 -0
- data/docs/understanding/algorithms/dom-diff.adoc +389 -0
- data/docs/understanding/algorithms/index.adoc +314 -0
- data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
- data/docs/understanding/architecture.adoc +447 -0
- data/docs/understanding/comparison-pipeline.adoc +317 -0
- data/docs/understanding/formats/html.adoc +380 -0
- data/docs/understanding/formats/index.adoc +261 -0
- data/docs/understanding/formats/json.adoc +390 -0
- data/docs/understanding/formats/xml.adoc +366 -0
- data/docs/understanding/formats/yaml.adoc +504 -0
- data/docs/understanding/index.adoc +130 -0
- data/lib/canon/cli.rb +42 -1
- data/lib/canon/commands/diff_command.rb +108 -23
- data/lib/canon/comparison/compare_profile.rb +101 -0
- data/lib/canon/comparison/comparison_result.rb +41 -2
- data/lib/canon/comparison/html_comparator.rb +292 -71
- data/lib/canon/comparison/html_compare_profile.rb +117 -0
- data/lib/canon/comparison/match_options.rb +42 -4
- data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
- data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
- data/lib/canon/comparison/xml_comparator.rb +695 -91
- data/lib/canon/comparison.rb +207 -2
- data/lib/canon/config/env_provider.rb +71 -0
- data/lib/canon/config/env_schema.rb +58 -0
- data/lib/canon/config/override_resolver.rb +55 -0
- data/lib/canon/config/type_converter.rb +59 -0
- data/lib/canon/config.rb +158 -29
- data/lib/canon/data_model.rb +29 -0
- data/lib/canon/diff/diff_classifier.rb +74 -14
- data/lib/canon/diff/diff_context_builder.rb +41 -0
- data/lib/canon/diff/diff_line.rb +18 -2
- data/lib/canon/diff/diff_node.rb +18 -3
- data/lib/canon/diff/diff_node_mapper.rb +71 -12
- data/lib/canon/diff/formatting_detector.rb +53 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
- data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
- data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
- data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
- data/lib/canon/diff_formatter/debug_output.rb +7 -1
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
- data/lib/canon/diff_formatter/legend.rb +42 -0
- data/lib/canon/diff_formatter.rb +78 -9
- data/lib/canon/errors.rb +56 -0
- data/lib/canon/formatters/html_formatter_base.rb +35 -1
- data/lib/canon/formatters/json_formatter.rb +3 -0
- data/lib/canon/formatters/yaml_formatter.rb +3 -0
- data/lib/canon/html/data_model.rb +229 -0
- data/lib/canon/html.rb +9 -0
- data/lib/canon/options/cli_generator.rb +70 -0
- data/lib/canon/options/registry.rb +234 -0
- data/lib/canon/rspec_matchers.rb +34 -13
- data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
- data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
- data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
- data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
- data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
- data/lib/canon/tree_diff/core/matching.rb +241 -0
- data/lib/canon/tree_diff/core/node_signature.rb +164 -0
- data/lib/canon/tree_diff/core/node_weight.rb +135 -0
- data/lib/canon/tree_diff/core/tree_node.rb +450 -0
- data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
- data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
- data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
- data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
- data/lib/canon/tree_diff/operation_converter.rb +631 -0
- data/lib/canon/tree_diff/operations/operation.rb +92 -0
- data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
- data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
- data/lib/canon/tree_diff.rb +33 -0
- data/lib/canon/validators/json_validator.rb +3 -1
- data/lib/canon/validators/yaml_validator.rb +3 -1
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +22 -23
- data/lib/canon/xml/element_matcher.rb +128 -20
- data/lib/canon/xml/namespace_helper.rb +110 -0
- data/lib/canon.rb +3 -0
- metadata +81 -23
- data/_config.yml +0 -116
- data/docs/ADVANCED_TOPICS.adoc +0 -20
- data/docs/BASIC_USAGE.adoc +0 -16
- data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
- data/docs/DIFF_ARCHITECTURE.adoc +0 -435
- data/docs/DIFF_FORMATTING.adoc +0 -540
- data/docs/FORMATS.adoc +0 -447
- data/docs/INPUT_VALIDATION.adoc +0 -477
- data/docs/MATCH_ARCHITECTURE.adoc +0 -463
- data/docs/MATCH_OPTIONS.adoc +0 -719
- data/docs/MODES.adoc +0 -432
- data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
- data/docs/OPTIONS.adoc +0 -1387
- data/docs/PREPROCESSING.adoc +0 -491
- data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
- data/docs/UNDERSTANDING_CANON.adoc +0 -17
|
@@ -0,0 +1,365 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: Algorithm-Specific Behavior
|
|
3
|
+
parent: Match Options
|
|
4
|
+
grand_parent: Features
|
|
5
|
+
nav_order: 4
|
|
6
|
+
---
|
|
7
|
+
= Algorithm-Specific Behavior
|
|
8
|
+
|
|
9
|
+
== Purpose
|
|
10
|
+
|
|
11
|
+
Match options control what to compare and how strictly, but **different algorithms interpret these options differently**. Understanding these differences is crucial for choosing the right configuration.
|
|
12
|
+
|
|
13
|
+
This page explains how the DOM and Semantic algorithms each handle match dimensions and provides guidance for migrating between algorithms.
|
|
14
|
+
|
|
15
|
+
== Key Concept
|
|
16
|
+
|
|
17
|
+
The same match option settings can produce different comparison behavior depending on which algorithm you choose:
|
|
18
|
+
|
|
19
|
+
* **DOM algorithm** uses options for element-by-element positional comparison
|
|
20
|
+
* **Semantic algorithm** uses options during signature calculation and similarity matching
|
|
21
|
+
|
|
22
|
+
== Algorithm Comparison
|
|
23
|
+
|
|
24
|
+
=== DOM Algorithm Match Behavior
|
|
25
|
+
|
|
26
|
+
The DOM algorithm applies match options during **positional element comparison**:
|
|
27
|
+
|
|
28
|
+
**Characteristics**:
|
|
29
|
+
* Elements matched by position in document tree
|
|
30
|
+
* Match options control comparison strictness at each position
|
|
31
|
+
* No understanding of semantic relationships
|
|
32
|
+
* Order matters significantly
|
|
33
|
+
|
|
34
|
+
**How Options Are Used**:
|
|
35
|
+
* `text_content` - Controls how text at each position is compared
|
|
36
|
+
* `structural_whitespace` - Controls whitespace comparison in structure
|
|
37
|
+
* `attribute_order` - Controls whether attribute order must match
|
|
38
|
+
* `attribute_values` - Controls how attribute values are compared
|
|
39
|
+
|
|
40
|
+
**Best For**:
|
|
41
|
+
* Documents with similar structure
|
|
42
|
+
* Traditional diff workflows
|
|
43
|
+
* Fast comparisons
|
|
44
|
+
* Stable, predictable results
|
|
45
|
+
|
|
46
|
+
=== Semantic Algorithm Match Behavior
|
|
47
|
+
|
|
48
|
+
The Semantic algorithm applies match options during **signature calculation and similarity matching**:
|
|
49
|
+
|
|
50
|
+
**Characteristics**:
|
|
51
|
+
* Elements matched by semantic signatures
|
|
52
|
+
* Match options influence signature generation
|
|
53
|
+
* Understands moves, merges, splits
|
|
54
|
+
* Order less critical (uses similarity scoring)
|
|
55
|
+
|
|
56
|
+
**How Options Are Used**:
|
|
57
|
+
* `text_content` - Included in element signature
|
|
58
|
+
* `structural_whitespace` - Affects structural signatures
|
|
59
|
+
* `attribute_order` - Ignored (attributes are unordered in signatures)
|
|
60
|
+
* `attribute_values` - Included in element signature
|
|
61
|
+
|
|
62
|
+
**Best For**:
|
|
63
|
+
* Restructured documents
|
|
64
|
+
* Detecting semantic changes
|
|
65
|
+
* Operation-level analysis
|
|
66
|
+
* Content evolution tracking
|
|
67
|
+
|
|
68
|
+
== Match Dimension Handling by Algorithm
|
|
69
|
+
|
|
70
|
+
This table shows how each algorithm interprets each match dimension:
|
|
71
|
+
|
|
72
|
+
[cols="2,3,3"]
|
|
73
|
+
|===
|
|
74
|
+
|Match Dimension |DOM Algorithm |Semantic Algorithm
|
|
75
|
+
|
|
76
|
+
|**text_content**
|
|
77
|
+
|Compares text at each position. `strict` requires exact match, `normalize` normalizes whitespace, `ignore` skips text comparison
|
|
78
|
+
|Influences element signature. `strict` includes exact text, `normalize` includes normalized text, `ignore` excludes text from signature
|
|
79
|
+
|
|
80
|
+
|**structural_whitespace**
|
|
81
|
+
|Compares whitespace-only text nodes at each position
|
|
82
|
+
|Affects structural signature calculation. Normalized whitespace creates different signatures
|
|
83
|
+
|
|
84
|
+
|**attribute_whitespace**
|
|
85
|
+
|Compares whitespace in attribute values at each position
|
|
86
|
+
|Affects attribute value signatures. Normalized values create different signatures
|
|
87
|
+
|
|
88
|
+
|**attribute_order**
|
|
89
|
+
|`strict` requires same attribute order, `ignore` allows any order at each position
|
|
90
|
+
|Always ignored - attributes are unordered in semantic signatures
|
|
91
|
+
|
|
92
|
+
|**attribute_values**
|
|
93
|
+
|Compares attribute values at each position
|
|
94
|
+
|Attribute values included in element signature
|
|
95
|
+
|
|
96
|
+
|**key_order** (JSON/YAML)
|
|
97
|
+
|`strict` requires same key order, `ignore` allows any order
|
|
98
|
+
|Always ignored - keys are unordered in semantic signatures
|
|
99
|
+
|
|
100
|
+
|**comments**
|
|
101
|
+
|Compares comments at each position. `strict` requires exact match, `normalize` normalizes, `ignore` skips
|
|
102
|
+
|Comments can be included in signatures or ignored. Less impact than in DOM
|
|
103
|
+
|
|
104
|
+
|**namespace_uri** (XML)
|
|
105
|
+
|Always compared strictly - elements must have same namespace URI to match at each position
|
|
106
|
+
|Always included in element signature - elements must have same namespace URI for signatures to match
|
|
107
|
+
|===
|
|
108
|
+
|
|
109
|
+
== Example: Same Options, Different Results
|
|
110
|
+
|
|
111
|
+
Here's an example showing how the same match options produce different results with each algorithm:
|
|
112
|
+
|
|
113
|
+
=== Document Pair
|
|
114
|
+
|
|
115
|
+
[source,xml]
|
|
116
|
+
----
|
|
117
|
+
<!-- Document 1 -->
|
|
118
|
+
<book>
|
|
119
|
+
<title>Canon Guide</title>
|
|
120
|
+
<author>John Doe</author>
|
|
121
|
+
</book>
|
|
122
|
+
|
|
123
|
+
<!-- Document 2 -->
|
|
124
|
+
<book>
|
|
125
|
+
<author>John Doe</author>
|
|
126
|
+
<title>Canon Guide</title>
|
|
127
|
+
</book>
|
|
128
|
+
----
|
|
129
|
+
|
|
130
|
+
=== DOM Algorithm Result
|
|
131
|
+
|
|
132
|
+
[source,ruby]
|
|
133
|
+
----
|
|
134
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
135
|
+
diff_algorithm: :dom,
|
|
136
|
+
match: {
|
|
137
|
+
text_content: :normalize,
|
|
138
|
+
attribute_order: :ignore
|
|
139
|
+
},
|
|
140
|
+
verbose: true
|
|
141
|
+
)
|
|
142
|
+
# Result: NOT EQUIVALENT
|
|
143
|
+
# Reason: Elements at positions don't match (title vs author)
|
|
144
|
+
# Even though content is identical, position matters
|
|
145
|
+
----
|
|
146
|
+
|
|
147
|
+
=== Semantic Algorithm Result
|
|
148
|
+
|
|
149
|
+
[source,ruby]
|
|
150
|
+
----
|
|
151
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
152
|
+
diff_algorithm: :semantic,
|
|
153
|
+
match: {
|
|
154
|
+
text_content: :normalize,
|
|
155
|
+
attribute_order: :ignore
|
|
156
|
+
},
|
|
157
|
+
verbose: true
|
|
158
|
+
)
|
|
159
|
+
# Result: EQUIVALENT (with MOVE operation)
|
|
160
|
+
# Reason: Elements have same signatures, just reordered
|
|
161
|
+
# Semantic algorithm detects this as a MOVE operation
|
|
162
|
+
----
|
|
163
|
+
|
|
164
|
+
== Match Profile Behavior Differences
|
|
165
|
+
|
|
166
|
+
Match profiles also behave differently with each algorithm:
|
|
167
|
+
|
|
168
|
+
=== `strict` Profile
|
|
169
|
+
|
|
170
|
+
**DOM Algorithm**:
|
|
171
|
+
* Exact positional matching
|
|
172
|
+
* All elements must be in same positions
|
|
173
|
+
* Whitespace must match exactly
|
|
174
|
+
* Fast comparison
|
|
175
|
+
|
|
176
|
+
**Semantic Algorithm**:
|
|
177
|
+
* Exact signature matching
|
|
178
|
+
* Elements can be reordered
|
|
179
|
+
* Signatures must match exactly
|
|
180
|
+
* Slower but detects moves
|
|
181
|
+
|
|
182
|
+
=== `spec_friendly` Profile
|
|
183
|
+
|
|
184
|
+
**DOM Algorithm**:
|
|
185
|
+
* Ignores formatting at each position
|
|
186
|
+
* Position still matters
|
|
187
|
+
* Good for test assertions with similar structure
|
|
188
|
+
|
|
189
|
+
**Semantic Algorithm**:
|
|
190
|
+
* Ignores formatting in signatures
|
|
191
|
+
* Position doesn't matter
|
|
192
|
+
* Good for test assertions with any structure
|
|
193
|
+
|
|
194
|
+
=== `content_only` Profile
|
|
195
|
+
|
|
196
|
+
**DOM Algorithm**:
|
|
197
|
+
* Compares only text content at positions
|
|
198
|
+
* Still position-dependent
|
|
199
|
+
* Ignores all structural differences at each position
|
|
200
|
+
|
|
201
|
+
**Semantic Algorithm**:
|
|
202
|
+
* Generates signatures from content only
|
|
203
|
+
* Position-independent
|
|
204
|
+
* True content-only comparison
|
|
205
|
+
|
|
206
|
+
== Migration Guide
|
|
207
|
+
|
|
208
|
+
=== Switching from DOM to Semantic
|
|
209
|
+
|
|
210
|
+
When migrating from DOM to Semantic algorithm:
|
|
211
|
+
|
|
212
|
+
**Expected Changes**:
|
|
213
|
+
1. **Reordered elements** will be detected as MOVEs instead of DELETE+INSERT
|
|
214
|
+
2. **attribute_order** setting becomes irrelevant (always ignored)
|
|
215
|
+
3. **Performance** will be slower but more intelligent
|
|
216
|
+
4. **Output format** changes to operation-based
|
|
217
|
+
|
|
218
|
+
**Configuration Adjustments**:
|
|
219
|
+
|
|
220
|
+
[source,ruby]
|
|
221
|
+
----
|
|
222
|
+
# Before (DOM)
|
|
223
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
224
|
+
diff_algorithm: :dom,
|
|
225
|
+
match: {
|
|
226
|
+
attribute_order: :strict # This mattered
|
|
227
|
+
},
|
|
228
|
+
diff_mode: :by_line
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
# After (Semantic)
|
|
232
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
233
|
+
diff_algorithm: :semantic,
|
|
234
|
+
match: {
|
|
235
|
+
attribute_order: :ignore # Changed (but actually doesn't matter)
|
|
236
|
+
},
|
|
237
|
+
diff_mode: :by_object # Better for semantic output
|
|
238
|
+
)
|
|
239
|
+
----
|
|
240
|
+
|
|
241
|
+
**What to Watch For**:
|
|
242
|
+
* Tests expecting positional differences may now pass (moves detected)
|
|
243
|
+
* Diff output format changes significantly
|
|
244
|
+
* Performance may be slower on large documents
|
|
245
|
+
|
|
246
|
+
=== Switching from Semantic to DOM
|
|
247
|
+
|
|
248
|
+
When migrating from Semantic to DOM algorithm:
|
|
249
|
+
|
|
250
|
+
**Expected Changes**:
|
|
251
|
+
1. **MOVE operations** will become DELETE+INSERT pairs
|
|
252
|
+
2. **Reordered content** will show as differences
|
|
253
|
+
3. **Performance** will be faster
|
|
254
|
+
4. **Output format** changes to line-based
|
|
255
|
+
|
|
256
|
+
**Configuration Adjustments**:
|
|
257
|
+
|
|
258
|
+
[source,ruby]
|
|
259
|
+
----
|
|
260
|
+
# Before (Semantic)
|
|
261
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
262
|
+
diff_algorithm: :semantic,
|
|
263
|
+
diff_mode: :by_object
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
# After (DOM)
|
|
267
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
268
|
+
diff_algorithm: :dom,
|
|
269
|
+
match: {
|
|
270
|
+
attribute_order: :ignore # May want to add this
|
|
271
|
+
},
|
|
272
|
+
diff_mode: :by_line # Better for DOM output
|
|
273
|
+
)
|
|
274
|
+
----
|
|
275
|
+
|
|
276
|
+
**What to Watch For**:
|
|
277
|
+
* Tests may now fail on reordered content
|
|
278
|
+
* Need to add `attribute_order: :ignore` if attribute order shouldn't matter
|
|
279
|
+
* Diff output is less semantic, more positional
|
|
280
|
+
|
|
281
|
+
== Choosing the Right Algorithm
|
|
282
|
+
|
|
283
|
+
=== Use DOM Algorithm When
|
|
284
|
+
|
|
285
|
+
* Documents have similar structure
|
|
286
|
+
* Position matters
|
|
287
|
+
* Fast performance is critical
|
|
288
|
+
* Traditional diff output is sufficient
|
|
289
|
+
* Stability is important (production use)
|
|
290
|
+
|
|
291
|
+
=== Use Semantic Algorithm When
|
|
292
|
+
|
|
293
|
+
* Documents may be restructured
|
|
294
|
+
* Need to detect moves/reorders
|
|
295
|
+
* Operation-level analysis is valuable
|
|
296
|
+
* Content evolution tracking is needed
|
|
297
|
+
* Willing to accept experimental status
|
|
298
|
+
|
|
299
|
+
== Common Patterns
|
|
300
|
+
|
|
301
|
+
=== Pattern 1: Test-Friendly DOM Comparison
|
|
302
|
+
|
|
303
|
+
[source,ruby]
|
|
304
|
+
----
|
|
305
|
+
Canon::Comparison.equivalent?(expected, actual,
|
|
306
|
+
diff_algorithm: :dom,
|
|
307
|
+
match_profile: :spec_friendly,
|
|
308
|
+
verbose: true
|
|
309
|
+
)
|
|
310
|
+
# Ignores formatting but requires same structure
|
|
311
|
+
----
|
|
312
|
+
|
|
313
|
+
=== Pattern 2: Content-Only Semantic Comparison
|
|
314
|
+
|
|
315
|
+
[source,ruby]
|
|
316
|
+
----
|
|
317
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
318
|
+
diff_algorithm: :semantic,
|
|
319
|
+
match_profile: :content_only,
|
|
320
|
+
verbose: true,
|
|
321
|
+
diff_mode: :by_object
|
|
322
|
+
)
|
|
323
|
+
# True content comparison, structure-independent
|
|
324
|
+
----
|
|
325
|
+
|
|
326
|
+
=== Pattern 3: Hybrid Approach
|
|
327
|
+
|
|
328
|
+
[source,ruby]
|
|
329
|
+
----
|
|
330
|
+
# Try DOM first (fast)
|
|
331
|
+
if Canon::Comparison.equivalent?(doc1, doc2, diff_algorithm: :dom)
|
|
332
|
+
puts "Documents identical"
|
|
333
|
+
else
|
|
334
|
+
# Use semantic for detailed analysis
|
|
335
|
+
result = Canon::Comparison.equivalent?(doc1, doc2,
|
|
336
|
+
diff_algorithm: :semantic,
|
|
337
|
+
verbose: true,
|
|
338
|
+
diff_mode: :by_object
|
|
339
|
+
)
|
|
340
|
+
puts result.operations
|
|
341
|
+
end
|
|
342
|
+
----
|
|
343
|
+
|
|
344
|
+
== Performance Implications
|
|
345
|
+
|
|
346
|
+
=== DOM Algorithm Performance
|
|
347
|
+
|
|
348
|
+
* **Speed**: Fast (linear with document size)
|
|
349
|
+
* **Memory**: Low (processes line-by-line)
|
|
350
|
+
* **Best for**: Documents < 100KB
|
|
351
|
+
|
|
352
|
+
=== Semantic Algorithm Performance
|
|
353
|
+
|
|
354
|
+
* **Speed**: Slower (quadratic worst case)
|
|
355
|
+
* **Memory**: Higher (builds tree structures)
|
|
356
|
+
* **Best for**: Documents < 10KB or where intelligence is worth the cost
|
|
357
|
+
|
|
358
|
+
== See Also
|
|
359
|
+
|
|
360
|
+
* link:index.adoc[Match Options Overview]
|
|
361
|
+
* link:../../understanding/algorithms/[Algorithms] - Detailed algorithm documentation
|
|
362
|
+
* link:dimensions.adoc[Match Dimensions] - All available dimensions
|
|
363
|
+
* link:profiles.adoc[Match Profiles] - Preset configurations
|
|
364
|
+
* link:../diff-formatting/algorithm-specific-output.adoc[Algorithm-Specific Output] - How output differs
|
|
365
|
+
* link:../../guides/choosing-configuration.adoc[Choosing Configuration] - Decision guide
|
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
---
|
|
2
|
+
layout: default
|
|
3
|
+
title: HTML-Specific Policies
|
|
4
|
+
parent: Match Options
|
|
5
|
+
grand_parent: Features
|
|
6
|
+
nav_order: 4
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
:toc:
|
|
10
|
+
:toclevels: 3
|
|
11
|
+
|
|
12
|
+
== HTML-Specific Comparison Policies
|
|
13
|
+
|
|
14
|
+
=== Overview
|
|
15
|
+
|
|
16
|
+
HTML comparison has specific policies that differ from XML due to HTML's unique
|
|
17
|
+
characteristics and rendering behavior. Canon uses `HtmlCompareProfile` to
|
|
18
|
+
implement these format-specific policies.
|
|
19
|
+
|
|
20
|
+
=== Default Policies
|
|
21
|
+
|
|
22
|
+
HTML uses the `:rendered` profile by default:
|
|
23
|
+
|
|
24
|
+
[source,ruby]
|
|
25
|
+
----
|
|
26
|
+
{
|
|
27
|
+
preprocessing: :rendered,
|
|
28
|
+
text_content: :normalize,
|
|
29
|
+
structural_whitespace: :normalize,
|
|
30
|
+
comments: :ignore,
|
|
31
|
+
attribute_order: :ignore
|
|
32
|
+
}
|
|
33
|
+
----
|
|
34
|
+
|
|
35
|
+
This reflects how browsers render HTML - whitespace is normalized and comments
|
|
36
|
+
are presentational.
|
|
37
|
+
|
|
38
|
+
=== HTML Version Detection
|
|
39
|
+
|
|
40
|
+
Canon automatically detects HTML version:
|
|
41
|
+
|
|
42
|
+
* **HTML4**: Case-insensitive element/attribute names
|
|
43
|
+
* **HTML5**: Case-sensitive (preserves case)
|
|
44
|
+
|
|
45
|
+
Detection is based on DOCTYPE or parsing mode.
|
|
46
|
+
|
|
47
|
+
=== Whitespace Preservation
|
|
48
|
+
|
|
49
|
+
Certain HTML elements require strict whitespace preservation regardless of the
|
|
50
|
+
`text_content` policy:
|
|
51
|
+
|
|
52
|
+
[cols="1,3"]
|
|
53
|
+
|===
|
|
54
|
+
|Element |Purpose
|
|
55
|
+
|
|
56
|
+
|`<pre>`
|
|
57
|
+
|Preformatted text blocks
|
|
58
|
+
|
|
59
|
+
|`<code>`
|
|
60
|
+
|Code snippets
|
|
61
|
+
|
|
62
|
+
|`<textarea>`
|
|
63
|
+
|Form input fields
|
|
64
|
+
|
|
65
|
+
|`<script>`
|
|
66
|
+
|JavaScript code
|
|
67
|
+
|
|
68
|
+
|`<style>`
|
|
69
|
+
|CSS stylesheets
|
|
70
|
+
|===
|
|
71
|
+
|
|
72
|
+
Inside these elements, ALL whitespace is preserved even when `text_content:
|
|
73
|
+
:normalize` is set.
|
|
74
|
+
|
|
75
|
+
.Example: Whitespace preservation in <pre>
|
|
76
|
+
====
|
|
77
|
+
[source,ruby]
|
|
78
|
+
----
|
|
79
|
+
html1 = '<pre>Line 1\n Line 2</pre>'
|
|
80
|
+
html2 = '<pre>Line 1\nLine 2</pre>'
|
|
81
|
+
|
|
82
|
+
# Whitespace is preserved - not equivalent
|
|
83
|
+
Canon::Comparison.equivalent?(html1, html2, preprocessing: :rendered)
|
|
84
|
+
# => false
|
|
85
|
+
----
|
|
86
|
+
|
|
87
|
+
The indentation difference matters in `<pre>` elements.
|
|
88
|
+
====
|
|
89
|
+
|
|
90
|
+
.Example: Whitespace normalization in <div>
|
|
91
|
+
====
|
|
92
|
+
[source,ruby]
|
|
93
|
+
----
|
|
94
|
+
html1 = '<div>Text with spaces</div>'
|
|
95
|
+
html2 = '<div>Text with spaces</div>'
|
|
96
|
+
|
|
97
|
+
# Whitespace is normalized - equivalent
|
|
98
|
+
Canon::Comparison.equivalent?(html1, html2, preprocessing: :rendered)
|
|
99
|
+
# => true
|
|
100
|
+
----
|
|
101
|
+
|
|
102
|
+
Multiple spaces are normalized to single spaces in regular elements.
|
|
103
|
+
====
|
|
104
|
+
|
|
105
|
+
=== Comment Handling
|
|
106
|
+
|
|
107
|
+
HTML comments are presentational by default (like CSS styles):
|
|
108
|
+
|
|
109
|
+
[source,ruby]
|
|
110
|
+
----
|
|
111
|
+
# Default: comments ignored (informative)
|
|
112
|
+
html1 = '<div><!-- comment --><p>Text</p></div>'
|
|
113
|
+
html2 = '<div><p>Text</p></div>'
|
|
114
|
+
|
|
115
|
+
Canon::Comparison.equivalent?(html1, html2)
|
|
116
|
+
# => true (comments don't affect equivalence)
|
|
117
|
+
|
|
118
|
+
# Strict mode: comments compared (normative)
|
|
119
|
+
Canon::Comparison.equivalent?(html1, html2, match: { comments: :strict })
|
|
120
|
+
# => false (comments affect equivalence)
|
|
121
|
+
----
|
|
122
|
+
|
|
123
|
+
=== Why comments are ignored by default
|
|
124
|
+
|
|
125
|
+
In HTML, comments serve similar purposes to CSS:
|
|
126
|
+
* Developer notes
|
|
127
|
+
* Conditional comments (IE hacks)
|
|
128
|
+
* Disabled code blocks
|
|
129
|
+
* Build tool markers
|
|
130
|
+
|
|
131
|
+
They don't affect rendering or semantic meaning, so they're
|
|
132
|
+
informative by default.
|
|
133
|
+
|
|
134
|
+
=== Case Sensitivity
|
|
135
|
+
|
|
136
|
+
HTML4 and HTML5 have different case sensitivity rules:
|
|
137
|
+
|
|
138
|
+
.HTML4 (case-insensitive)
|
|
139
|
+
====
|
|
140
|
+
[source,ruby]
|
|
141
|
+
----
|
|
142
|
+
html1 = '<DIV CLASS="test">Content</DIV>'
|
|
143
|
+
html2 = '<div class="test">Content</div>'
|
|
144
|
+
|
|
145
|
+
Canon::Comparison.equivalent?(html1, html2, format: :html4)
|
|
146
|
+
# => true (case doesn't matter in HTML4)
|
|
147
|
+
----
|
|
148
|
+
====
|
|
149
|
+
|
|
150
|
+
.HTML5 (case-sensitive)
|
|
151
|
+
====
|
|
152
|
+
[source,ruby]
|
|
153
|
+
----
|
|
154
|
+
html1 = '<DIV CLASS="test">Content</DIV>'
|
|
155
|
+
html2 = '<div class="test">Content</div>'
|
|
156
|
+
|
|
157
|
+
Canon::Comparison.equivalent?(html1, html2, format: :html5)
|
|
158
|
+
# => false (case matters in HTML5, though uncommon)
|
|
159
|
+
----
|
|
160
|
+
====
|
|
161
|
+
|
|
162
|
+
=== Usage Examples
|
|
163
|
+
|
|
164
|
+
=== Default HTML comparison
|
|
165
|
+
|
|
166
|
+
[source,ruby]
|
|
167
|
+
----
|
|
168
|
+
require 'canon/comparison'
|
|
169
|
+
|
|
170
|
+
html1 = '<div> <p> Text </p> </div>'
|
|
171
|
+
html2 = '<div><p>Text</p></div>'
|
|
172
|
+
|
|
173
|
+
# Uses HtmlCompareProfile automatically
|
|
174
|
+
result = Canon::Comparison.equivalent?(html1, html2)
|
|
175
|
+
# => true (whitespace normalized, comments ignored)
|
|
176
|
+
----
|
|
177
|
+
|
|
178
|
+
=== Strict HTML comparison
|
|
179
|
+
|
|
180
|
+
[source,ruby]
|
|
181
|
+
----
|
|
182
|
+
# All differences matter
|
|
183
|
+
result = Canon::Comparison.equivalent?(html1, html2,
|
|
184
|
+
match: {
|
|
185
|
+
text_content: :strict,
|
|
186
|
+
structural_whitespace: :strict,
|
|
187
|
+
comments: :strict,
|
|
188
|
+
attribute_order: :strict
|
|
189
|
+
}
|
|
190
|
+
)
|
|
191
|
+
# => false (whitespace differences are normative)
|
|
192
|
+
----
|
|
193
|
+
|
|
194
|
+
=== Mixed policies
|
|
195
|
+
|
|
196
|
+
[source,ruby]
|
|
197
|
+
----
|
|
198
|
+
# Normalize whitespace but compare comments strictly
|
|
199
|
+
result = Canon::Comparison.equivalent?(html1, html2,
|
|
200
|
+
match: {
|
|
201
|
+
text_content: :normalize,
|
|
202
|
+
structural_whitespace: :normalize,
|
|
203
|
+
comments: :strict
|
|
204
|
+
}
|
|
205
|
+
)
|
|
206
|
+
----
|
|
207
|
+
|
|
208
|
+
=== Preprocessing Options
|
|
209
|
+
|
|
210
|
+
HTML supports several preprocessing modes:
|
|
211
|
+
|
|
212
|
+
=== `:rendered` (default)
|
|
213
|
+
|
|
214
|
+
Simulates browser rendering:
|
|
215
|
+
- Normalizes whitespace
|
|
216
|
+
- Preserves whitespace in special elements
|
|
217
|
+
- Ignores comments
|
|
218
|
+
|
|
219
|
+
[source,ruby]
|
|
220
|
+
----
|
|
221
|
+
Canon::Comparison.equivalent?(html1, html2, preprocessing: :rendered)
|
|
222
|
+
----
|
|
223
|
+
|
|
224
|
+
=== `:format`
|
|
225
|
+
|
|
226
|
+
Pretty-prints before comparison:
|
|
227
|
+
- Consistent indentation
|
|
228
|
+
- One element per line
|
|
229
|
+
- Good for visual diffs
|
|
230
|
+
|
|
231
|
+
[source,ruby]
|
|
232
|
+
----
|
|
233
|
+
Canon::Comparison.equivalent?(html1, html2, preprocessing: :format)
|
|
234
|
+
----
|
|
235
|
+
|
|
236
|
+
=== `:none`
|
|
237
|
+
|
|
238
|
+
No preprocessing:
|
|
239
|
+
- Raw comparison
|
|
240
|
+
- Useful for exact matching
|
|
241
|
+
|
|
242
|
+
[source,ruby]
|
|
243
|
+
----
|
|
244
|
+
Canon::Comparison.equivalent?(html1, html2, preprocessing: :none)
|
|
245
|
+
----
|
|
246
|
+
|
|
247
|
+
=== Advanced Examples
|
|
248
|
+
|
|
249
|
+
=== Compare HTML with mixed content
|
|
250
|
+
|
|
251
|
+
[source,ruby]
|
|
252
|
+
----
|
|
253
|
+
html1 = '<p>This is <em>important</em> text.</p>'
|
|
254
|
+
html2 = '<p>This is <em>important</em> text.</p>'
|
|
255
|
+
|
|
256
|
+
result = Canon::Comparison.equivalent?(
|
|
257
|
+
html1, html2,
|
|
258
|
+
verbose: true,
|
|
259
|
+
match: { text_content: :normalize, structural_whitespace: :normalize }
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
result.equivalent? # => true
|
|
263
|
+
result.differences # => [#<DiffNode formatting: true, normative: false>]
|
|
264
|
+
----
|
|
265
|
+
|
|
266
|
+
=== Compare with element-specific preservation
|
|
267
|
+
|
|
268
|
+
[source,ruby]
|
|
269
|
+
----
|
|
270
|
+
html1 = '<div><pre> Code </pre></div>'
|
|
271
|
+
html2 = '<div><pre>Code</pre></div>'
|
|
272
|
+
|
|
273
|
+
# Whitespace preserved in <pre>, normalized in <div>
|
|
274
|
+
result = Canon::Comparison.equivalent?(html1, html2)
|
|
275
|
+
# => false (whitespace matters in <pre>)
|
|
276
|
+
----
|
|
277
|
+
|
|
278
|
+
=== Detect normative vs informative differences
|
|
279
|
+
|
|
280
|
+
[source,ruby]
|
|
281
|
+
----
|
|
282
|
+
html1 = '<div class="a" id="1"><!-- v1 --><p>Text</p></div>'
|
|
283
|
+
html2 = '<div id="1" class="b"><!-- v2 --><p>Text</p></div>'
|
|
284
|
+
|
|
285
|
+
result = Canon::Comparison.equivalent?(
|
|
286
|
+
html1, html2,
|
|
287
|
+
verbose: true,
|
|
288
|
+
match: { attribute_order: :ignore, comments: :ignore }
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
# Attribute order: informative (ignored)
|
|
292
|
+
# Comments: informative (ignored)
|
|
293
|
+
# Attribute value (class): normative (different)
|
|
294
|
+
|
|
295
|
+
result.equivalent? # => false
|
|
296
|
+
result.differences.select(&:normative?) # => [class attribute diff]
|
|
297
|
+
result.differences.reject(&:normative?) # => [order diff, comment diff]
|
|
298
|
+
----
|
|
299
|
+
|
|
300
|
+
=== Implementation
|
|
301
|
+
|
|
302
|
+
See the following files for implementation details:
|
|
303
|
+
|
|
304
|
+
* [`lib/canon/comparison/html_compare_profile.rb`](../../lib/canon/comparison/html_compare_profile.rb) - HTML-specific profile
|
|
305
|
+
* [`lib/canon/comparison/compare_profile.rb`](../../lib/canon/comparison/compare_profile.rb) - Base profile
|
|
306
|
+
* [`spec/canon/comparison/html_compare_profile_spec.rb`](../../spec/canon/comparison/html_compare_profile_spec.rb) - Comprehensive examples
|
|
307
|
+
|
|
308
|
+
=== See Also
|
|
309
|
+
|
|
310
|
+
* link:index.html[Match Options] - Overview of match system
|
|
311
|
+
* link:algorithm-specific-behavior.html[Algorithm-Specific Behavior] - How algorithms handle options
|
|
312
|
+
* link:../../advanced/diff-classification.html[Diff Classification] - Normative vs informative
|