canon 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +163 -67
- data/README.adoc +400 -7
- data/docs/Gemfile +9 -0
- data/docs/INDEX.adoc +99 -182
- data/docs/_config.yml +100 -0
- data/docs/advanced/diff-classification.adoc +547 -0
- data/docs/advanced/diff-pipeline.adoc +358 -0
- data/docs/advanced/index.adoc +214 -0
- data/docs/advanced/semantic-diff-report.adoc +390 -0
- data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
- data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
- data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
- data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
- data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
- data/docs/features/diff-formatting/display-filtering.adoc +472 -0
- data/docs/features/diff-formatting/index.adoc +140 -0
- data/docs/features/environment-configuration/index.adoc +327 -0
- data/docs/features/environment-configuration/override-system.adoc +436 -0
- data/docs/features/environment-configuration/size-limits.adoc +273 -0
- data/docs/features/index.adoc +173 -0
- data/docs/features/input-validation/index.adoc +521 -0
- data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
- data/docs/features/match-options/html-policies.adoc +312 -0
- data/docs/features/match-options/index.adoc +621 -0
- data/docs/getting-started/index.adoc +83 -0
- data/docs/getting-started/quick-start.adoc +76 -0
- data/docs/guides/choosing-configuration.adoc +689 -0
- data/docs/guides/index.adoc +181 -0
- data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
- data/docs/interfaces/index.adoc +101 -0
- data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
- data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
- data/docs/lychee.toml +65 -0
- data/docs/reference/cli-options.adoc +418 -0
- data/docs/reference/environment-variables.adoc +375 -0
- data/docs/reference/index.adoc +204 -0
- data/docs/reference/options-across-interfaces.adoc +417 -0
- data/docs/understanding/algorithms/dom-diff.adoc +389 -0
- data/docs/understanding/algorithms/index.adoc +314 -0
- data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
- data/docs/understanding/architecture.adoc +447 -0
- data/docs/understanding/comparison-pipeline.adoc +317 -0
- data/docs/understanding/formats/html.adoc +380 -0
- data/docs/understanding/formats/index.adoc +261 -0
- data/docs/understanding/formats/json.adoc +390 -0
- data/docs/understanding/formats/xml.adoc +366 -0
- data/docs/understanding/formats/yaml.adoc +504 -0
- data/docs/understanding/index.adoc +130 -0
- data/lib/canon/cli.rb +42 -1
- data/lib/canon/commands/diff_command.rb +108 -23
- data/lib/canon/comparison/compare_profile.rb +101 -0
- data/lib/canon/comparison/comparison_result.rb +41 -2
- data/lib/canon/comparison/html_comparator.rb +292 -71
- data/lib/canon/comparison/html_compare_profile.rb +117 -0
- data/lib/canon/comparison/match_options.rb +42 -4
- data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
- data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
- data/lib/canon/comparison/xml_comparator.rb +695 -91
- data/lib/canon/comparison.rb +207 -2
- data/lib/canon/config/env_provider.rb +71 -0
- data/lib/canon/config/env_schema.rb +58 -0
- data/lib/canon/config/override_resolver.rb +55 -0
- data/lib/canon/config/type_converter.rb +59 -0
- data/lib/canon/config.rb +158 -29
- data/lib/canon/data_model.rb +29 -0
- data/lib/canon/diff/diff_classifier.rb +74 -14
- data/lib/canon/diff/diff_context_builder.rb +41 -0
- data/lib/canon/diff/diff_line.rb +18 -2
- data/lib/canon/diff/diff_node.rb +18 -3
- data/lib/canon/diff/diff_node_mapper.rb +71 -12
- data/lib/canon/diff/formatting_detector.rb +53 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
- data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
- data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
- data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
- data/lib/canon/diff_formatter/debug_output.rb +7 -1
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
- data/lib/canon/diff_formatter/legend.rb +42 -0
- data/lib/canon/diff_formatter.rb +78 -9
- data/lib/canon/errors.rb +56 -0
- data/lib/canon/formatters/html_formatter_base.rb +35 -1
- data/lib/canon/formatters/json_formatter.rb +3 -0
- data/lib/canon/formatters/yaml_formatter.rb +3 -0
- data/lib/canon/html/data_model.rb +229 -0
- data/lib/canon/html.rb +9 -0
- data/lib/canon/options/cli_generator.rb +70 -0
- data/lib/canon/options/registry.rb +234 -0
- data/lib/canon/rspec_matchers.rb +34 -13
- data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
- data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
- data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
- data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
- data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
- data/lib/canon/tree_diff/core/matching.rb +241 -0
- data/lib/canon/tree_diff/core/node_signature.rb +164 -0
- data/lib/canon/tree_diff/core/node_weight.rb +135 -0
- data/lib/canon/tree_diff/core/tree_node.rb +450 -0
- data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
- data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
- data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
- data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
- data/lib/canon/tree_diff/operation_converter.rb +631 -0
- data/lib/canon/tree_diff/operations/operation.rb +92 -0
- data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
- data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
- data/lib/canon/tree_diff.rb +33 -0
- data/lib/canon/validators/json_validator.rb +3 -1
- data/lib/canon/validators/yaml_validator.rb +3 -1
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +22 -23
- data/lib/canon/xml/element_matcher.rb +128 -20
- data/lib/canon/xml/namespace_helper.rb +110 -0
- data/lib/canon.rb +3 -0
- metadata +81 -23
- data/_config.yml +0 -116
- data/docs/ADVANCED_TOPICS.adoc +0 -20
- data/docs/BASIC_USAGE.adoc +0 -16
- data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
- data/docs/DIFF_ARCHITECTURE.adoc +0 -435
- data/docs/DIFF_FORMATTING.adoc +0 -540
- data/docs/FORMATS.adoc +0 -447
- data/docs/INPUT_VALIDATION.adoc +0 -477
- data/docs/MATCH_ARCHITECTURE.adoc +0 -463
- data/docs/MATCH_OPTIONS.adoc +0 -719
- data/docs/MODES.adoc +0 -432
- data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
- data/docs/OPTIONS.adoc +0 -1387
- data/docs/PREPROCESSING.adoc +0 -491
- data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
- data/docs/UNDERSTANDING_CANON.adoc +0 -17
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: Comparison Pipeline
|
|
3
|
+
parent: Understanding
|
|
4
|
+
nav_order: 2
|
|
5
|
+
---
|
|
6
|
+
= Comparison Pipeline
|
|
7
|
+
|
|
8
|
+
== Purpose
|
|
9
|
+
|
|
10
|
+
Canon's comparison system uses a 4-layer architecture where each layer has a distinct responsibility. Understanding this pipeline is essential for configuring Canon effectively.
|
|
11
|
+
|
|
12
|
+
== The 4-Layer Flow
|
|
13
|
+
|
|
14
|
+
Canon processes document comparisons through four sequential layers:
|
|
15
|
+
|
|
16
|
+
[mermaid]
|
|
17
|
+
----
|
|
18
|
+
graph TD
|
|
19
|
+
A[Input Documents] --> B[Layer 1: Preprocessing]
|
|
20
|
+
B --> C[Layer 2: Algorithm Selection]
|
|
21
|
+
C --> D[Layer 3: Match Options]
|
|
22
|
+
D --> E[Layer 4: Diff Formatting]
|
|
23
|
+
E --> F[Output]
|
|
24
|
+
|
|
25
|
+
style B fill:#e1f5ff
|
|
26
|
+
style C fill:#fff4e1
|
|
27
|
+
style D fill:#ffe1f5
|
|
28
|
+
style E fill:#e1ffe1
|
|
29
|
+
----
|
|
30
|
+
|
|
31
|
+
== Layer Overview
|
|
32
|
+
|
|
33
|
+
=== Layer 1: Preprocessing
|
|
34
|
+
|
|
35
|
+
**Purpose**: Normalize documents before comparison
|
|
36
|
+
|
|
37
|
+
**Options**:
|
|
38
|
+
* `none` - No preprocessing (default)
|
|
39
|
+
* `c14n` - Canonicalize using format-specific rules
|
|
40
|
+
* `normalize` - Normalize whitespace
|
|
41
|
+
* `format` - Pretty-print with consistent formatting
|
|
42
|
+
|
|
43
|
+
**When this runs**: Before any comparison takes place
|
|
44
|
+
|
|
45
|
+
**Documentation**: See link:../features/preprocessing/[Preprocessing]
|
|
46
|
+
|
|
47
|
+
=== Layer 2: Algorithm Selection
|
|
48
|
+
|
|
49
|
+
**Purpose**: Choose the comparison strategy
|
|
50
|
+
|
|
51
|
+
**Options**:
|
|
52
|
+
* `dom` - DOM-based positional comparison (default, stable)
|
|
53
|
+
* `semantic` - Tree-based semantic diff (experimental, intelligent)
|
|
54
|
+
|
|
55
|
+
**Impact**: Determines how Layers 3 and 4 behave
|
|
56
|
+
|
|
57
|
+
**Documentation**: See link:algorithms/[Algorithms]
|
|
58
|
+
|
|
59
|
+
=== Layer 3: Match Options
|
|
60
|
+
|
|
61
|
+
**Purpose**: Configure what to compare and how strictly
|
|
62
|
+
|
|
63
|
+
**Key Concept**: This layer is **algorithm-specific** - each algorithm interprets match options differently.
|
|
64
|
+
|
|
65
|
+
**Components**:
|
|
66
|
+
* Match dimensions (granular control)
|
|
67
|
+
* Match profiles (preset combinations)
|
|
68
|
+
* Algorithm-specific behaviors
|
|
69
|
+
|
|
70
|
+
**Documentation**: See link:../features/match-options/[Match Options] and link:../features/match-options/algorithm-specific-behavior.adoc[Algorithm-Specific Behavior]
|
|
71
|
+
|
|
72
|
+
=== Layer 4: Diff Formatting
|
|
73
|
+
|
|
74
|
+
**Purpose**: Control how differences are displayed
|
|
75
|
+
|
|
76
|
+
**Key Concept**: This layer is **algorithm-specific** - each algorithm generates different output types.
|
|
77
|
+
|
|
78
|
+
**Components**:
|
|
79
|
+
* Diff mode (`by_line`, `by_object`)
|
|
80
|
+
* Colors and symbols
|
|
81
|
+
* Context and grouping
|
|
82
|
+
* Character visualization
|
|
83
|
+
|
|
84
|
+
**Documentation**: See link:../features/diff-formatting/[Diff Formatting] and link:../features/diff-formatting/algorithm-specific-output.adoc[Algorithm-Specific Output]
|
|
85
|
+
|
|
86
|
+
== Complete Example
|
|
87
|
+
|
|
88
|
+
Here's a full 4-layer configuration showing all layers working together:
|
|
89
|
+
|
|
90
|
+
=== Ruby API
|
|
91
|
+
|
|
92
|
+
[source,ruby]
|
|
93
|
+
----
|
|
94
|
+
result = Canon::Comparison.equivalent?(doc1, doc2,
|
|
95
|
+
# Layer 1: Preprocessing
|
|
96
|
+
preprocessing: :normalize,
|
|
97
|
+
|
|
98
|
+
# Layer 2: Algorithm
|
|
99
|
+
diff_algorithm: :semantic,
|
|
100
|
+
|
|
101
|
+
# Layer 3: Match Options
|
|
102
|
+
match_profile: :spec_friendly,
|
|
103
|
+
|
|
104
|
+
# Layer 4: Diff Formatting
|
|
105
|
+
verbose: true,
|
|
106
|
+
diff_mode: :by_object,
|
|
107
|
+
use_color: true,
|
|
108
|
+
context_lines: 3
|
|
109
|
+
)
|
|
110
|
+
----
|
|
111
|
+
|
|
112
|
+
=== CLI
|
|
113
|
+
|
|
114
|
+
[source,bash]
|
|
115
|
+
----
|
|
116
|
+
canon diff file1.xml file2.xml \
|
|
117
|
+
--preprocessing normalize \ # Layer 1
|
|
118
|
+
--diff-algorithm semantic \ # Layer 2
|
|
119
|
+
--match-profile spec_friendly \ # Layer 3
|
|
120
|
+
--diff-mode by_object \ # Layer 4
|
|
121
|
+
--verbose \ # Enable Layer 4 output
|
|
122
|
+
--context-lines 5 # Layer 4 option
|
|
123
|
+
----
|
|
124
|
+
|
|
125
|
+
== Layer-by-Layer Build-Up
|
|
126
|
+
|
|
127
|
+
Let's see how each layer adds to the configuration:
|
|
128
|
+
|
|
129
|
+
=== Just Layer 1
|
|
130
|
+
|
|
131
|
+
[source,ruby]
|
|
132
|
+
----
|
|
133
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
134
|
+
preprocessing: :normalize
|
|
135
|
+
)
|
|
136
|
+
# Documents are normalized, then compared using default DOM algorithm
|
|
137
|
+
# with strict matching and by-line diff output
|
|
138
|
+
----
|
|
139
|
+
|
|
140
|
+
=== Layers 1 + 2
|
|
141
|
+
|
|
142
|
+
[source,ruby]
|
|
143
|
+
----
|
|
144
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
145
|
+
preprocessing: :normalize,
|
|
146
|
+
diff_algorithm: :semantic
|
|
147
|
+
)
|
|
148
|
+
# Documents are normalized, then compared using semantic algorithm
|
|
149
|
+
# with strict matching and by-object diff output (semantic's natural mode)
|
|
150
|
+
----
|
|
151
|
+
|
|
152
|
+
=== Layers 1 + 2 + 3
|
|
153
|
+
|
|
154
|
+
[source,ruby]
|
|
155
|
+
----
|
|
156
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
157
|
+
preprocessing: :normalize,
|
|
158
|
+
diff_algorithm: :semantic,
|
|
159
|
+
match_profile: :spec_friendly
|
|
160
|
+
)
|
|
161
|
+
# Documents are normalized, compared using semantic algorithm
|
|
162
|
+
# with spec_friendly matching (ignores formatting differences)
|
|
163
|
+
# Default diff output is by-object
|
|
164
|
+
----
|
|
165
|
+
|
|
166
|
+
=== All 4 Layers
|
|
167
|
+
|
|
168
|
+
[source,ruby]
|
|
169
|
+
----
|
|
170
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
171
|
+
preprocessing: :normalize,
|
|
172
|
+
diff_algorithm: :semantic,
|
|
173
|
+
match_profile: :spec_friendly,
|
|
174
|
+
verbose: true,
|
|
175
|
+
diff_mode: :by_line,
|
|
176
|
+
use_color: true
|
|
177
|
+
)
|
|
178
|
+
# Complete configuration with all layers specified
|
|
179
|
+
# Documents normalized, semantic comparison, spec-friendly matching,
|
|
180
|
+
# traditional line-based diff output with colors
|
|
181
|
+
----
|
|
182
|
+
|
|
183
|
+
== Layer Interaction Matrix
|
|
184
|
+
|
|
185
|
+
This table shows common configuration patterns:
|
|
186
|
+
|
|
187
|
+
[cols="1,1,1,1,2"]
|
|
188
|
+
|===
|
|
189
|
+
|Layer 1 |Layer 2 |Layer 3 |Layer 4 |Result
|
|
190
|
+
|
|
191
|
+
|none
|
|
192
|
+
|dom
|
|
193
|
+
|strict
|
|
194
|
+
|by_line
|
|
195
|
+
|Traditional exact comparison
|
|
196
|
+
|
|
197
|
+
|normalize
|
|
198
|
+
|dom
|
|
199
|
+
|spec_friendly
|
|
200
|
+
|by_line
|
|
201
|
+
|Test-friendly comparison
|
|
202
|
+
|
|
203
|
+
|c14n
|
|
204
|
+
|dom
|
|
205
|
+
|content_only
|
|
206
|
+
|by_object
|
|
207
|
+
|Canonical structure view
|
|
208
|
+
|
|
209
|
+
|none
|
|
210
|
+
|semantic
|
|
211
|
+
|strict
|
|
212
|
+
|by_object
|
|
213
|
+
|Semantic operations view
|
|
214
|
+
|
|
215
|
+
|normalize
|
|
216
|
+
|semantic
|
|
217
|
+
|rendered
|
|
218
|
+
|by_line
|
|
219
|
+
|Rendered diff with operations
|
|
220
|
+
|===
|
|
221
|
+
|
|
222
|
+
== Key Principles
|
|
223
|
+
|
|
224
|
+
=== Layer Independence
|
|
225
|
+
|
|
226
|
+
Each layer has a distinct purpose:
|
|
227
|
+
* **Layer 1**: Document transformation
|
|
228
|
+
* **Layer 2**: Comparison strategy
|
|
229
|
+
* **Layer 3**: Match criteria
|
|
230
|
+
* **Layer 4**: Output presentation
|
|
231
|
+
|
|
232
|
+
=== Algorithm Specificity
|
|
233
|
+
|
|
234
|
+
Layers 3 and 4 are interpreted differently by each algorithm:
|
|
235
|
+
* The same match options may behave differently with DOM vs Semantic
|
|
236
|
+
* Diff modes have different natural fits for each algorithm
|
|
237
|
+
* Understanding this is crucial for effective configuration
|
|
238
|
+
|
|
239
|
+
=== Default Behavior
|
|
240
|
+
|
|
241
|
+
If you don't specify a layer:
|
|
242
|
+
* **Layer 1**: `none` (no preprocessing)
|
|
243
|
+
* **Layer 2**: `dom` (stable algorithm)
|
|
244
|
+
* **Layer 3**: `strict` (exact matching)
|
|
245
|
+
* **Layer 4**: `by_line` for DOM, `by_object` for Semantic
|
|
246
|
+
|
|
247
|
+
== Common Patterns
|
|
248
|
+
|
|
249
|
+
=== Testing XML Generation
|
|
250
|
+
|
|
251
|
+
[source,ruby]
|
|
252
|
+
----
|
|
253
|
+
Canon::Comparison.equivalent?(expected, actual,
|
|
254
|
+
preprocessing: :normalize, # Ignore formatting
|
|
255
|
+
match_profile: :spec_friendly, # Focus on content
|
|
256
|
+
verbose: true # Show differences
|
|
257
|
+
)
|
|
258
|
+
----
|
|
259
|
+
|
|
260
|
+
=== Debugging Test Failures
|
|
261
|
+
|
|
262
|
+
[source,ruby]
|
|
263
|
+
----
|
|
264
|
+
Canon::Comparison.equivalent?(expected, actual,
|
|
265
|
+
diff_algorithm: :semantic, # Detect moves/changes
|
|
266
|
+
verbose: true,
|
|
267
|
+
diff_mode: :by_object, # See operations
|
|
268
|
+
use_color: true # Easier to read
|
|
269
|
+
)
|
|
270
|
+
----
|
|
271
|
+
|
|
272
|
+
=== Content-Only Comparison
|
|
273
|
+
|
|
274
|
+
[source,ruby]
|
|
275
|
+
----
|
|
276
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
277
|
+
preprocessing: :format, # Normalize structure
|
|
278
|
+
match_profile: :content_only, # Ignore all formatting
|
|
279
|
+
verbose: true
|
|
280
|
+
)
|
|
281
|
+
----
|
|
282
|
+
|
|
283
|
+
== Anti-Patterns to Avoid
|
|
284
|
+
|
|
285
|
+
=== Over-Configuration
|
|
286
|
+
|
|
287
|
+
[source,ruby]
|
|
288
|
+
----
|
|
289
|
+
# DON'T: Too many conflicting options
|
|
290
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
291
|
+
preprocessing: :c14n,
|
|
292
|
+
diff_algorithm: :dom,
|
|
293
|
+
match: {
|
|
294
|
+
text_content: :ignore, # Conflicts with preprocessing
|
|
295
|
+
structural_whitespace: :strict # Conflicts with preprocessing
|
|
296
|
+
}
|
|
297
|
+
)
|
|
298
|
+
----
|
|
299
|
+
|
|
300
|
+
=== Wrong Algorithm/Mode Combination
|
|
301
|
+
|
|
302
|
+
[source,ruby]
|
|
303
|
+
----
|
|
304
|
+
# SUBOPTIMAL: Semantic algorithm with by-line mode loses operation info
|
|
305
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
306
|
+
diff_algorithm: :semantic,
|
|
307
|
+
diff_mode: :by_line # Better to use by_object with semantic
|
|
308
|
+
)
|
|
309
|
+
----
|
|
310
|
+
|
|
311
|
+
== See Also
|
|
312
|
+
|
|
313
|
+
* link:architecture.adoc[Architecture] - Overall system design
|
|
314
|
+
* link:algorithms/[Algorithms] - Detailed algorithm documentation
|
|
315
|
+
* link:../features/match-options/algorithm-specific-behavior.adoc[Algorithm-Specific Behavior] - How algorithms interpret match options
|
|
316
|
+
* link:../features/diff-formatting/algorithm-specific-output.adoc[Algorithm-Specific Output] - Different output formats
|
|
317
|
+
* link:../guides/choosing-configuration.adoc[Choosing Configuration] - Decision guide for all layers
|
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: HTML Format
|
|
3
|
+
parent: Format Support
|
|
4
|
+
grand_parent: Understanding
|
|
5
|
+
nav_order: 2
|
|
6
|
+
---
|
|
7
|
+
= HTML format
|
|
8
|
+
:toc:
|
|
9
|
+
:toclevels: 3
|
|
10
|
+
|
|
11
|
+
== Purpose
|
|
12
|
+
|
|
13
|
+
This page describes Canon's HTML format support, including automatic HTML4/HTML5/XHTML detection, browser rendering simulation, and HTML-specific features.
|
|
14
|
+
|
|
15
|
+
== Canonicalization
|
|
16
|
+
|
|
17
|
+
Canon supports HTML 4, HTML5, and XHTML with automatic format detection.
|
|
18
|
+
|
|
19
|
+
**Key features:**
|
|
20
|
+
|
|
21
|
+
* Automatic HTML vs XHTML detection
|
|
22
|
+
* HTML5 parser for modern HTML
|
|
23
|
+
* XML parser for XHTML
|
|
24
|
+
* Consistent attribute ordering
|
|
25
|
+
* Whitespace normalization
|
|
26
|
+
* Comment handling in `<style>` and `<script>` tags
|
|
27
|
+
|
|
28
|
+
.HTML canonicalization example
|
|
29
|
+
[example]
|
|
30
|
+
====
|
|
31
|
+
[source,ruby]
|
|
32
|
+
----
|
|
33
|
+
html = <<~HTML
|
|
34
|
+
<!DOCTYPE html>
|
|
35
|
+
<html>
|
|
36
|
+
<body>
|
|
37
|
+
<div class="foo" id="bar">
|
|
38
|
+
Content
|
|
39
|
+
</div>
|
|
40
|
+
</body>
|
|
41
|
+
</html>
|
|
42
|
+
HTML
|
|
43
|
+
|
|
44
|
+
Canon.format(html, :html)
|
|
45
|
+
# => Normalized structure with consistent formatting
|
|
46
|
+
----
|
|
47
|
+
====
|
|
48
|
+
|
|
49
|
+
== Format defaults
|
|
50
|
+
|
|
51
|
+
[cols="1,1"]
|
|
52
|
+
|===
|
|
53
|
+
|Dimension |Default Behavior
|
|
54
|
+
|
|
55
|
+
|`text_content`
|
|
56
|
+
|`:normalize`
|
|
57
|
+
|
|
58
|
+
|`structural_whitespace`
|
|
59
|
+
|`:normalize`
|
|
60
|
+
|
|
61
|
+
|`attribute_whitespace`
|
|
62
|
+
|`:normalize`
|
|
63
|
+
|
|
64
|
+
|`attribute_order`
|
|
65
|
+
|`:ignore`
|
|
66
|
+
|
|
67
|
+
|`attribute_values`
|
|
68
|
+
|`:strict`
|
|
69
|
+
|
|
70
|
+
|`comments`
|
|
71
|
+
|`:ignore`
|
|
72
|
+
|===
|
|
73
|
+
|
|
74
|
+
Default diff mode: `:by_line` (line-based diff)
|
|
75
|
+
|
|
76
|
+
== Match profiles for HTML
|
|
77
|
+
|
|
78
|
+
Canon provides predefined profiles optimized for HTML documents. Each profile configures preprocessing, match options, diff algorithm, and formatting.
|
|
79
|
+
|
|
80
|
+
=== strict profile
|
|
81
|
+
|
|
82
|
+
**Purpose**: Character-perfect HTML matching
|
|
83
|
+
|
|
84
|
+
**Configuration**:
|
|
85
|
+
|
|
86
|
+
[source,ruby]
|
|
87
|
+
----
|
|
88
|
+
{
|
|
89
|
+
preprocessing: :none,
|
|
90
|
+
diff_algorithm: :dom,
|
|
91
|
+
diff_mode: :by_line, # Line-based diff output (HTML default)
|
|
92
|
+
match: {
|
|
93
|
+
text_content: :strict,
|
|
94
|
+
structural_whitespace: :strict,
|
|
95
|
+
attribute_whitespace: :strict,
|
|
96
|
+
attribute_order: :strict,
|
|
97
|
+
attribute_values: :strict,
|
|
98
|
+
comments: :strict
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
----
|
|
102
|
+
|
|
103
|
+
**Use when**: Testing exact HTML formatter output, verifying HTML formatting compliance.
|
|
104
|
+
|
|
105
|
+
=== rendered profile
|
|
106
|
+
|
|
107
|
+
**Purpose**: Browser-rendered equivalence (most common for HTML)
|
|
108
|
+
|
|
109
|
+
**Configuration**:
|
|
110
|
+
|
|
111
|
+
[source,ruby]
|
|
112
|
+
----
|
|
113
|
+
{
|
|
114
|
+
preprocessing: :none,
|
|
115
|
+
diff_algorithm: :dom,
|
|
116
|
+
diff_mode: :by_line,
|
|
117
|
+
match: {
|
|
118
|
+
text_content: :normalize,
|
|
119
|
+
structural_whitespace: :normalize,
|
|
120
|
+
attribute_whitespace: :normalize,
|
|
121
|
+
attribute_order: :ignore, # HTML attributes are unordered
|
|
122
|
+
attribute_values: :strict,
|
|
123
|
+
comments: :ignore
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
----
|
|
127
|
+
|
|
128
|
+
**Use when**: Comparing HTML as browsers render it, testing web page output, ignoring formatting that doesn't affect display. This is the recommended profile for most HTML comparisons.
|
|
129
|
+
|
|
130
|
+
=== spec_friendly profile
|
|
131
|
+
|
|
132
|
+
**Purpose**: Test-friendly comparison for RSpec
|
|
133
|
+
|
|
134
|
+
**Configuration**:
|
|
135
|
+
|
|
136
|
+
[source,ruby]
|
|
137
|
+
----
|
|
138
|
+
{
|
|
139
|
+
preprocessing: :normalize,
|
|
140
|
+
diff_algorithm: :dom,
|
|
141
|
+
diff_mode: :by_object, # Tree-based for better test output
|
|
142
|
+
match: {
|
|
143
|
+
text_content: :normalize,
|
|
144
|
+
structural_whitespace: :ignore,
|
|
145
|
+
attribute_whitespace: :normalize,
|
|
146
|
+
attribute_order: :ignore,
|
|
147
|
+
attribute_values: :strict,
|
|
148
|
+
comments: :ignore
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
----
|
|
152
|
+
|
|
153
|
+
**Use when**: Writing RSpec tests for HTML generation, testing semantic HTML correctness.
|
|
154
|
+
|
|
155
|
+
=== content_only profile
|
|
156
|
+
|
|
157
|
+
**Purpose**: Maximum tolerance - only structure matters
|
|
158
|
+
|
|
159
|
+
**Configuration**:
|
|
160
|
+
|
|
161
|
+
[source,ruby]
|
|
162
|
+
----
|
|
163
|
+
{
|
|
164
|
+
preprocessing: :normalize,
|
|
165
|
+
diff_algorithm: :dom,
|
|
166
|
+
diff_mode: :by_object,
|
|
167
|
+
match: {
|
|
168
|
+
text_content: :normalize,
|
|
169
|
+
structural_whitespace: :ignore,
|
|
170
|
+
attribute_whitespace: :ignore,
|
|
171
|
+
attribute_order: :ignore,
|
|
172
|
+
attribute_values: :ignore,
|
|
173
|
+
comments: :ignore
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
----
|
|
177
|
+
|
|
178
|
+
**Use when**: Only HTML structure needs to match, maximum flexibility for all formatting and attribute differences.
|
|
179
|
+
|
|
180
|
+
== HTML-specific features
|
|
181
|
+
|
|
182
|
+
=== Format detection
|
|
183
|
+
|
|
184
|
+
Automatically detects HTML5, HTML4, or XHTML based on DOCTYPE and structure.
|
|
185
|
+
|
|
186
|
+
.Format detection examples
|
|
187
|
+
[example]
|
|
188
|
+
====
|
|
189
|
+
[source,html]
|
|
190
|
+
----
|
|
191
|
+
<!-- HTML5 detected -->
|
|
192
|
+
<!DOCTYPE html>
|
|
193
|
+
<html>...</html>
|
|
194
|
+
|
|
195
|
+
<!-- HTML4 detected -->
|
|
196
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN">
|
|
197
|
+
<html>...</html>
|
|
198
|
+
|
|
199
|
+
<!-- XHTML detected -->
|
|
200
|
+
<?xml version="1.0"?>
|
|
201
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN">
|
|
202
|
+
<html xmlns="http://www.w3.org/1999/xhtml">...</html>
|
|
203
|
+
----
|
|
204
|
+
====
|
|
205
|
+
|
|
206
|
+
=== Whitespace handling
|
|
207
|
+
|
|
208
|
+
HTML whitespace is collapsed per CSS rendering rules. Empty text nodes between elements are removed.
|
|
209
|
+
|
|
210
|
+
.Whitespace handling example
|
|
211
|
+
[example]
|
|
212
|
+
====
|
|
213
|
+
[source,html]
|
|
214
|
+
----
|
|
215
|
+
<!-- Before -->
|
|
216
|
+
<div>
|
|
217
|
+
<p>Hello world</p>
|
|
218
|
+
<p>Second paragraph</p>
|
|
219
|
+
</div>
|
|
220
|
+
|
|
221
|
+
<!-- After normalization (with normalize) -->
|
|
222
|
+
<div>
|
|
223
|
+
<p>Hello world</p>
|
|
224
|
+
<p>Second paragraph</p>
|
|
225
|
+
</div>
|
|
226
|
+
----
|
|
227
|
+
|
|
228
|
+
Multiple spaces within text content are collapsed to single spaces when `text_content: :normalize` is used.
|
|
229
|
+
====
|
|
230
|
+
|
|
231
|
+
=== Attribute order
|
|
232
|
+
|
|
233
|
+
HTML attributes are inherently unordered per the HTML specification, so default is `:ignore`.
|
|
234
|
+
|
|
235
|
+
.Attribute order example
|
|
236
|
+
[example]
|
|
237
|
+
====
|
|
238
|
+
[source,html]
|
|
239
|
+
----
|
|
240
|
+
<!-- These are always equivalent for HTML -->
|
|
241
|
+
<input type="text" id="name" class="form-control">
|
|
242
|
+
<input class="form-control" id="name" type="text">
|
|
243
|
+
----
|
|
244
|
+
|
|
245
|
+
The HTML specification states that attribute order has no meaning, so Canon ignores attribute order by default for HTML.
|
|
246
|
+
====
|
|
247
|
+
|
|
248
|
+
=== Special tags
|
|
249
|
+
|
|
250
|
+
Comments in `<style>` and `<script>` tags are normalized specially to handle CSS/JavaScript syntax.
|
|
251
|
+
|
|
252
|
+
.Special tag handling
|
|
253
|
+
[example]
|
|
254
|
+
====
|
|
255
|
+
[source,html]
|
|
256
|
+
----
|
|
257
|
+
<style>
|
|
258
|
+
/* CSS comments preserved */
|
|
259
|
+
body { margin: 0; }
|
|
260
|
+
</style>
|
|
261
|
+
|
|
262
|
+
<script>
|
|
263
|
+
// JavaScript comments preserved
|
|
264
|
+
console.log("test");
|
|
265
|
+
</script>
|
|
266
|
+
----
|
|
267
|
+
|
|
268
|
+
Canon recognizes that `<style>` and `<script>` tags contain non-HTML content and handles them appropriately.
|
|
269
|
+
====
|
|
270
|
+
|
|
271
|
+
=== Class attribute normalization
|
|
272
|
+
|
|
273
|
+
The HTML `class` attribute contains space-separated class names, making normalization particularly useful.
|
|
274
|
+
|
|
275
|
+
.Class attribute example
|
|
276
|
+
[example]
|
|
277
|
+
====
|
|
278
|
+
[source,html]
|
|
279
|
+
----
|
|
280
|
+
<!-- These are equivalent with attribute_whitespace: :normalize -->
|
|
281
|
+
<div class="btn primary active">Click</div>
|
|
282
|
+
<div class="btn primary active">Click</div>
|
|
283
|
+
----
|
|
284
|
+
|
|
285
|
+
Multiple spaces between class names are normalized to single spaces.
|
|
286
|
+
====
|
|
287
|
+
|
|
288
|
+
== Usage examples
|
|
289
|
+
|
|
290
|
+
=== Basic HTML comparison
|
|
291
|
+
|
|
292
|
+
[source,ruby]
|
|
293
|
+
----
|
|
294
|
+
html1 = File.read("page1.html")
|
|
295
|
+
html2 = File.read("page2.html")
|
|
296
|
+
|
|
297
|
+
Canon::Comparison.equivalent?(html1, html2,
|
|
298
|
+
match_profile: :rendered
|
|
299
|
+
)
|
|
300
|
+
----
|
|
301
|
+
|
|
302
|
+
=== Test-friendly HTML comparison
|
|
303
|
+
|
|
304
|
+
[source,ruby]
|
|
305
|
+
----
|
|
306
|
+
expect(actual_html).to be_html_equivalent_to(expected_html)
|
|
307
|
+
.with_profile(:rendered)
|
|
308
|
+
----
|
|
309
|
+
|
|
310
|
+
=== Using HTML comparator directly
|
|
311
|
+
|
|
312
|
+
[source,ruby]
|
|
313
|
+
----
|
|
314
|
+
Canon::Comparison::HtmlComparator.equivalent?(html1, html2,
|
|
315
|
+
match_profile: :rendered
|
|
316
|
+
)
|
|
317
|
+
----
|
|
318
|
+
|
|
319
|
+
=== CLI usage
|
|
320
|
+
|
|
321
|
+
[source,bash]
|
|
322
|
+
----
|
|
323
|
+
# Basic comparison with rendered profile
|
|
324
|
+
canon diff page1.html page2.html \
|
|
325
|
+
--match-profile rendered \
|
|
326
|
+
--verbose
|
|
327
|
+
|
|
328
|
+
# Strict HTML comparison
|
|
329
|
+
canon diff file1.html file2.html \
|
|
330
|
+
--match-profile strict \
|
|
331
|
+
--verbose
|
|
332
|
+
----
|
|
333
|
+
|
|
334
|
+
== HTML vs XHTML
|
|
335
|
+
|
|
336
|
+
Canon handles HTML and XHTML differently:
|
|
337
|
+
|
|
338
|
+
=== HTML (HTML4/HTML5)
|
|
339
|
+
|
|
340
|
+
* Uses HTML parser (more lenient)
|
|
341
|
+
* Attribute order ignored by default
|
|
342
|
+
* Whitespace normalized by default
|
|
343
|
+
* Comments ignored by default
|
|
344
|
+
|
|
345
|
+
=== XHTML
|
|
346
|
+
|
|
347
|
+
* Uses XML parser (stricter)
|
|
348
|
+
* Follows XML rules
|
|
349
|
+
* Can use XML-specific features
|
|
350
|
+
* Namespace-aware
|
|
351
|
+
|
|
352
|
+
.XHTML example
|
|
353
|
+
[example]
|
|
354
|
+
====
|
|
355
|
+
[source,xhtml]
|
|
356
|
+
----
|
|
357
|
+
<?xml version="1.0"?>
|
|
358
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
|
359
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
|
360
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
361
|
+
<head>
|
|
362
|
+
<title>XHTML Document</title>
|
|
363
|
+
</head>
|
|
364
|
+
<body>
|
|
365
|
+
<p>Content</p>
|
|
366
|
+
</body>
|
|
367
|
+
</html>
|
|
368
|
+
----
|
|
369
|
+
|
|
370
|
+
XHTML is treated as XML and follows stricter rules.
|
|
371
|
+
====
|
|
372
|
+
|
|
373
|
+
== See also
|
|
374
|
+
|
|
375
|
+
* link:../comparison-pipeline.adoc[Comparison Pipeline] - Understanding the 4 layers
|
|
376
|
+
* link:../../features/match-options/[Match Options] - All matching options
|
|
377
|
+
* link:../../guides/choosing-configuration.adoc[Choosing Configuration] - Decision guide
|
|
378
|
+
* link:index.adoc[Format Support] - Overview of all formats
|
|
379
|
+
* link:xml.adoc[XML Format] - XML-specific features
|
|
380
|
+
* link:json.adoc[JSON Format] - JSON-specific features
|