canon 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +163 -67
- data/README.adoc +400 -7
- data/docs/Gemfile +9 -0
- data/docs/INDEX.adoc +99 -182
- data/docs/_config.yml +100 -0
- data/docs/advanced/diff-classification.adoc +547 -0
- data/docs/advanced/diff-pipeline.adoc +358 -0
- data/docs/advanced/index.adoc +214 -0
- data/docs/advanced/semantic-diff-report.adoc +390 -0
- data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
- data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
- data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
- data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
- data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
- data/docs/features/diff-formatting/display-filtering.adoc +472 -0
- data/docs/features/diff-formatting/index.adoc +140 -0
- data/docs/features/environment-configuration/index.adoc +327 -0
- data/docs/features/environment-configuration/override-system.adoc +436 -0
- data/docs/features/environment-configuration/size-limits.adoc +273 -0
- data/docs/features/index.adoc +173 -0
- data/docs/features/input-validation/index.adoc +521 -0
- data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
- data/docs/features/match-options/html-policies.adoc +312 -0
- data/docs/features/match-options/index.adoc +621 -0
- data/docs/getting-started/index.adoc +83 -0
- data/docs/getting-started/quick-start.adoc +76 -0
- data/docs/guides/choosing-configuration.adoc +689 -0
- data/docs/guides/index.adoc +181 -0
- data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
- data/docs/interfaces/index.adoc +101 -0
- data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
- data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
- data/docs/lychee.toml +65 -0
- data/docs/reference/cli-options.adoc +418 -0
- data/docs/reference/environment-variables.adoc +375 -0
- data/docs/reference/index.adoc +204 -0
- data/docs/reference/options-across-interfaces.adoc +417 -0
- data/docs/understanding/algorithms/dom-diff.adoc +389 -0
- data/docs/understanding/algorithms/index.adoc +314 -0
- data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
- data/docs/understanding/architecture.adoc +447 -0
- data/docs/understanding/comparison-pipeline.adoc +317 -0
- data/docs/understanding/formats/html.adoc +380 -0
- data/docs/understanding/formats/index.adoc +261 -0
- data/docs/understanding/formats/json.adoc +390 -0
- data/docs/understanding/formats/xml.adoc +366 -0
- data/docs/understanding/formats/yaml.adoc +504 -0
- data/docs/understanding/index.adoc +130 -0
- data/lib/canon/cli.rb +42 -1
- data/lib/canon/commands/diff_command.rb +108 -23
- data/lib/canon/comparison/compare_profile.rb +101 -0
- data/lib/canon/comparison/comparison_result.rb +41 -2
- data/lib/canon/comparison/html_comparator.rb +292 -71
- data/lib/canon/comparison/html_compare_profile.rb +117 -0
- data/lib/canon/comparison/match_options.rb +42 -4
- data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
- data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
- data/lib/canon/comparison/xml_comparator.rb +695 -91
- data/lib/canon/comparison.rb +207 -2
- data/lib/canon/config/env_provider.rb +71 -0
- data/lib/canon/config/env_schema.rb +58 -0
- data/lib/canon/config/override_resolver.rb +55 -0
- data/lib/canon/config/type_converter.rb +59 -0
- data/lib/canon/config.rb +158 -29
- data/lib/canon/data_model.rb +29 -0
- data/lib/canon/diff/diff_classifier.rb +74 -14
- data/lib/canon/diff/diff_context_builder.rb +41 -0
- data/lib/canon/diff/diff_line.rb +18 -2
- data/lib/canon/diff/diff_node.rb +18 -3
- data/lib/canon/diff/diff_node_mapper.rb +71 -12
- data/lib/canon/diff/formatting_detector.rb +53 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
- data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
- data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
- data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
- data/lib/canon/diff_formatter/debug_output.rb +7 -1
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
- data/lib/canon/diff_formatter/legend.rb +42 -0
- data/lib/canon/diff_formatter.rb +78 -9
- data/lib/canon/errors.rb +56 -0
- data/lib/canon/formatters/html_formatter_base.rb +35 -1
- data/lib/canon/formatters/json_formatter.rb +3 -0
- data/lib/canon/formatters/yaml_formatter.rb +3 -0
- data/lib/canon/html/data_model.rb +229 -0
- data/lib/canon/html.rb +9 -0
- data/lib/canon/options/cli_generator.rb +70 -0
- data/lib/canon/options/registry.rb +234 -0
- data/lib/canon/rspec_matchers.rb +34 -13
- data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
- data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
- data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
- data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
- data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
- data/lib/canon/tree_diff/core/matching.rb +241 -0
- data/lib/canon/tree_diff/core/node_signature.rb +164 -0
- data/lib/canon/tree_diff/core/node_weight.rb +135 -0
- data/lib/canon/tree_diff/core/tree_node.rb +450 -0
- data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
- data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
- data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
- data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
- data/lib/canon/tree_diff/operation_converter.rb +631 -0
- data/lib/canon/tree_diff/operations/operation.rb +92 -0
- data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
- data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
- data/lib/canon/tree_diff.rb +33 -0
- data/lib/canon/validators/json_validator.rb +3 -1
- data/lib/canon/validators/yaml_validator.rb +3 -1
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +22 -23
- data/lib/canon/xml/element_matcher.rb +128 -20
- data/lib/canon/xml/namespace_helper.rb +110 -0
- data/lib/canon.rb +3 -0
- metadata +81 -23
- data/_config.yml +0 -116
- data/docs/ADVANCED_TOPICS.adoc +0 -20
- data/docs/BASIC_USAGE.adoc +0 -16
- data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
- data/docs/DIFF_ARCHITECTURE.adoc +0 -435
- data/docs/DIFF_FORMATTING.adoc +0 -540
- data/docs/FORMATS.adoc +0 -447
- data/docs/INPUT_VALIDATION.adoc +0 -477
- data/docs/MATCH_ARCHITECTURE.adoc +0 -463
- data/docs/MATCH_OPTIONS.adoc +0 -719
- data/docs/MODES.adoc +0 -432
- data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
- data/docs/OPTIONS.adoc +0 -1387
- data/docs/PREPROCESSING.adoc +0 -491
- data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
- data/docs/UNDERSTANDING_CANON.adoc +0 -17
data/README.adoc
CHANGED
|
@@ -49,11 +49,17 @@ require 'canon'
|
|
|
49
49
|
|
|
50
50
|
# Canonical form (compact)
|
|
51
51
|
Canon.format('<root><b>2</b><a>1</a></root>', :xml)
|
|
52
|
-
# =>
|
|
52
|
+
# => Pretty-printed XML (default behavior)
|
|
53
53
|
|
|
54
|
-
#
|
|
54
|
+
# Compact canonical form
|
|
55
|
+
require 'canon/xml/c14n'
|
|
56
|
+
Canon::Xml::C14n.canonicalize('<root><b>2</b><a>1</a></root>', with_comments: false)
|
|
57
|
+
# => "<root><b>2</b><a>1</a></root>"
|
|
58
|
+
|
|
59
|
+
# Pretty-print (human-readable with custom indent)
|
|
55
60
|
require 'canon/pretty_printer/xml'
|
|
56
|
-
|
|
61
|
+
xml_input = '<root><b>2</b><a>1</a></root>'
|
|
62
|
+
Canon::PrettyPrinter::Xml.new(indent: 2).format(xml_input)
|
|
57
63
|
----
|
|
58
64
|
|
|
59
65
|
=== Compare documents
|
|
@@ -67,6 +73,13 @@ xml2 = '<root> <b>2</b> <a>1</a> </root>'
|
|
|
67
73
|
|
|
68
74
|
Canon::Comparison.equivalent?(xml1, xml2)
|
|
69
75
|
# => true (semantically equivalent despite formatting differences)
|
|
76
|
+
|
|
77
|
+
# Use semantic tree diff for operation-level analysis
|
|
78
|
+
result = Canon::Comparison.equivalent?(xml1, xml2,
|
|
79
|
+
verbose: true,
|
|
80
|
+
diff_algorithm: :semantic
|
|
81
|
+
)
|
|
82
|
+
result.operations # => [INSERT, DELETE, UPDATE, MOVE operations]
|
|
70
83
|
----
|
|
71
84
|
|
|
72
85
|
=== Use in tests
|
|
@@ -117,6 +130,10 @@ $ canon help
|
|
|
117
130
|
options
|
|
118
131
|
* **link:docs/MATCH_OPTIONS[Match options]** - Match dimensions and
|
|
119
132
|
profiles
|
|
133
|
+
* **link:docs/TREE_DIFF[Semantic tree diff]** - Operation-level tree
|
|
134
|
+
comparison
|
|
135
|
+
* **link:docs/SEMANTIC_TREE_DIFF[Semantic tree diff algorithm]** - Comprehensive guide to semantic diff
|
|
136
|
+
* **link:docs/ENV_CONFIG[Environment configuration]** - Configure via ENV variables including size limits
|
|
120
137
|
* **link:docs/DIFF_FORMATTING[Diff formatting]** - Customizing diff output
|
|
121
138
|
* **link:docs/CHARACTER_VISUALIZATION[Character visualization]** -
|
|
122
139
|
Whitespace and special characters
|
|
@@ -131,6 +148,7 @@ $ canon help
|
|
|
131
148
|
classification
|
|
132
149
|
* **link:docs/DIFF_ARCHITECTURE[Diff architecture]** - Technical pipeline
|
|
133
150
|
details
|
|
151
|
+
* **link:docs/COMPARE_PROFILE[CompareProfile architecture]** - Format-specific policies
|
|
134
152
|
|
|
135
153
|
== Features
|
|
136
154
|
|
|
@@ -152,12 +170,182 @@ Compare documents based on meaning, not formatting:
|
|
|
152
170
|
|
|
153
171
|
* Whitespace normalization options
|
|
154
172
|
* Attribute/key order handling
|
|
155
|
-
* Comment handling
|
|
173
|
+
* Comment handling with display control
|
|
156
174
|
* Multiple match dimensions with behaviors
|
|
157
175
|
* Predefined match profiles (strict, rendered, spec_friendly, content_only)
|
|
158
176
|
|
|
159
177
|
See link:docs/MATCH_OPTIONS[Match options] for details.
|
|
160
178
|
|
|
179
|
+
==== Comment display control
|
|
180
|
+
|
|
181
|
+
Control which differences are displayed in diff output:
|
|
182
|
+
|
|
183
|
+
[source,ruby]
|
|
184
|
+
----
|
|
185
|
+
# Show all differences (default)
|
|
186
|
+
result = Canon::Comparison.equivalent?(xml1, xml2,
|
|
187
|
+
verbose: true,
|
|
188
|
+
match: { comments: :ignore },
|
|
189
|
+
show_diffs: :all
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Show only normative differences (affect equivalence)
|
|
193
|
+
result = Canon::Comparison.equivalent?(xml1, xml2,
|
|
194
|
+
verbose: true,
|
|
195
|
+
match: { comments: :ignore },
|
|
196
|
+
show_diffs: :normative
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Show only informative differences
|
|
200
|
+
result = Canon::Comparison.equivalent?(xml1, xml2,
|
|
201
|
+
verbose: true,
|
|
202
|
+
match: { comments: :ignore },
|
|
203
|
+
show_diffs: :informative
|
|
204
|
+
)
|
|
205
|
+
----
|
|
206
|
+
|
|
207
|
+
**CLI usage:**
|
|
208
|
+
[source,bash]
|
|
209
|
+
----
|
|
210
|
+
# Show all differences
|
|
211
|
+
$ canon diff file1.xml file2.xml --show-diffs all
|
|
212
|
+
|
|
213
|
+
# Show only normative differences
|
|
214
|
+
$ canon diff file1.xml file2.xml --show-diffs normative
|
|
215
|
+
|
|
216
|
+
# Show only informative differences
|
|
217
|
+
$ canon diff file1.xml file2.xml --show-diffs informative
|
|
218
|
+
----
|
|
219
|
+
|
|
220
|
+
**RSpec usage:**
|
|
221
|
+
[source,ruby]
|
|
222
|
+
----
|
|
223
|
+
expect(actual).to be_xml_equivalent_to(expected)
|
|
224
|
+
.show_diffs(:normative)
|
|
225
|
+
----
|
|
226
|
+
|
|
227
|
+
=== Original input string display
|
|
228
|
+
|
|
229
|
+
When debugging test failures, it's often helpful to see the exact strings that
|
|
230
|
+
were passed to the comparison before any preprocessing or normalization. The
|
|
231
|
+
`verbose_diff` option displays the original input strings in an RSpec-style
|
|
232
|
+
format with line numbers.
|
|
233
|
+
|
|
234
|
+
[source,ruby]
|
|
235
|
+
----
|
|
236
|
+
# Enable original string display in configuration
|
|
237
|
+
Canon::Config.configure do |config|
|
|
238
|
+
config.xml.diff.verbose_diff = true
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
# Or programmatically for a specific comparison
|
|
242
|
+
result = Canon::Comparison.equivalent?(xml1, xml2,
|
|
243
|
+
verbose: true,
|
|
244
|
+
verbose_diff: true
|
|
245
|
+
)
|
|
246
|
+
----
|
|
247
|
+
|
|
248
|
+
**Output format:**
|
|
249
|
+
----
|
|
250
|
+
==================================================================
|
|
251
|
+
ORIGINAL INPUT STRINGS
|
|
252
|
+
==================================================================
|
|
253
|
+
|
|
254
|
+
Expected (as string):
|
|
255
|
+
1 | <root>
|
|
256
|
+
2 | <element>value1</element>
|
|
257
|
+
3 | </root>
|
|
258
|
+
|
|
259
|
+
Actual (as string):
|
|
260
|
+
1 | <root>
|
|
261
|
+
2 | <element>value2</element>
|
|
262
|
+
3 | </root>
|
|
263
|
+
|
|
264
|
+
==================================================================
|
|
265
|
+
----
|
|
266
|
+
|
|
267
|
+
**When to use this feature:**
|
|
268
|
+
|
|
269
|
+
* Debugging why two documents are considered different
|
|
270
|
+
* Understanding preprocessing effects (c14n, normalization, etc.)
|
|
271
|
+
* Verifying the exact input received by the comparison
|
|
272
|
+
* Comparing raw vs processed content
|
|
273
|
+
|
|
274
|
+
**Environment variable:**
|
|
275
|
+
[source,bash]
|
|
276
|
+
----
|
|
277
|
+
export CANON_XML_DIFF_VERBOSE_DIFF=true
|
|
278
|
+
export CANON_HTML_DIFF_VERBOSE_DIFF=true
|
|
279
|
+
export CANON_JSON_DIFF_VERBOSE_DIFF=true
|
|
280
|
+
export CANON_YAML_DIFF_VERBOSE_DIFF=true
|
|
281
|
+
----
|
|
282
|
+
|
|
283
|
+
=== Algorithm choice
|
|
284
|
+
|
|
285
|
+
Canon provides two diff algorithms:
|
|
286
|
+
|
|
287
|
+
* **DOM diff** (default): Stable, position-based comparison for traditional line-by-line output
|
|
288
|
+
* **Semantic tree diff** (experimental): Advanced operation detection (INSERT, DELETE, UPDATE, MOVE, MERGE, SPLIT, UPGRADE, DOWNGRADE)
|
|
289
|
+
|
|
290
|
+
[source,ruby]
|
|
291
|
+
----
|
|
292
|
+
# Use DOM diff (default, stable)
|
|
293
|
+
result = Canon::Comparison.equivalent?(doc1, doc2,
|
|
294
|
+
verbose: true,
|
|
295
|
+
diff_algorithm: :dom
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
# Use semantic tree diff (experimental, more intelligent)
|
|
299
|
+
result = Canon::Comparison.equivalent?(doc1, doc2,
|
|
300
|
+
verbose: true,
|
|
301
|
+
diff_algorithm: :semantic
|
|
302
|
+
)
|
|
303
|
+
----
|
|
304
|
+
|
|
305
|
+
**When to use semantic tree diff:**
|
|
306
|
+
|
|
307
|
+
* Need to detect high-level operations (moves, merges, splits)
|
|
308
|
+
* Documents have significant rearrangement
|
|
309
|
+
* Want statistical analysis of changes
|
|
310
|
+
* Need operation-level transformation analysis
|
|
311
|
+
|
|
312
|
+
**When to use DOM diff:**
|
|
313
|
+
|
|
314
|
+
* Need stable, well-tested comparison
|
|
315
|
+
* Want traditional line-by-line output
|
|
316
|
+
* Documents are similar in structure
|
|
317
|
+
* Maximum performance for large files
|
|
318
|
+
|
|
319
|
+
See link:docs/SEMANTIC_TREE_DIFF[Semantic tree diff algorithm] for comprehensive guide.
|
|
320
|
+
|
|
321
|
+
=== Size limits for large files
|
|
322
|
+
|
|
323
|
+
Canon provides configurable size limits to prevent hangs on pathologically large files:
|
|
324
|
+
|
|
325
|
+
* **File size limit**: Default 5MB (configurable)
|
|
326
|
+
* **Node count limit**: Default 10,000 nodes (configurable)
|
|
327
|
+
* **Diff output limit**: Default 10,000 lines (configurable)
|
|
328
|
+
|
|
329
|
+
[source,bash]
|
|
330
|
+
----
|
|
331
|
+
# Configure via environment variables
|
|
332
|
+
export CANON_MAX_FILE_SIZE=10485760 # 10MB
|
|
333
|
+
export CANON_MAX_NODE_COUNT=50000 # 50,000 nodes
|
|
334
|
+
export CANON_MAX_DIFF_LINES=20000 # 20,000 lines
|
|
335
|
+
|
|
336
|
+
bundle exec rspec
|
|
337
|
+
----
|
|
338
|
+
|
|
339
|
+
[source,ruby]
|
|
340
|
+
----
|
|
341
|
+
# Or programmatically
|
|
342
|
+
Canon::Config.instance.xml.diff.max_file_size = 10_485_760
|
|
343
|
+
Canon::Config.instance.xml.diff.max_node_count = 50_000
|
|
344
|
+
Canon::Config.instance.xml.diff.max_diff_lines = 20_000
|
|
345
|
+
----
|
|
346
|
+
|
|
347
|
+
See link:docs/ENV_CONFIG#size-limits[ENV_CONFIG] for details on size limit configuration.
|
|
348
|
+
|
|
161
349
|
=== Smart diff output
|
|
162
350
|
|
|
163
351
|
**By-line mode**: Traditional line-by-line diff with:
|
|
@@ -177,8 +365,12 @@ See link:docs/MODES[Diff modes] for details.
|
|
|
177
365
|
|
|
178
366
|
=== Enhanced diff features
|
|
179
367
|
|
|
180
|
-
* **
|
|
181
|
-
|
|
368
|
+
* **Three-tier diff classification**: Formatting-only (`[` dark gray/`]` light gray), informative (`<` blue/`>` cyan), and normative (`-` red/`+` green) differences with directional colors
|
|
369
|
+
* **Directional color coding**: Removals and additions use different colors within each tier (red/green for normative, blue/cyan for informative, dark gray/light gray for formatting)
|
|
370
|
+
* **Namespace declaration tracking**: Separate dimension for tracking `xmlns` and `xmlns:*` attribute changes, reported independently from regular data attributes
|
|
371
|
+
* **Namespace rendering**: Explicit namespace display in XML diffs using `ns:[uri]` or `ns:[{blank}]` format
|
|
372
|
+
* **Informative diff visualization**: Visually distinct blue/cyan markers for differences that don't affect equivalence
|
|
373
|
+
* **Formatting diff detection**: Automatically detects and highlights purely cosmetic whitespace/line break differences
|
|
182
374
|
* **Whitespace visualization**: Make invisible characters visible with CJK-safe
|
|
183
375
|
Unicode symbols
|
|
184
376
|
* **Non-ASCII detection**: Warnings for unexpected Unicode characters
|
|
@@ -233,7 +425,7 @@ See link:docs/CLI[CLI documentation].
|
|
|
233
425
|
[source,ruby]
|
|
234
426
|
----
|
|
235
427
|
# Configure globally
|
|
236
|
-
Canon::
|
|
428
|
+
Canon::Config.configure do |config|
|
|
237
429
|
config.xml.match.profile = :spec_friendly
|
|
238
430
|
config.xml.diff.use_color = true
|
|
239
431
|
end
|
|
@@ -267,6 +459,207 @@ delegation to mode-specific formatters (by-line, by-object).
|
|
|
267
459
|
|
|
268
460
|
See link:docs/MATCH_ARCHITECTURE[Match architecture] for details.
|
|
269
461
|
|
|
462
|
+
=== CompareProfile architecture
|
|
463
|
+
|
|
464
|
+
Canon uses the **CompareProfile** class to encapsulate policy decisions about how differences in various dimensions should be handled during comparison. This provides clean separation of concerns between policy decisions, comparison logic, and difference classification.
|
|
465
|
+
|
|
466
|
+
==== Separation of concerns
|
|
467
|
+
|
|
468
|
+
The comparison system is divided into four distinct components:
|
|
469
|
+
|
|
470
|
+
**CompareProfile**:: Policy decisions (what to track, what affects equivalence)
|
|
471
|
+
**XmlComparator/HtmlComparator**:: Comparison logic (detect differences)
|
|
472
|
+
**DiffNode**:: Data representation (represents a difference)
|
|
473
|
+
**DiffClassifier**:: Classification logic (normative vs informative vs formatting)
|
|
474
|
+
|
|
475
|
+
Each component has ONE responsibility with no overlapping concerns:
|
|
476
|
+
|
|
477
|
+
* CompareProfile does NOT classify differences
|
|
478
|
+
* XmlComparator does NOT make policy decisions
|
|
479
|
+
* DiffClassifier does NOT compare documents
|
|
480
|
+
|
|
481
|
+
==== Policy methods
|
|
482
|
+
|
|
483
|
+
CompareProfile provides four key policy methods:
|
|
484
|
+
|
|
485
|
+
`track_dimension?(dimension)`:: Should DiffNodes be created for this dimension? Returns `true` in verbose mode to track all differences for reporting.
|
|
486
|
+
|
|
487
|
+
`affects_equivalence?(dimension)`:: Should differences affect equivalence? Determines the return value of the comparison.
|
|
488
|
+
Returns `false` for dimensions with `:ignore` behavior.
|
|
489
|
+
|
|
490
|
+
`normative_dimension?(dimension)`:: Is this dimension normative (affects equivalence) or informative (display only)?
|
|
491
|
+
Used by DiffClassifier to set the normative flag on DiffNodes.
|
|
492
|
+
|
|
493
|
+
`supports_formatting_detection?(dimension)`:: Can FormattingDetector apply to this dimension?
|
|
494
|
+
Returns `true` only for text/content dimensions (`:text_content`, `:structural_whitespace`, `:comments`).
|
|
495
|
+
|
|
496
|
+
=== CompareProfile architecture
|
|
497
|
+
|
|
498
|
+
Canon uses a `CompareProfile` system to define format-specific comparison policies.
|
|
499
|
+
This allows different formats (HTML, XML, JSON, YAML) to have their own default
|
|
500
|
+
behaviors while maintaining a consistent architecture.
|
|
501
|
+
|
|
502
|
+
==== How CompareProfile works
|
|
503
|
+
|
|
504
|
+
The `CompareProfile` class provides the foundation for policy-based comparison:
|
|
505
|
+
|
|
506
|
+
**Normative policy**: Determines what differences matter for equivalence. Each
|
|
507
|
+
dimension (`:text_content`, `:structural_whitespace`, `:comments`, etc.) has a
|
|
508
|
+
behavior (`:strict`, `:normalize`, `:ignore`) that determines whether differences
|
|
509
|
+
in that dimension affect equivalence.
|
|
510
|
+
|
|
511
|
+
**Dimension-based classification**: Each difference has a dimension and the
|
|
512
|
+
profile determines if that dimension is:
|
|
513
|
+
|
|
514
|
+
* **Normative**: Affects equivalence (documents not equivalent if different)
|
|
515
|
+
* **Informative**: Tracked but doesn't affect equivalence
|
|
516
|
+
* **Formatting-only**: Pure whitespace differences when normalized content matches
|
|
517
|
+
|
|
518
|
+
**Classification hierarchy**:
|
|
519
|
+
|
|
520
|
+
1. **Normative** (highest priority): Differences that make documents non-equivalent
|
|
521
|
+
2. ** Informative** (medium priority): Differences that are tracked but don't affect equivalence
|
|
522
|
+
3. **Formatting-only** (lowest priority): Pure whitespace/formatting differences
|
|
523
|
+
|
|
524
|
+
==== Dimension behaviors
|
|
525
|
+
|
|
526
|
+
Each dimension can have one of three behaviors:
|
|
527
|
+
|
|
528
|
+
* **`:strict`**: Differences in this dimension are normative (affect equivalence)
|
|
529
|
+
* **`:normalize`**: Differences are normalized; only semantic changes are normative
|
|
530
|
+
* **`:ignore`**: Differences are informative only (don't affect equivalence)
|
|
531
|
+
|
|
532
|
+
.Example: Whitespace handling
|
|
533
|
+
[example]
|
|
534
|
+
====
|
|
535
|
+
----
|
|
536
|
+
# Default (strict mode): whitespace differences are normative
|
|
537
|
+
xml1 = '<root><p>Hello world</p></root>'
|
|
538
|
+
xml2 = '<root><p>Hello\nworld</p></root>'
|
|
539
|
+
Canon::Comparison.equivalent?(xml1, xml2) # => false
|
|
540
|
+
|
|
541
|
+
# Normalize mode: whitespace-only differences are formatting-only
|
|
542
|
+
Canon::Comparison.equivalent?(xml1, xml2,
|
|
543
|
+
match: { text_content: :normalize, structural_whitespace: :normalize }
|
|
544
|
+
) # => true
|
|
545
|
+
----
|
|
546
|
+
====
|
|
547
|
+
|
|
548
|
+
In normalize mode, the line break is detected as formatting-only because the
|
|
549
|
+
normalized content ("Hello world") is the same.
|
|
550
|
+
|
|
551
|
+
==== Format-specific profiles
|
|
552
|
+
|
|
553
|
+
Different formats can extend `CompareProfile` with format-specific policies:
|
|
554
|
+
|
|
555
|
+
* **XML** (base): Strict policies for all dimensions
|
|
556
|
+
* **HTML** (HtmlCompareProfile): Comments ignored by default, whitespace preserved in certain elements
|
|
557
|
+
* **JSON/YAML** (future): Key order policies, type handling
|
|
558
|
+
|
|
559
|
+
See `lib/canon/comparison/compare_profile.rb` for the base implementation and
|
|
560
|
+
`lib/canon/comparison/html_compare_profile.rb` for HTML-specific policies.
|
|
561
|
+
|
|
562
|
+
==== Format-specific policies for HTML
|
|
563
|
+
|
|
564
|
+
Canon provides a format-specific CompareProfile implementation called
|
|
565
|
+
HtmlCompareProfile that encapsulates policies specific to HTML comparison.
|
|
566
|
+
This profile is automatically used by HtmlComparator based on detected
|
|
567
|
+
HTML version.
|
|
568
|
+
|
|
569
|
+
**Comments**: Default behavior is `:ignore` (presentational content in HTML),
|
|
570
|
+
unless explicitly set to `:strict`. When comments are set to `:strict`,
|
|
571
|
+
they will affect equivalence.
|
|
572
|
+
|
|
573
|
+
**Whitespace preservation**: HtmlCompareProfile automatically preserves
|
|
574
|
+
whitespace in elements where it's semantically significant (e.g., `<pre>`,
|
|
575
|
+
`<code>`, `<textarea>`, `<script>`, `<style>`). In other elements, whitespace
|
|
576
|
+
is normalized.
|
|
577
|
+
|
|
578
|
+
**Case sensitivity**: HTML5 is case-sensitive for element names, while
|
|
579
|
+
HTML4 is case-insensitive. HtmlCompareProfile uses HTML5 case-sensitivity
|
|
580
|
+
by default.
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
==== Usage example
|
|
584
|
+
|
|
585
|
+
When using `match: { comments: :ignore }`:
|
|
586
|
+
|
|
587
|
+
* `track_dimension?(:comments)` returns `true` (track in verbose mode)
|
|
588
|
+
* `affects_equivalence?(:comments)` returns `false` (doesn't affect equivalence)
|
|
589
|
+
* `normative_dimension?(:comments)` returns `false` (informative only)
|
|
590
|
+
|
|
591
|
+
This ensures that comment differences are tracked and displayed in verbose mode
|
|
592
|
+
but don't make documents non-equivalent.
|
|
593
|
+
|
|
594
|
+
.Example: Comment differences with :ignore behavior
|
|
595
|
+
====
|
|
596
|
+
[source,ruby]
|
|
597
|
+
----
|
|
598
|
+
xml1 = '<root><!-- comment 1 --><data>value</data></root>'
|
|
599
|
+
xml2 = '<root><!-- comment 2 --><data>value</data></root>'
|
|
600
|
+
|
|
601
|
+
result = Canon::Comparison.equivalent?(xml1, xml2,
|
|
602
|
+
verbose: true,
|
|
603
|
+
match: { comments: :ignore }
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
result.differences # => [#<DiffNode @dimension=:comments>]
|
|
607
|
+
result.differences[0].normative? # => false (informative)
|
|
608
|
+
result.equivalent? # => true (doesn't affect equivalence)
|
|
609
|
+
----
|
|
610
|
+
|
|
611
|
+
The comment difference is tracked and displayed, but the documents are still
|
|
612
|
+
considered equivalent because comments are set to `:ignore`.
|
|
613
|
+
====
|
|
614
|
+
|
|
615
|
+
.Example: HTML comment handling
|
|
616
|
+
====
|
|
617
|
+
[source,ruby]
|
|
618
|
+
----
|
|
619
|
+
html1 = '<div><!-- comment --><p>Text</p></div>'
|
|
620
|
+
html2 = '<div><p>Text</p></div>'
|
|
621
|
+
|
|
622
|
+
# HTML defaults: comments are ignored (presentational)
|
|
623
|
+
result = Canon::Comparison.equivalent?(html1, html2)
|
|
624
|
+
# => true (comments don't affect HTML equivalence by default)
|
|
625
|
+
|
|
626
|
+
# Explicit strict matching
|
|
627
|
+
result = Canon::Comparison.equivalent?(html1, html2,
|
|
628
|
+
match: { comments: :strict }
|
|
629
|
+
)
|
|
630
|
+
# => false (comments now affect equivalence)
|
|
631
|
+
----
|
|
632
|
+
|
|
633
|
+
Comments in HTML are considered presentational content (like CSS styles) and
|
|
634
|
+
don't affect the semantic meaning unless explicitly configured to `:strict`.
|
|
635
|
+
====
|
|
636
|
+
|
|
637
|
+
.Example: HTML whitespace preservation
|
|
638
|
+
====
|
|
639
|
+
[source,ruby]
|
|
640
|
+
----
|
|
641
|
+
html1 = '<pre>Line 1\n Line 2</pre>'
|
|
642
|
+
html2 = '<pre>Line 1\nLine 2</pre>'
|
|
643
|
+
|
|
644
|
+
# Whitespace is preserved in <pre> elements
|
|
645
|
+
result = Canon::Comparison.equivalent?(html1, html2)
|
|
646
|
+
# => false (whitespace differs in pre element)
|
|
647
|
+
|
|
648
|
+
# But normalized in other elements
|
|
649
|
+
html3 = '<div>Text with spaces</div>'
|
|
650
|
+
html4 = '<div>Text with spaces</div>'
|
|
651
|
+
result = Canon::Comparison.equivalent?(html3, html4)
|
|
652
|
+
# => true (whitespace normalized in regular elements)
|
|
653
|
+
----
|
|
654
|
+
|
|
655
|
+
HtmlCompareProfile automatically preserves whitespace in elements where it's
|
|
656
|
+
semantically significant (`<pre>`, `<code>`, `<textarea>`, `<script>`,
|
|
657
|
+
`<style>`), while normalizing it in other elements.
|
|
658
|
+
====
|
|
659
|
+
|
|
660
|
+
**Future format profiles**: The architecture supports additional format-specific
|
|
661
|
+
profiles for JSON, YAML, and other formats as needed.
|
|
662
|
+
|
|
270
663
|
== Development
|
|
271
664
|
|
|
272
665
|
After checking out the repo, run `bin/setup` to install dependencies. Then run
|