canon 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +163 -67
  3. data/README.adoc +400 -7
  4. data/docs/Gemfile +9 -0
  5. data/docs/INDEX.adoc +99 -182
  6. data/docs/_config.yml +100 -0
  7. data/docs/advanced/diff-classification.adoc +547 -0
  8. data/docs/advanced/diff-pipeline.adoc +358 -0
  9. data/docs/advanced/index.adoc +214 -0
  10. data/docs/advanced/semantic-diff-report.adoc +390 -0
  11. data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
  12. data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
  13. data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
  14. data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
  15. data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
  16. data/docs/features/diff-formatting/display-filtering.adoc +472 -0
  17. data/docs/features/diff-formatting/index.adoc +140 -0
  18. data/docs/features/environment-configuration/index.adoc +327 -0
  19. data/docs/features/environment-configuration/override-system.adoc +436 -0
  20. data/docs/features/environment-configuration/size-limits.adoc +273 -0
  21. data/docs/features/index.adoc +173 -0
  22. data/docs/features/input-validation/index.adoc +521 -0
  23. data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
  24. data/docs/features/match-options/html-policies.adoc +312 -0
  25. data/docs/features/match-options/index.adoc +621 -0
  26. data/docs/getting-started/index.adoc +83 -0
  27. data/docs/getting-started/quick-start.adoc +76 -0
  28. data/docs/guides/choosing-configuration.adoc +689 -0
  29. data/docs/guides/index.adoc +181 -0
  30. data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
  31. data/docs/interfaces/index.adoc +101 -0
  32. data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
  33. data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
  34. data/docs/lychee.toml +65 -0
  35. data/docs/reference/cli-options.adoc +418 -0
  36. data/docs/reference/environment-variables.adoc +375 -0
  37. data/docs/reference/index.adoc +204 -0
  38. data/docs/reference/options-across-interfaces.adoc +417 -0
  39. data/docs/understanding/algorithms/dom-diff.adoc +389 -0
  40. data/docs/understanding/algorithms/index.adoc +314 -0
  41. data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
  42. data/docs/understanding/architecture.adoc +447 -0
  43. data/docs/understanding/comparison-pipeline.adoc +317 -0
  44. data/docs/understanding/formats/html.adoc +380 -0
  45. data/docs/understanding/formats/index.adoc +261 -0
  46. data/docs/understanding/formats/json.adoc +390 -0
  47. data/docs/understanding/formats/xml.adoc +366 -0
  48. data/docs/understanding/formats/yaml.adoc +504 -0
  49. data/docs/understanding/index.adoc +130 -0
  50. data/lib/canon/cli.rb +42 -1
  51. data/lib/canon/commands/diff_command.rb +108 -23
  52. data/lib/canon/comparison/compare_profile.rb +101 -0
  53. data/lib/canon/comparison/comparison_result.rb +41 -2
  54. data/lib/canon/comparison/html_comparator.rb +292 -71
  55. data/lib/canon/comparison/html_compare_profile.rb +117 -0
  56. data/lib/canon/comparison/match_options.rb +42 -4
  57. data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
  58. data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
  59. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
  60. data/lib/canon/comparison/xml_comparator.rb +695 -91
  61. data/lib/canon/comparison.rb +207 -2
  62. data/lib/canon/config/env_provider.rb +71 -0
  63. data/lib/canon/config/env_schema.rb +58 -0
  64. data/lib/canon/config/override_resolver.rb +55 -0
  65. data/lib/canon/config/type_converter.rb +59 -0
  66. data/lib/canon/config.rb +158 -29
  67. data/lib/canon/data_model.rb +29 -0
  68. data/lib/canon/diff/diff_classifier.rb +74 -14
  69. data/lib/canon/diff/diff_context_builder.rb +41 -0
  70. data/lib/canon/diff/diff_line.rb +18 -2
  71. data/lib/canon/diff/diff_node.rb +18 -3
  72. data/lib/canon/diff/diff_node_mapper.rb +71 -12
  73. data/lib/canon/diff/formatting_detector.rb +53 -0
  74. data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
  75. data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
  76. data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
  77. data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
  78. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
  79. data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
  80. data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
  81. data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
  82. data/lib/canon/diff_formatter/debug_output.rb +7 -1
  83. data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
  84. data/lib/canon/diff_formatter/legend.rb +42 -0
  85. data/lib/canon/diff_formatter.rb +78 -9
  86. data/lib/canon/errors.rb +56 -0
  87. data/lib/canon/formatters/html_formatter_base.rb +35 -1
  88. data/lib/canon/formatters/json_formatter.rb +3 -0
  89. data/lib/canon/formatters/yaml_formatter.rb +3 -0
  90. data/lib/canon/html/data_model.rb +229 -0
  91. data/lib/canon/html.rb +9 -0
  92. data/lib/canon/options/cli_generator.rb +70 -0
  93. data/lib/canon/options/registry.rb +234 -0
  94. data/lib/canon/rspec_matchers.rb +34 -13
  95. data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
  96. data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
  97. data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
  98. data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
  99. data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
  100. data/lib/canon/tree_diff/core/matching.rb +241 -0
  101. data/lib/canon/tree_diff/core/node_signature.rb +164 -0
  102. data/lib/canon/tree_diff/core/node_weight.rb +135 -0
  103. data/lib/canon/tree_diff/core/tree_node.rb +450 -0
  104. data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
  105. data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
  106. data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
  107. data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
  108. data/lib/canon/tree_diff/operation_converter.rb +631 -0
  109. data/lib/canon/tree_diff/operations/operation.rb +92 -0
  110. data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
  111. data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
  112. data/lib/canon/tree_diff.rb +33 -0
  113. data/lib/canon/validators/json_validator.rb +3 -1
  114. data/lib/canon/validators/yaml_validator.rb +3 -1
  115. data/lib/canon/version.rb +1 -1
  116. data/lib/canon/xml/data_model.rb +22 -23
  117. data/lib/canon/xml/element_matcher.rb +128 -20
  118. data/lib/canon/xml/namespace_helper.rb +110 -0
  119. data/lib/canon.rb +3 -0
  120. metadata +81 -23
  121. data/_config.yml +0 -116
  122. data/docs/ADVANCED_TOPICS.adoc +0 -20
  123. data/docs/BASIC_USAGE.adoc +0 -16
  124. data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  125. data/docs/DIFF_ARCHITECTURE.adoc +0 -435
  126. data/docs/DIFF_FORMATTING.adoc +0 -540
  127. data/docs/FORMATS.adoc +0 -447
  128. data/docs/INPUT_VALIDATION.adoc +0 -477
  129. data/docs/MATCH_ARCHITECTURE.adoc +0 -463
  130. data/docs/MATCH_OPTIONS.adoc +0 -719
  131. data/docs/MODES.adoc +0 -432
  132. data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  133. data/docs/OPTIONS.adoc +0 -1387
  134. data/docs/PREPROCESSING.adoc +0 -491
  135. data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
  136. data/docs/UNDERSTANDING_CANON.adoc +0 -17
data/README.adoc CHANGED
@@ -49,11 +49,17 @@ require 'canon'
49
49
 
50
50
  # Canonical form (compact)
51
51
  Canon.format('<root><b>2</b><a>1</a></root>', :xml)
52
- # => "<root><a>1</a><b>2</b></root>"
52
+ # => Pretty-printed XML (default behavior)
53
53
 
54
- # Pretty-print (human-readable)
54
+ # Compact canonical form
55
+ require 'canon/xml/c14n'
56
+ Canon::Xml::C14n.canonicalize('<root><b>2</b><a>1</a></root>', with_comments: false)
57
+ # => "<root><b>2</b><a>1</a></root>"
58
+
59
+ # Pretty-print (human-readable with custom indent)
55
60
  require 'canon/pretty_printer/xml'
56
- Canon::Xml::PrettyPrinter.new(indent: 2).format(xml_input)
61
+ xml_input = '<root><b>2</b><a>1</a></root>'
62
+ Canon::PrettyPrinter::Xml.new(indent: 2).format(xml_input)
57
63
  ----
58
64
 
59
65
  === Compare documents
@@ -67,6 +73,13 @@ xml2 = '<root> <b>2</b> <a>1</a> </root>'
67
73
 
68
74
  Canon::Comparison.equivalent?(xml1, xml2)
69
75
  # => true (semantically equivalent despite formatting differences)
76
+
77
+ # Use semantic tree diff for operation-level analysis
78
+ result = Canon::Comparison.equivalent?(xml1, xml2,
79
+ verbose: true,
80
+ diff_algorithm: :semantic
81
+ )
82
+ result.operations # => [INSERT, DELETE, UPDATE, MOVE operations]
70
83
  ----
71
84
 
72
85
  === Use in tests
@@ -117,6 +130,10 @@ $ canon help
117
130
  options
118
131
  * **link:docs/MATCH_OPTIONS[Match options]** - Match dimensions and
119
132
  profiles
133
+ * **link:docs/TREE_DIFF[Semantic tree diff]** - Operation-level tree
134
+ comparison
135
+ * **link:docs/SEMANTIC_TREE_DIFF[Semantic tree diff algorithm]** - Comprehensive guide to semantic diff
136
+ * **link:docs/ENV_CONFIG[Environment configuration]** - Configure via ENV variables including size limits
120
137
  * **link:docs/DIFF_FORMATTING[Diff formatting]** - Customizing diff output
121
138
  * **link:docs/CHARACTER_VISUALIZATION[Character visualization]** -
122
139
  Whitespace and special characters
@@ -131,6 +148,7 @@ $ canon help
131
148
  classification
132
149
  * **link:docs/DIFF_ARCHITECTURE[Diff architecture]** - Technical pipeline
133
150
  details
151
+ * **link:docs/COMPARE_PROFILE[CompareProfile architecture]** - Format-specific policies
134
152
 
135
153
  == Features
136
154
 
@@ -152,12 +170,182 @@ Compare documents based on meaning, not formatting:
152
170
 
153
171
  * Whitespace normalization options
154
172
  * Attribute/key order handling
155
- * Comment handling
173
+ * Comment handling with display control
156
174
  * Multiple match dimensions with behaviors
157
175
  * Predefined match profiles (strict, rendered, spec_friendly, content_only)
158
176
 
159
177
  See link:docs/MATCH_OPTIONS[Match options] for details.
160
178
 
179
+ ==== Comment display control
180
+
181
+ Control which differences are displayed in diff output:
182
+
183
+ [source,ruby]
184
+ ----
185
+ # Show all differences (default)
186
+ result = Canon::Comparison.equivalent?(xml1, xml2,
187
+ verbose: true,
188
+ match: { comments: :ignore },
189
+ show_diffs: :all
190
+ )
191
+
192
+ # Show only normative differences (affect equivalence)
193
+ result = Canon::Comparison.equivalent?(xml1, xml2,
194
+ verbose: true,
195
+ match: { comments: :ignore },
196
+ show_diffs: :normative
197
+ )
198
+
199
+ # Show only informative differences
200
+ result = Canon::Comparison.equivalent?(xml1, xml2,
201
+ verbose: true,
202
+ match: { comments: :ignore },
203
+ show_diffs: :informative
204
+ )
205
+ ----
206
+
207
+ **CLI usage:**
208
+ [source,bash]
209
+ ----
210
+ # Show all differences
211
+ $ canon diff file1.xml file2.xml --show-diffs all
212
+
213
+ # Show only normative differences
214
+ $ canon diff file1.xml file2.xml --show-diffs normative
215
+
216
+ # Show only informative differences
217
+ $ canon diff file1.xml file2.xml --show-diffs informative
218
+ ----
219
+
220
+ **RSpec usage:**
221
+ [source,ruby]
222
+ ----
223
+ expect(actual).to be_xml_equivalent_to(expected)
224
+ .show_diffs(:normative)
225
+ ----
226
+
227
+ === Original input string display
228
+
229
+ When debugging test failures, it's often helpful to see the exact strings that
230
+ were passed to the comparison before any preprocessing or normalization. The
231
+ `verbose_diff` option displays the original input strings in an RSpec-style
232
+ format with line numbers.
233
+
234
+ [source,ruby]
235
+ ----
236
+ # Enable original string display in configuration
237
+ Canon::Config.configure do |config|
238
+ config.xml.diff.verbose_diff = true
239
+ end
240
+
241
+ # Or programmatically for a specific comparison
242
+ result = Canon::Comparison.equivalent?(xml1, xml2,
243
+ verbose: true,
244
+ verbose_diff: true
245
+ )
246
+ ----
247
+
248
+ **Output format:**
249
+ ----
250
+ ==================================================================
251
+ ORIGINAL INPUT STRINGS
252
+ ==================================================================
253
+
254
+ Expected (as string):
255
+ 1 | <root>
256
+ 2 | <element>value1</element>
257
+ 3 | </root>
258
+
259
+ Actual (as string):
260
+ 1 | <root>
261
+ 2 | <element>value2</element>
262
+ 3 | </root>
263
+
264
+ ==================================================================
265
+ ----
266
+
267
+ **When to use this feature:**
268
+
269
+ * Debugging why two documents are considered different
270
+ * Understanding preprocessing effects (c14n, normalization, etc.)
271
+ * Verifying the exact input received by the comparison
272
+ * Comparing raw vs processed content
273
+
274
+ **Environment variable:**
275
+ [source,bash]
276
+ ----
277
+ export CANON_XML_DIFF_VERBOSE_DIFF=true
278
+ export CANON_HTML_DIFF_VERBOSE_DIFF=true
279
+ export CANON_JSON_DIFF_VERBOSE_DIFF=true
280
+ export CANON_YAML_DIFF_VERBOSE_DIFF=true
281
+ ----
282
+
283
+ === Algorithm choice
284
+
285
+ Canon provides two diff algorithms:
286
+
287
+ * **DOM diff** (default): Stable, position-based comparison for traditional line-by-line output
288
+ * **Semantic tree diff** (experimental): Advanced operation detection (INSERT, DELETE, UPDATE, MOVE, MERGE, SPLIT, UPGRADE, DOWNGRADE)
289
+
290
+ [source,ruby]
291
+ ----
292
+ # Use DOM diff (default, stable)
293
+ result = Canon::Comparison.equivalent?(doc1, doc2,
294
+ verbose: true,
295
+ diff_algorithm: :dom
296
+ )
297
+
298
+ # Use semantic tree diff (experimental, more intelligent)
299
+ result = Canon::Comparison.equivalent?(doc1, doc2,
300
+ verbose: true,
301
+ diff_algorithm: :semantic
302
+ )
303
+ ----
304
+
305
+ **When to use semantic tree diff:**
306
+
307
+ * Need to detect high-level operations (moves, merges, splits)
308
+ * Documents have significant rearrangement
309
+ * Want statistical analysis of changes
310
+ * Need operation-level transformation analysis
311
+
312
+ **When to use DOM diff:**
313
+
314
+ * Need stable, well-tested comparison
315
+ * Want traditional line-by-line output
316
+ * Documents are similar in structure
317
+ * Maximum performance for large files
318
+
319
+ See link:docs/SEMANTIC_TREE_DIFF[Semantic tree diff algorithm] for comprehensive guide.
320
+
321
+ === Size limits for large files
322
+
323
+ Canon provides configurable size limits to prevent hangs on pathologically large files:
324
+
325
+ * **File size limit**: Default 5MB (configurable)
326
+ * **Node count limit**: Default 10,000 nodes (configurable)
327
+ * **Diff output limit**: Default 10,000 lines (configurable)
328
+
329
+ [source,bash]
330
+ ----
331
+ # Configure via environment variables
332
+ export CANON_MAX_FILE_SIZE=10485760 # 10MB
333
+ export CANON_MAX_NODE_COUNT=50000 # 50,000 nodes
334
+ export CANON_MAX_DIFF_LINES=20000 # 20,000 lines
335
+
336
+ bundle exec rspec
337
+ ----
338
+
339
+ [source,ruby]
340
+ ----
341
+ # Or programmatically
342
+ Canon::Config.instance.xml.diff.max_file_size = 10_485_760
343
+ Canon::Config.instance.xml.diff.max_node_count = 50_000
344
+ Canon::Config.instance.xml.diff.max_diff_lines = 20_000
345
+ ----
346
+
347
+ See link:docs/ENV_CONFIG#size-limits[ENV_CONFIG] for details on size limit configuration.
348
+
161
349
  === Smart diff output
162
350
 
163
351
  **By-line mode**: Traditional line-by-line diff with:
@@ -177,8 +365,12 @@ See link:docs/MODES[Diff modes] for details.
177
365
 
178
366
  === Enhanced diff features
179
367
 
180
- * **Color-coded output**: Red (normative deletions), green (normative additions), yellow
181
- (normative structure), cyan (informative diffs)
368
+ * **Three-tier diff classification**: Formatting-only (`[` dark gray/`]` light gray), informative (`<` blue/`>` cyan), and normative (`-` red/`+` green) differences with directional colors
369
+ * **Directional color coding**: Removals and additions use different colors within each tier (red/green for normative, blue/cyan for informative, dark gray/light gray for formatting)
370
+ * **Namespace declaration tracking**: Separate dimension for tracking `xmlns` and `xmlns:*` attribute changes, reported independently from regular data attributes
371
+ * **Namespace rendering**: Explicit namespace display in XML diffs using `ns:[uri]` or `ns:[{blank}]` format
372
+ * **Informative diff visualization**: Visually distinct blue/cyan markers for differences that don't affect equivalence
373
+ * **Formatting diff detection**: Automatically detects and highlights purely cosmetic whitespace/line break differences
182
374
  * **Whitespace visualization**: Make invisible characters visible with CJK-safe
183
375
  Unicode symbols
184
376
  * **Non-ASCII detection**: Warnings for unexpected Unicode characters
@@ -233,7 +425,7 @@ See link:docs/CLI[CLI documentation].
233
425
  [source,ruby]
234
426
  ----
235
427
  # Configure globally
236
- Canon::RSpecMatchers.configure do |config|
428
+ Canon::Config.configure do |config|
237
429
  config.xml.match.profile = :spec_friendly
238
430
  config.xml.diff.use_color = true
239
431
  end
@@ -267,6 +459,207 @@ delegation to mode-specific formatters (by-line, by-object).
267
459
 
268
460
  See link:docs/MATCH_ARCHITECTURE[Match architecture] for details.
269
461
 
462
+ === CompareProfile architecture
463
+
464
+ Canon uses the **CompareProfile** class to encapsulate policy decisions about how differences in various dimensions should be handled during comparison. This provides clean separation of concerns between policy decisions, comparison logic, and difference classification.
465
+
466
+ ==== Separation of concerns
467
+
468
+ The comparison system is divided into four distinct components:
469
+
470
+ **CompareProfile**:: Policy decisions (what to track, what affects equivalence)
471
+ **XmlComparator/HtmlComparator**:: Comparison logic (detect differences)
472
+ **DiffNode**:: Data representation (represents a difference)
473
+ **DiffClassifier**:: Classification logic (normative vs informative vs formatting)
474
+
475
+ Each component has ONE responsibility with no overlapping concerns:
476
+
477
+ * CompareProfile does NOT classify differences
478
+ * XmlComparator does NOT make policy decisions
479
+ * DiffClassifier does NOT compare documents
480
+
481
+ ==== Policy methods
482
+
483
+ CompareProfile provides four key policy methods:
484
+
485
+ `track_dimension?(dimension)`:: Should DiffNodes be created for this dimension? Returns `true` in verbose mode to track all differences for reporting.
486
+
487
+ `affects_equivalence?(dimension)`:: Should differences affect equivalence? Determines the return value of the comparison.
488
+ Returns `false` for dimensions with `:ignore` behavior.
489
+
490
+ `normative_dimension?(dimension)`:: Is this dimension normative (affects equivalence) or informative (display only)?
491
+ Used by DiffClassifier to set the normative flag on DiffNodes.
492
+
493
+ `supports_formatting_detection?(dimension)`:: Can FormattingDetector apply to this dimension?
494
+ Returns `true` only for text/content dimensions (`:text_content`, `:structural_whitespace`, `:comments`).
495
+
496
+ === CompareProfile architecture
497
+
498
+ Canon uses a `CompareProfile` system to define format-specific comparison policies.
499
+ This allows different formats (HTML, XML, JSON, YAML) to have their own default
500
+ behaviors while maintaining a consistent architecture.
501
+
502
+ ==== How CompareProfile works
503
+
504
+ The `CompareProfile` class provides the foundation for policy-based comparison:
505
+
506
+ **Normative policy**: Determines what differences matter for equivalence. Each
507
+ dimension (`:text_content`, `:structural_whitespace`, `:comments`, etc.) has a
508
+ behavior (`:strict`, `:normalize`, `:ignore`) that determines whether differences
509
+ in that dimension affect equivalence.
510
+
511
+ **Dimension-based classification**: Each difference has a dimension and the
512
+ profile determines if that dimension is:
513
+
514
+ * **Normative**: Affects equivalence (documents not equivalent if different)
515
+ * **Informative**: Tracked but doesn't affect equivalence
516
+ * **Formatting-only**: Pure whitespace differences when normalized content matches
517
+
518
+ **Classification hierarchy**:
519
+
520
+ 1. **Normative** (highest priority): Differences that make documents non-equivalent
521
+ 2. ** Informative** (medium priority): Differences that are tracked but don't affect equivalence
522
+ 3. **Formatting-only** (lowest priority): Pure whitespace/formatting differences
523
+
524
+ ==== Dimension behaviors
525
+
526
+ Each dimension can have one of three behaviors:
527
+
528
+ * **`:strict`**: Differences in this dimension are normative (affect equivalence)
529
+ * **`:normalize`**: Differences are normalized; only semantic changes are normative
530
+ * **`:ignore`**: Differences are informative only (don't affect equivalence)
531
+
532
+ .Example: Whitespace handling
533
+ [example]
534
+ ====
535
+ ----
536
+ # Default (strict mode): whitespace differences are normative
537
+ xml1 = '<root><p>Hello world</p></root>'
538
+ xml2 = '<root><p>Hello\nworld</p></root>'
539
+ Canon::Comparison.equivalent?(xml1, xml2) # => false
540
+
541
+ # Normalize mode: whitespace-only differences are formatting-only
542
+ Canon::Comparison.equivalent?(xml1, xml2,
543
+ match: { text_content: :normalize, structural_whitespace: :normalize }
544
+ ) # => true
545
+ ----
546
+ ====
547
+
548
+ In normalize mode, the line break is detected as formatting-only because the
549
+ normalized content ("Hello world") is the same.
550
+
551
+ ==== Format-specific profiles
552
+
553
+ Different formats can extend `CompareProfile` with format-specific policies:
554
+
555
+ * **XML** (base): Strict policies for all dimensions
556
+ * **HTML** (HtmlCompareProfile): Comments ignored by default, whitespace preserved in certain elements
557
+ * **JSON/YAML** (future): Key order policies, type handling
558
+
559
+ See `lib/canon/comparison/compare_profile.rb` for the base implementation and
560
+ `lib/canon/comparison/html_compare_profile.rb` for HTML-specific policies.
561
+
562
+ ==== Format-specific policies for HTML
563
+
564
+ Canon provides a format-specific CompareProfile implementation called
565
+ HtmlCompareProfile that encapsulates policies specific to HTML comparison.
566
+ This profile is automatically used by HtmlComparator based on detected
567
+ HTML version.
568
+
569
+ **Comments**: Default behavior is `:ignore` (presentational content in HTML),
570
+ unless explicitly set to `:strict`. When comments are set to `:strict`,
571
+ they will affect equivalence.
572
+
573
+ **Whitespace preservation**: HtmlCompareProfile automatically preserves
574
+ whitespace in elements where it's semantically significant (e.g., `<pre>`,
575
+ `<code>`, `<textarea>`, `<script>`, `<style>`). In other elements, whitespace
576
+ is normalized.
577
+
578
+ **Case sensitivity**: HTML5 is case-sensitive for element names, while
579
+ HTML4 is case-insensitive. HtmlCompareProfile uses HTML5 case-sensitivity
580
+ by default.
581
+
582
+
583
+ ==== Usage example
584
+
585
+ When using `match: { comments: :ignore }`:
586
+
587
+ * `track_dimension?(:comments)` returns `true` (track in verbose mode)
588
+ * `affects_equivalence?(:comments)` returns `false` (doesn't affect equivalence)
589
+ * `normative_dimension?(:comments)` returns `false` (informative only)
590
+
591
+ This ensures that comment differences are tracked and displayed in verbose mode
592
+ but don't make documents non-equivalent.
593
+
594
+ .Example: Comment differences with :ignore behavior
595
+ ====
596
+ [source,ruby]
597
+ ----
598
+ xml1 = '<root><!-- comment 1 --><data>value</data></root>'
599
+ xml2 = '<root><!-- comment 2 --><data>value</data></root>'
600
+
601
+ result = Canon::Comparison.equivalent?(xml1, xml2,
602
+ verbose: true,
603
+ match: { comments: :ignore }
604
+ )
605
+
606
+ result.differences # => [#<DiffNode @dimension=:comments>]
607
+ result.differences[0].normative? # => false (informative)
608
+ result.equivalent? # => true (doesn't affect equivalence)
609
+ ----
610
+
611
+ The comment difference is tracked and displayed, but the documents are still
612
+ considered equivalent because comments are set to `:ignore`.
613
+ ====
614
+
615
+ .Example: HTML comment handling
616
+ ====
617
+ [source,ruby]
618
+ ----
619
+ html1 = '<div><!-- comment --><p>Text</p></div>'
620
+ html2 = '<div><p>Text</p></div>'
621
+
622
+ # HTML defaults: comments are ignored (presentational)
623
+ result = Canon::Comparison.equivalent?(html1, html2)
624
+ # => true (comments don't affect HTML equivalence by default)
625
+
626
+ # Explicit strict matching
627
+ result = Canon::Comparison.equivalent?(html1, html2,
628
+ match: { comments: :strict }
629
+ )
630
+ # => false (comments now affect equivalence)
631
+ ----
632
+
633
+ Comments in HTML are considered presentational content (like CSS styles) and
634
+ don't affect the semantic meaning unless explicitly configured to `:strict`.
635
+ ====
636
+
637
+ .Example: HTML whitespace preservation
638
+ ====
639
+ [source,ruby]
640
+ ----
641
+ html1 = '<pre>Line 1\n Line 2</pre>'
642
+ html2 = '<pre>Line 1\nLine 2</pre>'
643
+
644
+ # Whitespace is preserved in <pre> elements
645
+ result = Canon::Comparison.equivalent?(html1, html2)
646
+ # => false (whitespace differs in pre element)
647
+
648
+ # But normalized in other elements
649
+ html3 = '<div>Text with spaces</div>'
650
+ html4 = '<div>Text with spaces</div>'
651
+ result = Canon::Comparison.equivalent?(html3, html4)
652
+ # => true (whitespace normalized in regular elements)
653
+ ----
654
+
655
+ HtmlCompareProfile automatically preserves whitespace in elements where it's
656
+ semantically significant (`<pre>`, `<code>`, `<textarea>`, `<script>`,
657
+ `<style>`), while normalizing it in other elements.
658
+ ====
659
+
660
+ **Future format profiles**: The architecture supports additional format-specific
661
+ profiles for JSON, YAML, and other formats as needed.
662
+
270
663
  == Development
271
664
 
272
665
  After checking out the repo, run `bin/setup` to install dependencies. Then run
data/docs/Gemfile ADDED
@@ -0,0 +1,9 @@
1
+ source "https://rubygems.org"
2
+
3
+ gem "jekyll", "~> 4.3"
4
+ gem "jekyll-asciidoc"
5
+ gem "just-the-docs"
6
+
7
+ group :jekyll_plugins do
8
+ gem "jekyll-seo-tag"
9
+ end