canon 0.1.8 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +83 -22
  3. data/docs/Gemfile +1 -0
  4. data/docs/_config.yml +90 -1
  5. data/docs/advanced/diff-classification.adoc +196 -24
  6. data/docs/features/match-options/index.adoc +239 -1
  7. data/lib/canon/comparison/format_detector.rb +2 -1
  8. data/lib/canon/comparison/html_comparator.rb +19 -8
  9. data/lib/canon/comparison/html_compare_profile.rb +8 -2
  10. data/lib/canon/comparison/markup_comparator.rb +109 -2
  11. data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
  12. data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
  13. data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
  14. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +108 -0
  15. data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
  16. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
  17. data/lib/canon/comparison/xml_comparator.rb +240 -23
  18. data/lib/canon/comparison/xml_node_comparison.rb +25 -3
  19. data/lib/canon/diff/diff_classifier.rb +119 -5
  20. data/lib/canon/diff/formatting_detector.rb +1 -1
  21. data/lib/canon/diff/xml_serialization_formatter.rb +153 -0
  22. data/lib/canon/rspec_matchers.rb +37 -8
  23. data/lib/canon/version.rb +1 -1
  24. data/lib/canon/xml/data_model.rb +24 -13
  25. metadata +4 -78
  26. data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
  27. data/false_positive_analysis.txt +0 -0
  28. data/file1.html +0 -1
  29. data/file2.html +0 -1
  30. data/old-docs/ADVANCED_TOPICS.adoc +0 -20
  31. data/old-docs/BASIC_USAGE.adoc +0 -16
  32. data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
  33. data/old-docs/CLI.adoc +0 -497
  34. data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  35. data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
  36. data/old-docs/DIFF_FORMATTING.adoc +0 -540
  37. data/old-docs/DIFF_PARAMETERS.adoc +0 -261
  38. data/old-docs/DOM_DIFF.adoc +0 -1017
  39. data/old-docs/ENV_CONFIG.adoc +0 -876
  40. data/old-docs/FORMATS.adoc +0 -867
  41. data/old-docs/INPUT_VALIDATION.adoc +0 -477
  42. data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
  43. data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
  44. data/old-docs/MATCH_OPTIONS.adoc +0 -912
  45. data/old-docs/MODES.adoc +0 -432
  46. data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  47. data/old-docs/OPTIONS.adoc +0 -1387
  48. data/old-docs/PREPROCESSING.adoc +0 -491
  49. data/old-docs/README.old.adoc +0 -2831
  50. data/old-docs/RSPEC.adoc +0 -814
  51. data/old-docs/RUBY_API.adoc +0 -485
  52. data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
  53. data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
  54. data/old-docs/STRING_COMPARE.adoc +0 -345
  55. data/old-docs/TMP.adoc +0 -3384
  56. data/old-docs/TREE_DIFF.adoc +0 -1080
  57. data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
  58. data/old-docs/VERBOSE.adoc +0 -482
  59. data/old-docs/VISUALIZATION_MAP.adoc +0 -625
  60. data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
  61. data/scripts/analyze_current_state.rb +0 -85
  62. data/scripts/analyze_false_positives.rb +0 -114
  63. data/scripts/analyze_remaining_failures.rb +0 -105
  64. data/scripts/compare_current_failures.rb +0 -95
  65. data/scripts/compare_dom_tree_diff.rb +0 -158
  66. data/scripts/compare_failures.rb +0 -151
  67. data/scripts/debug_attribute_extraction.rb +0 -66
  68. data/scripts/debug_blocks_839.rb +0 -115
  69. data/scripts/debug_meta_matching.rb +0 -52
  70. data/scripts/debug_p_matching.rb +0 -192
  71. data/scripts/debug_signature_matching.rb +0 -118
  72. data/scripts/debug_sourcecode_124.rb +0 -32
  73. data/scripts/debug_whitespace_sensitive.rb +0 -192
  74. data/scripts/extract_false_positives.rb +0 -138
  75. data/scripts/find_actual_false_positives.rb +0 -125
  76. data/scripts/investigate_all_false_positives.rb +0 -161
  77. data/scripts/investigate_batch1.rb +0 -127
  78. data/scripts/investigate_classification.rb +0 -150
  79. data/scripts/investigate_classification_detailed.rb +0 -190
  80. data/scripts/investigate_common_failures.rb +0 -342
  81. data/scripts/investigate_false_negative.rb +0 -80
  82. data/scripts/investigate_false_positive.rb +0 -83
  83. data/scripts/investigate_false_positives.rb +0 -227
  84. data/scripts/investigate_false_positives_batch.rb +0 -163
  85. data/scripts/investigate_mixed_content.rb +0 -125
  86. data/scripts/investigate_remaining_16.rb +0 -214
  87. data/scripts/run_single_test.rb +0 -29
  88. data/scripts/test_all_false_positives.rb +0 -95
  89. data/scripts/test_attribute_details.rb +0 -61
  90. data/scripts/test_both_algorithms.rb +0 -49
  91. data/scripts/test_both_simple.rb +0 -49
  92. data/scripts/test_enhanced_semantic_output.rb +0 -125
  93. data/scripts/test_readme_examples.rb +0 -131
  94. data/scripts/test_semantic_tree_diff.rb +0 -99
  95. data/scripts/test_semantic_ux_improvements.rb +0 -135
  96. data/scripts/test_single_false_positive.rb +0 -119
  97. data/scripts/test_size_limits.rb +0 -99
  98. data/test_html_1.html +0 -21
  99. data/test_html_2.html +0 -21
  100. data/test_nokogiri.rb +0 -33
  101. data/test_normalize.rb +0 -45
@@ -1,1155 +0,0 @@
1
- = Flexible whitespace matching system
2
-
3
- ==== General
4
-
5
- Canon provides a flexible whitespace matching system for XML, HTML, JSON, and YAML comparisons. This system allows precise control over how whitespace and formatting differences are handled during comparison.
6
-
7
- The system uses a two-phase architecture:
8
-
9
- * *Preprocessing phase*: What to compare (normalization, canonicalization, formatting)
10
- * *Matching phase*: How to compare (4 dimensions × 3 behaviors)
11
-
12
- ==== Two-phase architecture
13
-
14
- === Preprocessing phase
15
-
16
- The preprocessing phase determines what content is compared. Canon supports four preprocessing options:
17
-
18
- [cols="1,3"]
19
- |===
20
- | Option | Description
21
-
22
- | `:none`
23
- | No preprocessing - compare raw content as-is
24
-
25
- | `:c14n`
26
- | Apply XML Canonicalization (C14N) to normalize structure
27
-
28
- | `:normalize`
29
- | Apply whitespace normalization
30
-
31
- | `:format`
32
- | Apply format-specific pretty-printing
33
- |===
34
-
35
- The preprocessing option is controlled via the `preprocessing` parameter and defaults based on the format being compared.
36
-
37
- === Matching phase
38
-
39
- The matching phase defines how content is compared across four independent dimensions. Each dimension can be configured with one of three mutually exclusive behaviors.
40
-
41
- ==== Match dimensions
42
-
43
- The matching phase operates on four collectively exhaustive dimensions:
44
-
45
- [cols="1,3"]
46
- |===
47
- | Dimension | What it controls
48
-
49
- | `text_content`
50
- | Text content within elements/values
51
-
52
- | `structural_whitespace`
53
- | Whitespace between tags/elements (indentation, line breaks)
54
-
55
- | `attribute_whitespace`
56
- | Whitespace within attribute values
57
-
58
- | `comments`
59
- | How comments are handled
60
- |===
61
-
62
- These four dimensions are collectively exhaustive - they cover all aspects of whitespace and formatting in structured documents.
63
-
64
- ==== Match behaviors
65
-
66
- For each dimension, you can specify one of three mutually exclusive behaviors:
67
-
68
- [cols="1,3"]
69
- |===
70
- | Behavior | Description
71
-
72
- | `:strict`
73
- | Exact character-for-character matching (including all whitespace)
74
-
75
- | `:normalize`
76
- | Collapse consecutive whitespace to single spaces, trim leading/trailing whitespace
77
-
78
- | `:ignore`
79
- | Don't compare this dimension at all
80
- |===
81
-
82
- ==== Predefined match profiles
83
-
84
- Canon provides four predefined match profiles optimized for common use cases:
85
-
86
- === Profile comparison table
87
-
88
- The following table shows how each predefined profile configures the four match dimensions:
89
-
90
- [cols="1,1,1,1,1"]
91
- |===
92
- |Profile |text_content |structural_whitespace |attribute_whitespace |comments
93
-
94
- |`strict`
95
- |`:strict`
96
- |`:strict`
97
- |`:strict`
98
- |`:strict`
99
-
100
- |`rendered`
101
- |`:normalize`
102
- |`:ignore`
103
- |`:normalize`
104
- |`:ignore`
105
-
106
- |`spec_friendly`
107
- |`:normalize`
108
- |`:ignore`
109
- |`:normalize`
110
- |`:ignore`
111
-
112
- |`content_only`
113
- |`:normalize`
114
- |`:ignore`
115
- |`:ignore`
116
- |`:ignore`
117
- |===
118
-
119
- **Key differences between profiles:**
120
-
121
- * **strict**: Exact matching on all dimensions - use for byte-for-byte comparison
122
- * **rendered**: Mimics browser rendering - collapses text, ignores formatting and comments
123
- * **spec_friendly**: Same as rendered - ideal for test specifications
124
- * **content_only**: Most permissive - only compares text content, ignores all formatting and attribute whitespace
125
-
126
- NOTE: The `rendered` and `spec_friendly` profiles have identical configurations but serve different semantic purposes in your codebase.
127
-
128
- === Strict profile
129
-
130
- The `strict` profile is the default for XML and requires exact matching:
131
-
132
- [source,ruby]
133
- ----
134
- {
135
- text_content: :strict,
136
- structural_whitespace: :strict,
137
- attribute_whitespace: :strict,
138
- comments: :strict
139
- }
140
- ----
141
-
142
- Use this when:
143
-
144
- * You need exact byte-for-byte comparison
145
- * Whitespace is semantically significant
146
- * Working with canonicalized or pre-normalized content
147
-
148
- === Rendered profile
149
-
150
- The `rendered` profile mimics how browsers render HTML/XML:
151
-
152
- [source,ruby]
153
- ----
154
- {
155
- text_content: :normalize,
156
- structural_whitespace: :ignore,
157
- attribute_whitespace: :normalize,
158
- comments: :ignore
159
- }
160
- ----
161
-
162
- Use this when:
163
-
164
- * Comparing HTML documents where rendering matters
165
- * Whitespace between tags doesn't affect output
166
- * Comments are documentation-only
167
-
168
- This is the default profile for HTML comparisons.
169
-
170
- === Spec-friendly profile
171
-
172
- The `spec_friendly` profile ignores all formatting differences:
173
-
174
- [source,ruby]
175
- ----
176
- {
177
- text_content: :normalize,
178
- structural_whitespace: :ignore,
179
- attribute_whitespace: :normalize,
180
- comments: :ignore
181
- }
182
- ----
183
-
184
- Use this when:
185
-
186
- * Writing test specifications
187
- * Formatting/indentation style doesn't matter
188
- * Generated vs. hand-written content comparison
189
- * CI/CD environments with different formatters
190
-
191
- === Content-only profile
192
-
193
- The `content_only` profile focuses solely on actual content:
194
-
195
- [source,ruby]
196
- ----
197
- {
198
- text_content: :normalize,
199
- structural_whitespace: :ignore,
200
- attribute_whitespace: :ignore,
201
- comments: :ignore
202
- }
203
- ----
204
-
205
- Use this when:
206
-
207
- * Only semantic content matters
208
- * All whitespace (including in attributes) is insignificant
209
- * Maximum tolerance for formatting differences
210
-
211
- ==== Format-specific defaults
212
-
213
- Different formats have different default behaviors optimized for their typical use cases:
214
-
215
- === XML defaults
216
-
217
- [source,ruby]
218
- ----
219
- {
220
- preprocessing: :none,
221
- match_profile: :strict
222
- }
223
- ----
224
-
225
- XML defaults to strict matching because:
226
-
227
- * XML whitespace can be semantically significant
228
- * XML is often machine-generated with consistent formatting
229
- * Canonicalization (C14N) is available for normalization when needed
230
-
231
- === HTML defaults
232
-
233
- [source,ruby]
234
- ----
235
- {
236
- preprocessing: :none,
237
- match_profile: :rendered
238
- }
239
- ----
240
-
241
- HTML defaults to rendered-style matching because:
242
-
243
- * Browsers collapse whitespace when rendering
244
- * Indentation and formatting are for readability only
245
- * Comments are typically documentation
246
-
247
- === JSON defaults
248
-
249
- [source,ruby]
250
- ----
251
- {
252
- preprocessing: :format,
253
- match_profile: :rendered
254
- }
255
- ----
256
-
257
- JSON applies pretty-printing before comparison because:
258
-
259
- * JSON whitespace is never semantically significant
260
- * Minified vs. formatted JSON should be equivalent
261
- * Pretty-printing ensures consistent structure
262
-
263
- === YAML defaults
264
-
265
- [source,ruby]
266
- ----
267
- {
268
- preprocessing: :format,
269
- match_profile: :rendered
270
- }
271
- ----
272
-
273
- YAML applies pretty-printing because:
274
-
275
- * YAML formatting can vary significantly
276
- * Indentation styles differ between generators
277
- * Content equivalence is what matters
278
-
279
- ==== Usage examples
280
-
281
- === Using predefined profiles
282
-
283
- Use a profile for XML comparison:
284
-
285
- [source,ruby]
286
- ----
287
- expect(actual_xml).to be_xml_equivalent_to(
288
- expected_xml,
289
- match_profile: :spec_friendly
290
- )
291
- ----
292
-
293
- Use a profile for HTML comparison:
294
-
295
- [source,ruby]
296
- ----
297
- expect(actual_html).to be_html_equivalent_to(
298
- expected_html,
299
- match_profile: :content_only
300
- )
301
- ----
302
-
303
- === Using explicit match options
304
-
305
- Override specific dimensions:
306
-
307
- [source,ruby]
308
- ----
309
- expect(actual_xml).to be_xml_equivalent_to(
310
- expected_xml,
311
- match_options: {
312
- text_content: :normalize,
313
- structural_whitespace: :ignore,
314
- attribute_whitespace: :strict,
315
- comments: :ignore
316
- }
317
- )
318
- ----
319
-
320
- === Combining profiles and explicit options
321
-
322
- Explicit options override profile settings:
323
-
324
- [source,ruby]
325
- ----
326
- expect(actual_xml).to be_xml_equivalent_to(
327
- expected_xml,
328
- match_profile: :spec_friendly,
329
- match_options: {
330
- attribute_whitespace: :strict # Override just this dimension
331
- }
332
- )
333
- ----
334
-
335
- === Global configuration
336
-
337
- Set a global default profile for all tests:
338
-
339
- [source,ruby]
340
- ----
341
- # In spec_helper.rb
342
- Canon::RSpecMatchers.configure do |config|
343
- config.xml_match_profile = :spec_friendly
344
- config.html_match_profile = :rendered
345
- end
346
- ----
347
-
348
- Override global profile in specific tests:
349
-
350
- [source,ruby]
351
- ----
352
- # This test uses strict matching despite global spec_friendly
353
- expect(actual_xml).to be_xml_equivalent_to(
354
- expected_xml,
355
- match_profile: :strict
356
- )
357
- ----
358
-
359
- ==== Dimension-specific examples
360
-
361
- === Text content dimension
362
-
363
- The `text_content` dimension controls how text within elements is compared.
364
-
365
- ==== Strict behavior (exact whitespace)
366
-
367
- When `text_content: :strict`, all whitespace in text content must match exactly.
368
-
369
- .XML examples with strict text_content
370
- [example]
371
- The following XML strings are **not** considered equal because whitespace differs:
372
-
373
- [source,xml]
374
- ----
375
- <p> text with spaces </p>
376
- <p>text with spaces</p>
377
- ----
378
-
379
- [source,ruby]
380
- ----
381
- actual = "<p> text with spaces </p>"
382
- expected = "<p>text with spaces</p>"
383
-
384
- expect(actual).not_to be_xml_equivalent_to(
385
- expected,
386
- match_options: {
387
- text_content: :strict,
388
- structural_whitespace: :ignore,
389
- attribute_whitespace: :strict,
390
- comments: :ignore
391
- }
392
- )
393
- # => true (documents are NOT equivalent)
394
- ----
395
-
396
- Even differences in leading/trailing whitespace matter:
397
-
398
- [source,xml]
399
- ----
400
- <item> Value </item>
401
- <item>Value</item>
402
- ----
403
-
404
- [source,ruby]
405
- ----
406
- xml1 = "<item> Value </item>"
407
- xml2 = "<item>Value</item>"
408
-
409
- expect(xml1).not_to be_xml_equivalent_to(
410
- xml2,
411
- match_options: { text_content: :strict, structural_whitespace: :ignore }
412
- )
413
- # => true (documents are NOT equivalent)
414
- ----
415
-
416
- .HTML examples with strict text_content
417
- [example]
418
- [source,html]
419
- ----
420
- <a href="/admin"> SOME TEXT </a>
421
- <a href="/admin">SOME TEXT</a>
422
- ----
423
-
424
- [source,ruby]
425
- ----
426
- html1 = '<a href="/admin"> SOME TEXT </a>'
427
- html2 = '<a href="/admin">SOME TEXT</a>'
428
-
429
- expect(html1).not_to be_html_equivalent_to(
430
- html2,
431
- match_options: { text_content: :strict, structural_whitespace: :ignore }
432
- )
433
- # => true (documents are NOT equivalent)
434
- ----
435
-
436
- ==== Normalize behavior (collapse whitespace)
437
-
438
- When `text_content: :normalize`, consecutive whitespace is collapsed to single spaces and leading/trailing whitespace is trimmed.
439
-
440
- .XML examples with normalized text_content
441
- [example]
442
- The following XML strings **are** considered equal:
443
-
444
- [source,xml]
445
- ----
446
- <p> text with multiple spaces </p>
447
- <p>text with multiple spaces</p>
448
- ----
449
-
450
- [source,ruby]
451
- ----
452
- actual = "<p> text with multiple spaces </p>"
453
- expected = "<p>text with multiple spaces</p>"
454
-
455
- expect(actual).to be_xml_equivalent_to(
456
- expected,
457
- match_options: {
458
- text_content: :normalize,
459
- structural_whitespace: :ignore,
460
- attribute_whitespace: :strict,
461
- comments: :ignore
462
- }
463
- )
464
- # => true (documents are equivalent)
465
- ----
466
-
467
- Tabs and newlines are also normalized:
468
-
469
- [source,xml]
470
- ----
471
- <description>
472
- This is a
473
- multi-line
474
- description
475
- </description>
476
-
477
- <description>This is a multi-line description</description>
478
- ----
479
-
480
- [source,ruby]
481
- ----
482
- xml1 = <<~XML
483
- <description>
484
- This is a
485
- multi-line
486
- description
487
- </description>
488
- XML
489
-
490
- xml2 = "<description>This is a multi-line description</description>"
491
-
492
- expect(xml1).to be_xml_equivalent_to(
493
- xml2,
494
- match_options: { text_content: :normalize, structural_whitespace: :ignore }
495
- )
496
- # => true (documents are equivalent)
497
- ----
498
-
499
- .HTML examples with normalized text_content
500
- [example]
501
- [source,html]
502
- ----
503
- <a href="/admin"> SOME TEXT CONTENT </a>
504
- <a href="/admin">SOME TEXT CONTENT</a>
505
- ----
506
-
507
- [source,ruby]
508
- ----
509
- html1 = '<a href="/admin"> SOME TEXT CONTENT </a>'
510
- html2 = '<a href="/admin">SOME TEXT CONTENT</a>'
511
-
512
- expect(html1).to be_html_equivalent_to(
513
- html2,
514
- match_options: { text_content: :normalize, structural_whitespace: :ignore }
515
- )
516
- # => true (documents are equivalent)
517
- ----
518
-
519
- Multi-line HTML text:
520
-
521
- [source,html]
522
- ----
523
- <p>
524
- This is a paragraph
525
- with multiple lines
526
- of text.
527
- </p>
528
-
529
- <p>This is a paragraph with multiple lines of text.</p>
530
- ----
531
-
532
- [source,ruby]
533
- ----
534
- html1 = <<~HTML
535
- <p>
536
- This is a paragraph
537
- with multiple lines
538
- of text.
539
- </p>
540
- HTML
541
-
542
- html2 = "<p>This is a paragraph with multiple lines of text.</p>"
543
-
544
- expect(html1).to be_html_equivalent_to(
545
- html2,
546
- match_options: { text_content: :normalize, structural_whitespace: :ignore }
547
- )
548
- # => true (documents are equivalent)
549
- ----
550
-
551
- === Structural whitespace dimension
552
-
553
- The `structural_whitespace` dimension controls whitespace between tags (indentation, line breaks, formatting).
554
-
555
- ==== Strict behavior
556
-
557
- When `structural_whitespace: :strict`, all whitespace between tags must match exactly, including indentation and line breaks.
558
-
559
- .XML examples with strict structural_whitespace
560
- [example]
561
- These documents are **not** equivalent due to different indentation:
562
-
563
- [source,xml]
564
- ----
565
- <root>
566
- <item>Value</item>
567
- </root>
568
-
569
- <root>
570
- <item>Value</item>
571
- </root>
572
- ----
573
-
574
- [source,ruby]
575
- ----
576
- xml1 = "<root>\n <item>Value</item>\n</root>"
577
- xml2 = "<root>\n <item>Value</item>\n</root>"
578
-
579
- expect(xml1).not_to be_xml_equivalent_to(
580
- xml2,
581
- match_options: {
582
- text_content: :normalize,
583
- structural_whitespace: :strict,
584
- attribute_whitespace: :strict,
585
- comments: :ignore
586
- }
587
- )
588
- # => true (documents are NOT equivalent - indentation differs)
589
- ----
590
-
591
- ==== Ignore behavior (formatting doesn't matter)
592
-
593
- When `structural_whitespace: :ignore`, all whitespace between tags is ignored, making pretty-printed and compact formats equivalent.
594
-
595
- .XML examples with ignored structural_whitespace
596
- [example]
597
- Pretty-printed vs compact XML **are** considered equal:
598
-
599
- [source,xml]
600
- ----
601
- <!-- Pretty-printed with indentation -->
602
- <root>
603
- <a>
604
- <b>text</b>
605
- </a>
606
- </root>
607
-
608
- <!-- Compact on one line -->
609
- <root><a><b>text</b></a></root>
610
- ----
611
-
612
- [source,ruby]
613
- ----
614
- compact = "<root><a><b>text</b></a></root>"
615
- formatted = <<~XML
616
- <root>
617
- <a>
618
- <b>text</b>
619
- </a>
620
- </root>
621
- XML
622
-
623
- expect(compact).to be_xml_equivalent_to(
624
- formatted,
625
- match_options: {
626
- text_content: :normalize,
627
- structural_whitespace: :ignore,
628
- attribute_whitespace: :strict,
629
- comments: :ignore
630
- }
631
- )
632
- # => true (documents are equivalent)
633
- ----
634
-
635
- Complex nested structures with different indentation:
636
-
637
- [source,xml]
638
- ----
639
- <!-- 2-space indentation -->
640
- <document>
641
- <metadata>
642
- <title>My Document</title>
643
- <author>
644
- <name>John Doe</name>
645
- </author>
646
- </metadata>
647
- </document>
648
-
649
- <!-- 4-space indentation -->
650
- <document>
651
- <metadata>
652
- <title>My Document</title>
653
- <author>
654
- <name>John Doe</name>
655
- </author>
656
- </metadata>
657
- </document>
658
-
659
- <!-- Compact -->
660
- <document><metadata><title>My Document</title><author><name>John Doe</name></author></metadata></document>
661
- ----
662
-
663
- [source,ruby]
664
- ----
665
- two_spaces = <<~XML
666
- <document>
667
- <metadata>
668
- <title>My Document</title>
669
- <author>
670
- <name>John Doe</name>
671
- </author>
672
- </metadata>
673
- </document>
674
- XML
675
-
676
- four_spaces = "<document>\n <metadata>\n <title>My Document</title>\n <author>\n <name>John Doe</name>\n </author>\n </metadata>\n</document>"
677
-
678
- compact = "<document><metadata><title>My Document</title><author><name>John Doe</name></author></metadata></document>"
679
-
680
- expect(two_spaces).to be_xml_equivalent_to(
681
- four_spaces,
682
- match_options: { structural_whitespace: :ignore }
683
- )
684
- # => true
685
-
686
- expect(two_spaces).to be_xml_equivalent_to(
687
- compact,
688
- match_options: { structural_whitespace: :ignore }
689
- )
690
- # => true
691
- ----
692
-
693
- .HTML examples with ignored structural_whitespace
694
- [example]
695
- [source,html]
696
- ----
697
- <!-- Pretty-printed -->
698
- <div class="container">
699
- <header>
700
- <h1>Welcome</h1>
701
- <p>Introduction text</p>
702
- </header>
703
- </div>
704
-
705
- <!-- Compact -->
706
- <div class="container"><header><h1>Welcome</h1><p>Introduction text</p></header></div>
707
- ----
708
-
709
- [source,ruby]
710
- ----
711
- pretty_html = <<~HTML
712
- <div class="container">
713
- <header>
714
- <h1>Welcome</h1>
715
- <p>Introduction text</p>
716
- </header>
717
- </div>
718
- HTML
719
-
720
- compact_html = '<div class="container"><header><h1>Welcome</h1><p>Introduction text</p></header></div>'
721
-
722
- expect(pretty_html).to be_html_equivalent_to(
723
- compact_html,
724
- match_options: { structural_whitespace: :ignore }
725
- )
726
- # => true (documents are equivalent)
727
- ----
728
-
729
- ==== Normalize behavior
730
-
731
- When `structural_whitespace: :normalize`, whitespace between tags is collapsed to single spaces.
732
-
733
- .XML examples with normalized structural_whitespace
734
- [example]
735
- [source,xml]
736
- ----
737
- <root>
738
-
739
-
740
- <item>Value</item>
741
-
742
-
743
- </root>
744
-
745
- <root> <item>Value</item> </root>
746
- ----
747
-
748
- [source,ruby]
749
- ----
750
- xml1 = "<root>\n\n\n <item>Value</item>\n\n\n</root>"
751
- xml2 = "<root> <item>Value</item> </root>"
752
-
753
- expect(xml1).to be_xml_equivalent_to(
754
- xml2,
755
- match_options: { structural_whitespace: :normalize }
756
- )
757
- # => true (documents are equivalent - whitespace normalized)
758
- ----
759
-
760
- === Attribute whitespace dimension
761
-
762
- The `attribute_whitespace` dimension controls whitespace within attribute values.
763
-
764
- ==== Strict behavior (exact attribute whitespace)
765
-
766
- When `attribute_whitespace: :strict`, whitespace in attribute values must match exactly.
767
-
768
- .XML examples with strict attribute_whitespace
769
- [example]
770
- These documents are **not** equivalent due to attribute whitespace differences:
771
-
772
- [source,xml]
773
- ----
774
- <div class=" foo bar ">text</div>
775
- <div class="foo bar">text</div>
776
- ----
777
-
778
- [source,ruby]
779
- ----
780
- actual = '<div class=" foo bar ">text</div>'
781
- expected = '<div class="foo bar">text</div>'
782
-
783
- expect(actual).not_to be_xml_equivalent_to(
784
- expected,
785
- match_options: {
786
- text_content: :normalize,
787
- structural_whitespace: :ignore,
788
- attribute_whitespace: :strict,
789
- comments: :ignore
790
- }
791
- )
792
- # => true (documents are NOT equivalent)
793
- ----
794
-
795
- Leading/trailing whitespace in attributes:
796
-
797
- [source,xml]
798
- ----
799
- <item id=" 123 " name=" Widget "/>
800
- <item id="123" name="Widget"/>
801
- ----
802
-
803
- [source,ruby]
804
- ----
805
- xml1 = '<item id=" 123 " name=" Widget "/>'
806
- xml2 = '<item id="123" name="Widget"/>'
807
-
808
- expect(xml1).not_to be_xml_equivalent_to(
809
- xml2,
810
- match_options: { attribute_whitespace: :strict }
811
- )
812
- # => true (documents are NOT equivalent)
813
- ----
814
-
815
- .HTML examples with strict attribute_whitespace
816
- [example]
817
- [source,html]
818
- ----
819
- <a href="/admin" class=" button primary ">Link</a>
820
- <a href="/admin" class="button primary">Link</a>
821
- ----
822
-
823
- [source,ruby]
824
- ----
825
- html1 = '<a href="/admin" class=" button primary ">Link</a>'
826
- html2 = '<a href="/admin" class="button primary">Link</a>'
827
-
828
- expect(html1).not_to be_html_equivalent_to(
829
- html2,
830
- match_options: { attribute_whitespace: :strict }
831
- )
832
- # => true (documents are NOT equivalent)
833
- ----
834
-
835
- ==== Normalize behavior (collapse attribute whitespace)
836
-
837
- When `attribute_whitespace: :normalize`, whitespace in attribute values is collapsed and trimmed.
838
-
839
- .XML examples with normalized attribute_whitespace
840
- [example]
841
- These documents **are** considered equal:
842
-
843
- [source,xml]
844
- ----
845
- <div class=" foo bar ">text</div>
846
- <div class="foo bar">text</div>
847
- ----
848
-
849
- [source,ruby]
850
- ----
851
- actual = '<div class=" foo bar ">text</div>'
852
- expected = '<div class="foo bar">text</div>'
853
-
854
- expect(actual).to be_xml_equivalent_to(
855
- expected,
856
- match_options: {
857
- text_content: :normalize,
858
- structural_whitespace: :ignore,
859
- attribute_whitespace: :normalize,
860
- comments: :ignore
861
- }
862
- )
863
- # => true (documents are equivalent)
864
- ----
865
-
866
- Multiple attributes with whitespace:
867
-
868
- [source,xml]
869
- ----
870
- <item id=" 123 " name=" Widget " category=" tools "/>
871
- <item id="123" name="Widget" category="tools"/>
872
- ----
873
-
874
- [source,ruby]
875
- ----
876
- xml1 = '<item id=" 123 " name=" Widget " category=" tools "/>'
877
- xml2 = '<item id="123" name="Widget" category="tools"/>'
878
-
879
- expect(xml1).to be_xml_equivalent_to(
880
- xml2,
881
- match_options: { attribute_whitespace: :normalize }
882
- )
883
- # => true (documents are equivalent)
884
- ----
885
-
886
- .HTML examples with normalized attribute_whitespace
887
- [example]
888
- [source,html]
889
- ----
890
- <a href="/admin" class=" button primary " id=" main-link ">Link</a>
891
- <a href="/admin" class="button primary" id="main-link">Link</a>
892
- ----
893
-
894
- [source,ruby]
895
- ----
896
- html1 = '<a href="/admin" class=" button primary " id=" main-link ">Link</a>'
897
- html2 = '<a href="/admin" class="button primary" id="main-link">Link</a>'
898
-
899
- expect(html1).to be_html_equivalent_to(
900
- html2,
901
- match_options: { attribute_whitespace: :normalize }
902
- )
903
- # => true (documents are equivalent)
904
- ----
905
-
906
- ==== Ignore behavior
907
-
908
- When `attribute_whitespace: :ignore`, attribute values are not compared at all (only attribute names are checked).
909
-
910
- .Example with ignored attribute_whitespace
911
- [example]
912
- [source,ruby]
913
- ----
914
- xml1 = '<item class="foo">text</item>'
915
- xml2 = '<item class="completely different">text</item>'
916
-
917
- expect(xml1).to be_xml_equivalent_to(
918
- xml2,
919
- match_options: { attribute_whitespace: :ignore }
920
- )
921
- # => true (attribute values are not compared)
922
- ----
923
-
924
- === Comments dimension
925
-
926
- The `comments` dimension controls how XML/HTML comments are compared.
927
-
928
- ==== Strict behavior
929
-
930
- When `comments: :strict`, comments must match exactly, including their content and position.
931
-
932
- .XML examples with strict comments
933
- [example]
934
- These documents are **not** equivalent due to different comments:
935
-
936
- [source,xml]
937
- ----
938
- <root><!-- First comment --><a>text</a></root>
939
- <root><!-- Different comment --><a>text</a></root>
940
- ----
941
-
942
- [source,ruby]
943
- ----
944
- xml1 = "<root><!-- First comment --><a>text</a></root>"
945
- xml2 = "<root><!-- Different comment --><a>text</a></root>"
946
-
947
- expect(xml1).not_to be_xml_equivalent_to(
948
- xml2,
949
- match_options: { comments: :strict }
950
- )
951
- # => true (documents are NOT equivalent - comments differ)
952
- ----
953
-
954
- ==== Ignore behavior (comments don't affect comparison)
955
-
956
- When `comments: :ignore`, comments are completely ignored during comparison.
957
-
958
- .XML examples with ignored comments
959
- [example]
960
- These documents **are** considered equal despite different comments:
961
-
962
- [source,xml]
963
- ----
964
- <root><!-- comment --><a>text</a></root>
965
- <root><!-- different --><a>text</a></root>
966
- <root><a>text</a></root>
967
- ----
968
-
969
- [source,ruby]
970
- ----
971
- with_comment = "<root><!-- comment --><a>text</a></root>"
972
- different_comment = "<root><!-- different --><a>text</a></root>"
973
- no_comment = "<root><a>text</a></root>"
974
-
975
- expect(with_comment).to be_xml_equivalent_to(
976
- different_comment,
977
- match_options: {
978
- text_content: :normalize,
979
- structural_whitespace: :ignore,
980
- attribute_whitespace: :strict,
981
- comments: :ignore
982
- }
983
- )
984
- # => true (documents are equivalent - comments ignored)
985
-
986
- expect(with_comment).to be_xml_equivalent_to(
987
- no_comment,
988
- match_options: {
989
- text_content: :normalize,
990
- structural_whitespace: :ignore,
991
- attribute_whitespace: :strict,
992
- comments: :ignore
993
- }
994
- )
995
- # => true (documents are equivalent - comments ignored)
996
- ----
997
-
998
- Complex document with multiple comments:
999
-
1000
- [source,xml]
1001
- ----
1002
- <!-- Document header -->
1003
- <document>
1004
- <!-- Metadata section -->
1005
- <metadata>
1006
- <title>My Document</title>
1007
- <!-- Author information -->
1008
- <author>John Doe</author>
1009
- </metadata>
1010
- <!-- Main content -->
1011
- <content>
1012
- <p>Text</p>
1013
- </content>
1014
- </document>
1015
-
1016
- <document>
1017
- <metadata>
1018
- <title>My Document</title>
1019
- <author>John Doe</author>
1020
- </metadata>
1021
- <content>
1022
- <p>Text</p>
1023
- </content>
1024
- </document>
1025
- ----
1026
-
1027
- [source,ruby]
1028
- ----
1029
- with_comments = <<~XML
1030
- <!-- Document header -->
1031
- <document>
1032
- <!-- Metadata section -->
1033
- <metadata>
1034
- <title>My Document</title>
1035
- <!-- Author information -->
1036
- <author>John Doe</author>
1037
- </metadata>
1038
- <!-- Main content -->
1039
- <content>
1040
- <p>Text</p>
1041
- </content>
1042
- </document>
1043
- XML
1044
-
1045
- without_comments = <<~XML
1046
- <document>
1047
- <metadata>
1048
- <title>My Document</title>
1049
- <author>John Doe</author>
1050
- </metadata>
1051
- <content>
1052
- <p>Text</p>
1053
- </content>
1054
- </document>
1055
- XML
1056
-
1057
- expect(with_comments).to be_xml_equivalent_to(
1058
- without_comments,
1059
- match_options: { comments: :ignore }
1060
- )
1061
- # => true (documents are equivalent)
1062
- ----
1063
-
1064
- .HTML examples with ignored comments
1065
- [example]
1066
- [source,html]
1067
- ----
1068
- <!-- Navigation -->
1069
- <nav>
1070
- <!-- Primary menu -->
1071
- <ul>
1072
- <li>Home</li>
1073
- </ul>
1074
- </nav>
1075
-
1076
- <nav>
1077
- <ul>
1078
- <li>Home</li>
1079
- </ul>
1080
- </nav>
1081
- ----
1082
-
1083
- [source,ruby]
1084
- ----
1085
- html_with_comments = <<~HTML
1086
- <!-- Navigation -->
1087
- <nav>
1088
- <!-- Primary menu -->
1089
- <ul>
1090
- <li>Home</li>
1091
- </ul>
1092
- </nav>
1093
- HTML
1094
-
1095
- html_without_comments = <<~HTML
1096
- <nav>
1097
- <ul>
1098
- <li>Home</li>
1099
- </ul>
1100
- </nav>
1101
- HTML
1102
-
1103
- expect(html_with_comments).to be_html_equivalent_to(
1104
- html_without_comments,
1105
- match_options: { comments: :ignore }
1106
- )
1107
- # => true (documents are equivalent)
1108
- ----
1109
-
1110
- ==== Normalize behavior
1111
-
1112
- When `comments: :normalize`, comment content is trimmed and whitespace is collapsed before comparison.
1113
-
1114
- .Example with normalized comments
1115
- [example]
1116
- [source,ruby]
1117
- ----
1118
- xml1 = "<root><!-- comment with spaces --><a>text</a></root>"
1119
- xml2 = "<root><!-- comment with spaces --><a>text</a></root>"
1120
-
1121
- expect(xml1).to be_xml_equivalent_to(
1122
- xml2,
1123
- match_options: { comments: :normalize }
1124
- )
1125
- # => true (comments are normalized before comparison)
1126
- ----
1127
-
1128
- ==== Precedence resolution
1129
-
1130
- When multiple configuration sources are present, Canon resolves them in this order (highest to lowest precedence):
1131
-
1132
- . Explicit `match_options` hash in the test
1133
- . Named `match_profile` in the test
1134
- . Global format-specific profile (e.g., `xml_match_profile`)
1135
- . Format-specific defaults (e.g., XML → strict, HTML → rendered)
1136
-
1137
- .Example of precedence resolution
1138
- ====
1139
- [source,ruby]
1140
- ----
1141
- # Global configuration
1142
- Canon::RSpecMatchers.configure do |config|
1143
- config.xml_match_profile = :spec_friendly
1144
- end
1145
-
1146
- # This uses strict for attribute_whitespace (explicit option)
1147
- # and spec_friendly for other dimensions (global profile)
1148
- expect(actual).to be_xml_equivalent_to(
1149
- expected,
1150
- match_options: {
1151
- attribute_whitespace: :strict
1152
- }
1153
- )
1154
- ----
1155
- ====