canon 0.1.8 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +83 -22
  3. data/docs/Gemfile +1 -0
  4. data/docs/_config.yml +90 -1
  5. data/docs/advanced/diff-classification.adoc +196 -24
  6. data/docs/features/match-options/index.adoc +239 -1
  7. data/lib/canon/comparison/format_detector.rb +2 -1
  8. data/lib/canon/comparison/html_comparator.rb +19 -8
  9. data/lib/canon/comparison/html_compare_profile.rb +8 -2
  10. data/lib/canon/comparison/markup_comparator.rb +109 -2
  11. data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
  12. data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
  13. data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
  14. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +108 -0
  15. data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
  16. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
  17. data/lib/canon/comparison/xml_comparator.rb +240 -23
  18. data/lib/canon/comparison/xml_node_comparison.rb +25 -3
  19. data/lib/canon/diff/diff_classifier.rb +119 -5
  20. data/lib/canon/diff/formatting_detector.rb +1 -1
  21. data/lib/canon/diff/xml_serialization_formatter.rb +153 -0
  22. data/lib/canon/rspec_matchers.rb +37 -8
  23. data/lib/canon/version.rb +1 -1
  24. data/lib/canon/xml/data_model.rb +24 -13
  25. metadata +4 -78
  26. data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
  27. data/false_positive_analysis.txt +0 -0
  28. data/file1.html +0 -1
  29. data/file2.html +0 -1
  30. data/old-docs/ADVANCED_TOPICS.adoc +0 -20
  31. data/old-docs/BASIC_USAGE.adoc +0 -16
  32. data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
  33. data/old-docs/CLI.adoc +0 -497
  34. data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  35. data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
  36. data/old-docs/DIFF_FORMATTING.adoc +0 -540
  37. data/old-docs/DIFF_PARAMETERS.adoc +0 -261
  38. data/old-docs/DOM_DIFF.adoc +0 -1017
  39. data/old-docs/ENV_CONFIG.adoc +0 -876
  40. data/old-docs/FORMATS.adoc +0 -867
  41. data/old-docs/INPUT_VALIDATION.adoc +0 -477
  42. data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
  43. data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
  44. data/old-docs/MATCH_OPTIONS.adoc +0 -912
  45. data/old-docs/MODES.adoc +0 -432
  46. data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  47. data/old-docs/OPTIONS.adoc +0 -1387
  48. data/old-docs/PREPROCESSING.adoc +0 -491
  49. data/old-docs/README.old.adoc +0 -2831
  50. data/old-docs/RSPEC.adoc +0 -814
  51. data/old-docs/RUBY_API.adoc +0 -485
  52. data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
  53. data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
  54. data/old-docs/STRING_COMPARE.adoc +0 -345
  55. data/old-docs/TMP.adoc +0 -3384
  56. data/old-docs/TREE_DIFF.adoc +0 -1080
  57. data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
  58. data/old-docs/VERBOSE.adoc +0 -482
  59. data/old-docs/VISUALIZATION_MAP.adoc +0 -625
  60. data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
  61. data/scripts/analyze_current_state.rb +0 -85
  62. data/scripts/analyze_false_positives.rb +0 -114
  63. data/scripts/analyze_remaining_failures.rb +0 -105
  64. data/scripts/compare_current_failures.rb +0 -95
  65. data/scripts/compare_dom_tree_diff.rb +0 -158
  66. data/scripts/compare_failures.rb +0 -151
  67. data/scripts/debug_attribute_extraction.rb +0 -66
  68. data/scripts/debug_blocks_839.rb +0 -115
  69. data/scripts/debug_meta_matching.rb +0 -52
  70. data/scripts/debug_p_matching.rb +0 -192
  71. data/scripts/debug_signature_matching.rb +0 -118
  72. data/scripts/debug_sourcecode_124.rb +0 -32
  73. data/scripts/debug_whitespace_sensitive.rb +0 -192
  74. data/scripts/extract_false_positives.rb +0 -138
  75. data/scripts/find_actual_false_positives.rb +0 -125
  76. data/scripts/investigate_all_false_positives.rb +0 -161
  77. data/scripts/investigate_batch1.rb +0 -127
  78. data/scripts/investigate_classification.rb +0 -150
  79. data/scripts/investigate_classification_detailed.rb +0 -190
  80. data/scripts/investigate_common_failures.rb +0 -342
  81. data/scripts/investigate_false_negative.rb +0 -80
  82. data/scripts/investigate_false_positive.rb +0 -83
  83. data/scripts/investigate_false_positives.rb +0 -227
  84. data/scripts/investigate_false_positives_batch.rb +0 -163
  85. data/scripts/investigate_mixed_content.rb +0 -125
  86. data/scripts/investigate_remaining_16.rb +0 -214
  87. data/scripts/run_single_test.rb +0 -29
  88. data/scripts/test_all_false_positives.rb +0 -95
  89. data/scripts/test_attribute_details.rb +0 -61
  90. data/scripts/test_both_algorithms.rb +0 -49
  91. data/scripts/test_both_simple.rb +0 -49
  92. data/scripts/test_enhanced_semantic_output.rb +0 -125
  93. data/scripts/test_readme_examples.rb +0 -131
  94. data/scripts/test_semantic_tree_diff.rb +0 -99
  95. data/scripts/test_semantic_ux_improvements.rb +0 -135
  96. data/scripts/test_single_false_positive.rb +0 -119
  97. data/scripts/test_size_limits.rb +0 -99
  98. data/test_html_1.html +0 -21
  99. data/test_html_2.html +0 -21
  100. data/test_nokogiri.rb +0 -33
  101. data/test_normalize.rb +0 -45
@@ -1,1017 +0,0 @@
1
- ---
2
- layout: default
3
- title: DOM Diff
4
- nav_order: 42
5
- parent: Advanced Topics
6
- ---
7
- = DOM Diff Architecture
8
- :toc:
9
- :toclevels: 3
10
-
11
- == General
12
-
13
- Canon implements an advanced DOM (Document Object Model) diff algorithm as its default comparison method. Unlike simple text-based diff tools, Canon's DOM diff operates on the semantic structure of documents, providing precise, context-aware comparisons that respect the document's hierarchical nature.
14
-
15
- The DOM diff architecture is built on a sophisticated multi-layer pipeline that separates semantic comparison from textual representation, enabling features like normative/informative classification, intelligent grouping, and format-aware rendering.
16
-
17
- == When to use DOM diff
18
-
19
- DOM diff is Canon's **default algorithm** and is automatically used when you compare documents without explicitly specifying `diff_algorithm: :semantic`.
20
-
21
- DOM diff is ideal for:
22
-
23
- * **Standard document comparisons** where you need to see line-by-line differences
24
- * **Format-preserving comparisons** that respect the original document structure
25
- * **Normative vs informative analysis** of document changes
26
- * **Fine-grained control** over what constitutes a meaningful difference
27
- * **All document formats** (XML, HTML, JSON, YAML)
28
-
29
- For semantic tree-level operations (MERGE, SPLIT, UPGRADE, DOWNGRADE), see link:TREE_DIFF.adoc[Semantic Tree Diff].
30
-
31
- == Core architecture
32
-
33
- === Architectural layers
34
-
35
- Canon's DOM diff follows a strict separation of concerns across six distinct layers:
36
-
37
- [source]
38
- ----
39
- ┌─────────────────────────────────────────┐
40
- │ Layer 1: Comparison │
41
- │ Creates semantic DiffNodes │
42
- └─────────────────┬───────────────────────┘
43
-
44
- ┌─────────────────────────────────────────┐
45
- │ Layer 2: Mapping │
46
- │ Maps DiffNodes → DiffLines │
47
- └─────────────────┬───────────────────────┘
48
-
49
- ┌─────────────────────────────────────────┐
50
- │ Layer 3: Blocking │
51
- │ Groups DiffLines → DiffBlocks │
52
- └─────────────────┬───────────────────────┘
53
-
54
- ┌─────────────────────────────────────────┐
55
- │ Layer 4: Contexting │
56
- │ Adds context → DiffContexts │
57
- └─────────────────┬───────────────────────┘
58
-
59
- ┌─────────────────────────────────────────┐
60
- │ Layer 5: Reporting │
61
- │ Wraps in DiffReport │
62
- └─────────────────┬───────────────────────┘
63
-
64
- ┌─────────────────────────────────────────┐
65
- │ Layer 6: Formatting │
66
- │ Renders to human-readable output │
67
- └─────────────────────────────────────────┘
68
- ----
69
-
70
- Each layer has a single, well-defined responsibility and operates independently of the others. This separation enables powerful features like filtering normative-only differences or rendering the same comparison in multiple formats.
71
-
72
- === Key design principles
73
-
74
- **Separation of concerns**::
75
- Business logic (comparison, mapping, blocking) is completely separated from presentation (formatting). Formatters receive structured data and only handle rendering.
76
-
77
- **Information expert**::
78
- Each object knows about its own data. `DiffNode` knows if it's normative, `DiffLine` inherits from its `DiffNode`, `DiffBlock` aggregates from its lines.
79
-
80
- **Single responsibility**::
81
- Each class does exactly one thing: Comparator compares, Mapper maps, BlockBuilder groups, Formatter renders.
82
-
83
- **Tell, don't ask**::
84
- Objects expose behavior (`normative?`) rather than raw data, encapsulating decision-making logic within the appropriate class.
85
-
86
- == Core classes
87
-
88
- === DiffNode
89
-
90
- Represents a **semantic difference** between two DOM nodes. Created during the Comparison Layer.
91
-
92
- [source,ruby]
93
- ----
94
- class DiffNode
95
- attr_reader :node1 # Element from first document
96
- attr_reader :node2 # Element from second document
97
- attr_reader :dimension # Match dimension causing difference
98
- attr_reader :reason # Human-readable explanation
99
- attr_accessor :normative # true/false (set by DiffClassifier)
100
- end
101
- ----
102
-
103
- **Purpose**:: Captures **what** differs at the semantic/DOM level, independent of text representation.
104
-
105
- **Key attributes**::
106
- * `node1`, `node2`: The actual DOM elements being compared
107
- * `dimension`: Which match dimension detected the difference (`:text_content`, `:attribute_whitespace`, `:element_structure`, etc.)
108
- * `reason`: Descriptive text explaining the difference (e.g., "7 vs 9", "attribute 'class' differs")
109
- * `normative`: Whether this difference affects semantic equivalence
110
-
111
- **Example**::
112
- [source,ruby]
113
- ----
114
- DiffNode.new(
115
- node1: <Element "div">,
116
- node2: <Element "div">,
117
- dimension: :text_content,
118
- reason: "Text differs: 'Old' vs 'New'"
119
- )
120
- ----
121
-
122
- === DiffLine
123
-
124
- Represents a **single line** in the diff output. Links textual representation to semantic `DiffNode`. Created during the Mapping Layer.
125
-
126
- [source,ruby]
127
- ----
128
- class DiffLine
129
- attr_reader :line_number # Line number in original text
130
- attr_reader :content # Text content of the line
131
- attr_reader :type # :unchanged, :added, :removed, :changed
132
- attr_reader :diff_node # Reference to semantic DiffNode
133
-
134
- def normative? # Inherited from diff_node
135
- def informative? # Opposite of normative?
136
- end
137
- ----
138
-
139
- **Purpose**:: Bridges semantic differences (DiffNodes) to textual representation (lines of text).
140
-
141
- **Key attributes**::
142
- * `line_number`: Position in the original document
143
- * `content`: The actual text of the line
144
- * `type`: Type of change (`:unchanged`, `:added`, `:removed`, `:changed`)
145
- * `diff_node`: Reference to the semantic DiffNode causing this line to differ
146
-
147
- **Example**::
148
- [source,ruby]
149
- ----
150
- DiffLine.new(
151
- line_number: 5,
152
- content: "<p>New text</p>",
153
- type: :changed,
154
- diff_node: diff_node # Links to semantic difference
155
- )
156
- # => normative? returns true if diff_node.normative? is true
157
- ----
158
-
159
- === DiffBlock
160
-
161
- Represents a **contiguous block of changes** in the diff. Groups consecutive DiffLines together. Created during the Blocking Layer.
162
-
163
- [source,ruby]
164
- ----
165
- class DiffBlock
166
- attr_reader :start_idx # Starting line index
167
- attr_reader :end_idx # Ending line index
168
- attr_reader :types # Array of change types ['-', '+', '!']
169
- attr_reader :diff_lines # Array of DiffLine objects
170
- attr_reader :diff_node # DiffNode if all lines from same node
171
- attr_accessor :normative # true if ANY diff_line is normative
172
-
173
- def size # Number of lines in block
174
- def includes_type?(type) # Check if block contains type
175
- end
176
- ----
177
-
178
- **Purpose**:: Groups contiguous changed lines into logical units for display and filtering.
179
-
180
- **Key attributes**::
181
- * `start_idx`, `end_idx`: Line range for this block
182
- * `types`: Types of changes in this block (e.g., `['-', '+']` for removal+addition)
183
- * `diff_lines`: The actual DiffLine objects in this block
184
- * `diff_node`: Reference to DiffNode if all lines belong to the same semantic difference
185
- * `normative`: `true` if **any** line in the block is normative
186
-
187
- **Example**::
188
- [source,ruby]
189
- ----
190
- DiffBlock.new(
191
- start_idx: 10,
192
- end_idx: 12,
193
- types: ['-', '+'],
194
- diff_lines: [removed_line, added_line],
195
- normative: true
196
- )
197
- # => This block represents a normative change spanning lines 10-12
198
- ----
199
-
200
- === DiffContext
201
-
202
- Represents a **context** - a group of DiffBlocks with surrounding context lines. Created during the Contexting Layer.
203
-
204
- [source,ruby]
205
- ----
206
- class DiffContext
207
- attr_reader :start_idx # Start of context (includes context lines)
208
- attr_reader :end_idx # End of context (includes context lines)
209
- attr_reader :blocks # Array of DiffBlock objects
210
- attr_reader :lines # Array of all lines (changes + context)
211
- attr_accessor :normative # true if contains normative blocks
212
-
213
- def size # Total lines in context
214
- def block_count # Number of diff blocks
215
- def gap_to(other) # Distance to another context
216
- def overlaps?(other) # Check if contexts overlap
217
- end
218
- ----
219
-
220
- **Purpose**:: Provides surrounding context for groups of changes, making diffs easier to understand.
221
-
222
- **Key attributes**::
223
- * `start_idx`, `end_idx`: Extended range including context lines before/after
224
- * `blocks`: The DiffBlock objects contained in this context
225
- * `lines`: All lines including context (unchanged lines around changes)
226
- * `normative`: `true` if this context contains any normative blocks
227
-
228
- **Example**::
229
- [source,ruby]
230
- ----
231
- DiffContext.new(
232
- start_idx: 8, # Starts 2 lines before first change
233
- end_idx: 15, # Ends 2 lines after last change
234
- blocks: [block1, block2],
235
- normative: true
236
- )
237
- # => Context spans lines 8-15, includes 2 diff blocks
238
- ----
239
-
240
- == The DOM diff pipeline
241
-
242
- === Layer 1: Comparison
243
-
244
- **Input**:: Two documents (doc1, doc2) + match options
245
-
246
- **Process**:: `XmlComparator.equivalent?(doc1, doc2, options)`
247
-
248
- **Output**:: Array of `DiffNode` objects
249
-
250
- **How it works**::
251
-
252
- 1. Parse both documents into DOM trees
253
- 2. Compare nodes recursively according to match dimensions
254
- 3. For each difference found, create a `DiffNode`
255
- 4. `DiffClassifier` sets `normative` based on match dimension behavior
256
- 5. Return array of classified `DiffNode` objects
257
-
258
- **Example**::
259
- [source,ruby]
260
- ----
261
- # Input documents
262
- doc1 = "<div><p>Old text</p></div>"
263
- doc2 = "<div><p>New text</p></div>"
264
-
265
- # Comparison creates DiffNode
266
- diff_node = DiffNode.new(
267
- node1: <Element "p">,
268
- node2: <Element "p">,
269
- dimension: :text_content,
270
- reason: "Text differs: 'Old text' vs 'New text'",
271
- normative: true # Set by DiffClassifier
272
- )
273
-
274
- result.diff_nodes
275
- # => [diff_node]
276
- ----
277
-
278
- === Layer 2: Mapping (DiffNodes → DiffLines)
279
-
280
- **Input**:: DiffNode array, text1, text2
281
-
282
- **Process**:: `DiffNodeMapper.map(diff_nodes, text1, text2)`
283
-
284
- **Output**:: Array of `DiffLine` objects
285
-
286
- **How it works**::
287
-
288
- 1. Run text-based diff (using `Diff::LCS`) on the serialized text
289
- 2. For each changed line, find the corresponding `DiffNode`
290
- 3. Create `DiffLine` linking the line to its semantic `DiffNode`
291
- 4. Inherit `normative` status from the linked `DiffNode`
292
-
293
- **Example**::
294
- [source,ruby]
295
- ----
296
- # From the DiffNode above, create DiffLines
297
- diff_lines = [
298
- DiffLine.new(
299
- line_number: 1,
300
- content: "<div>",
301
- type: :unchanged,
302
- diff_node: nil # No semantic difference for this line
303
- ),
304
- DiffLine.new(
305
- line_number: 2,
306
- content: "<p>Old text</p>",
307
- type: :removed,
308
- diff_node: diff_node # Links to semantic difference
309
- ),
310
- DiffLine.new(
311
- line_number: 2,
312
- content: "<p>New text</p>",
313
- type: :added,
314
- diff_node: diff_node # Same DiffNode
315
- )
316
- ]
317
- ----
318
-
319
- **Why this matters**:: The link between `DiffLine` and `DiffNode` preserves semantic information throughout the pipeline. A formatter can ask `diff_line.normative?` and get the answer from the original semantic comparison.
320
-
321
- === Layer 3: Blocking (DiffLines → DiffBlocks)
322
-
323
- **Input**:: DiffLine array, `show_diffs` option
324
-
325
- **Process**:: `DiffBlockBuilder.build_blocks(diff_lines, show_diffs)`
326
-
327
- **Output**:: Array of `DiffBlock` objects
328
-
329
- **How it works**::
330
-
331
- 1. Identify runs of consecutive changed lines
332
- 2. Group each run into a `DiffBlock`
333
- 3. Set `block.normative` based on constituent lines
334
- 4. Filter blocks according to `show_diffs` option
335
-
336
- **Filtering options**::
337
- * `show_diffs: :normative` - Keep only blocks with `normative? == true`
338
- * `show_diffs: :informative` - Keep only blocks with `normative? == false`
339
- * `show_diffs: :all` - Keep all blocks (default)
340
-
341
- **Example**::
342
- [source,ruby]
343
- ----
344
- # From DiffLines above
345
- diff_block = DiffBlock.new(
346
- start_idx: 1,
347
- end_idx: 2,
348
- types: ['-', '+'],
349
- diff_lines: [removed_line, added_line],
350
- normative: true # Any line is normative → block is normative
351
- )
352
-
353
- # With show_diffs: :normative
354
- # => This block is kept (normative? == true)
355
-
356
- # With show_diffs: :informative
357
- # => This block is filtered out
358
- ----
359
-
360
- === Layer 4: Contexting (DiffBlocks → DiffContexts)
361
-
362
- **Input**:: DiffBlock array, `context_lines`, `grouping_lines` options
363
-
364
- **Process**:: `DiffContextBuilder.build_contexts(blocks, options)`
365
-
366
- **Output**:: Array of `DiffContext` objects
367
-
368
- **How it works**::
369
-
370
- 1. **Group nearby blocks**: Blocks within `grouping_lines` of each other are grouped together
371
- 2. **Expand with context**: Add `context_lines` unchanged lines before and after each group
372
- 3. **Create contexts**: Wrap each group in a `DiffContext` object
373
-
374
- **Example**::
375
- [source,ruby]
376
- ----
377
- # Options
378
- context_lines = 3 # Show 3 lines before/after
379
- grouping_lines = 5 # Group blocks within 5 lines
380
-
381
- # If two blocks are 4 lines apart, they're grouped into one context
382
- # If they're 6 lines apart, they become separate contexts
383
-
384
- diff_context = DiffContext.new(
385
- start_idx: 0, # Includes 3 context lines before
386
- end_idx: 8, # Includes 3 context lines after
387
- blocks: [block1, block2],
388
- normative: true
389
- )
390
- ----
391
-
392
- === Layer 5: Reporting (Wrap in DiffReport)
393
-
394
- **Input**:: DiffContext array + metadata
395
-
396
- **Process**:: `DiffReportBuilder.build(diff_nodes, text1, text2, opts)`
397
-
398
- **Output**:: `DiffReport` object
399
-
400
- **How it works**::
401
-
402
- 1. Orchestrate layers 2-4 (Mapping → Blocking → Contexting)
403
- 2. Collect metadata (file names, element name, etc.)
404
- 3. Wrap everything in a `DiffReport` object
405
-
406
- **Example**::
407
- [source,ruby]
408
- ----
409
- diff_report = DiffReport.new(
410
- element_name: "document",
411
- file1_name: "old.xml",
412
- file2_name: "new.xml",
413
- contexts: [context1, context2],
414
- has_differences: true
415
- )
416
- ----
417
-
418
- === Layer 6: Formatting (Render to string)
419
-
420
- **Input**:: `DiffReport` object
421
-
422
- **Process**:: `Formatter.format(diff_report)`
423
-
424
- **Output**:: Formatted string
425
-
426
- **How it works**::
427
-
428
- 1. Iterate through contexts in the report
429
- 2. Render each context with appropriate visualization
430
- 3. Apply colors, line numbers, and symbols
431
- 4. Return formatted output
432
-
433
- **Example**::
434
- [source,ruby]
435
- ----
436
- # Formatter receives structured DiffReport
437
- # NO comparison, NO filtering, NO business logic
438
- # ONLY rendering
439
-
440
- formatted = ByLine::XmlFormatter.format(diff_report)
441
- # =>
442
- # 1 | <div>
443
- # 2-| <p>Old text</p>
444
- # +| <p>New text</p>
445
- # 3 | </div>
446
- ----
447
-
448
- == Matching vs formatting stages
449
-
450
- A key architectural feature of Canon's DOM diff is the **complete separation** between matching (semantic comparison) and formatting (visual representation).
451
-
452
- === Matching stage (Layers 1-2)
453
-
454
- The matching stage determines **what** differs:
455
-
456
- 1. **Semantic comparison**: Compare DOM nodes according to match dimensions
457
- 2. **Classification**: Classify differences as normative or informative
458
- 3. **Mapping**: Link semantic differences to text line positions
459
-
460
- **Output**: Structured data about differences (`DiffNode[]` → `DiffLine[]`)
461
-
462
- **No formatting**: The matching stage knows nothing about colors, symbols, or visual representation
463
-
464
- === Formatting stage (Layers 3-6)
465
-
466
- The formatting stage determines **how** to display differences:
467
-
468
- 1. **Grouping**: Group lines into blocks, blocks into contexts
469
- 2. **Filtering**: Apply `show_diffs` to filter normative/informative
470
- 3. **Rendering**: Apply visual representation (colors, line numbers, symbols)
471
-
472
- **Input**: Structured data from matching stage
473
-
474
- **No comparison**: The formatting stage never compares documents or makes semantic decisions
475
-
476
- === Why this matters
477
-
478
- **Flexibility**::
479
- The same comparison result can be formatted in multiple ways without re-running the comparison.
480
-
481
- [source,ruby]
482
- ----
483
- # Compare once
484
- result = Canon.compare(doc1, doc2, format: :xml)
485
-
486
- # Format multiple ways
487
- puts Canon::DiffFormatter::ByLine::XmlFormatter.format(result.report)
488
- puts Canon::DiffFormatter::ByObject::XmlFormatter.format(result.report)
489
-
490
- # No need to re-compare!
491
- ----
492
-
493
- **Testability**::
494
- Each stage can be tested independently. Matchers test semantic comparison, formatters test visual output.
495
-
496
- **Maintainability**::
497
- Changes to comparison logic don't affect formatting, and vice versa.
498
-
499
- == Normative vs informative classification
500
-
501
- One of Canon's most powerful features is the ability to classify differences as **normative** (semantically significant) or **informative** (cosmetic).
502
-
503
- === How classification works
504
-
505
- Classification happens in Layer 1 (Comparison) via `DiffClassifier`:
506
-
507
- [source,ruby]
508
- ----
509
- # For each DiffNode
510
- dimension = diff_node.dimension # e.g., :attribute_whitespace
511
- behavior = match_options[dimension] # e.g., :ignore
512
-
513
- diff_node.normative = (behavior != :ignore)
514
- ----
515
-
516
- **Logic**::
517
- * If the match dimension is set to `:ignore` → `normative = false` (informative only)
518
- * Otherwise → `normative = true` (normative difference)
519
-
520
- === Propagation through layers
521
-
522
- The `normative` flag propagates through the pipeline:
523
-
524
- [source]
525
- ----
526
- DiffNode.normative?
527
-
528
- DiffLine.normative? (inherits from diff_node)
529
-
530
- DiffBlock.normative? (true if ANY line is normative)
531
-
532
- DiffContext.normative? (true if ANY block is normative)
533
- ----
534
-
535
- === Filtering by classification
536
-
537
- Use `show_diffs` to filter based on classification:
538
-
539
- [source,ruby]
540
- ----
541
- # Show only normative differences
542
- Canon.compare(doc1, doc2,
543
- match: { attribute_whitespace: :ignore },
544
- show_diffs: :normative
545
- )
546
- # => Hides attribute whitespace changes
547
-
548
- # Show only informative differences
549
- Canon.compare(doc1, doc2,
550
- show_diffs: :informative
551
- )
552
- # => Shows only cosmetic changes
553
-
554
- # Show all differences
555
- Canon.compare(doc1, doc2,
556
- show_diffs: :all
557
- )
558
- # => Shows everything (default)
559
- ----
560
-
561
- == Comparison with semantic tree diff
562
-
563
- Canon provides two diff algorithms:
564
-
565
- [cols="1,2,2"]
566
- |===
567
- | Feature | DOM Diff (Default) | Semantic Tree Diff
568
-
569
- | **Algorithm**
570
- | Line-based diff with semantic awareness
571
- | Tree-edit distance with operation detection
572
-
573
- | **Output**
574
- | Line-by-line differences
575
- | High-level operations (MERGE, SPLIT, etc.)
576
-
577
- | **Granularity**
578
- | Fine-grained (every line)
579
- | Coarse-grained (tree structures)
580
-
581
- | **Use case**
582
- | Standard comparisons, detailed analysis
583
- | Semantic refactoring analysis
584
-
585
- | **Performance**
586
- | Fast for most documents
587
- | Slower for large trees
588
-
589
- | **API**
590
- | `Canon.compare(doc1, doc2)` (default)
591
- | `Canon.compare(doc1, doc2, diff_algorithm: :semantic)`
592
-
593
- | **Classes**
594
- | DiffNode, DiffLine, DiffBlock, DiffContext
595
- | TreeNode, Operation, NodeSignature
596
- |===
597
-
598
- **When to use DOM diff**::
599
- * For regular document comparisons
600
- * When you need line-level details
601
- * When performance is critical
602
- * For all document formats
603
-
604
- **When to use semantic tree diff**::
605
- * For analyzing structural refactoring
606
- * When you need operation-level insights (MERGE, SPLIT)
607
- * For detecting semantic patterns
608
- * See link:TREE_DIFF.adoc[Semantic Tree Diff] for details
609
-
610
- == Examples
611
-
612
- === Basic DOM diff
613
-
614
- [source,ruby]
615
- ----
616
- require 'canon'
617
-
618
- doc1 = <<~XML
619
- <doc>
620
- <p>Hello</p>
621
- </doc>
622
- XML
623
-
624
- doc2 = <<~XML
625
- <doc>
626
- <p>World</p>
627
- </doc>
628
- XML
629
-
630
- result = Canon.compare(doc1, doc2, format: :xml)
631
- puts result.diff
632
- # =>
633
- # 1 | <doc>
634
- # 2-| <p>Hello</p>
635
- # +| <p>World</p>
636
- # 3 | </doc>
637
- ----
638
-
639
- === Normative-only differences
640
-
641
- [source,ruby]
642
- ----
643
- doc1 = '<div class="foo" id="1">Text</div>'
644
- doc2 = '<div id="1" class="foo">Text</div>'
645
-
646
- # Attribute order is ignored by default
647
- result = Canon.compare(doc1, doc2,
648
- format: :xml,
649
- show_diffs: :normative
650
- )
651
-
652
- puts result.diff
653
- # => (empty - no normative differences)
654
- ----
655
-
656
- === Informative-only differences
657
-
658
- [source,ruby]
659
- ----
660
- doc1 = '<p>Text</p>'
661
- doc2 = '<p> Text </p>' # Extra whitespace
662
-
663
- result = Canon.compare(doc1, doc2,
664
- format: :xml,
665
- match: { text_whitespace: :ignore },
666
- show_diffs: :informative
667
- )
668
-
669
- puts result.diff
670
- # => Shows the whitespace difference (informative only)
671
- ----
672
-
673
- === Accessing structured data
674
-
675
- [source,ruby]
676
- ----
677
- result = Canon.compare(doc1, doc2, format: :xml)
678
-
679
- # Access DiffNodes
680
- result.diff_nodes.each do |node|
681
- puts "#{node.dimension}: #{node.reason}"
682
- puts "Normative: #{node.normative?}"
683
- end
684
-
685
- # Access DiffReport
686
- report = result.report
687
- puts "Total contexts: #{report.contexts.length}"
688
- puts "Has differences: #{report.has_differences?}"
689
-
690
- # Access DiffContexts
691
- report.contexts.each do |context|
692
- puts "Context lines #{context.start_idx}-#{context.end_idx}"
693
- puts "Normative: #{context.normative?}"
694
- end
695
- ----
696
-
697
- == Implementation details
698
-
699
- === Element name matching
700
-
701
- `DiffNodeMapper` links `DiffLine` objects to `DiffNode` objects by matching element names:
702
-
703
- [source,ruby]
704
- ----
705
- # Extract element name from line
706
- line = "<bibitem id='123'>"
707
- element_name = extract_element_name(line)
708
- # => "bibitem"
709
-
710
- # Find DiffNode with matching element
711
- diff_node = diff_nodes.find do |node|
712
- node.node1.name == element_name ||
713
- node.node2.name == element_name
714
- end
715
-
716
- # Create DiffLine linked to DiffNode
717
- DiffLine.new(
718
- content: line,
719
- diff_node: diff_node # Linked!
720
- )
721
- ----
722
-
723
- This ensures that each changed line is correctly associated with its semantic difference.
724
-
725
- === Block grouping
726
-
727
- `DiffBlockBuilder` groups consecutive changed lines:
728
-
729
- [source,ruby]
730
- ----
731
- # Lines: U, U, C, C, C, U, C, U
732
- # Blocks: [C, C, C], [C]
733
- #
734
- # Block 1: lines 2-4 (3 consecutive changes)
735
- # Block 2: line 6 (1 change)
736
- ----
737
-
738
- === Context expansion
739
-
740
- `DiffContextBuilder` expands blocks with surrounding lines:
741
-
742
- [source,ruby]
743
- ----
744
- # Block at lines 10-12
745
- # context_lines = 3
746
- # → Context spans lines 7-15
747
- #
748
- # Lines 7-9: Context before
749
- # Lines 10-12: Actual changes
750
- # Lines 13-15: Context after
751
- ----
752
-
753
- === Context merging
754
-
755
- Nearby blocks are merged into a single context:
756
-
757
- [source,ruby]
758
- ----
759
- # Block 1 at lines 10-12
760
- # Block 2 at lines 18-20
761
- # Gap = 5 lines
762
- # grouping_lines = 5
763
- # → Both blocks in same context (gap ≤ grouping_lines)
764
-
765
- # Block 1 at lines 10-12
766
- # Block 2 at lines 25-27
767
- # Gap = 12 lines
768
- # grouping_lines = 5
769
- # → Separate contexts (gap > grouping_lines)
770
- ----
771
-
772
- == Real-world examples
773
-
774
- These examples demonstrate Canon's DOM diff output using actual test cases from IsoDoc specs. The examples use color highlighting to show different types of changes.
775
-
776
- === Understanding diff symbols
777
-
778
- Canon uses a dual-column line number format with specific symbols and colors to indicate different types of changes.
779
-
780
- **Line number format:**
781
-
782
- Canon displays two line numbers for each line:
783
- ```
784
- oldnum|newnum | content
785
- ```
786
-
787
- For example:
788
- ```
789
- 1| 1 | <div> # Line 1 in both files (unchanged)
790
- 2| - | <p>Old</p> # Line 2 in file 1, removed
791
- | 2+ | <p>New</p> # Line 2 in file 2, added
792
- 3| 3 | </div> # Line 3 in both files (unchanged)
793
- ```
794
-
795
- **Line-level symbols:**
796
-
797
- `-` (in new column):: Removed line - old file has this line, new file doesn't
798
- `+` (in new column):: Added line - new file has this line, old file doesn't
799
- `!` (in new column):: Changed line (character-level diff within the line)
800
- `~` (in new column):: Informative change (cosmetic difference)
801
- (blank):: Unchanged line (context)
802
-
803
- **Character visualization:**
804
-
805
- Canon visualizes invisible characters to make differences clear. The most common visualizations are:
806
-
807
- * Regular space (U+0020): `░` (light shade)
808
- * Non-breaking space (U+00A0): `␣` (open box)
809
- * Tab (U+0009): `⇥` (rightwards arrow to bar)
810
-
811
- For complete character visualization mappings, see link:CHARACTER_VISUALIZATION.adoc[Character Visualization].
812
-
813
- **Character-level highlighting:**
814
-
815
- For lines marked with `!` (changed), Canon highlights specific character ranges:
816
-
817
- * Deleted characters: red text
818
- * Added characters: green text
819
- * Unchanged characters: normal color
820
-
821
- **Normative vs informative:**
822
-
823
- * **Normative differences** (red/green) affect semantic equivalence
824
- * **Informative differences** (cyan) are cosmetic only
825
-
826
- === Example 1: Normative text content change
827
-
828
- This example shows a simple text content change from IsoDoc source code tests.
829
-
830
- **Input documents:**
831
-
832
- [source,xml]
833
- ----
834
- <!-- Document 1 -->
835
- <sourcecode>
836
- <body>puts "Hello, world."</body>
837
- </sourcecode>
838
-
839
- <!-- Document 2 -->
840
- <sourcecode>
841
- <body>puts "Goodbye, world."</body>
842
- </sourcecode>
843
- ----
844
-
845
- **Diff output:**
846
-
847
- ++++
848
- <div style="font-family: 'Courier New', monospace; background: #f5f5f5; padding: 15px; border: 1px solid #ddd; margin: 10px 0;">
849
- <div><span style="color: #cccc00;"> 1| 1 |</span> <span style="color: #999;">&lt;sourcecode&gt;</span></div>
850
- <div><span style="color: #cccc00;"> 2| 2 |</span> <span style="color: #999;"> &lt;body&gt;puts░"Hello,░world."&lt;/body&gt;</span></div>
851
- <div><span style="color: #cccc00;"> 3| - |</span> <span style="color: #cc0000;"> &lt;body&gt;puts░"Goodbye,░world."&lt;/body&gt;</span></div>
852
- <div><span style="color: #cccc00;"> | 3+ |</span> <span style="color: #00cc00;"> &lt;body&gt;puts░"Goodbye,░world."&lt;/body&gt;</span></div>
853
- <div><span style="color: #cccc00;"> 4| 4 |</span> <span style="color: #999;">&lt;/sourcecode&gt;</span></div>
854
- </div>
855
- ++++
856
-
857
- This is a **normative difference** - the text content actually changed, affecting semantic equivalence.
858
-
859
- === Example 2: Character-level change
860
-
861
- When only part of a line changes, Canon uses the `!` symbol and highlights specific character ranges.
862
-
863
- **Input documents:**
864
-
865
- [source,xml]
866
- ----
867
- <!-- Document 1 -->
868
- <p>Hello, world</p>
869
-
870
- <!-- Document 2 -->
871
- <p>Hello there, world</p>
872
- ----
873
-
874
- **Diff output:**
875
-
876
- ++++
877
- <div style="font-family: 'Courier New', monospace; background: #f5f5f5; padding: 15px; border: 1px solid #ddd; margin: 10px 0;">
878
- <div><span style="color: #cccc00;">1!|</span> &lt;p&gt;Hello<span style="color: #00cc00;"> there</span>, world&lt;/p&gt;</div>
879
- </div>
880
- ++++
881
-
882
- The `!` indicates a changed line, with:
883
-
884
- * `[Hello]` and `[, world</p>]` unchanged (normal color)
885
- * `[ there]` added (green text)
886
-
887
- === Example 3: Informative attribute order
888
-
889
- This example shows how attribute reordering is treated as informative (cosmetic) by default.
890
-
891
- **Input documents:**
892
-
893
- [source,xml]
894
- ----
895
- <!-- Document 1 -->
896
- <div class="TOC" id="_toc">Content</div>
897
-
898
- <!-- Document 2 -->
899
- <div id="_toc" class="TOC">Content</div>
900
- ----
901
-
902
- **With default settings:**
903
-
904
- No diff shown - attribute order is normalized automatically, so documents are considered equivalent.
905
-
906
- **With `show_diffs: :informative`:**
907
-
908
- ++++
909
- <div style="font-family: 'Courier New', monospace; background: #f5f5f5; padding: 15px; border: 1px solid #ddd; margin: 10px 0;">
910
- <div><span style="color: #cccc00;"> 1| ~ |</span> <span style="color: #00cccc;">&lt;div░class="TOC"░id="_toc"&gt;Content&lt;/div&gt;</span></div>
911
- <div><span style="color: #cccc00;"> | 1~ |</span> <span style="color: #00cccc;">&lt;div░id="_toc"░class="TOC"&gt;Content&lt;/div&gt;</span></div>
912
- </div>
913
- ++++
914
-
915
- The `~` symbol and cyan color indicate this is an **informative difference** - it doesn't affect semantic equivalence.
916
-
917
- === Example 4: Mixed normative and informative
918
-
919
- This example combines both normative and informative differences, showing how `show_diffs` filters the output.
920
-
921
- **Input documents:**
922
-
923
- [source,xml]
924
- ----
925
- <!-- Document 1 -->
926
- <root>
927
- <p>Old text</p>
928
- <div class="x" id="1">Same content</div>
929
- </root>
930
-
931
- <!-- Document 2 -->
932
- <root>
933
- <p>New text</p>
934
- <div id="1" class="x">Same content</div>
935
- </root>
936
- ----
937
-
938
- **With `show_diffs: :all` (show everything):**
939
-
940
- ++++
941
- <div style="font-family: 'Courier New', monospace; background: #f5f5f5; padding: 15px; border: 1px solid #ddd; margin: 10px 0;">
942
- <div><span style="color: #cccc00;"> 1| 1 |</span> <span style="color: #999;">&lt;root&gt;</span></div>
943
- <div><span style="color: #cccc00;"> 2| - |</span> <span style="color: #cc0000;"> &lt;p&gt;Old░text&lt;/p&gt;</span></div>
944
- <div><span style="color: #cccc00;"> | 2+ |</span> <span style="color: #00cc00;"> &lt;p&gt;New░text&lt;/p&gt;</span></div>
945
- <div><span style="color: #cccc00;"> 3| ~ |</span> <span style="color: #00cccc;"> &lt;div░class="x"░id="1"&gt;Same░content&lt;/div&gt;</span></div>
946
- <div><span style="color: #cccc00;"> | 3~ |</span> <span style="color: #00cccc;"> &lt;div░id="1"░class="x"&gt;Same░content&lt;/div&gt;</span></div>
947
- <div><span style="color: #cccc00;"> 4| 4 |</span> <span style="color: #999;">&lt;/root&gt;</span></div>
948
- </div>
949
- ++++
950
-
951
- **With `show_diffs: :normative` (only semantic changes):**
952
-
953
- ++++
954
- <div style="font-family: 'Courier New', monospace; background: #f5f5f5; padding: 15px; border: 1px solid #ddd; margin: 10px 0;">
955
- <div><span style="color: #cccc00;"> 1| 1 |</span> <span style="color: #999;">&lt;root&gt;</span></div>
956
- <div><span style="color: #cccc00;"> 2| - |</span> <span style="color: #cc0000;"> &lt;p&gt;Old░text&lt;/p&gt;</span></div>
957
- <div><span style="color: #cccc00;"> | 2+ |</span> <span style="color: #00cc00;"> &lt;p&gt;New░text&lt;/p&gt;</span></div>
958
- <div><span style="color: #cccc00;"> 3| 3 |</span> <span style="color: #999;"> &lt;div░id="1"░class="x"&gt;Same░content&lt;/div&gt;</span></div>
959
- <div><span style="color: #cccc00;"> 4| 4 |</span> <span style="color: #999;">&lt;/root&gt;</span></div>
960
- </div>
961
- ++++
962
-
963
- The attribute order difference (line 3) is hidden because it's informative.
964
-
965
- **With `show_diffs: :informative` (only cosmetic changes):**
966
-
967
- ++++
968
- <div style="font-family: 'Courier New', monospace; background: #f5f5f5; padding: 15px; border: 1px solid #ddd; margin: 10px 0;">
969
- <div><span style="color: #cccc00;"> 1| 1 |</span> <span style="color: #999;">&lt;root&gt;</span></div>
970
- <div><span style="color: #cccc00;"> 2| 2 |</span> <span style="color: #999;"> &lt;p&gt;New░text&lt;/p&gt;</span></div>
971
- <div><span style="color: #cccc00;"> 3| ~ |</span> <span style="color: #00cccc;"> &lt;div░class="x"░id="1"&gt;Same░content&lt;/div&gt;</span></div>
972
- <div><span style="color: #cccc00;"> | 3~ |</span> <span style="color: #00cccc;"> &lt;div░id="1"░class="x"&gt;Same░content&lt;/div&gt;</span></div>
973
- <div><span style="color: #cccc00;"> 4| 4 |</span> <span style="color: #999;">&lt;/root&gt;</span></div>
974
- </div>
975
- ++++
976
-
977
- The text change (line 2) is hidden because it's normative, not informative.
978
-
979
- === Color reference
980
-
981
- For accessibility, here are the specific colors used:
982
-
983
- [cols="1,2,3"]
984
- |===
985
- | Type | Color | Usage
986
-
987
- | Line numbers/pipes
988
- | Yellow: #cccc00
989
- | All line prefixes (e.g., `1\|`, `2-\|`, `+\|`)
990
-
991
- | Removed (normative)
992
- | Red: #cc0000
993
- | Lines with `-` symbol
994
-
995
- | Added (normative)
996
- | Green: #00cc00
997
- | Lines with `+` symbol
998
-
999
- | Changed (normative)
1000
- | Red/green text
1001
- | Character ranges in `!` lines
1002
-
1003
- | Informative
1004
- | Cyan: #00cccc
1005
- | Lines with `~` symbol
1006
-
1007
- | Context
1008
- | Gray: #999999
1009
- | Unchanged lines
1010
- |===
1011
-
1012
- == See also
1013
-
1014
- * link:TREE_DIFF.adoc[Semantic Tree Diff] - For tree-level semantic operations
1015
- * link:MATCH_OPTIONS.adoc[Match Options] - For controlling match dimensions
1016
- * link:DIFF_ARCHITECTURE.adoc[Diff Architecture] - For implementation details
1017
- * link:NORMATIVE_INFORMATIVE_DIFFS.adoc[Normative/Informative Diffs] - For classification details