canon 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +163 -67
- data/README.adoc +400 -7
- data/docs/Gemfile +9 -0
- data/docs/INDEX.adoc +99 -182
- data/docs/_config.yml +100 -0
- data/docs/advanced/diff-classification.adoc +547 -0
- data/docs/advanced/diff-pipeline.adoc +358 -0
- data/docs/advanced/index.adoc +214 -0
- data/docs/advanced/semantic-diff-report.adoc +390 -0
- data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
- data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
- data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
- data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
- data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
- data/docs/features/diff-formatting/display-filtering.adoc +472 -0
- data/docs/features/diff-formatting/index.adoc +140 -0
- data/docs/features/environment-configuration/index.adoc +327 -0
- data/docs/features/environment-configuration/override-system.adoc +436 -0
- data/docs/features/environment-configuration/size-limits.adoc +273 -0
- data/docs/features/index.adoc +173 -0
- data/docs/features/input-validation/index.adoc +521 -0
- data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
- data/docs/features/match-options/html-policies.adoc +312 -0
- data/docs/features/match-options/index.adoc +621 -0
- data/docs/getting-started/index.adoc +83 -0
- data/docs/getting-started/quick-start.adoc +76 -0
- data/docs/guides/choosing-configuration.adoc +689 -0
- data/docs/guides/index.adoc +181 -0
- data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
- data/docs/interfaces/index.adoc +101 -0
- data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
- data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
- data/docs/lychee.toml +65 -0
- data/docs/reference/cli-options.adoc +418 -0
- data/docs/reference/environment-variables.adoc +375 -0
- data/docs/reference/index.adoc +204 -0
- data/docs/reference/options-across-interfaces.adoc +417 -0
- data/docs/understanding/algorithms/dom-diff.adoc +389 -0
- data/docs/understanding/algorithms/index.adoc +314 -0
- data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
- data/docs/understanding/architecture.adoc +447 -0
- data/docs/understanding/comparison-pipeline.adoc +317 -0
- data/docs/understanding/formats/html.adoc +380 -0
- data/docs/understanding/formats/index.adoc +261 -0
- data/docs/understanding/formats/json.adoc +390 -0
- data/docs/understanding/formats/xml.adoc +366 -0
- data/docs/understanding/formats/yaml.adoc +504 -0
- data/docs/understanding/index.adoc +130 -0
- data/lib/canon/cli.rb +42 -1
- data/lib/canon/commands/diff_command.rb +108 -23
- data/lib/canon/comparison/compare_profile.rb +101 -0
- data/lib/canon/comparison/comparison_result.rb +41 -2
- data/lib/canon/comparison/html_comparator.rb +292 -71
- data/lib/canon/comparison/html_compare_profile.rb +117 -0
- data/lib/canon/comparison/match_options.rb +42 -4
- data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
- data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
- data/lib/canon/comparison/xml_comparator.rb +695 -91
- data/lib/canon/comparison.rb +207 -2
- data/lib/canon/config/env_provider.rb +71 -0
- data/lib/canon/config/env_schema.rb +58 -0
- data/lib/canon/config/override_resolver.rb +55 -0
- data/lib/canon/config/type_converter.rb +59 -0
- data/lib/canon/config.rb +158 -29
- data/lib/canon/data_model.rb +29 -0
- data/lib/canon/diff/diff_classifier.rb +74 -14
- data/lib/canon/diff/diff_context_builder.rb +41 -0
- data/lib/canon/diff/diff_line.rb +18 -2
- data/lib/canon/diff/diff_node.rb +18 -3
- data/lib/canon/diff/diff_node_mapper.rb +71 -12
- data/lib/canon/diff/formatting_detector.rb +53 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
- data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
- data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
- data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
- data/lib/canon/diff_formatter/debug_output.rb +7 -1
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
- data/lib/canon/diff_formatter/legend.rb +42 -0
- data/lib/canon/diff_formatter.rb +78 -9
- data/lib/canon/errors.rb +56 -0
- data/lib/canon/formatters/html_formatter_base.rb +35 -1
- data/lib/canon/formatters/json_formatter.rb +3 -0
- data/lib/canon/formatters/yaml_formatter.rb +3 -0
- data/lib/canon/html/data_model.rb +229 -0
- data/lib/canon/html.rb +9 -0
- data/lib/canon/options/cli_generator.rb +70 -0
- data/lib/canon/options/registry.rb +234 -0
- data/lib/canon/rspec_matchers.rb +34 -13
- data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
- data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
- data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
- data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
- data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
- data/lib/canon/tree_diff/core/matching.rb +241 -0
- data/lib/canon/tree_diff/core/node_signature.rb +164 -0
- data/lib/canon/tree_diff/core/node_weight.rb +135 -0
- data/lib/canon/tree_diff/core/tree_node.rb +450 -0
- data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
- data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
- data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
- data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
- data/lib/canon/tree_diff/operation_converter.rb +631 -0
- data/lib/canon/tree_diff/operations/operation.rb +92 -0
- data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
- data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
- data/lib/canon/tree_diff.rb +33 -0
- data/lib/canon/validators/json_validator.rb +3 -1
- data/lib/canon/validators/yaml_validator.rb +3 -1
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +22 -23
- data/lib/canon/xml/element_matcher.rb +128 -20
- data/lib/canon/xml/namespace_helper.rb +110 -0
- data/lib/canon.rb +3 -0
- metadata +81 -23
- data/_config.yml +0 -116
- data/docs/ADVANCED_TOPICS.adoc +0 -20
- data/docs/BASIC_USAGE.adoc +0 -16
- data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
- data/docs/DIFF_ARCHITECTURE.adoc +0 -435
- data/docs/DIFF_FORMATTING.adoc +0 -540
- data/docs/FORMATS.adoc +0 -447
- data/docs/INPUT_VALIDATION.adoc +0 -477
- data/docs/MATCH_ARCHITECTURE.adoc +0 -463
- data/docs/MATCH_OPTIONS.adoc +0 -719
- data/docs/MODES.adoc +0 -432
- data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
- data/docs/OPTIONS.adoc +0 -1387
- data/docs/PREPROCESSING.adoc +0 -491
- data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
- data/docs/UNDERSTANDING_CANON.adoc +0 -17
|
@@ -0,0 +1,389 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: DOM Algorithm
|
|
3
|
+
parent: Algorithms
|
|
4
|
+
grand_parent: Understanding
|
|
5
|
+
nav_order: 1
|
|
6
|
+
---
|
|
7
|
+
= DOM algorithm
|
|
8
|
+
:toc:
|
|
9
|
+
:toclevels: 3
|
|
10
|
+
|
|
11
|
+
== Purpose
|
|
12
|
+
|
|
13
|
+
The DOM (Document Object Model) algorithm is Canon's **default, stable algorithm** for document comparison. It provides fast, position-based comparison with traditional diff output.
|
|
14
|
+
|
|
15
|
+
This page explains when to use the DOM algorithm, how it works at a high level, and how to configure it effectively.
|
|
16
|
+
|
|
17
|
+
== When to Use
|
|
18
|
+
|
|
19
|
+
The DOM algorithm is Canon's **recommended algorithm for production use**.
|
|
20
|
+
|
|
21
|
+
=== Use DOM Algorithm When
|
|
22
|
+
|
|
23
|
+
* ✓ You need **stable, well-tested comparison** for production environments
|
|
24
|
+
* ✓ You want **traditional line-by-line diff output**
|
|
25
|
+
* ✓ Documents are **similar in structure** with minimal rearrangement
|
|
26
|
+
* ✓ You need **maximum performance** for large documents (> 10KB)
|
|
27
|
+
* ✓ You want **consistent, predictable behavior**
|
|
28
|
+
* ✓ You're comparing **any document format** (XML, HTML, JSON, YAML)
|
|
29
|
+
|
|
30
|
+
=== Characteristics
|
|
31
|
+
|
|
32
|
+
[cols="2,3"]
|
|
33
|
+
|===
|
|
34
|
+
|Feature |DOM Algorithm
|
|
35
|
+
|
|
36
|
+
|**Status**
|
|
37
|
+
|Stable, production-ready
|
|
38
|
+
|
|
39
|
+
|**Performance**
|
|
40
|
+
|Fast - O(n) linear time
|
|
41
|
+
|
|
42
|
+
|**Memory Usage**
|
|
43
|
+
|Low - line-by-line processing
|
|
44
|
+
|
|
45
|
+
|**Matching Strategy**
|
|
46
|
+
|Position-based element comparison
|
|
47
|
+
|
|
48
|
+
|**Move Detection**
|
|
49
|
+
|No (moves shown as DELETE + INSERT)
|
|
50
|
+
|
|
51
|
+
|**Output Format**
|
|
52
|
+
|Line-based differences
|
|
53
|
+
|
|
54
|
+
|**Best For**
|
|
55
|
+
|Similar documents, traditional diffs
|
|
56
|
+
|
|
57
|
+
|**Document Size**
|
|
58
|
+
|Handles large documents well (> 100KB)
|
|
59
|
+
|===
|
|
60
|
+
|
|
61
|
+
== How It Works
|
|
62
|
+
|
|
63
|
+
The DOM algorithm compares documents in a straightforward manner:
|
|
64
|
+
|
|
65
|
+
=== High-Level Process
|
|
66
|
+
|
|
67
|
+
1. **Parse Documents** - Convert both documents to DOM trees
|
|
68
|
+
2. **Compare Positions** - Compare elements at each position in the tree
|
|
69
|
+
3. **Apply Match Options** - Use configured dimensions for comparison
|
|
70
|
+
4. **Generate Differences** - Create line-based diff output
|
|
71
|
+
|
|
72
|
+
=== Position-Based Matching
|
|
73
|
+
|
|
74
|
+
The DOM algorithm compares elements **by their position** in the document:
|
|
75
|
+
|
|
76
|
+
.Position-based comparison example
|
|
77
|
+
[example]
|
|
78
|
+
====
|
|
79
|
+
[source,xml]
|
|
80
|
+
----
|
|
81
|
+
<!-- Document 1 -->
|
|
82
|
+
<book>
|
|
83
|
+
<title>Canon Guide</title>
|
|
84
|
+
<author>Alice</author>
|
|
85
|
+
</book>
|
|
86
|
+
|
|
87
|
+
<!-- Document 2 -->
|
|
88
|
+
<book>
|
|
89
|
+
<title>Canon Guide</title>
|
|
90
|
+
<author>Bob</author>
|
|
91
|
+
</book>
|
|
92
|
+
----
|
|
93
|
+
|
|
94
|
+
The algorithm compares:
|
|
95
|
+
* Position 1: `<book>` vs `<book>` ✓ Match
|
|
96
|
+
* Position 2: `<title>Canon Guide</title>` vs `<title>Canon Guide</title>` ✓ Match
|
|
97
|
+
* Position 3: `<author>Alice</author>` vs `<author>Bob</author>` ✗ Different
|
|
98
|
+
|
|
99
|
+
Result: One difference detected (author name changed)
|
|
100
|
+
====
|
|
101
|
+
|
|
102
|
+
=== No Move Detection
|
|
103
|
+
|
|
104
|
+
If elements are reordered, the DOM algorithm shows this as deletions and additions:
|
|
105
|
+
|
|
106
|
+
.Reordered elements shown as changes
|
|
107
|
+
[example]
|
|
108
|
+
====
|
|
109
|
+
[source,xml]
|
|
110
|
+
----
|
|
111
|
+
<!-- Document 1 -->
|
|
112
|
+
<book>
|
|
113
|
+
<title>Canon</title>
|
|
114
|
+
<author>Alice</author>
|
|
115
|
+
</book>
|
|
116
|
+
|
|
117
|
+
<!-- Document 2 -->
|
|
118
|
+
<book>
|
|
119
|
+
<author>Alice</author>
|
|
120
|
+
<title>Canon</title>
|
|
121
|
+
</book>
|
|
122
|
+
----
|
|
123
|
+
|
|
124
|
+
DOM algorithm output:
|
|
125
|
+
```
|
|
126
|
+
- <title>Canon</title>
|
|
127
|
+
+ <author>Alice</author>
|
|
128
|
+
<author>Alice</author>
|
|
129
|
+
+ <title>Canon</title>
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
The same content appears as both deleted and added because positions changed.
|
|
133
|
+
====
|
|
134
|
+
|
|
135
|
+
== Configuration
|
|
136
|
+
|
|
137
|
+
=== Basic Usage
|
|
138
|
+
|
|
139
|
+
**Ruby API**:
|
|
140
|
+
[source,ruby]
|
|
141
|
+
----
|
|
142
|
+
# DOM algorithm is the default
|
|
143
|
+
Canon::Comparison.equivalent?(doc1, doc2)
|
|
144
|
+
|
|
145
|
+
# Or explicitly specify
|
|
146
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
147
|
+
diff_algorithm: :dom
|
|
148
|
+
)
|
|
149
|
+
----
|
|
150
|
+
|
|
151
|
+
**CLI**:
|
|
152
|
+
[source,bash]
|
|
153
|
+
----
|
|
154
|
+
# DOM algorithm is the default
|
|
155
|
+
canon diff file1.xml file2.xml
|
|
156
|
+
|
|
157
|
+
# Or explicitly specify
|
|
158
|
+
canon diff file1.xml file2.xml --diff-algorithm dom
|
|
159
|
+
----
|
|
160
|
+
|
|
161
|
+
=== With Match Options
|
|
162
|
+
|
|
163
|
+
DOM algorithm respects all match dimensions:
|
|
164
|
+
|
|
165
|
+
[source,ruby]
|
|
166
|
+
----
|
|
167
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
168
|
+
diff_algorithm: :dom,
|
|
169
|
+
match: {
|
|
170
|
+
text_content: :normalize,
|
|
171
|
+
structural_whitespace: :ignore,
|
|
172
|
+
attribute_order: :ignore
|
|
173
|
+
}
|
|
174
|
+
)
|
|
175
|
+
----
|
|
176
|
+
|
|
177
|
+
See link:../../features/match-options/algorithm-specific-behavior.adoc[Algorithm-Specific Behavior] for how DOM interprets match options.
|
|
178
|
+
|
|
179
|
+
=== With Diff Formatting
|
|
180
|
+
|
|
181
|
+
The DOM algorithm works with both diff modes:
|
|
182
|
+
|
|
183
|
+
[source,ruby]
|
|
184
|
+
----
|
|
185
|
+
# Line-based output (natural fit for DOM)
|
|
186
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
187
|
+
diff_algorithm: :dom,
|
|
188
|
+
diff_mode: :by_line,
|
|
189
|
+
verbose: true
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Tree-based output (also works)
|
|
193
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
194
|
+
diff_algorithm: :dom,
|
|
195
|
+
diff_mode: :by_object,
|
|
196
|
+
verbose: true
|
|
197
|
+
)
|
|
198
|
+
----
|
|
199
|
+
|
|
200
|
+
== Output Format
|
|
201
|
+
|
|
202
|
+
=== Line-Based Output (Default)
|
|
203
|
+
|
|
204
|
+
The DOM algorithm naturally produces line-based differences:
|
|
205
|
+
|
|
206
|
+
.Line-based diff example
|
|
207
|
+
[example]
|
|
208
|
+
====
|
|
209
|
+
```
|
|
210
|
+
1 | <book>
|
|
211
|
+
2-| <title>Old Title</title>
|
|
212
|
+
+| <title>New Title</title>
|
|
213
|
+
3 | <author>Alice</author>
|
|
214
|
+
4 | </book>
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
* Lines with `-` were removed
|
|
218
|
+
* Lines with `+` were added
|
|
219
|
+
* Unchanged lines provide context
|
|
220
|
+
====
|
|
221
|
+
|
|
222
|
+
=== Tree-Based Output
|
|
223
|
+
|
|
224
|
+
The DOM algorithm can also produce tree-based output:
|
|
225
|
+
|
|
226
|
+
.Tree-based diff example
|
|
227
|
+
[example]
|
|
228
|
+
====
|
|
229
|
+
```
|
|
230
|
+
book
|
|
231
|
+
title
|
|
232
|
+
- Old Title
|
|
233
|
+
+ New Title
|
|
234
|
+
author
|
|
235
|
+
= Alice
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
* Shows hierarchical structure
|
|
239
|
+
* Still position-based (no move operations)
|
|
240
|
+
====
|
|
241
|
+
|
|
242
|
+
== Advantages
|
|
243
|
+
|
|
244
|
+
=== Fast Performance
|
|
245
|
+
|
|
246
|
+
* **Linear time complexity** - O(n) where n is document size
|
|
247
|
+
* **Low memory usage** - Processes line-by-line
|
|
248
|
+
* **Scales well** - Handles documents > 100KB easily
|
|
249
|
+
|
|
250
|
+
.Performance comparison
|
|
251
|
+
[cols="1,1,1"]
|
|
252
|
+
|===
|
|
253
|
+
|Document Size |DOM Time |Notes
|
|
254
|
+
|
|
255
|
+
|1 KB
|
|
256
|
+
|~1 ms
|
|
257
|
+
|Very fast
|
|
258
|
+
|
|
259
|
+
|10 KB
|
|
260
|
+
|~10 ms
|
|
261
|
+
|Still fast
|
|
262
|
+
|
|
263
|
+
|100 KB
|
|
264
|
+
|~100 ms
|
|
265
|
+
|Acceptable
|
|
266
|
+
|
|
267
|
+
|1 MB
|
|
268
|
+
|~1 s
|
|
269
|
+
|Good scaling
|
|
270
|
+
|===
|
|
271
|
+
|
|
272
|
+
=== Stable and Predictable
|
|
273
|
+
|
|
274
|
+
* **Production-ready** - Well-tested, no surprises
|
|
275
|
+
* **Consistent behavior** - Same results every time
|
|
276
|
+
* **Widely used** - Default for all Canon comparisons
|
|
277
|
+
|
|
278
|
+
=== Works Everywhere
|
|
279
|
+
|
|
280
|
+
* **All formats** - XML, HTML, JSON, YAML
|
|
281
|
+
* **All diff modes** - by_line and by_object
|
|
282
|
+
* **All match profiles** - Compatible with all configurations
|
|
283
|
+
|
|
284
|
+
== Limitations
|
|
285
|
+
|
|
286
|
+
=== No Move Detection
|
|
287
|
+
|
|
288
|
+
The DOM algorithm cannot detect when content moves to a different position:
|
|
289
|
+
|
|
290
|
+
**Limitation**: Reordered content shows as DELETE + INSERT pairs
|
|
291
|
+
|
|
292
|
+
**Workaround**: Use link:semantic-tree-diff.adoc[Semantic Algorithm] for move detection
|
|
293
|
+
|
|
294
|
+
=== Position-Dependent
|
|
295
|
+
|
|
296
|
+
The algorithm assumes similar structure between documents:
|
|
297
|
+
|
|
298
|
+
**Limitation**: Heavily restructured documents produce noisy diffs
|
|
299
|
+
|
|
300
|
+
**Workaround**: Use link:semantic-tree-diff.adoc[Semantic Algorithm] for restructured documents
|
|
301
|
+
|
|
302
|
+
== Common Use Cases
|
|
303
|
+
|
|
304
|
+
=== Use Case 1: Testing XML Generation
|
|
305
|
+
|
|
306
|
+
[source,ruby]
|
|
307
|
+
----
|
|
308
|
+
# Fast, reliable test assertion
|
|
309
|
+
RSpec.describe "XML generation" do
|
|
310
|
+
it "generates correct XML" do
|
|
311
|
+
actual = MyGenerator.generate
|
|
312
|
+
expected = File.read("expected.xml")
|
|
313
|
+
|
|
314
|
+
expect(actual).to be_xml_equivalent_to(expected)
|
|
315
|
+
.with_profile(:spec_friendly)
|
|
316
|
+
# Uses DOM algorithm by default
|
|
317
|
+
end
|
|
318
|
+
end
|
|
319
|
+
----
|
|
320
|
+
|
|
321
|
+
=== Use Case 2: Code Review Diff
|
|
322
|
+
|
|
323
|
+
[source,bash]
|
|
324
|
+
----
|
|
325
|
+
# Traditional diff for reviewing changes
|
|
326
|
+
canon diff old.xml new.xml \
|
|
327
|
+
--diff-algorithm dom \
|
|
328
|
+
--diff-mode by_line \
|
|
329
|
+
--verbose \
|
|
330
|
+
--use-color
|
|
331
|
+
----
|
|
332
|
+
|
|
333
|
+
=== Use Case 3: CI/CD Validation
|
|
334
|
+
|
|
335
|
+
[source,ruby]
|
|
336
|
+
----
|
|
337
|
+
# Fast validation in CI pipeline
|
|
338
|
+
result = Canon::Comparison.equivalent?(expected, actual,
|
|
339
|
+
diff_algorithm: :dom, # Fast
|
|
340
|
+
match_profile: :spec_friendly
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
unless result
|
|
344
|
+
puts "Validation failed!"
|
|
345
|
+
exit 1
|
|
346
|
+
end
|
|
347
|
+
----
|
|
348
|
+
|
|
349
|
+
== Best Practices
|
|
350
|
+
|
|
351
|
+
=== Use for Most Comparisons
|
|
352
|
+
|
|
353
|
+
The DOM algorithm is the **right choice for 90% of use cases**. Only switch to Semantic when you specifically need move detection.
|
|
354
|
+
|
|
355
|
+
=== Combine with Match Profiles
|
|
356
|
+
|
|
357
|
+
Use appropriate match profiles for your use case:
|
|
358
|
+
|
|
359
|
+
[source,ruby]
|
|
360
|
+
----
|
|
361
|
+
# For tests
|
|
362
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
363
|
+
match_profile: :spec_friendly
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
# For exact validation
|
|
367
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
368
|
+
match_profile: :strict
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
# For rendered HTML
|
|
372
|
+
Canon::Comparison.equivalent?(html1, html2,
|
|
373
|
+
match_profile: :rendered
|
|
374
|
+
)
|
|
375
|
+
----
|
|
376
|
+
|
|
377
|
+
=== Use Appropriate Diff Mode
|
|
378
|
+
|
|
379
|
+
* **by_line** - Best for code review, traditional diffs
|
|
380
|
+
* **by_object** - Best for structured view, test output
|
|
381
|
+
|
|
382
|
+
== See Also
|
|
383
|
+
|
|
384
|
+
* link:index.adoc[Algorithms Overview] - Comparison of DOM vs Semantic
|
|
385
|
+
* link:semantic-tree-diff.adoc[Semantic Algorithm] - Alternative algorithm
|
|
386
|
+
* link:../../features/match-options/algorithm-specific-behavior.adoc[Algorithm-Specific Behavior] - How DOM interprets options
|
|
387
|
+
* link:../../features/diff-formatting/algorithm-specific-output.adoc[Algorithm-Specific Output] - Output format details
|
|
388
|
+
* link:../../guides/choosing-configuration.adoc[Choosing Configuration] - Complete decision guide
|
|
389
|
+
* link:../../advanced/dom-diff-internals.adoc[DOM Diff Internals] - Advanced implementation details (if available)
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: Algorithms
|
|
3
|
+
parent: Understanding
|
|
4
|
+
nav_order: 4
|
|
5
|
+
has_children: true
|
|
6
|
+
---
|
|
7
|
+
= Algorithms
|
|
8
|
+
:toc:
|
|
9
|
+
:toclevels: 3
|
|
10
|
+
|
|
11
|
+
== Purpose
|
|
12
|
+
|
|
13
|
+
Canon provides two comparison algorithms, each with different strengths and use cases. This section explains how to choose between them and what to expect from each.
|
|
14
|
+
|
|
15
|
+
This corresponds to **Layer 2 (Algorithm Selection)** in Canon's 4-layer architecture. See link:../comparison-pipeline.adoc[Comparison Pipeline] for the complete flow.
|
|
16
|
+
|
|
17
|
+
== Overview
|
|
18
|
+
|
|
19
|
+
Canon supports two algorithms for document comparison:
|
|
20
|
+
|
|
21
|
+
* **DOM Algorithm** - Fast, stable, positional comparison (default)
|
|
22
|
+
* **Semantic Algorithm** - Slower, intelligent, detects moves and restructuring (experimental)
|
|
23
|
+
|
|
24
|
+
**Critical**: The algorithm choice affects how Layers 3 and 4 behave. See link:../../features/match-options/algorithm-specific-behavior.adoc[Algorithm-Specific Behavior] and link:../../features/diff-formatting/algorithm-specific-output.adoc[Algorithm-Specific Output].
|
|
25
|
+
|
|
26
|
+
== Child Pages
|
|
27
|
+
|
|
28
|
+
* link:dom-diff.adoc[DOM Algorithm] - Positional comparison details
|
|
29
|
+
* link:semantic-tree-diff.adoc[Semantic Algorithm] - Tree-based comparison details
|
|
30
|
+
|
|
31
|
+
== Algorithm Comparison
|
|
32
|
+
|
|
33
|
+
[cols="2,3,3"]
|
|
34
|
+
|===
|
|
35
|
+
|Feature |DOM Algorithm |Semantic Algorithm
|
|
36
|
+
|
|
37
|
+
|**Status**
|
|
38
|
+
|Stable, production-ready
|
|
39
|
+
|Experimental
|
|
40
|
+
|
|
41
|
+
|**Performance**
|
|
42
|
+
|Fast (linear with document size)
|
|
43
|
+
|Slower (quadratic worst case)
|
|
44
|
+
|
|
45
|
+
|**Memory**
|
|
46
|
+
|Low (line-by-line processing)
|
|
47
|
+
|Higher (tree structures in memory)
|
|
48
|
+
|
|
49
|
+
|**Matching Strategy**
|
|
50
|
+
|Position-based element matching
|
|
51
|
+
|Signature-based similarity matching
|
|
52
|
+
|
|
53
|
+
|**Move Detection**
|
|
54
|
+
|No (shows as DELETE + INSERT)
|
|
55
|
+
|Yes (shows as MOVE operation)
|
|
56
|
+
|
|
57
|
+
|**Layer 3 Behavior**
|
|
58
|
+
|Element-by-element comparison
|
|
59
|
+
|Signature calculation
|
|
60
|
+
|
|
61
|
+
|**Layer 4 Output**
|
|
62
|
+
|Line-based differences
|
|
63
|
+
|Operation-based (INSERT, DELETE, UPDATE, MOVE)
|
|
64
|
+
|
|
65
|
+
|**Natural Diff Mode**
|
|
66
|
+
|`by_line`
|
|
67
|
+
|`by_object`
|
|
68
|
+
|
|
69
|
+
|**Best For**
|
|
70
|
+
|Similar documents, traditional diffs
|
|
71
|
+
|Restructured documents, operation analysis
|
|
72
|
+
|
|
73
|
+
|**Document Size**
|
|
74
|
+
|Handles large documents (> 100KB)
|
|
75
|
+
|Best for smaller documents (< 10KB)
|
|
76
|
+
|===
|
|
77
|
+
|
|
78
|
+
== When to Use Each Algorithm
|
|
79
|
+
|
|
80
|
+
=== Use DOM Algorithm When
|
|
81
|
+
|
|
82
|
+
* ✓ Documents have similar structure
|
|
83
|
+
* ✓ Position matters in your comparison
|
|
84
|
+
* ✓ Fast performance is critical
|
|
85
|
+
* ✓ Traditional diff output is sufficient
|
|
86
|
+
* ✓ Working with large documents
|
|
87
|
+
* ✓ Stability is important (production use)
|
|
88
|
+
* ✓ You need well-tested, predictable behavior
|
|
89
|
+
|
|
90
|
+
**Example use cases**:
|
|
91
|
+
* Comparing generated vs expected XML in tests
|
|
92
|
+
* Reviewing code changes in HTML templates
|
|
93
|
+
* Fast CI/CD validation
|
|
94
|
+
* Large document comparison
|
|
95
|
+
|
|
96
|
+
=== Use Semantic Algorithm When
|
|
97
|
+
|
|
98
|
+
* ✓ Documents may be restructured
|
|
99
|
+
* ✓ Need to detect element moves/reordering
|
|
100
|
+
* ✓ Operation-level analysis is valuable
|
|
101
|
+
* ✓ Content evolution tracking is needed
|
|
102
|
+
* ✓ Willing to accept experimental status
|
|
103
|
+
* ✓ Working with smaller documents
|
|
104
|
+
|
|
105
|
+
**Example use cases**:
|
|
106
|
+
* Detecting document reorganization
|
|
107
|
+
* Analyzing content migration changes
|
|
108
|
+
* Understanding structural transformations
|
|
109
|
+
* Research and development work
|
|
110
|
+
|
|
111
|
+
== Configuration
|
|
112
|
+
|
|
113
|
+
=== Setting the Algorithm
|
|
114
|
+
|
|
115
|
+
**Ruby API**:
|
|
116
|
+
[source,ruby]
|
|
117
|
+
----
|
|
118
|
+
# DOM algorithm (default)
|
|
119
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
120
|
+
diff_algorithm: :dom
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Semantic algorithm
|
|
124
|
+
Canon::Comparison.equivalent?(doc1, doc2,
|
|
125
|
+
diff_algorithm: :semantic
|
|
126
|
+
)
|
|
127
|
+
----
|
|
128
|
+
|
|
129
|
+
**CLI**:
|
|
130
|
+
[source,bash]
|
|
131
|
+
----
|
|
132
|
+
# DOM algorithm (default)
|
|
133
|
+
canon diff file1.xml file2.xml --diff-algorithm dom
|
|
134
|
+
|
|
135
|
+
# Semantic algorithm
|
|
136
|
+
canon diff file1.xml file2.xml --diff-algorithm semantic
|
|
137
|
+
----
|
|
138
|
+
|
|
139
|
+
**RSpec**:
|
|
140
|
+
[source,ruby]
|
|
141
|
+
----
|
|
142
|
+
# Global configuration
|
|
143
|
+
Canon::RSpecMatchers.configure do |config|
|
|
144
|
+
config.xml.diff_algorithm = :semantic
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Per-test
|
|
148
|
+
expect(actual).to be_xml_equivalent_to(expected)
|
|
149
|
+
.with_options(diff_algorithm: :semantic)
|
|
150
|
+
----
|
|
151
|
+
|
|
152
|
+
== Algorithm Selection Decision Tree
|
|
153
|
+
|
|
154
|
+
[mermaid]
|
|
155
|
+
----
|
|
156
|
+
graph TD
|
|
157
|
+
Start[Choose Algorithm] --> Size{Document<br/>size?}
|
|
158
|
+
Size -->|> 100KB| DOM[Use DOM]
|
|
159
|
+
Size -->|< 10KB| Struct{Documents<br/>restructured?}
|
|
160
|
+
|
|
161
|
+
Struct -->|Yes| Semantic[Use Semantic]
|
|
162
|
+
Struct -->|No| Speed{Need<br/>speed?}
|
|
163
|
+
|
|
164
|
+
Speed -->|Yes| DOM
|
|
165
|
+
Speed -->|No| Features{Need move<br/>detection?}
|
|
166
|
+
|
|
167
|
+
Features -->|Yes| Semantic
|
|
168
|
+
Features -->|No| Prod{Production<br/>use?}
|
|
169
|
+
|
|
170
|
+
Prod -->|Yes| DOM
|
|
171
|
+
Prod -->|No| Either[Either works,<br/>prefer DOM]
|
|
172
|
+
|
|
173
|
+
style DOM fill:#e1f5ff
|
|
174
|
+
style Semantic fill:#ffe1f5
|
|
175
|
+
style Either fill:#e1ffe1
|
|
176
|
+
----
|
|
177
|
+
|
|
178
|
+
== Performance Characteristics
|
|
179
|
+
|
|
180
|
+
=== DOM Algorithm Performance
|
|
181
|
+
|
|
182
|
+
**Time Complexity**: O(n) where n is document size
|
|
183
|
+
* Linear scaling with document size
|
|
184
|
+
* Predictable performance
|
|
185
|
+
* Handles large documents well
|
|
186
|
+
|
|
187
|
+
**Memory Usage**: Low
|
|
188
|
+
* Line-by-line processing
|
|
189
|
+
* No complex tree structures
|
|
190
|
+
* Minimal memory overhead
|
|
191
|
+
|
|
192
|
+
**Throughput**: High
|
|
193
|
+
* ~1000 comparisons/second for typical documents
|
|
194
|
+
* Suitable for batch processing
|
|
195
|
+
|
|
196
|
+
=== Semantic Algorithm Performance
|
|
197
|
+
|
|
198
|
+
**Time Complexity**: O(n²) worst case, O(n log n) typical
|
|
199
|
+
* Quadratic worst case (highly restructured)
|
|
200
|
+
* Logarithmic typical case (similar structure)
|
|
201
|
+
* Slower on large documents
|
|
202
|
+
|
|
203
|
+
**Memory Usage**: Higher
|
|
204
|
+
* Builds tree structures in memory
|
|
205
|
+
* Stores signatures for all nodes
|
|
206
|
+
* More memory per comparison
|
|
207
|
+
|
|
208
|
+
**Throughput**: Lower
|
|
209
|
+
* ~100 comparisons/second for typical documents
|
|
210
|
+
* Better for one-off comparisons
|
|
211
|
+
|
|
212
|
+
=== Performance Comparison Example
|
|
213
|
+
|
|
214
|
+
.Comparison time for different document sizes
|
|
215
|
+
[cols="1,1,1,1"]
|
|
216
|
+
|===
|
|
217
|
+
|Document Size |DOM Time |Semantic Time |Ratio
|
|
218
|
+
|
|
219
|
+
|1 KB
|
|
220
|
+
|~1 ms
|
|
221
|
+
|~10 ms
|
|
222
|
+
|10x
|
|
223
|
+
|
|
224
|
+
|10 KB
|
|
225
|
+
|~10 ms
|
|
226
|
+
|~150 ms
|
|
227
|
+
|15x
|
|
228
|
+
|
|
229
|
+
|100 KB
|
|
230
|
+
|~100 ms
|
|
231
|
+
|~3000 ms
|
|
232
|
+
|30x
|
|
233
|
+
|
|
234
|
+
|1 MB
|
|
235
|
+
|~1 s
|
|
236
|
+
|~60 s
|
|
237
|
+
|60x
|
|
238
|
+
|===
|
|
239
|
+
|
|
240
|
+
NOTE: These are approximate times. Actual performance depends on document structure and complexity.
|
|
241
|
+
|
|
242
|
+
== Migration Between Algorithms
|
|
243
|
+
|
|
244
|
+
=== Switching from DOM to Semantic
|
|
245
|
+
|
|
246
|
+
**Expected changes**:
|
|
247
|
+
1. Reordered elements detected as MOVE instead of DELETE+INSERT
|
|
248
|
+
2. `attribute_order` setting becomes irrelevant
|
|
249
|
+
3. Performance slower but more intelligent
|
|
250
|
+
4. Output format changes to operation-based
|
|
251
|
+
|
|
252
|
+
See link:../../features/match-options/algorithm-specific-behavior.adoc[Algorithm-Specific Behavior] for migration details.
|
|
253
|
+
|
|
254
|
+
=== Switching from Semantic to DOM
|
|
255
|
+
|
|
256
|
+
**Expected changes**:
|
|
257
|
+
1. MOVE operations become DELETE+INSERT pairs
|
|
258
|
+
2. Reordered content shows as differences
|
|
259
|
+
3. Performance faster
|
|
260
|
+
4. Output format changes to line-based
|
|
261
|
+
|
|
262
|
+
== Common Patterns
|
|
263
|
+
|
|
264
|
+
=== Pattern 1: Fast Validation (DOM)
|
|
265
|
+
|
|
266
|
+
[source,ruby]
|
|
267
|
+
----
|
|
268
|
+
# Fast CI/CD validation
|
|
269
|
+
Canon::Comparison.equivalent?(expected, actual,
|
|
270
|
+
diff_algorithm: :dom,
|
|
271
|
+
match_profile: :spec_friendly
|
|
272
|
+
)
|
|
273
|
+
----
|
|
274
|
+
|
|
275
|
+
=== Pattern 2: Detailed Analysis (Semantic)
|
|
276
|
+
|
|
277
|
+
[source,ruby]
|
|
278
|
+
----
|
|
279
|
+
# Understand what changed
|
|
280
|
+
result = Canon::Comparison.equivalent?(old, new,
|
|
281
|
+
diff_algorithm: :semantic,
|
|
282
|
+
verbose: true,
|
|
283
|
+
diff_mode: :by_object
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
puts "Moves: #{result.statistics.moves}"
|
|
287
|
+
puts "Updates: #{result.statistics.updates}"
|
|
288
|
+
----
|
|
289
|
+
|
|
290
|
+
=== Pattern 3: Hybrid Approach
|
|
291
|
+
|
|
292
|
+
[source,ruby]
|
|
293
|
+
----
|
|
294
|
+
# Try fast DOM first
|
|
295
|
+
if Canon::Comparison.equivalent?(doc1, doc2, diff_algorithm: :dom)
|
|
296
|
+
puts "Documents identical"
|
|
297
|
+
else
|
|
298
|
+
# Use semantic for detailed analysis
|
|
299
|
+
result = Canon::Comparison.equivalent?(doc1, doc2,
|
|
300
|
+
diff_algorithm: :semantic,
|
|
301
|
+
verbose: true
|
|
302
|
+
)
|
|
303
|
+
analyze_operations(result.operations)
|
|
304
|
+
end
|
|
305
|
+
----
|
|
306
|
+
|
|
307
|
+
== See also
|
|
308
|
+
|
|
309
|
+
* link:dom-diff.adoc[DOM Algorithm] - Detailed DOM algorithm documentation
|
|
310
|
+
* link:semantic-tree-diff.adoc[Semantic Algorithm] - Detailed semantic algorithm documentation
|
|
311
|
+
* link:../comparison-pipeline.adoc[Comparison Pipeline] - 4-layer architecture
|
|
312
|
+
* link:../../features/match-options/algorithm-specific-behavior.adoc[Algorithm-Specific Behavior] - How algorithms interpret options
|
|
313
|
+
* link:../../features/diff-formatting/algorithm-specific-output.adoc[Algorithm-Specific Output] - Output format differences
|
|
314
|
+
* link:../../guides/choosing-configuration.adoc[Choosing Configuration] - Complete decision guide
|