canon 0.1.8 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +83 -22
- data/docs/Gemfile +1 -0
- data/docs/_config.yml +90 -1
- data/docs/advanced/diff-classification.adoc +196 -24
- data/docs/features/match-options/index.adoc +239 -1
- data/lib/canon/comparison/format_detector.rb +2 -1
- data/lib/canon/comparison/html_comparator.rb +19 -8
- data/lib/canon/comparison/html_compare_profile.rb +8 -2
- data/lib/canon/comparison/markup_comparator.rb +109 -2
- data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
- data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +108 -0
- data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
- data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
- data/lib/canon/comparison/xml_comparator.rb +240 -23
- data/lib/canon/comparison/xml_node_comparison.rb +25 -3
- data/lib/canon/diff/diff_classifier.rb +119 -5
- data/lib/canon/diff/formatting_detector.rb +1 -1
- data/lib/canon/diff/xml_serialization_formatter.rb +153 -0
- data/lib/canon/rspec_matchers.rb +37 -8
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +24 -13
- metadata +4 -78
- data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
- data/false_positive_analysis.txt +0 -0
- data/file1.html +0 -1
- data/file2.html +0 -1
- data/old-docs/ADVANCED_TOPICS.adoc +0 -20
- data/old-docs/BASIC_USAGE.adoc +0 -16
- data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
- data/old-docs/CLI.adoc +0 -497
- data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
- data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
- data/old-docs/DIFF_FORMATTING.adoc +0 -540
- data/old-docs/DIFF_PARAMETERS.adoc +0 -261
- data/old-docs/DOM_DIFF.adoc +0 -1017
- data/old-docs/ENV_CONFIG.adoc +0 -876
- data/old-docs/FORMATS.adoc +0 -867
- data/old-docs/INPUT_VALIDATION.adoc +0 -477
- data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
- data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
- data/old-docs/MATCH_OPTIONS.adoc +0 -912
- data/old-docs/MODES.adoc +0 -432
- data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
- data/old-docs/OPTIONS.adoc +0 -1387
- data/old-docs/PREPROCESSING.adoc +0 -491
- data/old-docs/README.old.adoc +0 -2831
- data/old-docs/RSPEC.adoc +0 -814
- data/old-docs/RUBY_API.adoc +0 -485
- data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
- data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
- data/old-docs/STRING_COMPARE.adoc +0 -345
- data/old-docs/TMP.adoc +0 -3384
- data/old-docs/TREE_DIFF.adoc +0 -1080
- data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
- data/old-docs/VERBOSE.adoc +0 -482
- data/old-docs/VISUALIZATION_MAP.adoc +0 -625
- data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
- data/scripts/analyze_current_state.rb +0 -85
- data/scripts/analyze_false_positives.rb +0 -114
- data/scripts/analyze_remaining_failures.rb +0 -105
- data/scripts/compare_current_failures.rb +0 -95
- data/scripts/compare_dom_tree_diff.rb +0 -158
- data/scripts/compare_failures.rb +0 -151
- data/scripts/debug_attribute_extraction.rb +0 -66
- data/scripts/debug_blocks_839.rb +0 -115
- data/scripts/debug_meta_matching.rb +0 -52
- data/scripts/debug_p_matching.rb +0 -192
- data/scripts/debug_signature_matching.rb +0 -118
- data/scripts/debug_sourcecode_124.rb +0 -32
- data/scripts/debug_whitespace_sensitive.rb +0 -192
- data/scripts/extract_false_positives.rb +0 -138
- data/scripts/find_actual_false_positives.rb +0 -125
- data/scripts/investigate_all_false_positives.rb +0 -161
- data/scripts/investigate_batch1.rb +0 -127
- data/scripts/investigate_classification.rb +0 -150
- data/scripts/investigate_classification_detailed.rb +0 -190
- data/scripts/investigate_common_failures.rb +0 -342
- data/scripts/investigate_false_negative.rb +0 -80
- data/scripts/investigate_false_positive.rb +0 -83
- data/scripts/investigate_false_positives.rb +0 -227
- data/scripts/investigate_false_positives_batch.rb +0 -163
- data/scripts/investigate_mixed_content.rb +0 -125
- data/scripts/investigate_remaining_16.rb +0 -214
- data/scripts/run_single_test.rb +0 -29
- data/scripts/test_all_false_positives.rb +0 -95
- data/scripts/test_attribute_details.rb +0 -61
- data/scripts/test_both_algorithms.rb +0 -49
- data/scripts/test_both_simple.rb +0 -49
- data/scripts/test_enhanced_semantic_output.rb +0 -125
- data/scripts/test_readme_examples.rb +0 -131
- data/scripts/test_semantic_tree_diff.rb +0 -99
- data/scripts/test_semantic_ux_improvements.rb +0 -135
- data/scripts/test_single_false_positive.rb +0 -119
- data/scripts/test_size_limits.rb +0 -99
- data/test_html_1.html +0 -21
- data/test_html_2.html +0 -21
- data/test_nokogiri.rb +0 -33
- data/test_normalize.rb +0 -45
data/old-docs/ENV_CONFIG.adoc
DELETED
|
@@ -1,876 +0,0 @@
|
|
|
1
|
-
= Environment Variable Configuration
|
|
2
|
-
:toc:
|
|
3
|
-
:toclevels: 3
|
|
4
|
-
|
|
5
|
-
== General
|
|
6
|
-
|
|
7
|
-
Canon supports configuration through environment variables, allowing you to override default settings and programmatically set values without modifying code. This is particularly useful for CI/CD pipelines, containerized environments, and different deployment scenarios.
|
|
8
|
-
|
|
9
|
-
== Priority Chain
|
|
10
|
-
|
|
11
|
-
Configuration values are resolved using the following priority (highest to lowest):
|
|
12
|
-
|
|
13
|
-
. **Environment Variables** (highest priority)
|
|
14
|
-
. **Programmatic Configuration** (via `Canon::Config`)
|
|
15
|
-
. **Default Values** (lowest priority)
|
|
16
|
-
|
|
17
|
-
This means environment variables always override programmatic settings, which in turn override defaults.
|
|
18
|
-
|
|
19
|
-
== Naming Convention
|
|
20
|
-
|
|
21
|
-
Environment variables follow a consistent naming pattern:
|
|
22
|
-
|
|
23
|
-
[source]
|
|
24
|
-
----
|
|
25
|
-
CANON_{FORMAT}_{CONFIG_TYPE}_{ATTRIBUTE}
|
|
26
|
-
----
|
|
27
|
-
|
|
28
|
-
Where:
|
|
29
|
-
|
|
30
|
-
* `FORMAT`: `XML`, `HTML`, `JSON`, `YAML`, or `STRING`
|
|
31
|
-
* `CONFIG_TYPE`: `DIFF` or `MATCH`
|
|
32
|
-
* `ATTRIBUTE`: The configuration attribute name (e.g., `ALGORITHM`, `MODE`, `PROFILE`)
|
|
33
|
-
|
|
34
|
-
=== Global Variables
|
|
35
|
-
|
|
36
|
-
You can also use global environment variables that apply to all formats by omitting the format prefix:
|
|
37
|
-
|
|
38
|
-
[source]
|
|
39
|
-
----
|
|
40
|
-
CANON_{ATTRIBUTE}
|
|
41
|
-
----
|
|
42
|
-
|
|
43
|
-
Global variables are overridden by format-specific variables.
|
|
44
|
-
|
|
45
|
-
== Diff Configuration
|
|
46
|
-
|
|
47
|
-
=== Format-Specific Variables
|
|
48
|
-
|
|
49
|
-
==== Algorithm Selection
|
|
50
|
-
|
|
51
|
-
Canon supports two diff algorithms:
|
|
52
|
-
|
|
53
|
-
* **`dom`**: DOM-based tree diff (default, stable)
|
|
54
|
-
* **`semantic`**: Semantic tree diff (experimental, more sophisticated)
|
|
55
|
-
|
|
56
|
-
[source,bash]
|
|
57
|
-
----
|
|
58
|
-
# Set algorithm for XML diff
|
|
59
|
-
export CANON_XML_DIFF_ALGORITHM=semantic
|
|
60
|
-
|
|
61
|
-
# Set algorithm for HTML diff
|
|
62
|
-
export CANON_HTML_DIFF_ALGORITHM=dom
|
|
63
|
-
|
|
64
|
-
# Set globally for all formats
|
|
65
|
-
export CANON_ALGORITHM=semantic
|
|
66
|
-
----
|
|
67
|
-
|
|
68
|
-
**Testing/Debugging Use Case**:
|
|
69
|
-
|
|
70
|
-
When comparing algorithm behavior in test suites:
|
|
71
|
-
|
|
72
|
-
[source,bash]
|
|
73
|
-
----
|
|
74
|
-
# Test with DOM algorithm (baseline)
|
|
75
|
-
CANON_ALGORITHM=dom bundle exec rspec
|
|
76
|
-
|
|
77
|
-
# Test with semantic algorithm
|
|
78
|
-
CANON_ALGORITHM=semantic bundle exec rspec
|
|
79
|
-
|
|
80
|
-
# Format-specific (useful when formats behave differently)
|
|
81
|
-
CANON_HTML_DIFF_ALGORITHM=dom CANON_XML_DIFF_ALGORITHM=dom bundle exec rspec
|
|
82
|
-
----
|
|
83
|
-
|
|
84
|
-
**RSpec Configuration**:
|
|
85
|
-
|
|
86
|
-
In your `spec_helper.rb`, simply set the default algorithm:
|
|
87
|
-
|
|
88
|
-
[source,ruby]
|
|
89
|
-
----
|
|
90
|
-
# Canon's ENV variables will automatically override this
|
|
91
|
-
Canon::Config.instance.html.diff.algorithm = :semantic
|
|
92
|
-
Canon::Config.instance.xml.diff.algorithm = :semantic
|
|
93
|
-
----
|
|
94
|
-
|
|
95
|
-
Then use ENV variables to override for specific test runs without modifying the spec_helper.
|
|
96
|
-
|
|
97
|
-
Valid values: `dom`, `semantic`
|
|
98
|
-
|
|
99
|
-
==== Diff Mode
|
|
100
|
-
|
|
101
|
-
[source,bash]
|
|
102
|
-
----
|
|
103
|
-
# Set diff mode for JSON
|
|
104
|
-
export CANON_JSON_DIFF_MODE=by_object
|
|
105
|
-
|
|
106
|
-
# Set diff mode for YAML
|
|
107
|
-
export CANON_YAML_DIFF_MODE=by_line
|
|
108
|
-
----
|
|
109
|
-
|
|
110
|
-
Valid values: `by_line`, `by_object`
|
|
111
|
-
|
|
112
|
-
==== Color Output
|
|
113
|
-
|
|
114
|
-
[source,bash]
|
|
115
|
-
----
|
|
116
|
-
# Disable color output for XML
|
|
117
|
-
export CANON_XML_DIFF_USE_COLOR=false
|
|
118
|
-
|
|
119
|
-
# Enable color output for HTML
|
|
120
|
-
export CANON_HTML_DIFF_USE_COLOR=true
|
|
121
|
-
----
|
|
122
|
-
|
|
123
|
-
Valid values: `true`, `false`, `1`, `0`, `yes`, `no`
|
|
124
|
-
|
|
125
|
-
==== Context Lines
|
|
126
|
-
|
|
127
|
-
[source,bash]
|
|
128
|
-
----
|
|
129
|
-
# Set context lines for XML diff
|
|
130
|
-
export CANON_XML_DIFF_CONTEXT_LINES=5
|
|
131
|
-
----
|
|
132
|
-
|
|
133
|
-
Valid values: Any positive integer
|
|
134
|
-
|
|
135
|
-
==== Grouping Lines
|
|
136
|
-
|
|
137
|
-
[source,bash]
|
|
138
|
-
----
|
|
139
|
-
# Set grouping lines for XML diff
|
|
140
|
-
export CANON_XML_DIFF_GROUPING_LINES=20
|
|
141
|
-
----
|
|
142
|
-
|
|
143
|
-
Valid values: Any positive integer
|
|
144
|
-
|
|
145
|
-
==== Show Diffs
|
|
146
|
-
|
|
147
|
-
[source,bash]
|
|
148
|
-
----
|
|
149
|
-
# Show only informative diffs
|
|
150
|
-
export CANON_XML_DIFF_SHOW_DIFFS=informative
|
|
151
|
-
|
|
152
|
-
# Show all diffs
|
|
153
|
-
export CANON_XML_DIFF_SHOW_DIFFS=all
|
|
154
|
-
----
|
|
155
|
-
|
|
156
|
-
Valid values: `all`, `informative`, `normative`
|
|
157
|
-
|
|
158
|
-
==== Verbose Diff
|
|
159
|
-
|
|
160
|
-
[source,bash]
|
|
161
|
-
----
|
|
162
|
-
# Enable verbose diff output
|
|
163
|
-
export CANON_XML_DIFF_VERBOSE_DIFF=true
|
|
164
|
-
----
|
|
165
|
-
|
|
166
|
-
Valid values: `true`, `false`, `1`, `0`, `yes`, `no`
|
|
167
|
-
|
|
168
|
-
==== Show Compare
|
|
169
|
-
|
|
170
|
-
[source,bash]
|
|
171
|
-
----
|
|
172
|
-
# Enable side-by-side algorithm comparison
|
|
173
|
-
export CANON_XML_DIFF_SHOW_COMPARE=true
|
|
174
|
-
----
|
|
175
|
-
|
|
176
|
-
Valid values: `true`, `false`, `1`, `0`, `yes`, `no`
|
|
177
|
-
|
|
178
|
-
=== Global Diff Variables
|
|
179
|
-
|
|
180
|
-
Apply to all formats unless overridden by format-specific variables:
|
|
181
|
-
|
|
182
|
-
[source,bash]
|
|
183
|
-
----
|
|
184
|
-
# Set algorithm globally
|
|
185
|
-
export CANON_ALGORITHM=semantic
|
|
186
|
-
|
|
187
|
-
# Disable color globally
|
|
188
|
-
export CANON_USE_COLOR=false
|
|
189
|
-
|
|
190
|
-
# Set diff mode globally
|
|
191
|
-
export CANON_MODE=by_object
|
|
192
|
-
|
|
193
|
-
# Set context lines globally
|
|
194
|
-
export CANON_CONTEXT_LINES=10
|
|
195
|
-
|
|
196
|
-
# Set grouping lines globally
|
|
197
|
-
export CANON_GROUPING_LINES=15
|
|
198
|
-
|
|
199
|
-
# Set show_diffs globally
|
|
200
|
-
export CANON_SHOW_DIFFS=informative
|
|
201
|
-
|
|
202
|
-
# Enable verbose diff globally
|
|
203
|
-
export CANON_VERBOSE_DIFF=true
|
|
204
|
-
|
|
205
|
-
# Enable show_compare globally
|
|
206
|
-
export CANON_SHOW_COMPARE=true
|
|
207
|
-
----
|
|
208
|
-
|
|
209
|
-
==== Size Limits
|
|
210
|
-
|
|
211
|
-
Canon provides configurable size limits to prevent hangs or excessive resource usage when processing very large files.
|
|
212
|
-
|
|
213
|
-
===== File Size Limit
|
|
214
|
-
|
|
215
|
-
Maximum file size in bytes before comparison is rejected:
|
|
216
|
-
|
|
217
|
-
[source,bash]
|
|
218
|
-
----
|
|
219
|
-
# Set max file size to 10MB for XML
|
|
220
|
-
export CANON_XML_DIFF_MAX_FILE_SIZE=10485760
|
|
221
|
-
|
|
222
|
-
# Set globally (5MB default)
|
|
223
|
-
export CANON_MAX_FILE_SIZE=5242880
|
|
224
|
-
----
|
|
225
|
-
|
|
226
|
-
Valid values: Any positive integer (bytes)
|
|
227
|
-
|
|
228
|
-
Default: 5,242,880 bytes (5MB)
|
|
229
|
-
|
|
230
|
-
===== Node Count Limit
|
|
231
|
-
|
|
232
|
-
Maximum number of nodes in a tree structure before comparison is rejected:
|
|
233
|
-
|
|
234
|
-
[source,bash]
|
|
235
|
-
----
|
|
236
|
-
# Set max node count for XML diff
|
|
237
|
-
export CANON_XML_DIFF_MAX_NODE_COUNT=20000
|
|
238
|
-
|
|
239
|
-
# Set globally (10,000 default)
|
|
240
|
-
export CANON_MAX_NODE_COUNT=10000
|
|
241
|
-
----
|
|
242
|
-
|
|
243
|
-
Valid values: Any positive integer
|
|
244
|
-
|
|
245
|
-
Default: 10,000 nodes
|
|
246
|
-
|
|
247
|
-
===== Diff Output Lines Limit
|
|
248
|
-
|
|
249
|
-
Maximum number of lines in diff output before truncation:
|
|
250
|
-
|
|
251
|
-
[source,bash]
|
|
252
|
-
----
|
|
253
|
-
# Set max diff lines for XML
|
|
254
|
-
export CANON_XML_DIFF_MAX_DIFF_LINES=15000
|
|
255
|
-
|
|
256
|
-
# Set globally (10,000 default)
|
|
257
|
-
export CANON_MAX_DIFF_LINES=10000
|
|
258
|
-
----
|
|
259
|
-
|
|
260
|
-
Valid values: Any positive integer
|
|
261
|
-
|
|
262
|
-
Default: 10,000 lines
|
|
263
|
-
|
|
264
|
-
===== Use Case: Large SVG Files
|
|
265
|
-
|
|
266
|
-
When working with large SVG files (e.g., 3.5MB) that may cause hangs:
|
|
267
|
-
|
|
268
|
-
[source,bash]
|
|
269
|
-
----
|
|
270
|
-
# Increase limits for large SVG processing
|
|
271
|
-
export CANON_MAX_FILE_SIZE=10485760 # 10MB
|
|
272
|
-
export CANON_MAX_NODE_COUNT=50000 # 50,000 nodes
|
|
273
|
-
export CANON_MAX_DIFF_LINES=20000 # 20,000 lines
|
|
274
|
-
|
|
275
|
-
bundle exec rspec spec/test_031_spec.rb
|
|
276
|
-
----
|
|
277
|
-
|
|
278
|
-
===== Disabling Limits
|
|
279
|
-
|
|
280
|
-
To disable a limit, set it to 0 or a negative value:
|
|
281
|
-
|
|
282
|
-
[source,bash]
|
|
283
|
-
----
|
|
284
|
-
# Disable file size limit (not recommended)
|
|
285
|
-
export CANON_MAX_FILE_SIZE=0
|
|
286
|
-
|
|
287
|
-
# Disable node count limit (use with caution)
|
|
288
|
-
export CANON_MAX_NODE_COUNT=-1
|
|
289
|
-
----
|
|
290
|
-
|
|
291
|
-
WARNING: Disabling limits may cause Canon to hang or consume excessive memory on pathologically large inputs.
|
|
292
|
-
|
|
293
|
-
== Match Configuration
|
|
294
|
-
|
|
295
|
-
=== Format-Specific Match Profile
|
|
296
|
-
|
|
297
|
-
[source,bash]
|
|
298
|
-
----
|
|
299
|
-
# Set match profile for XML
|
|
300
|
-
export CANON_XML_MATCH_PROFILE=ignore_whitespace
|
|
301
|
-
|
|
302
|
-
# Set match profile for HTML
|
|
303
|
-
export CANON_HTML_MATCH_PROFILE=strict
|
|
304
|
-
----
|
|
305
|
-
|
|
306
|
-
Valid values: Any match profile name (e.g., `ignore_whitespace`, `strict`, `semantic`)
|
|
307
|
-
|
|
308
|
-
=== Global Match Profile
|
|
309
|
-
|
|
310
|
-
[source,bash]
|
|
311
|
-
----
|
|
312
|
-
# Set match profile globally
|
|
313
|
-
export CANON_PROFILE=ignore_whitespace
|
|
314
|
-
----
|
|
315
|
-
|
|
316
|
-
== Usage Examples
|
|
317
|
-
|
|
318
|
-
=== Example 1: CI/CD Environment
|
|
319
|
-
|
|
320
|
-
[source,bash]
|
|
321
|
-
----
|
|
322
|
-
# .github/workflows/test.yml or similar
|
|
323
|
-
export CANON_USE_COLOR=false
|
|
324
|
-
export CANON_ALGORITHM=semantic
|
|
325
|
-
export CANON_SHOW_COMPARE=true
|
|
326
|
-
|
|
327
|
-
bundle exec rspec
|
|
328
|
-
----
|
|
329
|
-
|
|
330
|
-
=== Example 2: Docker Container
|
|
331
|
-
|
|
332
|
-
[source,dockerfile]
|
|
333
|
-
----
|
|
334
|
-
# Dockerfile
|
|
335
|
-
ENV CANON_XML_DIFF_ALGORITHM=semantic
|
|
336
|
-
ENV CANON_USE_COLOR=false
|
|
337
|
-
ENV CANON_CONTEXT_LINES=5
|
|
338
|
-
----
|
|
339
|
-
|
|
340
|
-
=== Example 3: Different Environments
|
|
341
|
-
|
|
342
|
-
[source,bash]
|
|
343
|
-
----
|
|
344
|
-
# Development
|
|
345
|
-
export CANON_VERBOSE_DIFF=true
|
|
346
|
-
export CANON_USE_COLOR=true
|
|
347
|
-
|
|
348
|
-
# Production
|
|
349
|
-
export CANON_VERBOSE_DIFF=false
|
|
350
|
-
export CANON_USE_COLOR=false
|
|
351
|
-
export CANON_XML_MATCH_PROFILE=strict
|
|
352
|
-
----
|
|
353
|
-
|
|
354
|
-
=== Example 4: Format-Specific Configuration
|
|
355
|
-
|
|
356
|
-
[source,bash]
|
|
357
|
-
----
|
|
358
|
-
# XML uses semantic diff
|
|
359
|
-
export CANON_XML_DIFF_ALGORITHM=semantic
|
|
360
|
-
|
|
361
|
-
# HTML uses DOM diff
|
|
362
|
-
export CANON_HTML_DIFF_ALGORITHM=dom
|
|
363
|
-
|
|
364
|
-
# All formats disable color
|
|
365
|
-
export CANON_USE_COLOR=false
|
|
366
|
-
----
|
|
367
|
-
|
|
368
|
-
== Programmatic Override
|
|
369
|
-
|
|
370
|
-
Even when environment variables are set, you can check their values programmatically:
|
|
371
|
-
|
|
372
|
-
[source,ruby]
|
|
373
|
-
----
|
|
374
|
-
# Environment variable is set
|
|
375
|
-
ENV['CANON_XML_DIFF_ALGORITHM'] = 'semantic'
|
|
376
|
-
|
|
377
|
-
# Config respects ENV variable
|
|
378
|
-
config = Canon::Config.new
|
|
379
|
-
puts config.xml.diff.algorithm # => :semantic
|
|
380
|
-
|
|
381
|
-
# Programmatic setting is ignored when ENV is set
|
|
382
|
-
config.xml.diff.algorithm = :dom
|
|
383
|
-
puts config.xml.diff.algorithm # => :semantic (ENV wins)
|
|
384
|
-
----
|
|
385
|
-
|
|
386
|
-
== Type Conversion
|
|
387
|
-
|
|
388
|
-
Environment variable values are automatically converted to the appropriate Ruby types:
|
|
389
|
-
|
|
390
|
-
=== Boolean Values
|
|
391
|
-
|
|
392
|
-
Accepted values for boolean attributes:
|
|
393
|
-
|
|
394
|
-
* **True**: `true`, `1`, `yes`
|
|
395
|
-
* **False**: `false`, `0`, `no`
|
|
396
|
-
|
|
397
|
-
Case-insensitive.
|
|
398
|
-
|
|
399
|
-
=== Integer Values
|
|
400
|
-
|
|
401
|
-
Any valid integer string is converted to an integer:
|
|
402
|
-
|
|
403
|
-
[source,bash]
|
|
404
|
-
----
|
|
405
|
-
export CANON_CONTEXT_LINES=15 # Converted to Integer 15
|
|
406
|
-
----
|
|
407
|
-
|
|
408
|
-
=== Symbol Values
|
|
409
|
-
|
|
410
|
-
String values are converted to symbols:
|
|
411
|
-
|
|
412
|
-
[source,bash]
|
|
413
|
-
----
|
|
414
|
-
export CANON_ALGORITHM=semantic # Converted to Symbol :semantic
|
|
415
|
-
----
|
|
416
|
-
|
|
417
|
-
== Backward Compatibility
|
|
418
|
-
|
|
419
|
-
The ENV override system maintains full backward compatibility with existing configuration methods:
|
|
420
|
-
|
|
421
|
-
[source,ruby]
|
|
422
|
-
----
|
|
423
|
-
# Traditional configuration still works
|
|
424
|
-
Canon::Config.configure do |config|
|
|
425
|
-
config.xml.diff.algorithm = :semantic
|
|
426
|
-
end
|
|
427
|
-
|
|
428
|
-
# ENV can override
|
|
429
|
-
ENV['CANON_XML_DIFF_ALGORITHM'] = 'dom'
|
|
430
|
-
config = Canon::Config.new
|
|
431
|
-
puts config.xml.diff.algorithm # => :dom
|
|
432
|
-
----
|
|
433
|
-
|
|
434
|
-
== Complete Variable Reference
|
|
435
|
-
|
|
436
|
-
=== Diff Configuration Variables
|
|
437
|
-
|
|
438
|
-
[cols="1,2,1"]
|
|
439
|
-
|===
|
|
440
|
-
|Variable Pattern |Description |Type
|
|
441
|
-
|
|
442
|
-
|`CANON_{FORMAT}_DIFF_MODE`
|
|
443
|
-
|Diff output mode
|
|
444
|
-
|Symbol (`:by_line`, `:by_object`)
|
|
445
|
-
|
|
446
|
-
|`CANON_{FORMAT}_DIFF_USE_COLOR`
|
|
447
|
-
|Enable/disable colored output
|
|
448
|
-
|Boolean
|
|
449
|
-
|
|
450
|
-
|`CANON_{FORMAT}_DIFF_CONTEXT_LINES`
|
|
451
|
-
|Number of context lines
|
|
452
|
-
|Integer
|
|
453
|
-
|
|
454
|
-
|`CANON_{FORMAT}_DIFF_GROUPING_LINES`
|
|
455
|
-
|Number of grouping lines
|
|
456
|
-
|Integer
|
|
457
|
-
|
|
458
|
-
|`CANON_{FORMAT}_DIFF_SHOW_DIFFS`
|
|
459
|
-
|Which diffs to show
|
|
460
|
-
|Symbol (`:all`, `:informative`, `:normative`)
|
|
461
|
-
|
|
462
|
-
|`CANON_{FORMAT}_DIFF_VERBOSE_DIFF`
|
|
463
|
-
|Enable verbose output
|
|
464
|
-
|Boolean
|
|
465
|
-
|
|
466
|
-
|`CANON_{FORMAT}_DIFF_ALGORITHM`
|
|
467
|
-
|Diff algorithm to use
|
|
468
|
-
|Symbol (`:dom`, `:semantic`)
|
|
469
|
-
|
|
470
|
-
|`CANON_{FORMAT}_DIFF_SHOW_COMPARE`
|
|
471
|
-
|Show algorithm comparison
|
|
472
|
-
|Boolean
|
|
473
|
-
|
|
474
|
-
|`CANON_{FORMAT}_DIFF_MAX_FILE_SIZE`
|
|
475
|
-
|Maximum file size in bytes
|
|
476
|
-
|Integer (default: 5,242,880)
|
|
477
|
-
|
|
478
|
-
|`CANON_{FORMAT}_DIFF_MAX_NODE_COUNT`
|
|
479
|
-
|Maximum tree node count
|
|
480
|
-
|Integer (default: 10,000)
|
|
481
|
-
|
|
482
|
-
|`CANON_{FORMAT}_DIFF_MAX_DIFF_LINES`
|
|
483
|
-
|Maximum diff output lines
|
|
484
|
-
|Integer (default: 10,000)
|
|
485
|
-
|===
|
|
486
|
-
|
|
487
|
-
=== Match Configuration Variables
|
|
488
|
-
|
|
489
|
-
[cols="1,2,1"]
|
|
490
|
-
|===
|
|
491
|
-
|Variable Pattern |Description |Type
|
|
492
|
-
|
|
493
|
-
|`CANON_{FORMAT}_MATCH_PROFILE`
|
|
494
|
-
|Match profile to use
|
|
495
|
-
|Symbol
|
|
496
|
-
|===
|
|
497
|
-
|
|
498
|
-
=== Global Variables
|
|
499
|
-
|
|
500
|
-
Replace `{FORMAT}_DIFF_` or `{FORMAT}_MATCH_` with just the attribute name:
|
|
501
|
-
|
|
502
|
-
[source,bash]
|
|
503
|
-
----
|
|
504
|
-
CANON_ALGORITHM=semantic
|
|
505
|
-
CANON_USE_COLOR=false
|
|
506
|
-
CANON_PROFILE=strict
|
|
507
|
-
CANON_MAX_FILE_SIZE=5242880
|
|
508
|
-
CANON_MAX_NODE_COUNT=10000
|
|
509
|
-
CANON_MAX_DIFF_LINES=10000
|
|
510
|
-
----
|
|
511
|
-
|
|
512
|
-
== Troubleshooting
|
|
513
|
-
|
|
514
|
-
=== ENV Variable Not Taking Effect
|
|
515
|
-
|
|
516
|
-
Check the priority chain. If a programmatic value seems to override ENV, verify:
|
|
517
|
-
|
|
518
|
-
. The ENV variable is set before creating the Config instance
|
|
519
|
-
. The variable name follows the correct naming convention
|
|
520
|
-
. The value is valid for the attribute type
|
|
521
|
-
|
|
522
|
-
=== Type Conversion Errors
|
|
523
|
-
|
|
524
|
-
If you encounter type conversion errors:
|
|
525
|
-
|
|
526
|
-
. Check that boolean values use accepted strings (`true`, `false`, `1`, `0`, `yes`, `no`)
|
|
527
|
-
. Ensure integer values are valid integers
|
|
528
|
-
. Verify symbol values don't contain special characters
|
|
529
|
-
|
|
530
|
-
=== Debugging
|
|
531
|
-
|
|
532
|
-
You can inspect the resolver to see which values are from ENV:
|
|
533
|
-
|
|
534
|
-
[source,ruby]
|
|
535
|
-
----
|
|
536
|
-
config = Canon::Config.new
|
|
537
|
-
resolver = config.xml.diff.instance_variable_get(:@resolver)
|
|
538
|
-
|
|
539
|
-
puts "ENV values: #{resolver.env.inspect}"
|
|
540
|
-
puts "Programmatic values: #{resolver.programmatic.inspect}"
|
|
541
|
-
puts "Defaults: #{resolver.defaults.inspect}"
|
|
542
|
-
puts "Source of algorithm: #{resolver.source_for(:algorithm)}"
|
|
543
|
-
|
|
544
|
-
== Troubleshooting Semantic Tree Algorithm
|
|
545
|
-
|
|
546
|
-
When using the semantic tree diff algorithm (`CANON_ALGORITHM=semantic`), you may encounter specific issues. This section provides solutions for common problems.
|
|
547
|
-
|
|
548
|
-
=== Algorithm Not Taking Effect
|
|
549
|
-
|
|
550
|
-
**Symptom:**
|
|
551
|
-
|
|
552
|
-
Output shows traditional line-by-line diff instead of operation-level analysis.
|
|
553
|
-
|
|
554
|
-
**Solutions:**
|
|
555
|
-
|
|
556
|
-
1. Verify environment variable is set correctly:
|
|
557
|
-
+
|
|
558
|
-
[source,bash]
|
|
559
|
-
----
|
|
560
|
-
echo $CANON_ALGORITHM
|
|
561
|
-
----
|
|
562
|
-
+
|
|
563
|
-
Should output: `semantic`
|
|
564
|
-
|
|
565
|
-
2. Ensure `verbose: true` is set to see operations:
|
|
566
|
-
+
|
|
567
|
-
[source,ruby]
|
|
568
|
-
----
|
|
569
|
-
result = Canon::Comparison.equivalent?(doc1, doc2,
|
|
570
|
-
verbose: true, # Required for operations
|
|
571
|
-
diff_algorithm: :semantic
|
|
572
|
-
)
|
|
573
|
-
----
|
|
574
|
-
|
|
575
|
-
3. Check that ENV variable is set before creating Config instance:
|
|
576
|
-
+
|
|
577
|
-
[source,ruby]
|
|
578
|
-
----
|
|
579
|
-
ENV['CANON_ALGORITHM'] = 'semantic' # Set before requiring Canon
|
|
580
|
-
require 'canon'
|
|
581
|
-
----
|
|
582
|
-
|
|
583
|
-
=== Performance Issues / Hangs
|
|
584
|
-
|
|
585
|
-
**Symptom:**
|
|
586
|
-
|
|
587
|
-
Comparison hangs, takes very long, or consumes excessive memory.
|
|
588
|
-
|
|
589
|
-
**Cause:**
|
|
590
|
-
|
|
591
|
-
The semantic tree diff has O(n²) complexity in similarity matching phase. Large documents (>10,000 nodes) can be slow.
|
|
592
|
-
|
|
593
|
-
**Solutions:**
|
|
594
|
-
|
|
595
|
-
1. Increase size limits to check if document exceeds defaults:
|
|
596
|
-
+
|
|
597
|
-
[source,bash]
|
|
598
|
-
----
|
|
599
|
-
export CANON_MAX_NODE_COUNT=20000
|
|
600
|
-
export CANON_MAX_FILE_SIZE=10485760
|
|
601
|
-
bundle exec rspec
|
|
602
|
-
----
|
|
603
|
-
|
|
604
|
-
2. Disable expensive matching phases:
|
|
605
|
-
+
|
|
606
|
-
[source,ruby]
|
|
607
|
-
----
|
|
608
|
-
Canon::Comparison.equivalent?(doc1, doc2,
|
|
609
|
-
diff_algorithm: :semantic,
|
|
610
|
-
match: {
|
|
611
|
-
similarity_matching: false, # Skip if exact matches suffice
|
|
612
|
-
propagation: false # Skip propagation
|
|
613
|
-
}
|
|
614
|
-
)
|
|
615
|
-
----
|
|
616
|
-
|
|
617
|
-
3. Switch to DOM diff for large files:
|
|
618
|
-
+
|
|
619
|
-
[source,bash]
|
|
620
|
-
----
|
|
621
|
-
export CANON_ALGORITHM=dom
|
|
622
|
-
----
|
|
623
|
-
+
|
|
624
|
-
Or conditionally in code:
|
|
625
|
-
+
|
|
626
|
-
[source,ruby]
|
|
627
|
-
----
|
|
628
|
-
algorithm = node_count > 5000 ? :dom : :semantic
|
|
629
|
-
Canon::Comparison.equivalent?(doc1, doc2, diff_algorithm: algorithm)
|
|
630
|
-
----
|
|
631
|
-
|
|
632
|
-
4. Increase similarity threshold to reduce candidates:
|
|
633
|
-
+
|
|
634
|
-
[source,bash]
|
|
635
|
-
----
|
|
636
|
-
export CANON_XML_MATCH_SIMILARITY_THRESHOLD=0.98
|
|
637
|
-
----
|
|
638
|
-
|
|
639
|
-
=== Too Many False Matches
|
|
640
|
-
|
|
641
|
-
**Symptom:**
|
|
642
|
-
|
|
643
|
-
Unrelated nodes are matched together, causing incorrect UPDATE operations instead of INSERT/DELETE.
|
|
644
|
-
|
|
645
|
-
**Cause:**
|
|
646
|
-
|
|
647
|
-
Similarity threshold too low (too lenient).
|
|
648
|
-
|
|
649
|
-
**Solutions:**
|
|
650
|
-
|
|
651
|
-
1. Increase similarity threshold:
|
|
652
|
-
+
|
|
653
|
-
[source,ruby]
|
|
654
|
-
----
|
|
655
|
-
Canon::Comparison.equivalent?(doc1, doc2,
|
|
656
|
-
diff_algorithm: :semantic,
|
|
657
|
-
match: {
|
|
658
|
-
similarity_threshold: 0.98 # Was 0.95, now stricter
|
|
659
|
-
}
|
|
660
|
-
)
|
|
661
|
-
----
|
|
662
|
-
|
|
663
|
-
2. Disable similarity matching to use only exact matches:
|
|
664
|
-
+
|
|
665
|
-
[source,ruby]
|
|
666
|
-
----
|
|
667
|
-
Canon::Comparison.equivalent?(doc1, doc2,
|
|
668
|
-
diff_algorithm: :semantic,
|
|
669
|
-
match: {
|
|
670
|
-
similarity_matching: false # Only hash-based exact matches
|
|
671
|
-
}
|
|
672
|
-
)
|
|
673
|
-
----
|
|
674
|
-
|
|
675
|
-
=== Too Few Matches / Missing MOVE Operations
|
|
676
|
-
|
|
677
|
-
**Symptom:**
|
|
678
|
-
|
|
679
|
-
Similar content shows as DELETE + INSERT instead of UPDATE or MOVE. Match rate is low in statistics.
|
|
680
|
-
|
|
681
|
-
**Cause:**
|
|
682
|
-
|
|
683
|
-
Similarity threshold too high (too strict).
|
|
684
|
-
|
|
685
|
-
**Solutions:**
|
|
686
|
-
|
|
687
|
-
1. Decrease similarity threshold:
|
|
688
|
-
+
|
|
689
|
-
[source,ruby]
|
|
690
|
-
----
|
|
691
|
-
Canon::Comparison.equivalent?(doc1, doc2,
|
|
692
|
-
diff_algorithm: :semantic,
|
|
693
|
-
match: {
|
|
694
|
-
similarity_threshold: 0.85 # Was 0.95, now more lenient
|
|
695
|
-
}
|
|
696
|
-
)
|
|
697
|
-
----
|
|
698
|
-
|
|
699
|
-
2. Ensure all matching phases are enabled:
|
|
700
|
-
+
|
|
701
|
-
[source,ruby]
|
|
702
|
-
----
|
|
703
|
-
Canon::Comparison.equivalent?(doc1, doc2,
|
|
704
|
-
diff_algorithm: :semantic,
|
|
705
|
-
match: {
|
|
706
|
-
hash_matching: true,
|
|
707
|
-
similarity_matching: true,
|
|
708
|
-
propagation: true
|
|
709
|
-
}
|
|
710
|
-
)
|
|
711
|
-
----
|
|
712
|
-
|
|
713
|
-
3. Use preprocessing to normalize content:
|
|
714
|
-
+
|
|
715
|
-
[source,ruby]
|
|
716
|
-
----
|
|
717
|
-
Canon::Comparison.equivalent?(doc1, doc2,
|
|
718
|
-
diff_algorithm: :semantic,
|
|
719
|
-
preprocessing: :c14n # Normalize before comparison
|
|
720
|
-
)
|
|
721
|
-
----
|
|
722
|
-
|
|
723
|
-
=== Metadata Elements Not Treated as Informative
|
|
724
|
-
|
|
725
|
-
**Symptom:**
|
|
726
|
-
|
|
727
|
-
Changes to `semx`, `fmt-*`, `autonum` elements are marked as normative (must-fix) instead of informative.
|
|
728
|
-
|
|
729
|
-
**Cause:**
|
|
730
|
-
|
|
731
|
-
Element not in metadata elements list.
|
|
732
|
-
|
|
733
|
-
**Solutions:**
|
|
734
|
-
|
|
735
|
-
1. Verify element is in the metadata list (see link:SEMANTIC_TREE_DIFF.adoc#metadata-elements[SEMANTIC_TREE_DIFF.adoc]).
|
|
736
|
-
|
|
737
|
-
2. Use match dimensions to ignore specific changes:
|
|
738
|
-
+
|
|
739
|
-
[source,ruby]
|
|
740
|
-
----
|
|
741
|
-
Canon::Comparison.equivalent?(doc1, doc2,
|
|
742
|
-
diff_algorithm: :semantic,
|
|
743
|
-
match: {
|
|
744
|
-
text_content: :ignore # All text differences → informative
|
|
745
|
-
}
|
|
746
|
-
)
|
|
747
|
-
----
|
|
748
|
-
|
|
749
|
-
=== Whitespace Differences in `<pre>` or `<code>` Elements
|
|
750
|
-
|
|
751
|
-
**Symptom:**
|
|
752
|
-
|
|
753
|
-
Documents with identical semantic content fail due to whitespace differences in preformatted blocks.
|
|
754
|
-
|
|
755
|
-
**Cause:**
|
|
756
|
-
|
|
757
|
-
Whitespace is preserved in whitespace-sensitive elements (`pre`, `code`, `textarea`, `script`, `style`).
|
|
758
|
-
|
|
759
|
-
**Solutions:**
|
|
760
|
-
|
|
761
|
-
This is by design - whitespace in these elements is semantically significant. To ignore:
|
|
762
|
-
|
|
763
|
-
1. Normalize whitespace before comparison:
|
|
764
|
-
+
|
|
765
|
-
[source,ruby]
|
|
766
|
-
----
|
|
767
|
-
def normalize_pre_whitespace(xml)
|
|
768
|
-
doc = Nokogiri::XML(xml)
|
|
769
|
-
doc.css('pre, code').each do |elem|
|
|
770
|
-
elem.content = elem.content.strip.gsub(/\s+/, ' ')
|
|
771
|
-
end
|
|
772
|
-
doc.to_xml
|
|
773
|
-
end
|
|
774
|
-
|
|
775
|
-
normalized1 = normalize_pre_whitespace(doc1)
|
|
776
|
-
normalized2 = normalize_pre_whitespace(doc2)
|
|
777
|
-
|
|
778
|
-
Canon::Comparison.equivalent?(normalized1, normalized2,
|
|
779
|
-
diff_algorithm: :semantic
|
|
780
|
-
)
|
|
781
|
-
----
|
|
782
|
-
|
|
783
|
-
2. Or ignore text content differences globally:
|
|
784
|
-
+
|
|
785
|
-
[source,ruby]
|
|
786
|
-
----
|
|
787
|
-
Canon::Comparison.equivalent?(doc1, doc2,
|
|
788
|
-
diff_algorithm: :semantic,
|
|
789
|
-
match: {
|
|
790
|
-
text_content: :ignore # Ignore all text differences
|
|
791
|
-
}
|
|
792
|
-
)
|
|
793
|
-
----
|
|
794
|
-
|
|
795
|
-
=== File Size / Node Count Limit Exceeded
|
|
796
|
-
|
|
797
|
-
**Symptom:**
|
|
798
|
-
|
|
799
|
-
Error message: "File size exceeds maximum limit" or "Node count exceeds maximum limit"
|
|
800
|
-
|
|
801
|
-
**Cause:**
|
|
802
|
-
|
|
803
|
-
Document exceeds configured size limits (default: 5MB file size, 10,000 nodes).
|
|
804
|
-
|
|
805
|
-
**Solutions:**
|
|
806
|
-
|
|
807
|
-
1. Increase limits via environment variables:
|
|
808
|
-
+
|
|
809
|
-
[source,bash]
|
|
810
|
-
----
|
|
811
|
-
export CANON_MAX_FILE_SIZE=10485760 # 10MB
|
|
812
|
-
export CANON_MAX_NODE_COUNT=50000 # 50,000 nodes
|
|
813
|
-
export CANON_MAX_DIFF_LINES=20000 # 20,000 lines output
|
|
814
|
-
bundle exec rspec
|
|
815
|
-
----
|
|
816
|
-
|
|
817
|
-
2. Disable limits (not recommended):
|
|
818
|
-
+
|
|
819
|
-
[source,bash]
|
|
820
|
-
----
|
|
821
|
-
export CANON_MAX_FILE_SIZE=0
|
|
822
|
-
export CANON_MAX_NODE_COUNT=-1
|
|
823
|
-
----
|
|
824
|
-
+
|
|
825
|
-
WARNING: Disabling limits may cause hangs on pathologically large files.
|
|
826
|
-
|
|
827
|
-
3. Use DOM diff for oversized files:
|
|
828
|
-
+
|
|
829
|
-
[source,ruby]
|
|
830
|
-
----
|
|
831
|
-
algorithm = file_size > 5_242_880 ? :dom : :semantic
|
|
832
|
-
Canon::Comparison.equivalent?(doc1, doc2, diff_algorithm: algorithm)
|
|
833
|
-
----
|
|
834
|
-
|
|
835
|
-
=== Debugging Algorithm Selection
|
|
836
|
-
|
|
837
|
-
To verify which algorithm is being used:
|
|
838
|
-
|
|
839
|
-
[source,ruby]
|
|
840
|
-
----
|
|
841
|
-
config = Canon::Config.instance
|
|
842
|
-
puts "XML algorithm: #{config.xml.diff.algorithm}"
|
|
843
|
-
puts "HTML algorithm: #{config.html.diff.algorithm}"
|
|
844
|
-
|
|
845
|
-
result = Canon::Comparison.equivalent?(doc1, doc2, verbose: true)
|
|
846
|
-
puts "Used algorithm: #{result.match_options[:diff_algorithm]}"
|
|
847
|
-
puts "Tree diff enabled: #{result.match_options[:tree_diff_enabled]}"
|
|
848
|
-
----
|
|
849
|
-
|
|
850
|
-
=== Getting Help
|
|
851
|
-
|
|
852
|
-
If issues persist:
|
|
853
|
-
|
|
854
|
-
1. Check the link:SEMANTIC_TREE_DIFF.adoc[SEMANTIC_TREE_DIFF.adoc] documentation
|
|
855
|
-
2. Review link:TREE_DIFF.adoc[TREE_DIFF.adoc] for operation details
|
|
856
|
-
3. Enable verbose output to inspect operations:
|
|
857
|
-
+
|
|
858
|
-
[source,ruby]
|
|
859
|
-
----
|
|
860
|
-
result = Canon::Comparison.equivalent?(doc1, doc2,
|
|
861
|
-
verbose: true,
|
|
862
|
-
diff_algorithm: :semantic
|
|
863
|
-
)
|
|
864
|
-
|
|
865
|
-
result.operations.each do |op|
|
|
866
|
-
puts "#{op.type}: #{op.inspect}"
|
|
867
|
-
end
|
|
868
|
-
|
|
869
|
-
stats = result.match_options[:tree_diff_statistics]
|
|
870
|
-
puts "Match rate: #{stats[:match_rate]}"
|
|
871
|
-
puts "Total operations: #{result.operations.size}"
|
|
872
|
-
----
|
|
873
|
-
|
|
874
|
-
4. Compare with DOM diff output to identify algorithm-specific issues
|
|
875
|
-
5. Report bugs at https://github.com/lutaml/canon/issues
|
|
876
|
-
----
|