canon 0.1.8 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +83 -22
  3. data/docs/Gemfile +1 -0
  4. data/docs/_config.yml +90 -1
  5. data/docs/advanced/diff-classification.adoc +196 -24
  6. data/docs/features/match-options/index.adoc +239 -1
  7. data/lib/canon/comparison/format_detector.rb +2 -1
  8. data/lib/canon/comparison/html_comparator.rb +19 -8
  9. data/lib/canon/comparison/html_compare_profile.rb +8 -2
  10. data/lib/canon/comparison/markup_comparator.rb +109 -2
  11. data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
  12. data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
  13. data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
  14. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +108 -0
  15. data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
  16. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
  17. data/lib/canon/comparison/xml_comparator.rb +240 -23
  18. data/lib/canon/comparison/xml_node_comparison.rb +25 -3
  19. data/lib/canon/diff/diff_classifier.rb +119 -5
  20. data/lib/canon/diff/formatting_detector.rb +1 -1
  21. data/lib/canon/diff/xml_serialization_formatter.rb +153 -0
  22. data/lib/canon/rspec_matchers.rb +37 -8
  23. data/lib/canon/version.rb +1 -1
  24. data/lib/canon/xml/data_model.rb +24 -13
  25. metadata +4 -78
  26. data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
  27. data/false_positive_analysis.txt +0 -0
  28. data/file1.html +0 -1
  29. data/file2.html +0 -1
  30. data/old-docs/ADVANCED_TOPICS.adoc +0 -20
  31. data/old-docs/BASIC_USAGE.adoc +0 -16
  32. data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
  33. data/old-docs/CLI.adoc +0 -497
  34. data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  35. data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
  36. data/old-docs/DIFF_FORMATTING.adoc +0 -540
  37. data/old-docs/DIFF_PARAMETERS.adoc +0 -261
  38. data/old-docs/DOM_DIFF.adoc +0 -1017
  39. data/old-docs/ENV_CONFIG.adoc +0 -876
  40. data/old-docs/FORMATS.adoc +0 -867
  41. data/old-docs/INPUT_VALIDATION.adoc +0 -477
  42. data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
  43. data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
  44. data/old-docs/MATCH_OPTIONS.adoc +0 -912
  45. data/old-docs/MODES.adoc +0 -432
  46. data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  47. data/old-docs/OPTIONS.adoc +0 -1387
  48. data/old-docs/PREPROCESSING.adoc +0 -491
  49. data/old-docs/README.old.adoc +0 -2831
  50. data/old-docs/RSPEC.adoc +0 -814
  51. data/old-docs/RUBY_API.adoc +0 -485
  52. data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
  53. data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
  54. data/old-docs/STRING_COMPARE.adoc +0 -345
  55. data/old-docs/TMP.adoc +0 -3384
  56. data/old-docs/TREE_DIFF.adoc +0 -1080
  57. data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
  58. data/old-docs/VERBOSE.adoc +0 -482
  59. data/old-docs/VISUALIZATION_MAP.adoc +0 -625
  60. data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
  61. data/scripts/analyze_current_state.rb +0 -85
  62. data/scripts/analyze_false_positives.rb +0 -114
  63. data/scripts/analyze_remaining_failures.rb +0 -105
  64. data/scripts/compare_current_failures.rb +0 -95
  65. data/scripts/compare_dom_tree_diff.rb +0 -158
  66. data/scripts/compare_failures.rb +0 -151
  67. data/scripts/debug_attribute_extraction.rb +0 -66
  68. data/scripts/debug_blocks_839.rb +0 -115
  69. data/scripts/debug_meta_matching.rb +0 -52
  70. data/scripts/debug_p_matching.rb +0 -192
  71. data/scripts/debug_signature_matching.rb +0 -118
  72. data/scripts/debug_sourcecode_124.rb +0 -32
  73. data/scripts/debug_whitespace_sensitive.rb +0 -192
  74. data/scripts/extract_false_positives.rb +0 -138
  75. data/scripts/find_actual_false_positives.rb +0 -125
  76. data/scripts/investigate_all_false_positives.rb +0 -161
  77. data/scripts/investigate_batch1.rb +0 -127
  78. data/scripts/investigate_classification.rb +0 -150
  79. data/scripts/investigate_classification_detailed.rb +0 -190
  80. data/scripts/investigate_common_failures.rb +0 -342
  81. data/scripts/investigate_false_negative.rb +0 -80
  82. data/scripts/investigate_false_positive.rb +0 -83
  83. data/scripts/investigate_false_positives.rb +0 -227
  84. data/scripts/investigate_false_positives_batch.rb +0 -163
  85. data/scripts/investigate_mixed_content.rb +0 -125
  86. data/scripts/investigate_remaining_16.rb +0 -214
  87. data/scripts/run_single_test.rb +0 -29
  88. data/scripts/test_all_false_positives.rb +0 -95
  89. data/scripts/test_attribute_details.rb +0 -61
  90. data/scripts/test_both_algorithms.rb +0 -49
  91. data/scripts/test_both_simple.rb +0 -49
  92. data/scripts/test_enhanced_semantic_output.rb +0 -125
  93. data/scripts/test_readme_examples.rb +0 -131
  94. data/scripts/test_semantic_tree_diff.rb +0 -99
  95. data/scripts/test_semantic_ux_improvements.rb +0 -135
  96. data/scripts/test_single_false_positive.rb +0 -119
  97. data/scripts/test_size_limits.rb +0 -99
  98. data/test_html_1.html +0 -21
  99. data/test_html_2.html +0 -21
  100. data/test_nokogiri.rb +0 -33
  101. data/test_normalize.rb +0 -45
@@ -1,876 +0,0 @@
1
- = Environment Variable Configuration
2
- :toc:
3
- :toclevels: 3
4
-
5
- == General
6
-
7
- Canon supports configuration through environment variables, allowing you to override default settings and programmatically set values without modifying code. This is particularly useful for CI/CD pipelines, containerized environments, and different deployment scenarios.
8
-
9
- == Priority Chain
10
-
11
- Configuration values are resolved using the following priority (highest to lowest):
12
-
13
- . **Environment Variables** (highest priority)
14
- . **Programmatic Configuration** (via `Canon::Config`)
15
- . **Default Values** (lowest priority)
16
-
17
- This means environment variables always override programmatic settings, which in turn override defaults.
18
-
19
- == Naming Convention
20
-
21
- Environment variables follow a consistent naming pattern:
22
-
23
- [source]
24
- ----
25
- CANON_{FORMAT}_{CONFIG_TYPE}_{ATTRIBUTE}
26
- ----
27
-
28
- Where:
29
-
30
- * `FORMAT`: `XML`, `HTML`, `JSON`, `YAML`, or `STRING`
31
- * `CONFIG_TYPE`: `DIFF` or `MATCH`
32
- * `ATTRIBUTE`: The configuration attribute name (e.g., `ALGORITHM`, `MODE`, `PROFILE`)
33
-
34
- === Global Variables
35
-
36
- You can also use global environment variables that apply to all formats by omitting the format prefix:
37
-
38
- [source]
39
- ----
40
- CANON_{ATTRIBUTE}
41
- ----
42
-
43
- Global variables are overridden by format-specific variables.
44
-
45
- == Diff Configuration
46
-
47
- === Format-Specific Variables
48
-
49
- ==== Algorithm Selection
50
-
51
- Canon supports two diff algorithms:
52
-
53
- * **`dom`**: DOM-based tree diff (default, stable)
54
- * **`semantic`**: Semantic tree diff (experimental, more sophisticated)
55
-
56
- [source,bash]
57
- ----
58
- # Set algorithm for XML diff
59
- export CANON_XML_DIFF_ALGORITHM=semantic
60
-
61
- # Set algorithm for HTML diff
62
- export CANON_HTML_DIFF_ALGORITHM=dom
63
-
64
- # Set globally for all formats
65
- export CANON_ALGORITHM=semantic
66
- ----
67
-
68
- **Testing/Debugging Use Case**:
69
-
70
- When comparing algorithm behavior in test suites:
71
-
72
- [source,bash]
73
- ----
74
- # Test with DOM algorithm (baseline)
75
- CANON_ALGORITHM=dom bundle exec rspec
76
-
77
- # Test with semantic algorithm
78
- CANON_ALGORITHM=semantic bundle exec rspec
79
-
80
- # Format-specific (useful when formats behave differently)
81
- CANON_HTML_DIFF_ALGORITHM=dom CANON_XML_DIFF_ALGORITHM=dom bundle exec rspec
82
- ----
83
-
84
- **RSpec Configuration**:
85
-
86
- In your `spec_helper.rb`, simply set the default algorithm:
87
-
88
- [source,ruby]
89
- ----
90
- # Canon's ENV variables will automatically override this
91
- Canon::Config.instance.html.diff.algorithm = :semantic
92
- Canon::Config.instance.xml.diff.algorithm = :semantic
93
- ----
94
-
95
- Then use ENV variables to override for specific test runs without modifying the spec_helper.
96
-
97
- Valid values: `dom`, `semantic`
98
-
99
- ==== Diff Mode
100
-
101
- [source,bash]
102
- ----
103
- # Set diff mode for JSON
104
- export CANON_JSON_DIFF_MODE=by_object
105
-
106
- # Set diff mode for YAML
107
- export CANON_YAML_DIFF_MODE=by_line
108
- ----
109
-
110
- Valid values: `by_line`, `by_object`
111
-
112
- ==== Color Output
113
-
114
- [source,bash]
115
- ----
116
- # Disable color output for XML
117
- export CANON_XML_DIFF_USE_COLOR=false
118
-
119
- # Enable color output for HTML
120
- export CANON_HTML_DIFF_USE_COLOR=true
121
- ----
122
-
123
- Valid values: `true`, `false`, `1`, `0`, `yes`, `no`
124
-
125
- ==== Context Lines
126
-
127
- [source,bash]
128
- ----
129
- # Set context lines for XML diff
130
- export CANON_XML_DIFF_CONTEXT_LINES=5
131
- ----
132
-
133
- Valid values: Any positive integer
134
-
135
- ==== Grouping Lines
136
-
137
- [source,bash]
138
- ----
139
- # Set grouping lines for XML diff
140
- export CANON_XML_DIFF_GROUPING_LINES=20
141
- ----
142
-
143
- Valid values: Any positive integer
144
-
145
- ==== Show Diffs
146
-
147
- [source,bash]
148
- ----
149
- # Show only informative diffs
150
- export CANON_XML_DIFF_SHOW_DIFFS=informative
151
-
152
- # Show all diffs
153
- export CANON_XML_DIFF_SHOW_DIFFS=all
154
- ----
155
-
156
- Valid values: `all`, `informative`, `normative`
157
-
158
- ==== Verbose Diff
159
-
160
- [source,bash]
161
- ----
162
- # Enable verbose diff output
163
- export CANON_XML_DIFF_VERBOSE_DIFF=true
164
- ----
165
-
166
- Valid values: `true`, `false`, `1`, `0`, `yes`, `no`
167
-
168
- ==== Show Compare
169
-
170
- [source,bash]
171
- ----
172
- # Enable side-by-side algorithm comparison
173
- export CANON_XML_DIFF_SHOW_COMPARE=true
174
- ----
175
-
176
- Valid values: `true`, `false`, `1`, `0`, `yes`, `no`
177
-
178
- === Global Diff Variables
179
-
180
- Apply to all formats unless overridden by format-specific variables:
181
-
182
- [source,bash]
183
- ----
184
- # Set algorithm globally
185
- export CANON_ALGORITHM=semantic
186
-
187
- # Disable color globally
188
- export CANON_USE_COLOR=false
189
-
190
- # Set diff mode globally
191
- export CANON_MODE=by_object
192
-
193
- # Set context lines globally
194
- export CANON_CONTEXT_LINES=10
195
-
196
- # Set grouping lines globally
197
- export CANON_GROUPING_LINES=15
198
-
199
- # Set show_diffs globally
200
- export CANON_SHOW_DIFFS=informative
201
-
202
- # Enable verbose diff globally
203
- export CANON_VERBOSE_DIFF=true
204
-
205
- # Enable show_compare globally
206
- export CANON_SHOW_COMPARE=true
207
- ----
208
-
209
- ==== Size Limits
210
-
211
- Canon provides configurable size limits to prevent hangs or excessive resource usage when processing very large files.
212
-
213
- ===== File Size Limit
214
-
215
- Maximum file size in bytes before comparison is rejected:
216
-
217
- [source,bash]
218
- ----
219
- # Set max file size to 10MB for XML
220
- export CANON_XML_DIFF_MAX_FILE_SIZE=10485760
221
-
222
- # Set globally (5MB default)
223
- export CANON_MAX_FILE_SIZE=5242880
224
- ----
225
-
226
- Valid values: Any positive integer (bytes)
227
-
228
- Default: 5,242,880 bytes (5MB)
229
-
230
- ===== Node Count Limit
231
-
232
- Maximum number of nodes in a tree structure before comparison is rejected:
233
-
234
- [source,bash]
235
- ----
236
- # Set max node count for XML diff
237
- export CANON_XML_DIFF_MAX_NODE_COUNT=20000
238
-
239
- # Set globally (10,000 default)
240
- export CANON_MAX_NODE_COUNT=10000
241
- ----
242
-
243
- Valid values: Any positive integer
244
-
245
- Default: 10,000 nodes
246
-
247
- ===== Diff Output Lines Limit
248
-
249
- Maximum number of lines in diff output before truncation:
250
-
251
- [source,bash]
252
- ----
253
- # Set max diff lines for XML
254
- export CANON_XML_DIFF_MAX_DIFF_LINES=15000
255
-
256
- # Set globally (10,000 default)
257
- export CANON_MAX_DIFF_LINES=10000
258
- ----
259
-
260
- Valid values: Any positive integer
261
-
262
- Default: 10,000 lines
263
-
264
- ===== Use Case: Large SVG Files
265
-
266
- When working with large SVG files (e.g., 3.5MB) that may cause hangs:
267
-
268
- [source,bash]
269
- ----
270
- # Increase limits for large SVG processing
271
- export CANON_MAX_FILE_SIZE=10485760 # 10MB
272
- export CANON_MAX_NODE_COUNT=50000 # 50,000 nodes
273
- export CANON_MAX_DIFF_LINES=20000 # 20,000 lines
274
-
275
- bundle exec rspec spec/test_031_spec.rb
276
- ----
277
-
278
- ===== Disabling Limits
279
-
280
- To disable a limit, set it to 0 or a negative value:
281
-
282
- [source,bash]
283
- ----
284
- # Disable file size limit (not recommended)
285
- export CANON_MAX_FILE_SIZE=0
286
-
287
- # Disable node count limit (use with caution)
288
- export CANON_MAX_NODE_COUNT=-1
289
- ----
290
-
291
- WARNING: Disabling limits may cause Canon to hang or consume excessive memory on pathologically large inputs.
292
-
293
- == Match Configuration
294
-
295
- === Format-Specific Match Profile
296
-
297
- [source,bash]
298
- ----
299
- # Set match profile for XML
300
- export CANON_XML_MATCH_PROFILE=ignore_whitespace
301
-
302
- # Set match profile for HTML
303
- export CANON_HTML_MATCH_PROFILE=strict
304
- ----
305
-
306
- Valid values: Any match profile name (e.g., `ignore_whitespace`, `strict`, `semantic`)
307
-
308
- === Global Match Profile
309
-
310
- [source,bash]
311
- ----
312
- # Set match profile globally
313
- export CANON_PROFILE=ignore_whitespace
314
- ----
315
-
316
- == Usage Examples
317
-
318
- === Example 1: CI/CD Environment
319
-
320
- [source,bash]
321
- ----
322
- # .github/workflows/test.yml or similar
323
- export CANON_USE_COLOR=false
324
- export CANON_ALGORITHM=semantic
325
- export CANON_SHOW_COMPARE=true
326
-
327
- bundle exec rspec
328
- ----
329
-
330
- === Example 2: Docker Container
331
-
332
- [source,dockerfile]
333
- ----
334
- # Dockerfile
335
- ENV CANON_XML_DIFF_ALGORITHM=semantic
336
- ENV CANON_USE_COLOR=false
337
- ENV CANON_CONTEXT_LINES=5
338
- ----
339
-
340
- === Example 3: Different Environments
341
-
342
- [source,bash]
343
- ----
344
- # Development
345
- export CANON_VERBOSE_DIFF=true
346
- export CANON_USE_COLOR=true
347
-
348
- # Production
349
- export CANON_VERBOSE_DIFF=false
350
- export CANON_USE_COLOR=false
351
- export CANON_XML_MATCH_PROFILE=strict
352
- ----
353
-
354
- === Example 4: Format-Specific Configuration
355
-
356
- [source,bash]
357
- ----
358
- # XML uses semantic diff
359
- export CANON_XML_DIFF_ALGORITHM=semantic
360
-
361
- # HTML uses DOM diff
362
- export CANON_HTML_DIFF_ALGORITHM=dom
363
-
364
- # All formats disable color
365
- export CANON_USE_COLOR=false
366
- ----
367
-
368
- == Programmatic Override
369
-
370
- Even when environment variables are set, you can check their values programmatically:
371
-
372
- [source,ruby]
373
- ----
374
- # Environment variable is set
375
- ENV['CANON_XML_DIFF_ALGORITHM'] = 'semantic'
376
-
377
- # Config respects ENV variable
378
- config = Canon::Config.new
379
- puts config.xml.diff.algorithm # => :semantic
380
-
381
- # Programmatic setting is ignored when ENV is set
382
- config.xml.diff.algorithm = :dom
383
- puts config.xml.diff.algorithm # => :semantic (ENV wins)
384
- ----
385
-
386
- == Type Conversion
387
-
388
- Environment variable values are automatically converted to the appropriate Ruby types:
389
-
390
- === Boolean Values
391
-
392
- Accepted values for boolean attributes:
393
-
394
- * **True**: `true`, `1`, `yes`
395
- * **False**: `false`, `0`, `no`
396
-
397
- Case-insensitive.
398
-
399
- === Integer Values
400
-
401
- Any valid integer string is converted to an integer:
402
-
403
- [source,bash]
404
- ----
405
- export CANON_CONTEXT_LINES=15 # Converted to Integer 15
406
- ----
407
-
408
- === Symbol Values
409
-
410
- String values are converted to symbols:
411
-
412
- [source,bash]
413
- ----
414
- export CANON_ALGORITHM=semantic # Converted to Symbol :semantic
415
- ----
416
-
417
- == Backward Compatibility
418
-
419
- The ENV override system maintains full backward compatibility with existing configuration methods:
420
-
421
- [source,ruby]
422
- ----
423
- # Traditional configuration still works
424
- Canon::Config.configure do |config|
425
- config.xml.diff.algorithm = :semantic
426
- end
427
-
428
- # ENV can override
429
- ENV['CANON_XML_DIFF_ALGORITHM'] = 'dom'
430
- config = Canon::Config.new
431
- puts config.xml.diff.algorithm # => :dom
432
- ----
433
-
434
- == Complete Variable Reference
435
-
436
- === Diff Configuration Variables
437
-
438
- [cols="1,2,1"]
439
- |===
440
- |Variable Pattern |Description |Type
441
-
442
- |`CANON_{FORMAT}_DIFF_MODE`
443
- |Diff output mode
444
- |Symbol (`:by_line`, `:by_object`)
445
-
446
- |`CANON_{FORMAT}_DIFF_USE_COLOR`
447
- |Enable/disable colored output
448
- |Boolean
449
-
450
- |`CANON_{FORMAT}_DIFF_CONTEXT_LINES`
451
- |Number of context lines
452
- |Integer
453
-
454
- |`CANON_{FORMAT}_DIFF_GROUPING_LINES`
455
- |Number of grouping lines
456
- |Integer
457
-
458
- |`CANON_{FORMAT}_DIFF_SHOW_DIFFS`
459
- |Which diffs to show
460
- |Symbol (`:all`, `:informative`, `:normative`)
461
-
462
- |`CANON_{FORMAT}_DIFF_VERBOSE_DIFF`
463
- |Enable verbose output
464
- |Boolean
465
-
466
- |`CANON_{FORMAT}_DIFF_ALGORITHM`
467
- |Diff algorithm to use
468
- |Symbol (`:dom`, `:semantic`)
469
-
470
- |`CANON_{FORMAT}_DIFF_SHOW_COMPARE`
471
- |Show algorithm comparison
472
- |Boolean
473
-
474
- |`CANON_{FORMAT}_DIFF_MAX_FILE_SIZE`
475
- |Maximum file size in bytes
476
- |Integer (default: 5,242,880)
477
-
478
- |`CANON_{FORMAT}_DIFF_MAX_NODE_COUNT`
479
- |Maximum tree node count
480
- |Integer (default: 10,000)
481
-
482
- |`CANON_{FORMAT}_DIFF_MAX_DIFF_LINES`
483
- |Maximum diff output lines
484
- |Integer (default: 10,000)
485
- |===
486
-
487
- === Match Configuration Variables
488
-
489
- [cols="1,2,1"]
490
- |===
491
- |Variable Pattern |Description |Type
492
-
493
- |`CANON_{FORMAT}_MATCH_PROFILE`
494
- |Match profile to use
495
- |Symbol
496
- |===
497
-
498
- === Global Variables
499
-
500
- Replace `{FORMAT}_DIFF_` or `{FORMAT}_MATCH_` with just the attribute name:
501
-
502
- [source,bash]
503
- ----
504
- CANON_ALGORITHM=semantic
505
- CANON_USE_COLOR=false
506
- CANON_PROFILE=strict
507
- CANON_MAX_FILE_SIZE=5242880
508
- CANON_MAX_NODE_COUNT=10000
509
- CANON_MAX_DIFF_LINES=10000
510
- ----
511
-
512
- == Troubleshooting
513
-
514
- === ENV Variable Not Taking Effect
515
-
516
- Check the priority chain. If a programmatic value seems to override ENV, verify:
517
-
518
- . The ENV variable is set before creating the Config instance
519
- . The variable name follows the correct naming convention
520
- . The value is valid for the attribute type
521
-
522
- === Type Conversion Errors
523
-
524
- If you encounter type conversion errors:
525
-
526
- . Check that boolean values use accepted strings (`true`, `false`, `1`, `0`, `yes`, `no`)
527
- . Ensure integer values are valid integers
528
- . Verify symbol values don't contain special characters
529
-
530
- === Debugging
531
-
532
- You can inspect the resolver to see which values are from ENV:
533
-
534
- [source,ruby]
535
- ----
536
- config = Canon::Config.new
537
- resolver = config.xml.diff.instance_variable_get(:@resolver)
538
-
539
- puts "ENV values: #{resolver.env.inspect}"
540
- puts "Programmatic values: #{resolver.programmatic.inspect}"
541
- puts "Defaults: #{resolver.defaults.inspect}"
542
- puts "Source of algorithm: #{resolver.source_for(:algorithm)}"
543
-
544
- == Troubleshooting Semantic Tree Algorithm
545
-
546
- When using the semantic tree diff algorithm (`CANON_ALGORITHM=semantic`), you may encounter specific issues. This section provides solutions for common problems.
547
-
548
- === Algorithm Not Taking Effect
549
-
550
- **Symptom:**
551
-
552
- Output shows traditional line-by-line diff instead of operation-level analysis.
553
-
554
- **Solutions:**
555
-
556
- 1. Verify environment variable is set correctly:
557
- +
558
- [source,bash]
559
- ----
560
- echo $CANON_ALGORITHM
561
- ----
562
- +
563
- Should output: `semantic`
564
-
565
- 2. Ensure `verbose: true` is set to see operations:
566
- +
567
- [source,ruby]
568
- ----
569
- result = Canon::Comparison.equivalent?(doc1, doc2,
570
- verbose: true, # Required for operations
571
- diff_algorithm: :semantic
572
- )
573
- ----
574
-
575
- 3. Check that ENV variable is set before creating Config instance:
576
- +
577
- [source,ruby]
578
- ----
579
- ENV['CANON_ALGORITHM'] = 'semantic' # Set before requiring Canon
580
- require 'canon'
581
- ----
582
-
583
- === Performance Issues / Hangs
584
-
585
- **Symptom:**
586
-
587
- Comparison hangs, takes very long, or consumes excessive memory.
588
-
589
- **Cause:**
590
-
591
- The semantic tree diff has O(n²) complexity in similarity matching phase. Large documents (>10,000 nodes) can be slow.
592
-
593
- **Solutions:**
594
-
595
- 1. Increase size limits to check if document exceeds defaults:
596
- +
597
- [source,bash]
598
- ----
599
- export CANON_MAX_NODE_COUNT=20000
600
- export CANON_MAX_FILE_SIZE=10485760
601
- bundle exec rspec
602
- ----
603
-
604
- 2. Disable expensive matching phases:
605
- +
606
- [source,ruby]
607
- ----
608
- Canon::Comparison.equivalent?(doc1, doc2,
609
- diff_algorithm: :semantic,
610
- match: {
611
- similarity_matching: false, # Skip if exact matches suffice
612
- propagation: false # Skip propagation
613
- }
614
- )
615
- ----
616
-
617
- 3. Switch to DOM diff for large files:
618
- +
619
- [source,bash]
620
- ----
621
- export CANON_ALGORITHM=dom
622
- ----
623
- +
624
- Or conditionally in code:
625
- +
626
- [source,ruby]
627
- ----
628
- algorithm = node_count > 5000 ? :dom : :semantic
629
- Canon::Comparison.equivalent?(doc1, doc2, diff_algorithm: algorithm)
630
- ----
631
-
632
- 4. Increase similarity threshold to reduce candidates:
633
- +
634
- [source,bash]
635
- ----
636
- export CANON_XML_MATCH_SIMILARITY_THRESHOLD=0.98
637
- ----
638
-
639
- === Too Many False Matches
640
-
641
- **Symptom:**
642
-
643
- Unrelated nodes are matched together, causing incorrect UPDATE operations instead of INSERT/DELETE.
644
-
645
- **Cause:**
646
-
647
- Similarity threshold too low (too lenient).
648
-
649
- **Solutions:**
650
-
651
- 1. Increase similarity threshold:
652
- +
653
- [source,ruby]
654
- ----
655
- Canon::Comparison.equivalent?(doc1, doc2,
656
- diff_algorithm: :semantic,
657
- match: {
658
- similarity_threshold: 0.98 # Was 0.95, now stricter
659
- }
660
- )
661
- ----
662
-
663
- 2. Disable similarity matching to use only exact matches:
664
- +
665
- [source,ruby]
666
- ----
667
- Canon::Comparison.equivalent?(doc1, doc2,
668
- diff_algorithm: :semantic,
669
- match: {
670
- similarity_matching: false # Only hash-based exact matches
671
- }
672
- )
673
- ----
674
-
675
- === Too Few Matches / Missing MOVE Operations
676
-
677
- **Symptom:**
678
-
679
- Similar content shows as DELETE + INSERT instead of UPDATE or MOVE. Match rate is low in statistics.
680
-
681
- **Cause:**
682
-
683
- Similarity threshold too high (too strict).
684
-
685
- **Solutions:**
686
-
687
- 1. Decrease similarity threshold:
688
- +
689
- [source,ruby]
690
- ----
691
- Canon::Comparison.equivalent?(doc1, doc2,
692
- diff_algorithm: :semantic,
693
- match: {
694
- similarity_threshold: 0.85 # Was 0.95, now more lenient
695
- }
696
- )
697
- ----
698
-
699
- 2. Ensure all matching phases are enabled:
700
- +
701
- [source,ruby]
702
- ----
703
- Canon::Comparison.equivalent?(doc1, doc2,
704
- diff_algorithm: :semantic,
705
- match: {
706
- hash_matching: true,
707
- similarity_matching: true,
708
- propagation: true
709
- }
710
- )
711
- ----
712
-
713
- 3. Use preprocessing to normalize content:
714
- +
715
- [source,ruby]
716
- ----
717
- Canon::Comparison.equivalent?(doc1, doc2,
718
- diff_algorithm: :semantic,
719
- preprocessing: :c14n # Normalize before comparison
720
- )
721
- ----
722
-
723
- === Metadata Elements Not Treated as Informative
724
-
725
- **Symptom:**
726
-
727
- Changes to `semx`, `fmt-*`, `autonum` elements are marked as normative (must-fix) instead of informative.
728
-
729
- **Cause:**
730
-
731
- Element not in metadata elements list.
732
-
733
- **Solutions:**
734
-
735
- 1. Verify element is in the metadata list (see link:SEMANTIC_TREE_DIFF.adoc#metadata-elements[SEMANTIC_TREE_DIFF.adoc]).
736
-
737
- 2. Use match dimensions to ignore specific changes:
738
- +
739
- [source,ruby]
740
- ----
741
- Canon::Comparison.equivalent?(doc1, doc2,
742
- diff_algorithm: :semantic,
743
- match: {
744
- text_content: :ignore # All text differences → informative
745
- }
746
- )
747
- ----
748
-
749
- === Whitespace Differences in `<pre>` or `<code>` Elements
750
-
751
- **Symptom:**
752
-
753
- Documents with identical semantic content fail due to whitespace differences in preformatted blocks.
754
-
755
- **Cause:**
756
-
757
- Whitespace is preserved in whitespace-sensitive elements (`pre`, `code`, `textarea`, `script`, `style`).
758
-
759
- **Solutions:**
760
-
761
- This is by design - whitespace in these elements is semantically significant. To ignore:
762
-
763
- 1. Normalize whitespace before comparison:
764
- +
765
- [source,ruby]
766
- ----
767
- def normalize_pre_whitespace(xml)
768
- doc = Nokogiri::XML(xml)
769
- doc.css('pre, code').each do |elem|
770
- elem.content = elem.content.strip.gsub(/\s+/, ' ')
771
- end
772
- doc.to_xml
773
- end
774
-
775
- normalized1 = normalize_pre_whitespace(doc1)
776
- normalized2 = normalize_pre_whitespace(doc2)
777
-
778
- Canon::Comparison.equivalent?(normalized1, normalized2,
779
- diff_algorithm: :semantic
780
- )
781
- ----
782
-
783
- 2. Or ignore text content differences globally:
784
- +
785
- [source,ruby]
786
- ----
787
- Canon::Comparison.equivalent?(doc1, doc2,
788
- diff_algorithm: :semantic,
789
- match: {
790
- text_content: :ignore # Ignore all text differences
791
- }
792
- )
793
- ----
794
-
795
- === File Size / Node Count Limit Exceeded
796
-
797
- **Symptom:**
798
-
799
- Error message: "File size exceeds maximum limit" or "Node count exceeds maximum limit"
800
-
801
- **Cause:**
802
-
803
- Document exceeds configured size limits (default: 5MB file size, 10,000 nodes).
804
-
805
- **Solutions:**
806
-
807
- 1. Increase limits via environment variables:
808
- +
809
- [source,bash]
810
- ----
811
- export CANON_MAX_FILE_SIZE=10485760 # 10MB
812
- export CANON_MAX_NODE_COUNT=50000 # 50,000 nodes
813
- export CANON_MAX_DIFF_LINES=20000 # 20,000 lines output
814
- bundle exec rspec
815
- ----
816
-
817
- 2. Disable limits (not recommended):
818
- +
819
- [source,bash]
820
- ----
821
- export CANON_MAX_FILE_SIZE=0
822
- export CANON_MAX_NODE_COUNT=-1
823
- ----
824
- +
825
- WARNING: Disabling limits may cause hangs on pathologically large files.
826
-
827
- 3. Use DOM diff for oversized files:
828
- +
829
- [source,ruby]
830
- ----
831
- algorithm = file_size > 5_242_880 ? :dom : :semantic
832
- Canon::Comparison.equivalent?(doc1, doc2, diff_algorithm: algorithm)
833
- ----
834
-
835
- === Debugging Algorithm Selection
836
-
837
- To verify which algorithm is being used:
838
-
839
- [source,ruby]
840
- ----
841
- config = Canon::Config.instance
842
- puts "XML algorithm: #{config.xml.diff.algorithm}"
843
- puts "HTML algorithm: #{config.html.diff.algorithm}"
844
-
845
- result = Canon::Comparison.equivalent?(doc1, doc2, verbose: true)
846
- puts "Used algorithm: #{result.match_options[:diff_algorithm]}"
847
- puts "Tree diff enabled: #{result.match_options[:tree_diff_enabled]}"
848
- ----
849
-
850
- === Getting Help
851
-
852
- If issues persist:
853
-
854
- 1. Check the link:SEMANTIC_TREE_DIFF.adoc[SEMANTIC_TREE_DIFF.adoc] documentation
855
- 2. Review link:TREE_DIFF.adoc[TREE_DIFF.adoc] for operation details
856
- 3. Enable verbose output to inspect operations:
857
- +
858
- [source,ruby]
859
- ----
860
- result = Canon::Comparison.equivalent?(doc1, doc2,
861
- verbose: true,
862
- diff_algorithm: :semantic
863
- )
864
-
865
- result.operations.each do |op|
866
- puts "#{op.type}: #{op.inspect}"
867
- end
868
-
869
- stats = result.match_options[:tree_diff_statistics]
870
- puts "Match rate: #{stats[:match_rate]}"
871
- puts "Total operations: #{result.operations.size}"
872
- ----
873
-
874
- 4. Compare with DOM diff output to identify algorithm-specific issues
875
- 5. Report bugs at https://github.com/lutaml/canon/issues
876
- ----