canon 0.1.8 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +83 -22
  3. data/docs/Gemfile +1 -0
  4. data/docs/_config.yml +90 -1
  5. data/docs/advanced/diff-classification.adoc +196 -24
  6. data/docs/features/match-options/index.adoc +239 -1
  7. data/lib/canon/comparison/format_detector.rb +2 -1
  8. data/lib/canon/comparison/html_comparator.rb +19 -8
  9. data/lib/canon/comparison/html_compare_profile.rb +8 -2
  10. data/lib/canon/comparison/markup_comparator.rb +109 -2
  11. data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
  12. data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
  13. data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
  14. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +108 -0
  15. data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
  16. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
  17. data/lib/canon/comparison/xml_comparator.rb +240 -23
  18. data/lib/canon/comparison/xml_node_comparison.rb +25 -3
  19. data/lib/canon/diff/diff_classifier.rb +119 -5
  20. data/lib/canon/diff/formatting_detector.rb +1 -1
  21. data/lib/canon/diff/xml_serialization_formatter.rb +153 -0
  22. data/lib/canon/rspec_matchers.rb +37 -8
  23. data/lib/canon/version.rb +1 -1
  24. data/lib/canon/xml/data_model.rb +24 -13
  25. metadata +4 -78
  26. data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
  27. data/false_positive_analysis.txt +0 -0
  28. data/file1.html +0 -1
  29. data/file2.html +0 -1
  30. data/old-docs/ADVANCED_TOPICS.adoc +0 -20
  31. data/old-docs/BASIC_USAGE.adoc +0 -16
  32. data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
  33. data/old-docs/CLI.adoc +0 -497
  34. data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  35. data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
  36. data/old-docs/DIFF_FORMATTING.adoc +0 -540
  37. data/old-docs/DIFF_PARAMETERS.adoc +0 -261
  38. data/old-docs/DOM_DIFF.adoc +0 -1017
  39. data/old-docs/ENV_CONFIG.adoc +0 -876
  40. data/old-docs/FORMATS.adoc +0 -867
  41. data/old-docs/INPUT_VALIDATION.adoc +0 -477
  42. data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
  43. data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
  44. data/old-docs/MATCH_OPTIONS.adoc +0 -912
  45. data/old-docs/MODES.adoc +0 -432
  46. data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  47. data/old-docs/OPTIONS.adoc +0 -1387
  48. data/old-docs/PREPROCESSING.adoc +0 -491
  49. data/old-docs/README.old.adoc +0 -2831
  50. data/old-docs/RSPEC.adoc +0 -814
  51. data/old-docs/RUBY_API.adoc +0 -485
  52. data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
  53. data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
  54. data/old-docs/STRING_COMPARE.adoc +0 -345
  55. data/old-docs/TMP.adoc +0 -3384
  56. data/old-docs/TREE_DIFF.adoc +0 -1080
  57. data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
  58. data/old-docs/VERBOSE.adoc +0 -482
  59. data/old-docs/VISUALIZATION_MAP.adoc +0 -625
  60. data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
  61. data/scripts/analyze_current_state.rb +0 -85
  62. data/scripts/analyze_false_positives.rb +0 -114
  63. data/scripts/analyze_remaining_failures.rb +0 -105
  64. data/scripts/compare_current_failures.rb +0 -95
  65. data/scripts/compare_dom_tree_diff.rb +0 -158
  66. data/scripts/compare_failures.rb +0 -151
  67. data/scripts/debug_attribute_extraction.rb +0 -66
  68. data/scripts/debug_blocks_839.rb +0 -115
  69. data/scripts/debug_meta_matching.rb +0 -52
  70. data/scripts/debug_p_matching.rb +0 -192
  71. data/scripts/debug_signature_matching.rb +0 -118
  72. data/scripts/debug_sourcecode_124.rb +0 -32
  73. data/scripts/debug_whitespace_sensitive.rb +0 -192
  74. data/scripts/extract_false_positives.rb +0 -138
  75. data/scripts/find_actual_false_positives.rb +0 -125
  76. data/scripts/investigate_all_false_positives.rb +0 -161
  77. data/scripts/investigate_batch1.rb +0 -127
  78. data/scripts/investigate_classification.rb +0 -150
  79. data/scripts/investigate_classification_detailed.rb +0 -190
  80. data/scripts/investigate_common_failures.rb +0 -342
  81. data/scripts/investigate_false_negative.rb +0 -80
  82. data/scripts/investigate_false_positive.rb +0 -83
  83. data/scripts/investigate_false_positives.rb +0 -227
  84. data/scripts/investigate_false_positives_batch.rb +0 -163
  85. data/scripts/investigate_mixed_content.rb +0 -125
  86. data/scripts/investigate_remaining_16.rb +0 -214
  87. data/scripts/run_single_test.rb +0 -29
  88. data/scripts/test_all_false_positives.rb +0 -95
  89. data/scripts/test_attribute_details.rb +0 -61
  90. data/scripts/test_both_algorithms.rb +0 -49
  91. data/scripts/test_both_simple.rb +0 -49
  92. data/scripts/test_enhanced_semantic_output.rb +0 -125
  93. data/scripts/test_readme_examples.rb +0 -131
  94. data/scripts/test_semantic_tree_diff.rb +0 -99
  95. data/scripts/test_semantic_ux_improvements.rb +0 -135
  96. data/scripts/test_single_false_positive.rb +0 -119
  97. data/scripts/test_size_limits.rb +0 -99
  98. data/test_html_1.html +0 -21
  99. data/test_html_2.html +0 -21
  100. data/test_nokogiri.rb +0 -33
  101. data/test_normalize.rb +0 -45
data/old-docs/CLI.adoc DELETED
@@ -1,497 +0,0 @@
1
- ---
2
- layout: default
3
- title: Command-Line Interface
4
- nav_order: 11
5
- parent: Basic Usage
6
- ---
7
- = Canon command-line interface
8
- :toc:
9
- :toclevels: 3
10
-
11
- == Scope
12
-
13
- This document describes Canon's command-line interface (CLI). The `canon`
14
- command provides file formatting and comparison capabilities.
15
-
16
- For Ruby API usage, see link:RUBY_API[Ruby API documentation].
17
-
18
- For RSpec testing, see link:RSPEC[RSpec documentation].
19
-
20
- == General
21
-
22
- After installing the Canon gem, the `canon` command becomes available in your
23
- shell. It provides two main commands:
24
-
25
- * `canon format` - Format files in XML, HTML, JSON, or YAML
26
- * `canon diff` - Compare files semantically
27
-
28
- == Installation
29
-
30
- [source,bash]
31
- ----
32
- $ gem install canon
33
- $ canon --help
34
- ----
35
-
36
- == Format command
37
-
38
- The `format` command formats files in canonical or pretty-print mode.
39
-
40
- === Syntax
41
-
42
- [source,bash]
43
- ----
44
- canon format FILE [OPTIONS]
45
- ----
46
-
47
- === Output modes
48
-
49
- `pretty` (default):: Human-readable output with indentation (2 spaces)
50
- `c14n`:: Canonical form without indentation (compact)
51
-
52
- === Options
53
-
54
- `-f, --format FORMAT`:: Specify format: `xml`, `html`, `json`, or `yaml`
55
- (auto-detected from extension if not specified)
56
-
57
- `-m, --mode MODE`:: Output mode: `pretty` (default) or `c14n`
58
-
59
- `-i, --indent N`:: Indentation spaces for pretty mode (default: 2)
60
-
61
- `--indent-type TYPE`:: Indentation type: `space` (default) or `tab`
62
-
63
- `-o, --output FILE`:: Write output to file instead of stdout
64
-
65
- `-c, --with-comments`:: Include comments in canonical XML output
66
-
67
- === Format detection
68
-
69
- When `--format` is not specified, Canon detects the format from file extension:
70
-
71
- [cols="1,1"]
72
- |===
73
- |File Extension |Detected Format
74
-
75
- |`.xml`
76
- |XML
77
-
78
- |`.html`, `.htm`
79
- |HTML
80
-
81
- |`.json`
82
- |JSON
83
-
84
- |`.yaml`, `.yml`
85
- |YAML
86
- |===
87
-
88
- === Examples
89
-
90
- .Pretty-print with default settings
91
- [example]
92
- ====
93
- [source,bash]
94
- ----
95
- $ canon format input.xml
96
- <?xml version="1.0" encoding="UTF-8"?>
97
- <root>
98
- <a>1</a>
99
- <b>2</b>
100
- </root>
101
- ----
102
- ====
103
-
104
- .Canonical mode (compact)
105
- [example]
106
- ====
107
- [source,bash]
108
- ----
109
- $ canon format input.xml --mode c14n
110
- <root><a>1</a><b>2</b></root>
111
- ----
112
- ====
113
-
114
- .Custom indentation
115
- [example]
116
- ====
117
- [source,bash]
118
- ----
119
- # 4-space indentation
120
- $ canon format input.xml --mode pretty --indent 4
121
-
122
- # Tab indentation
123
- $ canon format input.xml --indent-type tab
124
-
125
- # JSON with 4 spaces
126
- $ canon format data.json --indent 4
127
- ----
128
- ====
129
-
130
- .Specify format explicitly
131
- [example]
132
- ====
133
- [source,bash]
134
- ----
135
- # Format a .txt file as XML
136
- $ canon format data.txt --format xml
137
- ----
138
- ====
139
-
140
- .Save to file
141
- [example]
142
- ====
143
- [source,bash]
144
- ----
145
- $ canon format input.xml --output formatted.xml
146
-
147
- # Or use shell redirection
148
- $ canon format input.xml > formatted.xml
149
- ----
150
- ====
151
-
152
- .Include XML comments in canonical output
153
- [example]
154
- ====
155
- [source,bash]
156
- ----
157
- $ canon format doc.xml --mode c14n --with-comments
158
- ----
159
- ====
160
-
161
- .Format different file types
162
- [example]
163
- ====
164
- [source,bash]
165
- ----
166
- # HTML files
167
- $ canon format page.html
168
- $ canon format page.html --mode c14n
169
-
170
- # JSON files
171
- $ canon format config.json
172
- $ canon format config.json --indent 4
173
-
174
- # YAML files
175
- $ canon format data.yaml
176
- ----
177
- ====
178
-
179
- == Diff command
180
-
181
- The `diff` command performs semantic comparison of files.
182
-
183
- === Syntax
184
-
185
- [source,bash]
186
- ----
187
- canon diff FILE1 FILE2 [OPTIONS]
188
- ----
189
-
190
- === Diff modes
191
-
192
- Canon supports two diff modes optimized for different use cases:
193
-
194
- `by-object`:: (default for JSON/YAML) Semantic tree-based diff showing
195
- structural changes
196
-
197
- `by-line`:: (default for HTML, optional for XML) Line-by-line diff after
198
- canonicalization
199
-
200
- See link:MODES[Diff modes] for details.
201
-
202
- === Format options
203
-
204
- `-f, --format FORMAT`:: Format for both files: `xml`, `html`, `json`, or
205
- `yaml` (auto-detected from extension if not specified)
206
-
207
- `--format1 FORMAT`:: Format of first file (when comparing different formats)
208
-
209
- `--format2 FORMAT`:: Format of second file (when comparing different formats)
210
-
211
- === Comparison options
212
-
213
- `-v, --verbose`:: Show detailed differences (default: just show if files
214
- differ)
215
-
216
- `--diff-algorithm ALGORITHM`:: Diff algorithm to use: `dom` (default) or
217
- `semantic`. DOM uses positional matching, semantic uses tree-based matching
218
- with operation detection.
219
-
220
- `--by-line`:: Use line-by-line diff for XML (default: by-object mode)
221
-
222
- `--text-content BEHAVIOR`:: How to compare text content: `strict`,
223
- `normalize`, or `ignore`
224
-
225
- `--structural-whitespace BEHAVIOR`:: How to handle whitespace between
226
- elements: `strict`, `normalize`, or `ignore`
227
-
228
- `--attribute-whitespace BEHAVIOR`:: How to handle whitespace in attribute
229
- values: `strict`, `normalize`, or `ignore` (XML/HTML only)
230
-
231
- `--attribute-order BEHAVIOR`:: Whether attribute order matters: `strict` or
232
- `ignore` (XML/HTML only)
233
-
234
- `--attribute-values BEHAVIOR`:: How to compare attribute values: `strict`,
235
- `normalize`, or `ignore` (XML/HTML only)
236
-
237
- `--key-order BEHAVIOR`:: Whether key order matters: `strict` or `ignore`
238
- (JSON/YAML only)
239
-
240
- `--comments BEHAVIOR`:: How to handle comments: `strict`, `normalize`, or
241
- `ignore`
242
-
243
- `--match-profile PROFILE`:: Use predefined match profile: `strict`,
244
- `rendered`, `spec_friendly`, or `content_only`
245
-
246
- See link:MATCH_OPTIONS[Match options] for detailed dimension reference.
247
-
248
- === Output options
249
-
250
- `--color` / `--no-color`:: Enable/disable colored output (default: enabled)
251
-
252
- `--context-lines N`:: Number of context lines around changes (default: 3)
253
-
254
- `--diff-grouping-lines N`:: Group changes within N lines into blocks
255
-
256
- See link:DIFF_FORMATTING[Diff formatting] for details.
257
-
258
- === Exit codes
259
-
260
- * `0` - Files are semantically equivalent
261
- * `1` - Files are semantically different
262
- * Other - Error occurred
263
-
264
- === Examples
265
-
266
- .Basic comparison
267
- [example]
268
- ====
269
- [source,bash]
270
- ----
271
- # Compare two JSON files
272
- $ canon diff config1.json config2.json
273
- Files are semantically different
274
-
275
- # Compare two XML files
276
- $ canon diff file1.xml file2.xml
277
- ✅ Files are semantically equivalent
278
- ----
279
- ====
280
-
281
- .Verbose mode with detailed diff
282
- [example]
283
- ====
284
- [source,bash]
285
- ----
286
- $ canon diff config1.json config2.json --verbose
287
- Visual Diff:
288
- ├── settings.debug:
289
- │ ├── - true
290
- │ └── + false
291
- └── version:
292
- ├── - "1.0.0"
293
- └── + "2.0.0"
294
- ----
295
- ====
296
-
297
- .XML comparison with by-line mode
298
- [example]
299
- ====
300
- [source,bash]
301
- ----
302
- $ canon diff document1.xml document2.xml --by-line --verbose
303
- Line-by-line diff:
304
- 4 - | <foreword id="fwd">
305
- 4 + | <foreword displayorder="2" id="fwd">
306
- 5 | <p>First paragraph</p>
307
- 10 + | <p>New content</p>
308
- 11 | </clause>
309
- ----
310
- ====
311
-
312
- .HTML comparison
313
- [example]
314
- ====
315
- [source,bash]
316
- ----
317
- $ canon diff page1.html page2.html --verbose
318
- Line-by-line diff:
319
- 4 - | <title>My Page</title>
320
- 4 + | <title>My Updated Page</title>
321
- 7 - | <div class="header">
322
- 7 + | <nav class="header">
323
- ----
324
- ====
325
-
326
- .Using match profiles
327
- [example]
328
- ====
329
- [source,bash]
330
- ----
331
- # Use spec_friendly profile
332
- $ canon diff file1.xml file2.xml \
333
- --match-profile spec_friendly \
334
- --verbose
335
-
336
- # Use rendered profile for HTML
337
- $ canon diff page1.html page2.html \
338
- --match-profile rendered \
339
- --verbose
340
-
341
- # Use strict profile (exact matching)
342
- $ canon diff file1.xml file2.xml \
343
- --match-profile strict \
344
- --verbose
345
- ----
346
- ====
347
-
348
- .Customize match dimensions
349
- [example]
350
- ====
351
- [source,bash]
352
- ----
353
- # Normalize text content, ignore whitespace
354
- $ canon diff file1.xml file2.xml \
355
- --text-content normalize \
356
- --structural-whitespace ignore \
357
- --verbose
358
-
359
- # Ignore comments and attribute order
360
- $ canon diff file1.xml file2.xml \
361
- --comments ignore \
362
- --attribute-order ignore \
363
- --verbose
364
-
365
- # Multiple dimension overrides
366
- $ canon diff file1.xml file2.xml \
367
- --text-content normalize \
368
- --structural-whitespace ignore \
369
- --attribute-whitespace normalize \
370
- --comments ignore \
371
- --verbose
372
- ----
373
- ====
374
-
375
- .Combine profile with dimension overrides
376
- [example]
377
- ====
378
- [source,bash]
379
- ----
380
- # Use spec_friendly but require strict comments
381
- $ canon diff file1.xml file2.xml \
382
- --match-profile spec_friendly \
383
- --comments strict \
384
- --verbose
385
- ----
386
- ====
387
-
388
- .Customize diff output
389
- [example]
390
- ====
391
- [source,bash]
392
- ----
393
- # Show more context lines
394
- $ canon diff file1.xml file2.xml \
395
- --verbose \
396
- --context-lines 5
397
-
398
- # Group nearby changes
399
- $ canon diff file1.xml file2.xml \
400
- --verbose \
401
- --diff-grouping-lines 10
402
-
403
- # Disable colors for piping to files
404
- $ canon diff file1.xml file2.xml \
405
- --verbose \
406
- --no-color > diff.txt
407
-
408
- # Combine diff options
409
- $ canon diff file1.xml file2.xml \
410
- --verbose \
411
- --context-lines 5 \
412
- --diff-grouping-lines 2 \
413
- --no-color
414
- ----
415
- ====
416
-
417
- .Compare different formats
418
- [example]
419
- ====
420
- [source,bash]
421
- ----
422
- # Compare JSON with YAML (must have same structure)
423
- $ canon diff config.json config.yaml \
424
- --format1 json \
425
- --format2 yaml \
426
- --verbose
427
- ----
428
- ====
429
-
430
- .JSON/YAML comparison examples
431
- [example]
432
- ====
433
- [source,bash]
434
- ----
435
- # JSON comparison (uses by-object mode by default)
436
- $ canon diff config1.json config2.json --verbose
437
-
438
- # YAML comparison with key order ignored
439
- $ canon diff data1.yaml data2.yaml \
440
- --key-order ignore \
441
- --verbose
442
-
443
- # Show 10 context lines for large config files
444
- $ canon diff large-config1.json large-config2.json \
445
- --verbose \
446
- --context-lines 10
447
- ----
448
- ====
449
-
450
- .Shell integration
451
- [example]
452
- ====
453
- [source,bash]
454
- ----
455
- # Use in scripts
456
- if canon diff expected.xml actual.xml; then
457
- echo "Files match!"
458
- else
459
- echo "Files differ"
460
- canon diff expected.xml actual.xml --verbose
461
- fi
462
-
463
- # Generate diff report
464
- canon diff file1.xml file2.xml --verbose --no-color > diff-report.txt
465
-
466
- # Compare with process substitution
467
- canon diff <(curl https://example.com/api/v1) \
468
- <(curl https://example.com/api/v2) \
469
- --format json \
470
- --verbose
471
- ----
472
- ====
473
-
474
- == Help command
475
-
476
- Get help on available commands and options:
477
-
478
- [source,bash]
479
- ----
480
- # General help
481
- $ canon help
482
-
483
- # Command-specific help
484
- $ canon help format
485
- $ canon help diff
486
-
487
- # Show version
488
- $ canon --version
489
- ----
490
-
491
- == See also
492
-
493
- * link:RUBY_API[Ruby API documentation]
494
- * link:RSPEC[RSpec matchers]
495
- * link:MATCH_OPTIONS[Match options reference]
496
- * link:MODES[Diff modes]
497
- * link:DIFF_FORMATTING[Diff formatting options]
@@ -1,19 +0,0 @@
1
- ---
2
- layout: default
3
- title: Customizing Behavior
4
- nav_order: 4
5
- has_children: true
6
- ---
7
- = Customizing behavior
8
-
9
- Configure Canon for your specific needs:
10
-
11
- * **link:MATCH_OPTIONS[Match options]** - Match dimensions and profiles
12
- * **link:PREPROCESSING[Preprocessing]** - Document normalization options
13
- * **link:DIFF_FORMATTING[Diff formatting]** - Customizing diff output
14
- * **link:INPUT_VALIDATION[Input validation]** - Error handling
15
- * **link:CHARACTER_VISUALIZATION[Character visualization]** - Whitespace
16
- visibility
17
-
18
- These documents cover Canon's configuration options for fine-tuning comparison
19
- behavior and diff output.