canon 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +9 -1
  3. data/.rubocop_todo.yml +276 -7
  4. data/README.adoc +203 -138
  5. data/_config.yml +116 -0
  6. data/docs/ADVANCED_TOPICS.adoc +20 -0
  7. data/docs/BASIC_USAGE.adoc +16 -0
  8. data/docs/CHARACTER_VISUALIZATION.adoc +567 -0
  9. data/docs/CLI.adoc +493 -0
  10. data/docs/CUSTOMIZING_BEHAVIOR.adoc +19 -0
  11. data/docs/DIFF_ARCHITECTURE.adoc +435 -0
  12. data/docs/DIFF_FORMATTING.adoc +540 -0
  13. data/docs/FORMATS.adoc +447 -0
  14. data/docs/INDEX.adoc +222 -0
  15. data/docs/INPUT_VALIDATION.adoc +477 -0
  16. data/docs/MATCH_ARCHITECTURE.adoc +463 -0
  17. data/docs/MATCH_OPTIONS.adoc +719 -0
  18. data/docs/MODES.adoc +432 -0
  19. data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +219 -0
  20. data/docs/OPTIONS.adoc +1387 -0
  21. data/docs/PREPROCESSING.adoc +491 -0
  22. data/docs/RSPEC.adoc +605 -0
  23. data/docs/RUBY_API.adoc +478 -0
  24. data/docs/SEMANTIC_DIFF_REPORT.adoc +528 -0
  25. data/docs/UNDERSTANDING_CANON.adoc +17 -0
  26. data/docs/VERBOSE.adoc +482 -0
  27. data/exe/canon +7 -0
  28. data/lib/canon/cli.rb +179 -0
  29. data/lib/canon/commands/diff_command.rb +195 -0
  30. data/lib/canon/commands/format_command.rb +113 -0
  31. data/lib/canon/comparison/base_comparator.rb +39 -0
  32. data/lib/canon/comparison/comparison_result.rb +79 -0
  33. data/lib/canon/comparison/html_comparator.rb +410 -0
  34. data/lib/canon/comparison/json_comparator.rb +212 -0
  35. data/lib/canon/comparison/match_options.rb +616 -0
  36. data/lib/canon/comparison/xml_comparator.rb +566 -0
  37. data/lib/canon/comparison/yaml_comparator.rb +93 -0
  38. data/lib/canon/comparison.rb +239 -0
  39. data/lib/canon/config.rb +172 -0
  40. data/lib/canon/diff/diff_block.rb +71 -0
  41. data/lib/canon/diff/diff_block_builder.rb +105 -0
  42. data/lib/canon/diff/diff_classifier.rb +46 -0
  43. data/lib/canon/diff/diff_context.rb +85 -0
  44. data/lib/canon/diff/diff_context_builder.rb +107 -0
  45. data/lib/canon/diff/diff_line.rb +77 -0
  46. data/lib/canon/diff/diff_node.rb +56 -0
  47. data/lib/canon/diff/diff_node_mapper.rb +148 -0
  48. data/lib/canon/diff/diff_report.rb +133 -0
  49. data/lib/canon/diff/diff_report_builder.rb +62 -0
  50. data/lib/canon/diff_formatter/by_line/base_formatter.rb +407 -0
  51. data/lib/canon/diff_formatter/by_line/html_formatter.rb +672 -0
  52. data/lib/canon/diff_formatter/by_line/json_formatter.rb +284 -0
  53. data/lib/canon/diff_formatter/by_line/simple_formatter.rb +190 -0
  54. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +860 -0
  55. data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +292 -0
  56. data/lib/canon/diff_formatter/by_object/base_formatter.rb +199 -0
  57. data/lib/canon/diff_formatter/by_object/json_formatter.rb +305 -0
  58. data/lib/canon/diff_formatter/by_object/xml_formatter.rb +248 -0
  59. data/lib/canon/diff_formatter/by_object/yaml_formatter.rb +17 -0
  60. data/lib/canon/diff_formatter/character_map.yml +197 -0
  61. data/lib/canon/diff_formatter/debug_output.rb +431 -0
  62. data/lib/canon/diff_formatter/diff_detail_formatter.rb +551 -0
  63. data/lib/canon/diff_formatter/legend.rb +141 -0
  64. data/lib/canon/diff_formatter.rb +520 -0
  65. data/lib/canon/errors.rb +56 -0
  66. data/lib/canon/formatters/html4_formatter.rb +17 -0
  67. data/lib/canon/formatters/html5_formatter.rb +17 -0
  68. data/lib/canon/formatters/html_formatter.rb +37 -0
  69. data/lib/canon/formatters/html_formatter_base.rb +163 -0
  70. data/lib/canon/formatters/json_formatter.rb +3 -0
  71. data/lib/canon/formatters/xml_formatter.rb +20 -55
  72. data/lib/canon/formatters/yaml_formatter.rb +4 -1
  73. data/lib/canon/pretty_printer/html.rb +57 -0
  74. data/lib/canon/pretty_printer/json.rb +25 -0
  75. data/lib/canon/pretty_printer/xml.rb +29 -0
  76. data/lib/canon/rspec_matchers.rb +222 -80
  77. data/lib/canon/validators/base_validator.rb +49 -0
  78. data/lib/canon/validators/html_validator.rb +138 -0
  79. data/lib/canon/validators/json_validator.rb +89 -0
  80. data/lib/canon/validators/xml_validator.rb +53 -0
  81. data/lib/canon/validators/yaml_validator.rb +73 -0
  82. data/lib/canon/version.rb +1 -1
  83. data/lib/canon/xml/attribute_handler.rb +80 -0
  84. data/lib/canon/xml/c14n.rb +36 -0
  85. data/lib/canon/xml/character_encoder.rb +38 -0
  86. data/lib/canon/xml/data_model.rb +225 -0
  87. data/lib/canon/xml/element_matcher.rb +196 -0
  88. data/lib/canon/xml/line_range_mapper.rb +158 -0
  89. data/lib/canon/xml/namespace_handler.rb +86 -0
  90. data/lib/canon/xml/node.rb +32 -0
  91. data/lib/canon/xml/nodes/attribute_node.rb +54 -0
  92. data/lib/canon/xml/nodes/comment_node.rb +23 -0
  93. data/lib/canon/xml/nodes/element_node.rb +56 -0
  94. data/lib/canon/xml/nodes/namespace_node.rb +38 -0
  95. data/lib/canon/xml/nodes/processing_instruction_node.rb +24 -0
  96. data/lib/canon/xml/nodes/root_node.rb +16 -0
  97. data/lib/canon/xml/nodes/text_node.rb +23 -0
  98. data/lib/canon/xml/processor.rb +151 -0
  99. data/lib/canon/xml/whitespace_normalizer.rb +72 -0
  100. data/lib/canon/xml/xml_base_handler.rb +188 -0
  101. data/lib/canon.rb +14 -3
  102. metadata +116 -21
@@ -0,0 +1,567 @@
1
+ ---
2
+ layout: default
3
+ title: Character Visualization
4
+ nav_order: 34
5
+ parent: Customizing Behavior
6
+ ---
7
+ = Canon character visualization
8
+ :toc:
9
+ :toclevels: 3
10
+
11
+ == Scope
12
+
13
+ This document describes Canon's whitespace and special character visualization
14
+ system, which makes invisible characters visible in diff output.
15
+
16
+ For diff formatting options, see link:DIFF_FORMATTING[Diff formatting].
17
+
18
+ == General
19
+
20
+ When comparing documents, invisible characters like spaces, tabs, and
21
+ zero-width characters can cause mysterious test failures. Canon's character
22
+ visualization makes these characters visible in diff output, helping you
23
+ quickly identify the exact difference.
24
+
25
+ Visualization is **CJK-safe**, using Unicode symbols that don't conflict with
26
+ Chinese, Japanese, or Korean text.
27
+
28
+ == When visualization is applied
29
+
30
+ Character visualization is applied **only to diff lines** (additions,
31
+ deletions, and changes), not to context lines (unchanged lines). This ensures:
32
+
33
+ * Context lines display content in original form
34
+ * Only actual changes show visualization
35
+ * Differences are easier to spot
36
+
37
+ Within changed lines showing token-level diffs, unchanged tokens are displayed
38
+ in the terminal's default color (not red/green) to distinguish them from
39
+ actual changes.
40
+
41
+ == Default character map
42
+
43
+ Canon provides a comprehensive CJK-safe character mapping.
44
+
45
+ === Common whitespace
46
+
47
+ [cols="1,1,1,2"]
48
+ |===
49
+ |Character |Unicode |Symbol |Description
50
+
51
+ |Regular space
52
+ |U+0020
53
+ |`░`
54
+ |Light Shade (U+2591)
55
+
56
+ |Tab
57
+ |U+0009
58
+ |`⇥`
59
+ |Rightwards Arrow to Bar (U+21E5)
60
+
61
+ |Non-breaking space
62
+ |U+00A0
63
+ |`␣`
64
+ |Open Box (U+2423)
65
+ |===
66
+
67
+ === Line endings
68
+
69
+ [cols="1,1,1,2"]
70
+ |===
71
+ |Character |Unicode |Symbol |Description
72
+
73
+ |Line feed (LF)
74
+ |U+000A
75
+ |`↵`
76
+ |Downwards Arrow with Corner Leftwards (U+21B5)
77
+
78
+ |Carriage return (CR)
79
+ |U+000D
80
+ |`⏎`
81
+ |Return Symbol (U+23CE)
82
+
83
+ |Windows line ending (CRLF)
84
+ |U+000D U+000A
85
+ |`↵`
86
+ |Downwards Arrow with Corner Leftwards (U+21B5)
87
+
88
+ |Next line (NEL)
89
+ |U+0085
90
+ |`⏎`
91
+ |Return Symbol (U+23CE)
92
+
93
+ |Line separator
94
+ |U+2028
95
+ |`⤓`
96
+ |Downwards Arrow to Bar (U+2913)
97
+
98
+ |Paragraph separator
99
+ |U+2029
100
+ |`⤓`
101
+ |Downwards Arrow to Bar (U+2913)
102
+ |===
103
+
104
+ === Unicode spaces
105
+
106
+ [cols="1,1,1,2"]
107
+ |===
108
+ |Character |Unicode |Symbol |Description
109
+
110
+ |En space
111
+ |U+2002
112
+ |`▭`
113
+ |White Rectangle (U+25AD)
114
+
115
+ |Em space
116
+ |U+2003
117
+ |`▬`
118
+ |Black Rectangle (U+25AC)
119
+
120
+ |Four-per-em space
121
+ |U+2005
122
+ |`⏓`
123
+ |Metrical Short Over Long (U+23D3)
124
+
125
+ |Six-per-em space
126
+ |U+2006
127
+ |`⏕`
128
+ |Metrical Two Shorts Over Long (U+23D5)
129
+
130
+ |Thin space
131
+ |U+2009
132
+ |`▯`
133
+ |White Vertical Rectangle (U+25AF)
134
+
135
+ |Hair space
136
+ |U+200A
137
+ |`▮`
138
+ |Black Vertical Rectangle (U+25AE)
139
+
140
+ |Figure space
141
+ |U+2007
142
+ |`□`
143
+ |White Square (U+25A1)
144
+
145
+ |Narrow no-break space
146
+ |U+202F
147
+ |`▫`
148
+ |White Small Square (U+25AB)
149
+
150
+ |Medium mathematical space
151
+ |U+205F
152
+ |`▭`
153
+ |White Rectangle (U+25AD)
154
+
155
+ |Ideographic space
156
+ |U+3000
157
+ |`⎵`
158
+ |Bottom Square Bracket (U+23B5)
159
+
160
+ |Ideographic half space
161
+ |U+303F
162
+ |`⏑`
163
+ |Metrical Breve (U+23D1)
164
+ |===
165
+
166
+ === Zero-width characters
167
+
168
+ [cols="1,1,1,2"]
169
+ |===
170
+ |Character |Unicode |Symbol |Description
171
+
172
+ |Zero-width space
173
+ |U+200B
174
+ |`→`
175
+ |Rightwards Arrow (U+2192)
176
+
177
+ |Zero-width non-joiner
178
+ |U+200C
179
+ |`↛`
180
+ |Rightwards Arrow with Stroke (U+219B)
181
+
182
+ |Zero-width joiner
183
+ |U+200D
184
+ |`⇢`
185
+ |Rightwards Dashed Arrow (U+21E2)
186
+
187
+ |Zero-width no-break space (BOM)
188
+ |U+FEFF
189
+ |`⇨`
190
+ |Rightwards White Arrow (U+21E8)
191
+ |===
192
+
193
+ === Bidirectional/RTL markers
194
+
195
+ [cols="1,1,1,2"]
196
+ |===
197
+ |Character |Unicode |Symbol |Description
198
+
199
+ |Left-to-right mark
200
+ |U+200E
201
+ |`⟹`
202
+ |Long Rightwards Double Arrow (U+27F9)
203
+
204
+ |Right-to-left mark
205
+ |U+200F
206
+ |`⟸`
207
+ |Long Leftwards Double Arrow (U+27F8)
208
+
209
+ |LTR embedding
210
+ |U+202A
211
+ |`⇒`
212
+ |Rightwards Double Arrow (U+21D2)
213
+
214
+ |RTL embedding
215
+ |U+202B
216
+ |`⇐`
217
+ |Leftwards Double Arrow (U+21D0)
218
+
219
+ |Pop directional formatting
220
+ |U+202C
221
+ |`↔`
222
+ |Left Right Arrow (U+2194)
223
+
224
+ |LTR override
225
+ |U+202D
226
+ |`⇉`
227
+ |Rightwards Paired Arrows (U+21C9)
228
+
229
+ |RTL override
230
+ |U+202E
231
+ |`⇇`
232
+ |Leftwards Paired Arrows (U+21C7)
233
+ |===
234
+
235
+ === Control characters
236
+
237
+ [cols="1,1,1,2"]
238
+ |===
239
+ |Character |Unicode |Symbol |Description
240
+
241
+ |Null
242
+ |U+0000
243
+ |`␀`
244
+ |Symbol for Null (U+2400)
245
+
246
+ |Soft hyphen
247
+ |U+00AD
248
+ |`­‐`
249
+ |Hyphen (U+2010)
250
+
251
+ |Backspace
252
+ |U+0008
253
+ |`␈`
254
+ |Symbol for Backspace (U+2408)
255
+
256
+ |Delete
257
+ |U+007F
258
+ |`␡`
259
+ |Symbol for Delete (U+2421)
260
+ |===
261
+
262
+ == CJK safety
263
+
264
+ The visualization characters are specifically chosen to avoid conflicts with
265
+ CJK text:
266
+
267
+ **Avoided characters**:
268
+
269
+ * **No middle dots** (`·`) - commonly used as separators in CJK
270
+ * **No bullets** (`∙`) - used in CJK lists
271
+ * **No circles** (`◌◍◎`) - look similar to CJK characters like ○ ●
272
+ * **No small dots** (`⋅`) - conflict with CJK punctuation
273
+
274
+ **Used instead**:
275
+
276
+ * Box characters (`□▭▬▯▮▫`) for various space types
277
+ * Arrow symbols (`→↛⇢⇨⟹⟸⇒⇐`) for zero-width and directional characters
278
+ * Control Pictures block symbols (`␀␈␡`) for control characters
279
+
280
+ == Examples in use
281
+
282
+ === Space added
283
+
284
+ .Regular space added
285
+ [example]
286
+ ====
287
+ [source]
288
+ ----
289
+ 10| -| <tag>Value</tag> # No space
290
+ | 10+| <tag>░Value</tag> # Space added (green light shade)
291
+ ----
292
+
293
+ The `░` symbol clearly shows a regular space was added between `<tag>` and
294
+ `Value`.
295
+ ====
296
+
297
+ === Tab vs spaces
298
+
299
+ .Tab replaced with spaces
300
+ [example]
301
+ ====
302
+ [source]
303
+ ----
304
+ 15| -| <tag>⇥Value</tag> # Tab (red arrow-to-bar)
305
+ | 15+| <tag>░░Value</tag> # Two spaces (green light shades)
306
+ ----
307
+
308
+ The difference between a tab (`⇥`) and two spaces (`░░`) is immediately
309
+ visible.
310
+ ====
311
+
312
+ === Non-breaking space
313
+
314
+ .Non-breaking space from web copy-paste
315
+ [example]
316
+ ====
317
+ Without visualization, these look identical:
318
+
319
+ [source,xml]
320
+ ----
321
+ <foreword id="fwd">
322
+ <foreword id="fwd">
323
+ ----
324
+
325
+ With visualization:
326
+
327
+ [source]
328
+ ----
329
+ 4| -| <foreword░id="fwd"> # Regular space (U+0020)
330
+ | 4+| <foreword␣id="fwd"> # Non-breaking space (U+00A0)
331
+ ----
332
+
333
+ The different symbols (`░` vs `␣`) clearly show that one uses a regular space
334
+ while the other uses a non-breaking space, likely from copying from a web page.
335
+ ====
336
+
337
+ === Zero-width space
338
+
339
+ .Zero-width space (completely invisible)
340
+ [example]
341
+ ====
342
+ Zero-width characters are invisible but affect comparison:
343
+
344
+ [source,xml]
345
+ ----
346
+ <item>Widget</item>
347
+ <item>Widget</item> <!-- Contains U+200B zero-width space after "Widget" -->
348
+ ----
349
+
350
+ The diff shows:
351
+
352
+ [source]
353
+ ----
354
+ 5| -| <item>Widget</item>
355
+ | 5+| <item>Widget→</item> # Zero-width space visualized as →
356
+ ----
357
+
358
+ The rightwards arrow (`→`) reveals the presence of a zero-width space.
359
+ ====
360
+
361
+ === Mixed invisible characters
362
+
363
+ .Multiple whitespace types
364
+ [example]
365
+ ====
366
+ [source]
367
+ ----
368
+ 30| -| <p>Text▬more</p> # Em space (red black rectangle)
369
+ | 30+| <p>Text░more</p> # Regular space (green light shade)
370
+ ----
371
+
372
+ Different space types shown with different symbols.
373
+ ====
374
+
375
+ == Real-world scenarios
376
+
377
+ === Web copy-paste
378
+
379
+ **Problem**: Text copied from web pages often contains non-breaking spaces
380
+ (U+00A0) instead of regular spaces.
381
+
382
+ .Detection example
383
+ [example]
384
+ ====
385
+ [source]
386
+ ----
387
+ 4| -| <p>Hello░world</p> # U+0020 (regular space)
388
+ | 4+| <p>Hello␣world</p> # U+00A0 (non-breaking space)
389
+ ----
390
+
391
+ The `␣` symbol immediately identifies the non-breaking space.
392
+ ====
393
+
394
+ === Smart quotes
395
+
396
+ **Problem**: Text editors may automatically convert straight quotes to curly
397
+ quotes.
398
+
399
+ .Detection example
400
+ [example]
401
+ ====
402
+ [source]
403
+ ----
404
+ 10| -| <title>John's Book</title> # Straight apostrophe
405
+ | 10+| <title>John's Book</title> # Curly apostrophe (U+2019)
406
+ ----
407
+
408
+ Non-ASCII warning will alert you to the smart quote.
409
+ ====
410
+
411
+ === Template generation
412
+
413
+ **Problem**: Generated output has invisible character differences.
414
+
415
+ .Detection example
416
+ [example]
417
+ ====
418
+ [source]
419
+ ----
420
+ 20| -| <item>Value→</item> # Zero-width space present
421
+ | 20+| <item>Value</item> # No zero-width space
422
+ ----
423
+
424
+ The `→` symbol reveals the zero-width space in generated content.
425
+ ====
426
+
427
+ == Customizing character visualization
428
+
429
+ You can customize the visualization map for specific needs.
430
+
431
+ === Custom map
432
+
433
+ [source,ruby]
434
+ ----
435
+ require 'canon/diff_formatter'
436
+
437
+ # Create custom visualization map
438
+ custom_map = Canon::DiffFormatter.merge_visualization_map({
439
+ ' ' => '·', # Use middle dot for spaces (if not using CJK)
440
+ "\t" => '→', # Use simple arrow for tabs
441
+ "\u200B" => '⚠' # Warning symbol for zero-width space
442
+ })
443
+
444
+ # Use custom map with formatter
445
+ formatter = Canon::DiffFormatter.new(
446
+ use_color: true,
447
+ visualization_map: custom_map
448
+ )
449
+
450
+ # The custom map merges with defaults, so unspecified
451
+ # characters still use the default visualization
452
+ ----
453
+
454
+ === When to customize
455
+
456
+ **Use custom visualization when**:
457
+
458
+ * Working with non-CJK text exclusively
459
+ * Prefer simpler symbols
460
+ * Need specific character highlighting
461
+ * Integrating with existing tools
462
+
463
+ **Keep defaults when**:
464
+
465
+ * Working with CJK text
466
+ * Maximum compatibility needed
467
+ * Standard behavior preferred
468
+
469
+ == Configuration
470
+
471
+ Character visualization is automatically enabled when `use_color: true` and
472
+ applies across all Canon interfaces.
473
+
474
+ === Enabling/disabling
475
+
476
+ Visualization is tied to color output:
477
+
478
+ [source,ruby]
479
+ ----
480
+ # Enable (visualization active)
481
+ diff: { use_color: true }
482
+
483
+ # Disable (no visualization)
484
+ diff: { use_color: false }
485
+ ----
486
+
487
+ === Interface configuration
488
+
489
+ .Ruby API
490
+ [example]
491
+ ====
492
+ [source,ruby]
493
+ ----
494
+ # Visualization enabled by default
495
+ Canon::Comparison.equivalent?(doc1, doc2,
496
+ verbose: true,
497
+ diff: { use_color: true } # Visualization active
498
+ )
499
+
500
+ # Disable for plain text
501
+ Canon::Comparison.equivalent?(doc1, doc2,
502
+ verbose: true,
503
+ diff: { use_color: false } # No visualization
504
+ )
505
+ ----
506
+ ====
507
+
508
+ .CLI
509
+ [example]
510
+ ====
511
+ [source,bash]
512
+ ----
513
+ # Enable (default)
514
+ $ canon diff file1.xml file2.xml --verbose
515
+
516
+ # Disable
517
+ $ canon diff file1.xml file2.xml --no-color --verbose
518
+ ----
519
+ ====
520
+
521
+ .RSpec
522
+ [example]
523
+ ====
524
+ [source,ruby]
525
+ ----
526
+ Canon::RSpecMatchers.configure do |config|
527
+ # Enable for local development
528
+ config.xml.diff.use_color = !ENV['CI']
529
+ end
530
+ ----
531
+ ====
532
+
533
+ == Troubleshooting
534
+
535
+ === Visualization not showing
536
+
537
+ **Problem**: Invisible characters not visualized.
538
+
539
+ **Solutions**:
540
+
541
+ * Ensure `use_color: true`
542
+ * Check terminal supports Unicode
543
+ * Verify the characters are in diff lines (not context lines)
544
+
545
+ === Wrong symbols displayed
546
+
547
+ **Problem**: Symbols appear garbled or as boxes.
548
+
549
+ **Solutions**:
550
+
551
+ * Use terminal with Unicode support
552
+ * Install Unicode-compatible font
553
+ * Check terminal encoding (should be UTF-8)
554
+
555
+ === CJK text affected
556
+
557
+ **Problem**: Visualization conflicts with CJK text.
558
+
559
+ **Solution**: Canon's defaults are CJK-safe. If using custom map, avoid the
560
+ characters listed in "CJK safety" section.
561
+
562
+ == See also
563
+
564
+ * link:DIFF_FORMATTING[Diff formatting]
565
+ * link:MODES[Diff modes]
566
+ * link:MATCH_ARCHITECTURE[Match architecture]
567
+ * link:RUBY_API[Ruby API documentation]