canon 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +112 -25
  3. data/docs/Gemfile +1 -0
  4. data/docs/_config.yml +90 -1
  5. data/docs/advanced/diff-classification.adoc +82 -2
  6. data/docs/features/match-options/index.adoc +239 -1
  7. data/lib/canon/comparison/format_detector.rb +2 -1
  8. data/lib/canon/comparison/html_comparator.rb +19 -8
  9. data/lib/canon/comparison/html_compare_profile.rb +8 -2
  10. data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
  11. data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
  12. data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
  13. data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
  14. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
  15. data/lib/canon/comparison/xml_comparator.rb +48 -23
  16. data/lib/canon/comparison/xml_node_comparison.rb +25 -3
  17. data/lib/canon/diff/diff_classifier.rb +101 -2
  18. data/lib/canon/diff/formatting_detector.rb +1 -1
  19. data/lib/canon/rspec_matchers.rb +37 -8
  20. data/lib/canon/version.rb +1 -1
  21. data/lib/canon/xml/data_model.rb +24 -13
  22. metadata +3 -78
  23. data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
  24. data/false_positive_analysis.txt +0 -0
  25. data/file1.html +0 -1
  26. data/file2.html +0 -1
  27. data/old-docs/ADVANCED_TOPICS.adoc +0 -20
  28. data/old-docs/BASIC_USAGE.adoc +0 -16
  29. data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
  30. data/old-docs/CLI.adoc +0 -497
  31. data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  32. data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
  33. data/old-docs/DIFF_FORMATTING.adoc +0 -540
  34. data/old-docs/DIFF_PARAMETERS.adoc +0 -261
  35. data/old-docs/DOM_DIFF.adoc +0 -1017
  36. data/old-docs/ENV_CONFIG.adoc +0 -876
  37. data/old-docs/FORMATS.adoc +0 -867
  38. data/old-docs/INPUT_VALIDATION.adoc +0 -477
  39. data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
  40. data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
  41. data/old-docs/MATCH_OPTIONS.adoc +0 -912
  42. data/old-docs/MODES.adoc +0 -432
  43. data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  44. data/old-docs/OPTIONS.adoc +0 -1387
  45. data/old-docs/PREPROCESSING.adoc +0 -491
  46. data/old-docs/README.old.adoc +0 -2831
  47. data/old-docs/RSPEC.adoc +0 -814
  48. data/old-docs/RUBY_API.adoc +0 -485
  49. data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
  50. data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
  51. data/old-docs/STRING_COMPARE.adoc +0 -345
  52. data/old-docs/TMP.adoc +0 -3384
  53. data/old-docs/TREE_DIFF.adoc +0 -1080
  54. data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
  55. data/old-docs/VERBOSE.adoc +0 -482
  56. data/old-docs/VISUALIZATION_MAP.adoc +0 -625
  57. data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
  58. data/scripts/analyze_current_state.rb +0 -85
  59. data/scripts/analyze_false_positives.rb +0 -114
  60. data/scripts/analyze_remaining_failures.rb +0 -105
  61. data/scripts/compare_current_failures.rb +0 -95
  62. data/scripts/compare_dom_tree_diff.rb +0 -158
  63. data/scripts/compare_failures.rb +0 -151
  64. data/scripts/debug_attribute_extraction.rb +0 -66
  65. data/scripts/debug_blocks_839.rb +0 -115
  66. data/scripts/debug_meta_matching.rb +0 -52
  67. data/scripts/debug_p_matching.rb +0 -192
  68. data/scripts/debug_signature_matching.rb +0 -118
  69. data/scripts/debug_sourcecode_124.rb +0 -32
  70. data/scripts/debug_whitespace_sensitive.rb +0 -192
  71. data/scripts/extract_false_positives.rb +0 -138
  72. data/scripts/find_actual_false_positives.rb +0 -125
  73. data/scripts/investigate_all_false_positives.rb +0 -161
  74. data/scripts/investigate_batch1.rb +0 -127
  75. data/scripts/investigate_classification.rb +0 -150
  76. data/scripts/investigate_classification_detailed.rb +0 -190
  77. data/scripts/investigate_common_failures.rb +0 -342
  78. data/scripts/investigate_false_negative.rb +0 -80
  79. data/scripts/investigate_false_positive.rb +0 -83
  80. data/scripts/investigate_false_positives.rb +0 -227
  81. data/scripts/investigate_false_positives_batch.rb +0 -163
  82. data/scripts/investigate_mixed_content.rb +0 -125
  83. data/scripts/investigate_remaining_16.rb +0 -214
  84. data/scripts/run_single_test.rb +0 -29
  85. data/scripts/test_all_false_positives.rb +0 -95
  86. data/scripts/test_attribute_details.rb +0 -61
  87. data/scripts/test_both_algorithms.rb +0 -49
  88. data/scripts/test_both_simple.rb +0 -49
  89. data/scripts/test_enhanced_semantic_output.rb +0 -125
  90. data/scripts/test_readme_examples.rb +0 -131
  91. data/scripts/test_semantic_tree_diff.rb +0 -99
  92. data/scripts/test_semantic_ux_improvements.rb +0 -135
  93. data/scripts/test_single_false_positive.rb +0 -119
  94. data/scripts/test_size_limits.rb +0 -99
  95. data/test_html_1.html +0 -21
  96. data/test_html_2.html +0 -21
  97. data/test_nokogiri.rb +0 -33
  98. data/test_normalize.rb +0 -45
@@ -1,625 +0,0 @@
1
- == Character visualization
2
-
3
- === General
4
-
5
- **Purpose**: Make invisible whitespace and special characters visible in diffs.
6
-
7
- Whitespace changes can be difficult to spot in traditional diffs because spaces,
8
- tabs, and other invisible characters don't appear in output.
9
-
10
- Canon visualizes these changes using a comprehensive set of Unicode symbols that
11
- are safe for use with CJK (Chinese, Japanese, Korean) text.
12
-
13
- **Visualization scope**: Character visualization is applied only to **diff lines**
14
- (additions, deletions, and changes), not to context lines (unchanged lines).
15
-
16
- This ensures that:
17
-
18
- * Context lines display content in its original form without substitution
19
- * Only actual changes show visualization, making differences easier to spot
20
- * Within changed lines showing token-level diffs, unchanged tokens are displayed
21
- in the terminal's default color (not red/green) to distinguish them from actual
22
- changes
23
-
24
- === How it works
25
-
26
- The character visualization feature works by substituting specific defined
27
- characters with visible symbols during diff output generation.
28
-
29
- This is especially useful for detecting character-level differences that are otherwise
30
- invisible, such as:
31
-
32
- * Extra spaces or tabs
33
- * Non-breaking spaces (U+00A0)
34
- * Zero-width spaces (U+200B)
35
- * Directional markers (e.g., LTR/RTL markers)
36
- * Control characters (e.g., soft hyphens)
37
-
38
- .Whitespace visualization examples
39
- [example]
40
- ====
41
- [source]
42
- ----
43
- # Space added between tags
44
- 10| -| <tag>Value</tag> # No space
45
- | 10+| <tag>░Value</tag> # Space added (green light shade)
46
-
47
- # Tab character
48
- 15| -| <tag>⇥Value</tag> # Tab (red arrow-to-bar)
49
- | 15+| <tag>░░Value</tag> # Two spaces (green light shades)
50
-
51
- # Non-breaking space (U+00A0)
52
- 20| -| <tag>Value</tag> # Regular space
53
- | 20+| <tag>Value␣</tag> # Non-breaking space (green open box)
54
-
55
- # Zero-width space (U+200B)
56
- 25| -| <word1><word2> # No zero-width space
57
- | 25+| <word1>→<word2> # Zero-width space (green arrow)
58
-
59
- # Mixed invisible characters
60
- 30| -| <p>Text▬more</p> # Em space (red black rectangle)
61
- | 30+| <p>Text░more</p> # Regular space (green light shade)
62
- ----
63
- ====
64
-
65
- Where visualization symbols appear in:
66
-
67
- * Red when showing removed/deleted characters
68
- * Green when showing added/inserted characters
69
- * Bold to make them more visible
70
-
71
- **When is this useful?**
72
-
73
- 1. **Test failures due to formatting**: Your test expects compact XML but receives
74
- pretty-printed XML with different indentation
75
-
76
- 2. **Mixed whitespace**: Some parts of your code use tabs while others use spaces
77
-
78
- 3. **Non-breaking spaces**: Copy-pasted content from browsers often contains
79
- U+00A0 instead of regular spaces
80
-
81
- 4. **Zero-width characters**: Invisible Unicode characters that cause mysterious
82
- comparison failures
83
-
84
- 5. **RTL/LTR markers**: Bidirectional text markers in internationalized content
85
-
86
- 6. **Template differences**: Generated output has invisible character differences
87
-
88
-
89
- .Real-world example: Non-breaking space from web copy-paste
90
- [example]
91
- Without whitespace visualization, these two lines look identical:
92
-
93
- [source,xml]
94
- ----
95
- <foreword id="fwd">
96
- <foreword id="fwd">
97
- ----
98
-
99
- With whitespace visualization enabled, the difference is immediately visible:
100
-
101
- [source]
102
- ----
103
- 4| -| <foreword░id="fwd"> # Regular space (U+0020)
104
- | 4+| <foreword␣id="fwd"> # Non-breaking space (U+00A0)
105
- ----
106
-
107
- The different symbols (`░` vs `␣`) clearly show that one uses a regular space
108
- while the other uses a non-breaking space, likely from copying text from a web
109
- page or word processor.
110
-
111
- .Real-world example: Zero-width characters
112
- [example]
113
- Zero-width characters are completely invisible but affect comparison:
114
-
115
- [source,xml]
116
- ----
117
- <item>Widget</item>
118
- <item>Widget</item> <!-- Contains U+200B zero-width space after "Widget" -->
119
- ----
120
-
121
- The diff shows:
122
-
123
- [source]
124
- ----
125
- 5| -| <item>Widget</item>
126
- | 5+| <item>Widget→</item> # Zero-width space visualized as →
127
- ----
128
-
129
- The rightwards arrow (`→`) reveals the presence of a zero-width space that would
130
- otherwise be impossible to detect.
131
-
132
- .Real-world example: Unicode legend in action
133
- [example]
134
- When a diff contains multiple non-ASCII characters, the legend appears at the top:
135
-
136
- [source]
137
- ----
138
- Character Visualization Legend:
139
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
140
- Whitespace:
141
- '␣': U+00A0 (' ') No-Break Space
142
- '⏓': U+2005 (' ') Four-Per-Em Space
143
-
144
- Zero-Width Characters:
145
- '→': U+200B (​​) Zero-Width Space
146
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
147
-
148
- 4| -| <p>Hello world</p>
149
- | 4+| <p>Hello␣world⏓test→end</p>
150
- ----
151
-
152
- The legend tells you exactly what each symbol means, making it easy to identify
153
- and fix the issue.
154
-
155
-
156
-
157
- === Character substitution legend
158
-
159
- When a character visualization substitution is performed in a diff, Canon
160
- automatically displays a **Character Substitution Legend** at the top of the
161
- diff output.
162
-
163
- This legend explains what each visualization symbol represents.
164
-
165
- This legend applies to all interfaces that display diffs, across all Canon
166
- supported formats.
167
-
168
- .Legend format
169
- [example]
170
- ====
171
- [source]
172
- ----
173
- Character Visualization Legend:
174
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
175
- Whitespace:
176
- '␣': U+00A0 (' ') No-Break Space
177
- '⏓': U+2005 (' ') Four-Per-Em Space
178
-
179
- Zero-Width Characters:
180
- '→': U+200B (​​) Zero-Width Space
181
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
182
-
183
- [... diff output follows ...]
184
- ----
185
- ====
186
-
187
- Where,
188
-
189
- [source]
190
- ----
191
- 'vis': U+XXXX ('orig') Character-Name
192
-
193
- Where:
194
- vis = The visualization symbol (e.g., ⏓)
195
- XXXX = Unicode codepoint in hex
196
- orig = The original character (shown only if different from visualization)
197
- Character-Name = Human-readable name from character_map.yml
198
- ----
199
-
200
- The legend is:
201
-
202
- * **Grouped by category**: Characters are organized under category headings (Whitespace, Line Endings, Zero-Width Characters, Directional Markers, Control Characters)
203
- * **Shows all three forms**: visualization symbol, Unicode codepoint, original character (when different)
204
- * **Only shows detected characters**: The legend only includes characters actually found in the diff
205
-
206
-
207
- === Character map configuration and definition
208
-
209
- Canon's character visualization mappings are defined in a central YAML
210
- configuration file:
211
-
212
- * `lib/canon/diff_formatter/character_map.yml`
213
-
214
- This file can be overriden or extended to customize how specific characters are
215
- visualized.
216
-
217
- This file contains all character definitions organized by category.
218
-
219
- The character map definitions follow this format:
220
-
221
- [source,yaml]
222
- ----
223
- characters:
224
- - character: " " # Regular space (ASCII)
225
- visualization: "░" # Symbol shown in diffs
226
- category: whitespace # Category
227
- name: "Space" # Human-readable name
228
- ----
229
-
230
- Each character entry includes:
231
-
232
- `character`:: (required unless `unicode`) The actual character (for ASCII chars
233
- or sequences like `"\n"`, `"\t"`, `"\r\n"`).
234
-
235
- `unicode`:: (required unless `character`) The Unicode hex code (e.g., `"2005"` for
236
- U+2005).
237
-
238
- `visualization`:: (required) The symbol used to represent this character in
239
- diffs. This should be a visible character.
240
-
241
- `category`:: (required) The category this character belongs to (whitespace,
242
- line_ending, zero_width, directional, control)
243
-
244
- `name`:: (required) Human-readable name of the character
245
-
246
-
247
- .Example character map entry
248
- [source,yaml]
249
- ----
250
- - unicode: "2005" # Four-Per-Em Space (U+2005)
251
- visualization: "⏓" # Symbol shown in diffs
252
- category: whitespace
253
- name: "Four-Per-Em Space"
254
-
255
- - character: "\n" # Line feed (ASCII)
256
- visualization: "↵"
257
- category: line_ending
258
- name: "Line Feed"
259
- ----
260
-
261
-
262
- === Customizing visualization
263
-
264
- ==== General
265
-
266
- You can customize the character visualization map for your specific needs
267
- by providing your own definitions in one of three ways:
268
-
269
- * A complete override map using the `visualization_map` parameter (absolute
270
- highest priority - ignores all other customization)
271
-
272
- * A custom YAML character map file using the `character_map_file` parameter
273
- (merges with defaults)
274
-
275
- * Individual character overrides using the `character_definitions` parameter
276
- (merges with defaults and file, highest priority among merging methods)
277
-
278
- **Priority order**:
279
-
280
- . If `visualization_map` is provided → Complete replacement (ignores
281
- `character_map_file` and `character_definitions`)
282
-
283
- . Otherwise, start with Canon defaults, then:
284
-
285
- .. Apply `character_map_file` customizations (if provided)
286
-
287
- .. Apply `character_definitions` customizations (if provided)
288
-
289
- This means `character_definitions` overrides `character_map_file`, which in turn
290
- overrides Canon defaults.
291
-
292
-
293
- ==== Map replacement
294
-
295
- Provide a complete visualization map that replaces all defaults.
296
-
297
- Each character entry must comply with the character definition provided above.
298
-
299
- [source,ruby]
300
- ----
301
- # Define your own complete map
302
- my_map = [
303
- { character: ' ', visualization: '·', category: 'whitespace', name: 'Space' },
304
- { character: "\t", visualization: '→', category: 'whitespace', name: 'Tab' },
305
- { unicode: "00A0", visualization: '␣', category: 'whitespace', name: 'Non-Breaking Space' },
306
- { unicode: "200B", visualization: '⚠', category: 'zero_width', name: 'Zero-Width Space' },
307
- ]
308
-
309
- formatter = Canon::DiffFormatter.new( visualization_map: my_map )
310
- ----
311
-
312
- WARNING: If `visualization_map` parameter is provided, it completely replaces the
313
- defaults and ignores `character_map_file` and `character_definitions`.
314
-
315
-
316
- ==== Individual overrides
317
-
318
- Override or add individual characters while keeping defaults:
319
-
320
- [source,ruby]
321
- ----
322
- formatter = Canon::DiffFormatter.new(
323
- character_definitions: [
324
- { unicode: "2005", visualization: "★", category: "whitespace", name: "Four-Per-Em Space" },
325
- { character: "\t", visualization: "→→", category: "whitespace", name: "Tab" },
326
- { unicode: "3042", visualization: "あ", category: "control", name: "Hiragana Letter A" },
327
- ]
328
- )
329
- ----
330
-
331
- [source,ruby]
332
- ----
333
- require 'canon/diff_formatter'
334
-
335
- # Create custom visualization map
336
- custom_map = Canon::DiffFormatter.merge_visualization_map([
337
- { unicode: '0020', visualization: '·', category: 'whitespace', name: 'Space' },
338
- { character: "\t", visualization: '→', category: 'whitespace', name: 'Tab' },
339
- { unicode: "200B", visualization: '⚠', category: 'zero_width', name: 'Zero-Width Space' },
340
- ])
341
-
342
- # Use custom map with formatter
343
- formatter = Canon::DiffFormatter.new( use_color: true, visualization_map: custom_map )
344
-
345
- # The custom map merges with defaults, so unspecified characters still use the
346
- # default visualization
347
- ----
348
-
349
- ==== Extending with a custom YAML file
350
-
351
- Provide a custom YAML file defining character visualizations.
352
-
353
- Notice that this file **extends** the default character map, allowing you to
354
- add or override specific characters without redefining the entire map.
355
-
356
- [source,ruby]
357
- ----
358
- formatter = Canon::DiffFormatter.new(
359
- character_map_file: "custom_map.yml", # Merges with defaults
360
- character_definitions: [ # Overrides file + defaults
361
- { unicode: "2005", visualization: "◆", category: "whitespace", name: "Four-Per-Em Space" }
362
- ]
363
- )
364
- ----
365
-
366
- NOTE: The `character_definitions` take precedence over `character_map_file`,
367
- which in turn takes precedence over Canon defaults. However, if you provide
368
- `visualization_map` directly, it completely replaces everything and ignores both
369
- `character_map_file` and `character_definitions`.
370
-
371
-
372
- ==== For Canon developers
373
-
374
- The official character map can be extended by contributing to
375
- `character_map.yml`:
376
-
377
- [source,yaml]
378
- ----
379
- characters:
380
- # ... existing characters ...
381
-
382
- # Add your custom character
383
- - unicode: "XXXX" # Replace with hex code
384
- visualization: "symbol" # Your chosen symbol
385
- category: category_name # One of: whitespace, line_ending, zero_width, directional, control
386
- name: "Character Name"
387
- ----
388
-
389
-
390
- === Default character map
391
-
392
- Canon provides a comprehensive CJK-safe character mapping for common non-visible
393
- characters encountered in diffs.
394
-
395
- NOTE: These visualization symbols appear **only in diff lines** (additions,
396
- deletions, and changes), not in context lines (unchanged lines).
397
-
398
- .Common whitespace characters
399
- [cols="1,1,1,2"]
400
- |===
401
- |Character |Unicode |Symbol |Description
402
-
403
- |Regular space
404
- |U+0020
405
- |`░`
406
- |Light Shade (U+2591)
407
-
408
- |Tab
409
- |U+0009
410
- |`⇥`
411
- |Rightwards Arrow to Bar (U+21E5)
412
-
413
- |Non-breaking space
414
- |U+00A0
415
- |`␣`
416
- |Open Box (U+2423)
417
- |===
418
-
419
- .Line endings
420
- [cols="1,1,1,2"]
421
- |===
422
- |Character |Unicode |Symbol |Description
423
-
424
- |Line feed (LF)
425
- |U+000A
426
- |`↵`
427
- |Downwards Arrow with Corner Leftwards (U+21B5)
428
-
429
- |Carriage return (CR)
430
- |U+000D
431
- |`⏎`
432
- |Return Symbol (U+23CE)
433
-
434
- |Windows line ending (CRLF)
435
- |U+000D U+000A
436
- |`↵`
437
- |Downwards Arrow with Corner Leftwards (U+21B5)
438
-
439
- |Next line (NEL)
440
- |U+0085
441
- |`⏎`
442
- |Return Symbol (U+23CE)
443
-
444
- |Line separator
445
- |U+2028
446
- |`⤓`
447
- |Downwards Arrow to Bar (U+2913)
448
-
449
- |Paragraph separator
450
- |U+2029
451
- |`⤓`
452
- |Downwards Arrow to Bar (U+2913)
453
- |===
454
-
455
- .Unicode spaces (various widths)
456
- [cols="1,1,1,2"]
457
- |===
458
- |Character |Unicode |Symbol |Description
459
-
460
- |En space
461
- |U+2002
462
- |`▭`
463
- |White Rectangle (U+25AD)
464
-
465
- |Em space
466
- |U+2003
467
- |`▬`
468
- |Black Rectangle (U+25AC)
469
-
470
- |Four-per-em space
471
- |U+2005
472
- |`⏓`
473
- |Metrical Short Over Long (U+23D3)
474
-
475
- |Six-per-em space
476
- |U+2006
477
- |`⏕`
478
- |Metrical Two Shorts Over Long (U+23D5)
479
-
480
- |Thin space
481
- |U+2009
482
- |`▯`
483
- |White Vertical Rectangle (U+25AF)
484
-
485
- |Hair space
486
- |U+200A
487
- |`▮`
488
- |Black Vertical Rectangle (U+25AE)
489
-
490
- |Figure space
491
- |U+2007
492
- |`□`
493
- |White Square (U+25A1)
494
-
495
- |Narrow no-break space
496
- |U+202F
497
- |`▫`
498
- |White Small Square (U+25AB)
499
-
500
- |Medium mathematical space
501
- |U+205F
502
- |`▭`
503
- |White Rectangle (U+25AD)
504
-
505
- |Ideographic space
506
- |U+3000
507
- |`⎵`
508
- |Bottom Square Bracket (U+23B5)
509
-
510
- |Ideographic half space
511
- |U+303F
512
- |`⏑`
513
- |Metrical Breve (U+23D1)
514
-
515
- |===
516
-
517
- .Zero-width characters (invisible troublemakers)
518
- [cols="1,1,1,2"]
519
- |===
520
- |Character |Unicode |Symbol |Description
521
-
522
- |Zero-width space
523
- |U+200B
524
- |`→`
525
- |Rightwards Arrow (U+2192)
526
-
527
- |Zero-width non-joiner
528
- |U+200C
529
- |`↛`
530
- |Rightwards Arrow with Stroke (U+219B)
531
-
532
- |Zero-width joiner
533
- |U+200D
534
- |`⇢`
535
- |Rightwards Dashed Arrow (U+21E2)
536
-
537
- |Zero-width no-break space (BOM)
538
- |U+FEFF
539
- |`⇨`
540
- |Rightwards White Arrow (U+21E8)
541
- |===
542
-
543
- .Bidirectional/RTL markers
544
- [cols="1,1,1,2"]
545
- |===
546
- |Character |Unicode |Symbol |Description
547
-
548
- |Left-to-right mark
549
- |U+200E
550
- |`⟹`
551
- |Long Rightwards Double Arrow (U+27F9)
552
-
553
- |Right-to-left mark
554
- |U+200F
555
- |`⟸`
556
- |Long Leftwards Double Arrow (U+27F8)
557
-
558
- |LTR embedding
559
- |U+202A
560
- |`⇒`
561
- |Rightwards Double Arrow (U+21D2)
562
-
563
- |RTL embedding
564
- |U+202B
565
- |`⇐`
566
- |Leftwards Double Arrow (U+21D0)
567
-
568
- |Pop directional formatting
569
- |U+202C
570
- |`↔`
571
- |Left Right Arrow (U+2194)
572
-
573
- |LTR override
574
- |U+202D
575
- |`⇉`
576
- |Rightwards Paired Arrows (U+21C9)
577
-
578
- |RTL override
579
- |U+202E
580
- |`⇇`
581
- |Leftwards Paired Arrows (U+21C7)
582
- |===
583
-
584
- .Control characters
585
- [cols="1,1,1,2"]
586
- |===
587
- |Character |Unicode |Symbol |Description
588
-
589
- |Null
590
- |U+0000
591
- |`␀`
592
- |Symbol for Null (U+2400)
593
-
594
- |Soft hyphen
595
- |U+00AD
596
- |`­‐`
597
- |Hyphen (U+2010)
598
-
599
- |Backspace
600
- |U+0008
601
- |`␈`
602
- |Symbol for Backspace (U+2408)
603
-
604
- |Delete
605
- |U+007F
606
- |`␡`
607
- |Symbol for Delete (U+2421)
608
- |===
609
-
610
- [NOTE]
611
- ====
612
- The default visualization characters are specifically chosen to avoid conflicts
613
- with CJK text:
614
-
615
- * **No middle dots** (`·`) - commonly used as separators in CJK
616
- * **No bullets** (`∙`) - used in CJK lists
617
- * **No circles** (`◌◍◎`) - look similar to CJK characters like ○ ●
618
- * **No small dots** (`⋅`) - conflict with CJK punctuation
619
-
620
- Instead, Canon uses:
621
-
622
- * Box characters (`□▭▬▯▮▫`) for various space types
623
- * Arrow symbols (`→↛⇢⇨⟹⟸⇒⇐`) for zero-width and directional characters
624
- * Control Pictures block symbols (`␀␈␡`) for control characters
625
- ====