canon 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +112 -25
  3. data/docs/Gemfile +1 -0
  4. data/docs/_config.yml +90 -1
  5. data/docs/advanced/diff-classification.adoc +82 -2
  6. data/docs/features/match-options/index.adoc +239 -1
  7. data/lib/canon/comparison/format_detector.rb +2 -1
  8. data/lib/canon/comparison/html_comparator.rb +19 -8
  9. data/lib/canon/comparison/html_compare_profile.rb +8 -2
  10. data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
  11. data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
  12. data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
  13. data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
  14. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
  15. data/lib/canon/comparison/xml_comparator.rb +48 -23
  16. data/lib/canon/comparison/xml_node_comparison.rb +25 -3
  17. data/lib/canon/diff/diff_classifier.rb +101 -2
  18. data/lib/canon/diff/formatting_detector.rb +1 -1
  19. data/lib/canon/rspec_matchers.rb +37 -8
  20. data/lib/canon/version.rb +1 -1
  21. data/lib/canon/xml/data_model.rb +24 -13
  22. metadata +3 -78
  23. data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
  24. data/false_positive_analysis.txt +0 -0
  25. data/file1.html +0 -1
  26. data/file2.html +0 -1
  27. data/old-docs/ADVANCED_TOPICS.adoc +0 -20
  28. data/old-docs/BASIC_USAGE.adoc +0 -16
  29. data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
  30. data/old-docs/CLI.adoc +0 -497
  31. data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  32. data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
  33. data/old-docs/DIFF_FORMATTING.adoc +0 -540
  34. data/old-docs/DIFF_PARAMETERS.adoc +0 -261
  35. data/old-docs/DOM_DIFF.adoc +0 -1017
  36. data/old-docs/ENV_CONFIG.adoc +0 -876
  37. data/old-docs/FORMATS.adoc +0 -867
  38. data/old-docs/INPUT_VALIDATION.adoc +0 -477
  39. data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
  40. data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
  41. data/old-docs/MATCH_OPTIONS.adoc +0 -912
  42. data/old-docs/MODES.adoc +0 -432
  43. data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  44. data/old-docs/OPTIONS.adoc +0 -1387
  45. data/old-docs/PREPROCESSING.adoc +0 -491
  46. data/old-docs/README.old.adoc +0 -2831
  47. data/old-docs/RSPEC.adoc +0 -814
  48. data/old-docs/RUBY_API.adoc +0 -485
  49. data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
  50. data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
  51. data/old-docs/STRING_COMPARE.adoc +0 -345
  52. data/old-docs/TMP.adoc +0 -3384
  53. data/old-docs/TREE_DIFF.adoc +0 -1080
  54. data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
  55. data/old-docs/VERBOSE.adoc +0 -482
  56. data/old-docs/VISUALIZATION_MAP.adoc +0 -625
  57. data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
  58. data/scripts/analyze_current_state.rb +0 -85
  59. data/scripts/analyze_false_positives.rb +0 -114
  60. data/scripts/analyze_remaining_failures.rb +0 -105
  61. data/scripts/compare_current_failures.rb +0 -95
  62. data/scripts/compare_dom_tree_diff.rb +0 -158
  63. data/scripts/compare_failures.rb +0 -151
  64. data/scripts/debug_attribute_extraction.rb +0 -66
  65. data/scripts/debug_blocks_839.rb +0 -115
  66. data/scripts/debug_meta_matching.rb +0 -52
  67. data/scripts/debug_p_matching.rb +0 -192
  68. data/scripts/debug_signature_matching.rb +0 -118
  69. data/scripts/debug_sourcecode_124.rb +0 -32
  70. data/scripts/debug_whitespace_sensitive.rb +0 -192
  71. data/scripts/extract_false_positives.rb +0 -138
  72. data/scripts/find_actual_false_positives.rb +0 -125
  73. data/scripts/investigate_all_false_positives.rb +0 -161
  74. data/scripts/investigate_batch1.rb +0 -127
  75. data/scripts/investigate_classification.rb +0 -150
  76. data/scripts/investigate_classification_detailed.rb +0 -190
  77. data/scripts/investigate_common_failures.rb +0 -342
  78. data/scripts/investigate_false_negative.rb +0 -80
  79. data/scripts/investigate_false_positive.rb +0 -83
  80. data/scripts/investigate_false_positives.rb +0 -227
  81. data/scripts/investigate_false_positives_batch.rb +0 -163
  82. data/scripts/investigate_mixed_content.rb +0 -125
  83. data/scripts/investigate_remaining_16.rb +0 -214
  84. data/scripts/run_single_test.rb +0 -29
  85. data/scripts/test_all_false_positives.rb +0 -95
  86. data/scripts/test_attribute_details.rb +0 -61
  87. data/scripts/test_both_algorithms.rb +0 -49
  88. data/scripts/test_both_simple.rb +0 -49
  89. data/scripts/test_enhanced_semantic_output.rb +0 -125
  90. data/scripts/test_readme_examples.rb +0 -131
  91. data/scripts/test_semantic_tree_diff.rb +0 -99
  92. data/scripts/test_semantic_ux_improvements.rb +0 -135
  93. data/scripts/test_single_false_positive.rb +0 -119
  94. data/scripts/test_size_limits.rb +0 -99
  95. data/test_html_1.html +0 -21
  96. data/test_html_2.html +0 -21
  97. data/test_nokogiri.rb +0 -33
  98. data/test_normalize.rb +0 -45
@@ -1,261 +0,0 @@
1
- == Diff renderer
2
-
3
- === General
4
-
5
- A diff renderer is responsible for taking a diff report and producing a
6
- human-readable representation of the differences between two text files.
7
-
8
- Canon provides a built-in diff renderer that produces a colored
9
- diff output, highlighting the differences between the two files,
10
- in either a by-object or by-line format.
11
-
12
- === Concepts
13
-
14
- ==== Diff report
15
-
16
- A diff report is a structured representation of the complete differences between
17
- two text files. It is composed of multiple diff contexts.
18
-
19
- A diff report serves as the input to a diff renderer, which processes the report
20
- to generate a human-readable diff output.
21
-
22
- A diff report is generated by a comparison engine that analyzes the two text
23
- files and identifies the differences between them.
24
-
25
- ==== Diff context
26
-
27
- A diff context is a representation of a group of diff blocks with surrounding
28
- grouping lines.
29
-
30
- When the amount of grouping lines is set to 0, each diff block is treated as its
31
- own context.
32
-
33
- When the amount of grouping lines is greater than 0, multiple diff blocks that are close to each other can be grouped together into a single context.
34
-
35
- ==== Diff block
36
-
37
- A diff block is a representation of a contiguous block of changes.
38
-
39
- In typical line-based diffing, a diff block consists of a run of consecutive lines that have been added, removed, or modified.
40
-
41
- In Canon, which uses semantic diffing, a diff block is a representation of a
42
- contiguous block of changes, which may be a many-to-many mapping of changes
43
- lines depending on the nature of the change.
44
-
45
-
46
- === Parameters
47
-
48
- Canon diff renderers support the following parameters:
49
-
50
- ==== Parameters
51
-
52
- The following table shows all available diff formatting parameters and their
53
- availability across interfaces:
54
-
55
- [cols="1,1,1,1,2,1"]
56
- |===
57
- |Parameter |RSpec |CLI |Ruby API |Description |Default
58
-
59
- |`use_color`
60
- |✓
61
- |✓
62
- |✓
63
- |Enable/disable colored output
64
- |`true`
65
-
66
- |`diff_mode`
67
- |✓
68
- |✓
69
- |✓
70
- |Comparison mode: `:by_object` or `:by_line`
71
- |`:by_line` (RSpec), `:by_object` (XML/JSON/YAML)
72
-
73
- |`context_lines`
74
- |✓
75
- |✓
76
- |✓
77
- |Number of unchanged lines to show around each change
78
- |`3`
79
-
80
- |`diff_grouping_lines`
81
- |✓
82
- |✓
83
- |✓
84
- |Maximum line distance to group separate diffs into context blocks
85
- |`10`
86
- |===
87
-
88
- ==== Use color
89
-
90
- `use_color: <boolean>` default: `true`
91
-
92
- Specifies whether to produce colored diff output using ANSI color codes.
93
-
94
- When `use_color` is `true`, the diff output includes ANSI color codes to
95
- enhance readability by visually distinguishing different types of changes.
96
-
97
- * Type: Boolean
98
- * Default: `true`
99
- * Colors used:
100
- ** Red: Deletions/removed content
101
- ** Green: Additions/inserted content
102
- ** Yellow: Modified content
103
- ** Cyan: Element names and structure
104
-
105
- When `use_color` is `false`:
106
-
107
- * Line numbers and pipes are plain text
108
- * Whitespace is not visualized (remains invisible)
109
- * Unicode legend is still shown (but without color)
110
- * Content changes are shown without color highlighting
111
-
112
-
113
- **Purpose**: Improve readability by distinguishing structural elements from
114
- content changes.
115
-
116
- When color mode is enabled (`use_color: true`), the diff formatter uses a
117
- consistent color scheme:
118
-
119
- * **Yellow**: Line numbers and pipe separators
120
- * **Red**: Deletion markers (`-`) and removed content
121
- * **Green**: Addition markers (`+`) and inserted content
122
- * **Default terminal color**: Unchanged context lines (no ANSI codes applied)
123
-
124
- This color scheme helps differentiate between:
125
-
126
- * The diff structure (line numbers, pipes)
127
- * Content that was removed (red)
128
- * Content that was added (green)
129
- * Content that stayed the same (your terminal's default color)
130
-
131
- .Example colored diff output
132
- [example]
133
- In a colored terminal, a typical diff line appears as:
134
-
135
- [source]
136
- ----
137
- 5| 5 | <p>First paragraph</p> # Context line (yellow numbers/pipes, default text)
138
- 6| -| <old>Text</old> # Deletion (yellow numbers/pipes, red marker/content)
139
- | 6+| <new>Text</new> # Addition (yellow numbers/pipes, green marker/content)
140
- ----
141
-
142
- Where:
143
-
144
- * Line numbers (`5`, `6`) are in yellow
145
- * Pipe separators (`|`) are in yellow
146
- * Markers (`-`, `+`) are in red/green respectively
147
- * Changed content is highlighted in red (deletions) or green (additions)
148
- * Unchanged content uses your terminal's default color (no forced white/black)
149
-
150
- **Why this matters**: When running tests with RSpec, the framework initially sets
151
- output to red. Canon's diff formatter explicitly resets colors to prevent RSpec's
152
- red from bleeding into the diff output, ensuring consistent and readable diffs.
153
-
154
-
155
- ==== Diff mode
156
-
157
- `diff_mode: <string>` default: `by_line`
158
-
159
-
160
- ==== Context lines
161
-
162
- `context_lines: <number>` default: `3`
163
-
164
- Specifies the number of context lines before and after the diff block to show.
165
-
166
- Usage:
167
-
168
- [source,ruby]
169
- ----
170
- renderer = Canon::DiffFormatter::Renderer.new(context_lines: 5)
171
- diff_output = renderer.render(diff_report)
172
- ----
173
-
174
- .Example of XML line-by-line diff with context lines set to 3
175
- [example]
176
- ====
177
- There are 3 context lines before and after the diff block:
178
-
179
- [source]
180
- ----
181
- Line-by-line diff (XML mode):
182
- Character Visualization Legend:
183
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
184
- Whitespace:
185
- '␣': U+00A0 (' ') NO-Break-Space
186
-
187
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
188
-
189
- 29| 29 | <eref bibitemid="_" citeas="ISO 639-2" id="_" type="inline">
190
- 30| 30 | </eref>
191
- 31| 31 | <semx element="eref" source="_">
192
- 32| - | <fmt-eref░bibitemid="_"░citeas="ISO░639-2"░type="inline">ISO\u00a0639-2</fmt-eref>
193
- | 32+ | <fmt-eref░bibitemid="_"░citeas="ISO░639-2"░type="inline">ISO␣639-2</fmt-eref>
194
- 33| 33 | </semx>
195
- 34| 34 | </p>
196
- 35| 35 | </clause>
197
- ----
198
- ====
199
-
200
- ==== Grouping lines
201
-
202
- `diff_grouping_lines: <number>` default: `0`
203
-
204
- Specifies the number of grouping lines to coalesce nearby diff blocks into a
205
- single context.
206
-
207
- The algorithm groups diff blocks as long as the distance between the previous
208
- block's end and the next block's start is less than or equal to the grouping
209
- lines setting.
210
-
211
- The default value of `0` means that each diff block is treated as its own
212
- context.
213
-
214
- When set to `5`, for example, any two diff blocks that are within 5 lines of
215
- each other will be grouped together into a single context.
216
-
217
- .Example of XML line-by-line diff with grouping lines set to 10
218
- [example]
219
- ====
220
- Here, multiple diff blocks are grouped together into a single context because
221
- they are within 10 lines of each other.
222
-
223
- [source]
224
- ----
225
- Line-by-line diff (XML mode):
226
- Character Visualization Legend:
227
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
228
- Whitespace:
229
- '␣': U+00A0 (' ') NO-Break-Space
230
-
231
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
232
-
233
- 22| 22 | <span class="fmt-element-name">Figure</span>
234
- 23| 23 | <semx element="autonum" source="B1">1</semx>
235
- 24| 24 | </span>
236
- 25| - | <span░class="fmt-caption-delim">\u00a0—░</span>
237
- | 25+ | <span░class="fmt-caption-delim">␣—░</span>
238
- 26| 26 | <semx element="name" source="_">First</semx>
239
- 27| 27 | </fmt-name>
240
- 28| 28 | <fmt-xref-label>
241
-
242
- 57| 57 | <span class="fmt-element-name">Figure</span>
243
- 58| 58 | <semx element="autonum" source="B2">2</semx>
244
- 59| 59 | </span>
245
- 60| - | <span░class="fmt-caption-delim">\u00a0—░</span>
246
- | 60+ | <span░class="fmt-caption-delim">␣—░</span>
247
- 61| 61 | <semx element="name" source="_">Second</semx>
248
- 62| 62 | </fmt-name>
249
- 63| 63 | <fmt-xref-label>
250
-
251
- 100| 100 | <span class="fmt-element-name">Figure</span>
252
- 101| 101 | <semx element="autonum" source="B3">3</semx>
253
- 102| 102 | </span>
254
- 103| - | <span░class="fmt-caption-delim">\u00a0—░</span>
255
- | 103+ | <span░class="fmt-caption-delim">␣—░</span>
256
- 104| 104 | <semx element="name" source="_">Third</semx>
257
- 105| 105 | </fmt-name>
258
- 106| 106 | <fmt-xref-label>
259
- ----
260
- ====
261
-