canon 0.1.8 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +83 -22
  3. data/docs/Gemfile +1 -0
  4. data/docs/_config.yml +90 -1
  5. data/docs/advanced/diff-classification.adoc +196 -24
  6. data/docs/features/match-options/index.adoc +239 -1
  7. data/lib/canon/comparison/format_detector.rb +2 -1
  8. data/lib/canon/comparison/html_comparator.rb +19 -8
  9. data/lib/canon/comparison/html_compare_profile.rb +8 -2
  10. data/lib/canon/comparison/markup_comparator.rb +109 -2
  11. data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
  12. data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
  13. data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
  14. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +108 -0
  15. data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
  16. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
  17. data/lib/canon/comparison/xml_comparator.rb +240 -23
  18. data/lib/canon/comparison/xml_node_comparison.rb +25 -3
  19. data/lib/canon/diff/diff_classifier.rb +119 -5
  20. data/lib/canon/diff/formatting_detector.rb +1 -1
  21. data/lib/canon/diff/xml_serialization_formatter.rb +153 -0
  22. data/lib/canon/rspec_matchers.rb +37 -8
  23. data/lib/canon/version.rb +1 -1
  24. data/lib/canon/xml/data_model.rb +24 -13
  25. metadata +4 -78
  26. data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
  27. data/false_positive_analysis.txt +0 -0
  28. data/file1.html +0 -1
  29. data/file2.html +0 -1
  30. data/old-docs/ADVANCED_TOPICS.adoc +0 -20
  31. data/old-docs/BASIC_USAGE.adoc +0 -16
  32. data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
  33. data/old-docs/CLI.adoc +0 -497
  34. data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  35. data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
  36. data/old-docs/DIFF_FORMATTING.adoc +0 -540
  37. data/old-docs/DIFF_PARAMETERS.adoc +0 -261
  38. data/old-docs/DOM_DIFF.adoc +0 -1017
  39. data/old-docs/ENV_CONFIG.adoc +0 -876
  40. data/old-docs/FORMATS.adoc +0 -867
  41. data/old-docs/INPUT_VALIDATION.adoc +0 -477
  42. data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
  43. data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
  44. data/old-docs/MATCH_OPTIONS.adoc +0 -912
  45. data/old-docs/MODES.adoc +0 -432
  46. data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  47. data/old-docs/OPTIONS.adoc +0 -1387
  48. data/old-docs/PREPROCESSING.adoc +0 -491
  49. data/old-docs/README.old.adoc +0 -2831
  50. data/old-docs/RSPEC.adoc +0 -814
  51. data/old-docs/RUBY_API.adoc +0 -485
  52. data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
  53. data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
  54. data/old-docs/STRING_COMPARE.adoc +0 -345
  55. data/old-docs/TMP.adoc +0 -3384
  56. data/old-docs/TREE_DIFF.adoc +0 -1080
  57. data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
  58. data/old-docs/VERBOSE.adoc +0 -482
  59. data/old-docs/VISUALIZATION_MAP.adoc +0 -625
  60. data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
  61. data/scripts/analyze_current_state.rb +0 -85
  62. data/scripts/analyze_false_positives.rb +0 -114
  63. data/scripts/analyze_remaining_failures.rb +0 -105
  64. data/scripts/compare_current_failures.rb +0 -95
  65. data/scripts/compare_dom_tree_diff.rb +0 -158
  66. data/scripts/compare_failures.rb +0 -151
  67. data/scripts/debug_attribute_extraction.rb +0 -66
  68. data/scripts/debug_blocks_839.rb +0 -115
  69. data/scripts/debug_meta_matching.rb +0 -52
  70. data/scripts/debug_p_matching.rb +0 -192
  71. data/scripts/debug_signature_matching.rb +0 -118
  72. data/scripts/debug_sourcecode_124.rb +0 -32
  73. data/scripts/debug_whitespace_sensitive.rb +0 -192
  74. data/scripts/extract_false_positives.rb +0 -138
  75. data/scripts/find_actual_false_positives.rb +0 -125
  76. data/scripts/investigate_all_false_positives.rb +0 -161
  77. data/scripts/investigate_batch1.rb +0 -127
  78. data/scripts/investigate_classification.rb +0 -150
  79. data/scripts/investigate_classification_detailed.rb +0 -190
  80. data/scripts/investigate_common_failures.rb +0 -342
  81. data/scripts/investigate_false_negative.rb +0 -80
  82. data/scripts/investigate_false_positive.rb +0 -83
  83. data/scripts/investigate_false_positives.rb +0 -227
  84. data/scripts/investigate_false_positives_batch.rb +0 -163
  85. data/scripts/investigate_mixed_content.rb +0 -125
  86. data/scripts/investigate_remaining_16.rb +0 -214
  87. data/scripts/run_single_test.rb +0 -29
  88. data/scripts/test_all_false_positives.rb +0 -95
  89. data/scripts/test_attribute_details.rb +0 -61
  90. data/scripts/test_both_algorithms.rb +0 -49
  91. data/scripts/test_both_simple.rb +0 -49
  92. data/scripts/test_enhanced_semantic_output.rb +0 -125
  93. data/scripts/test_readme_examples.rb +0 -131
  94. data/scripts/test_semantic_tree_diff.rb +0 -99
  95. data/scripts/test_semantic_ux_improvements.rb +0 -135
  96. data/scripts/test_single_false_positive.rb +0 -119
  97. data/scripts/test_size_limits.rb +0 -99
  98. data/test_html_1.html +0 -21
  99. data/test_html_2.html +0 -21
  100. data/test_nokogiri.rb +0 -33
  101. data/test_normalize.rb +0 -45
@@ -1,261 +0,0 @@
1
- == Diff renderer
2
-
3
- === General
4
-
5
- A diff renderer is responsible for taking a diff report and producing a
6
- human-readable representation of the differences between two text files.
7
-
8
- Canon provides a built-in diff renderer that produces a colored
9
- diff output, highlighting the differences between the two files,
10
- in either a by-object or by-line format.
11
-
12
- === Concepts
13
-
14
- ==== Diff report
15
-
16
- A diff report is a structured representation of the complete differences between
17
- two text files. It is composed of multiple diff contexts.
18
-
19
- A diff report serves as the input to a diff renderer, which processes the report
20
- to generate a human-readable diff output.
21
-
22
- A diff report is generated by a comparison engine that analyzes the two text
23
- files and identifies the differences between them.
24
-
25
- ==== Diff context
26
-
27
- A diff context is a representation of a group of diff blocks with surrounding
28
- grouping lines.
29
-
30
- When the amount of grouping lines is set to 0, each diff block is treated as its
31
- own context.
32
-
33
- When the amount of grouping lines is greater than 0, multiple diff blocks that are close to each other can be grouped together into a single context.
34
-
35
- ==== Diff block
36
-
37
- A diff block is a representation of a contiguous block of changes.
38
-
39
- In typical line-based diffing, a diff block consists of a run of consecutive lines that have been added, removed, or modified.
40
-
41
- In Canon, which uses semantic diffing, a diff block is a representation of a
42
- contiguous block of changes, which may be a many-to-many mapping of changes
43
- lines depending on the nature of the change.
44
-
45
-
46
- === Parameters
47
-
48
- Canon diff renderers support the following parameters:
49
-
50
- ==== Parameters
51
-
52
- The following table shows all available diff formatting parameters and their
53
- availability across interfaces:
54
-
55
- [cols="1,1,1,1,2,1"]
56
- |===
57
- |Parameter |RSpec |CLI |Ruby API |Description |Default
58
-
59
- |`use_color`
60
- |✓
61
- |✓
62
- |✓
63
- |Enable/disable colored output
64
- |`true`
65
-
66
- |`diff_mode`
67
- |✓
68
- |✓
69
- |✓
70
- |Comparison mode: `:by_object` or `:by_line`
71
- |`:by_line` (RSpec), `:by_object` (XML/JSON/YAML)
72
-
73
- |`context_lines`
74
- |✓
75
- |✓
76
- |✓
77
- |Number of unchanged lines to show around each change
78
- |`3`
79
-
80
- |`diff_grouping_lines`
81
- |✓
82
- |✓
83
- |✓
84
- |Maximum line distance to group separate diffs into context blocks
85
- |`10`
86
- |===
87
-
88
- ==== Use color
89
-
90
- `use_color: <boolean>` default: `true`
91
-
92
- Specifies whether to produce colored diff output using ANSI color codes.
93
-
94
- When `use_color` is `true`, the diff output includes ANSI color codes to
95
- enhance readability by visually distinguishing different types of changes.
96
-
97
- * Type: Boolean
98
- * Default: `true`
99
- * Colors used:
100
- ** Red: Deletions/removed content
101
- ** Green: Additions/inserted content
102
- ** Yellow: Modified content
103
- ** Cyan: Element names and structure
104
-
105
- When `use_color` is `false`:
106
-
107
- * Line numbers and pipes are plain text
108
- * Whitespace is not visualized (remains invisible)
109
- * Unicode legend is still shown (but without color)
110
- * Content changes are shown without color highlighting
111
-
112
-
113
- **Purpose**: Improve readability by distinguishing structural elements from
114
- content changes.
115
-
116
- When color mode is enabled (`use_color: true`), the diff formatter uses a
117
- consistent color scheme:
118
-
119
- * **Yellow**: Line numbers and pipe separators
120
- * **Red**: Deletion markers (`-`) and removed content
121
- * **Green**: Addition markers (`+`) and inserted content
122
- * **Default terminal color**: Unchanged context lines (no ANSI codes applied)
123
-
124
- This color scheme helps differentiate between:
125
-
126
- * The diff structure (line numbers, pipes)
127
- * Content that was removed (red)
128
- * Content that was added (green)
129
- * Content that stayed the same (your terminal's default color)
130
-
131
- .Example colored diff output
132
- [example]
133
- In a colored terminal, a typical diff line appears as:
134
-
135
- [source]
136
- ----
137
- 5| 5 | <p>First paragraph</p> # Context line (yellow numbers/pipes, default text)
138
- 6| -| <old>Text</old> # Deletion (yellow numbers/pipes, red marker/content)
139
- | 6+| <new>Text</new> # Addition (yellow numbers/pipes, green marker/content)
140
- ----
141
-
142
- Where:
143
-
144
- * Line numbers (`5`, `6`) are in yellow
145
- * Pipe separators (`|`) are in yellow
146
- * Markers (`-`, `+`) are in red/green respectively
147
- * Changed content is highlighted in red (deletions) or green (additions)
148
- * Unchanged content uses your terminal's default color (no forced white/black)
149
-
150
- **Why this matters**: When running tests with RSpec, the framework initially sets
151
- output to red. Canon's diff formatter explicitly resets colors to prevent RSpec's
152
- red from bleeding into the diff output, ensuring consistent and readable diffs.
153
-
154
-
155
- ==== Diff mode
156
-
157
- `diff_mode: <string>` default: `by_line`
158
-
159
-
160
- ==== Context lines
161
-
162
- `context_lines: <number>` default: `3`
163
-
164
- Specifies the number of context lines before and after the diff block to show.
165
-
166
- Usage:
167
-
168
- [source,ruby]
169
- ----
170
- renderer = Canon::DiffFormatter::Renderer.new(context_lines: 5)
171
- diff_output = renderer.render(diff_report)
172
- ----
173
-
174
- .Example of XML line-by-line diff with context lines set to 3
175
- [example]
176
- ====
177
- There are 3 context lines before and after the diff block:
178
-
179
- [source]
180
- ----
181
- Line-by-line diff (XML mode):
182
- Character Visualization Legend:
183
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
184
- Whitespace:
185
- '␣': U+00A0 (' ') NO-Break-Space
186
-
187
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
188
-
189
- 29| 29 | <eref bibitemid="_" citeas="ISO 639-2" id="_" type="inline">
190
- 30| 30 | </eref>
191
- 31| 31 | <semx element="eref" source="_">
192
- 32| - | <fmt-eref░bibitemid="_"░citeas="ISO░639-2"░type="inline">ISO\u00a0639-2</fmt-eref>
193
- | 32+ | <fmt-eref░bibitemid="_"░citeas="ISO░639-2"░type="inline">ISO␣639-2</fmt-eref>
194
- 33| 33 | </semx>
195
- 34| 34 | </p>
196
- 35| 35 | </clause>
197
- ----
198
- ====
199
-
200
- ==== Grouping lines
201
-
202
- `diff_grouping_lines: <number>` default: `0`
203
-
204
- Specifies the number of grouping lines to coalesce nearby diff blocks into a
205
- single context.
206
-
207
- The algorithm groups diff blocks as long as the distance between the previous
208
- block's end and the next block's start is less than or equal to the grouping
209
- lines setting.
210
-
211
- The default value of `0` means that each diff block is treated as its own
212
- context.
213
-
214
- When set to `5`, for example, any two diff blocks that are within 5 lines of
215
- each other will be grouped together into a single context.
216
-
217
- .Example of XML line-by-line diff with grouping lines set to 10
218
- [example]
219
- ====
220
- Here, multiple diff blocks are grouped together into a single context because
221
- they are within 10 lines of each other.
222
-
223
- [source]
224
- ----
225
- Line-by-line diff (XML mode):
226
- Character Visualization Legend:
227
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
228
- Whitespace:
229
- '␣': U+00A0 (' ') NO-Break-Space
230
-
231
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
232
-
233
- 22| 22 | <span class="fmt-element-name">Figure</span>
234
- 23| 23 | <semx element="autonum" source="B1">1</semx>
235
- 24| 24 | </span>
236
- 25| - | <span░class="fmt-caption-delim">\u00a0—░</span>
237
- | 25+ | <span░class="fmt-caption-delim">␣—░</span>
238
- 26| 26 | <semx element="name" source="_">First</semx>
239
- 27| 27 | </fmt-name>
240
- 28| 28 | <fmt-xref-label>
241
-
242
- 57| 57 | <span class="fmt-element-name">Figure</span>
243
- 58| 58 | <semx element="autonum" source="B2">2</semx>
244
- 59| 59 | </span>
245
- 60| - | <span░class="fmt-caption-delim">\u00a0—░</span>
246
- | 60+ | <span░class="fmt-caption-delim">␣—░</span>
247
- 61| 61 | <semx element="name" source="_">Second</semx>
248
- 62| 62 | </fmt-name>
249
- 63| 63 | <fmt-xref-label>
250
-
251
- 100| 100 | <span class="fmt-element-name">Figure</span>
252
- 101| 101 | <semx element="autonum" source="B3">3</semx>
253
- 102| 102 | </span>
254
- 103| - | <span░class="fmt-caption-delim">\u00a0—░</span>
255
- | 103+ | <span░class="fmt-caption-delim">␣—░</span>
256
- 104| 104 | <semx element="name" source="_">Third</semx>
257
- 105| 105 | </fmt-name>
258
- 106| 106 | <fmt-xref-label>
259
- ----
260
- ====
261
-