canon 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +25 -135
  3. data/README.adoc +13 -13
  4. data/docs/.lycheeignore +69 -0
  5. data/docs/advanced/extending-canon.adoc +193 -0
  6. data/docs/internals/diffnode-enrichment.adoc +611 -0
  7. data/docs/internals/index.adoc +251 -0
  8. data/docs/lychee.toml +13 -6
  9. data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +250 -0
  10. data/docs/understanding/architecture.adoc +749 -33
  11. data/docs/understanding/comparison-pipeline.adoc +122 -0
  12. data/false_positive_analysis.txt +0 -0
  13. data/file1.html +1 -0
  14. data/file2.html +1 -0
  15. data/lib/canon/cache.rb +129 -0
  16. data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +68 -0
  17. data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +68 -0
  18. data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +171 -0
  19. data/lib/canon/comparison/dimensions/base_dimension.rb +107 -0
  20. data/lib/canon/comparison/dimensions/comments_dimension.rb +121 -0
  21. data/lib/canon/comparison/dimensions/element_position_dimension.rb +90 -0
  22. data/lib/canon/comparison/dimensions/registry.rb +77 -0
  23. data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +119 -0
  24. data/lib/canon/comparison/dimensions/text_content_dimension.rb +96 -0
  25. data/lib/canon/comparison/dimensions.rb +54 -0
  26. data/lib/canon/comparison/format_detector.rb +86 -0
  27. data/lib/canon/comparison/html_comparator.rb +51 -18
  28. data/lib/canon/comparison/html_parser.rb +80 -0
  29. data/lib/canon/comparison/json_comparator.rb +12 -0
  30. data/lib/canon/comparison/json_parser.rb +19 -0
  31. data/lib/canon/comparison/markup_comparator.rb +293 -0
  32. data/lib/canon/comparison/match_options/base_resolver.rb +143 -0
  33. data/lib/canon/comparison/match_options/json_resolver.rb +82 -0
  34. data/lib/canon/comparison/match_options/xml_resolver.rb +151 -0
  35. data/lib/canon/comparison/match_options/yaml_resolver.rb +87 -0
  36. data/lib/canon/comparison/match_options.rb +68 -463
  37. data/lib/canon/comparison/profile_definition.rb +149 -0
  38. data/lib/canon/comparison/ruby_object_comparator.rb +180 -0
  39. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +7 -10
  40. data/lib/canon/comparison/xml_comparator/attribute_comparator.rb +177 -0
  41. data/lib/canon/comparison/xml_comparator/attribute_filter.rb +136 -0
  42. data/lib/canon/comparison/xml_comparator/child_comparison.rb +189 -0
  43. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +115 -0
  44. data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +186 -0
  45. data/lib/canon/comparison/xml_comparator/node_parser.rb +74 -0
  46. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +95 -0
  47. data/lib/canon/comparison/xml_comparator.rb +52 -664
  48. data/lib/canon/comparison/xml_node_comparison.rb +297 -0
  49. data/lib/canon/comparison/xml_parser.rb +19 -0
  50. data/lib/canon/comparison/yaml_comparator.rb +3 -3
  51. data/lib/canon/comparison.rb +265 -110
  52. data/lib/canon/diff/diff_node.rb +32 -2
  53. data/lib/canon/diff/node_serializer.rb +191 -0
  54. data/lib/canon/diff/path_builder.rb +143 -0
  55. data/lib/canon/diff_formatter/by_line/base_formatter.rb +251 -0
  56. data/lib/canon/diff_formatter/by_line/html_formatter.rb +6 -248
  57. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +38 -229
  58. data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +30 -0
  59. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +579 -0
  60. data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +121 -0
  61. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +253 -0
  62. data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +61 -0
  63. data/lib/canon/diff_formatter/diff_detail_formatter.rb +31 -1028
  64. data/lib/canon/diff_formatter.rb +1 -1
  65. data/lib/canon/rspec_matchers.rb +1 -1
  66. data/lib/canon/tree_diff/operation_converter.rb +92 -338
  67. data/lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb +71 -0
  68. data/lib/canon/tree_diff/operation_converter_helpers/post_processor.rb +103 -0
  69. data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +168 -0
  70. data/lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb +188 -0
  71. data/lib/canon/version.rb +1 -1
  72. data/old-docs/ADVANCED_TOPICS.adoc +20 -0
  73. data/old-docs/BASIC_USAGE.adoc +16 -0
  74. data/old-docs/CHARACTER_VISUALIZATION.adoc +567 -0
  75. data/old-docs/CLI.adoc +497 -0
  76. data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +19 -0
  77. data/old-docs/DIFF_ARCHITECTURE.adoc +435 -0
  78. data/old-docs/DIFF_FORMATTING.adoc +540 -0
  79. data/old-docs/DIFF_PARAMETERS.adoc +261 -0
  80. data/old-docs/DOM_DIFF.adoc +1017 -0
  81. data/old-docs/ENV_CONFIG.adoc +876 -0
  82. data/old-docs/FORMATS.adoc +867 -0
  83. data/old-docs/INPUT_VALIDATION.adoc +477 -0
  84. data/old-docs/MATCHER_BEHAVIOR.adoc +90 -0
  85. data/old-docs/MATCH_ARCHITECTURE.adoc +463 -0
  86. data/old-docs/MATCH_OPTIONS.adoc +912 -0
  87. data/old-docs/MODES.adoc +432 -0
  88. data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +219 -0
  89. data/old-docs/OPTIONS.adoc +1387 -0
  90. data/old-docs/PREPROCESSING.adoc +491 -0
  91. data/old-docs/README.old.adoc +2831 -0
  92. data/old-docs/RSPEC.adoc +814 -0
  93. data/old-docs/RUBY_API.adoc +485 -0
  94. data/old-docs/SEMANTIC_DIFF_REPORT.adoc +646 -0
  95. data/old-docs/SEMANTIC_TREE_DIFF.adoc +765 -0
  96. data/old-docs/STRING_COMPARE.adoc +345 -0
  97. data/old-docs/TMP.adoc +3384 -0
  98. data/old-docs/TREE_DIFF.adoc +1080 -0
  99. data/old-docs/UNDERSTANDING_CANON.adoc +17 -0
  100. data/old-docs/VERBOSE.adoc +482 -0
  101. data/old-docs/VISUALIZATION_MAP.adoc +625 -0
  102. data/old-docs/WHITESPACE_TREATMENT.adoc +1155 -0
  103. data/scripts/analyze_current_state.rb +85 -0
  104. data/scripts/analyze_false_positives.rb +114 -0
  105. data/scripts/analyze_remaining_failures.rb +105 -0
  106. data/scripts/compare_current_failures.rb +95 -0
  107. data/scripts/compare_dom_tree_diff.rb +158 -0
  108. data/scripts/compare_failures.rb +151 -0
  109. data/scripts/debug_attribute_extraction.rb +66 -0
  110. data/scripts/debug_blocks_839.rb +115 -0
  111. data/scripts/debug_meta_matching.rb +52 -0
  112. data/scripts/debug_p_matching.rb +192 -0
  113. data/scripts/debug_signature_matching.rb +118 -0
  114. data/scripts/debug_sourcecode_124.rb +32 -0
  115. data/scripts/debug_whitespace_sensitive.rb +192 -0
  116. data/scripts/extract_false_positives.rb +138 -0
  117. data/scripts/find_actual_false_positives.rb +125 -0
  118. data/scripts/investigate_all_false_positives.rb +161 -0
  119. data/scripts/investigate_batch1.rb +127 -0
  120. data/scripts/investigate_classification.rb +150 -0
  121. data/scripts/investigate_classification_detailed.rb +190 -0
  122. data/scripts/investigate_common_failures.rb +342 -0
  123. data/scripts/investigate_false_negative.rb +80 -0
  124. data/scripts/investigate_false_positive.rb +83 -0
  125. data/scripts/investigate_false_positives.rb +227 -0
  126. data/scripts/investigate_false_positives_batch.rb +163 -0
  127. data/scripts/investigate_mixed_content.rb +125 -0
  128. data/scripts/investigate_remaining_16.rb +214 -0
  129. data/scripts/run_single_test.rb +29 -0
  130. data/scripts/test_all_false_positives.rb +95 -0
  131. data/scripts/test_attribute_details.rb +61 -0
  132. data/scripts/test_both_algorithms.rb +49 -0
  133. data/scripts/test_both_simple.rb +49 -0
  134. data/scripts/test_enhanced_semantic_output.rb +125 -0
  135. data/scripts/test_readme_examples.rb +131 -0
  136. data/scripts/test_semantic_tree_diff.rb +99 -0
  137. data/scripts/test_semantic_ux_improvements.rb +135 -0
  138. data/scripts/test_single_false_positive.rb +119 -0
  139. data/scripts/test_size_limits.rb +99 -0
  140. data/test_html_1.html +21 -0
  141. data/test_html_2.html +21 -0
  142. data/test_nokogiri.rb +33 -0
  143. data/test_normalize.rb +45 -0
  144. metadata +123 -2
data/test_normalize.rb ADDED
@@ -0,0 +1,45 @@
1
+ require "bundler/setup"
2
+ require "canon/html/data_model"
3
+
4
+ html1 = "<html><body><p>Test</p></body></html>"
5
+ html2 = "<html>\n\n<body>\n\n<p>Test</p>\n\n</body>\n\n</html>"
6
+
7
+ # Parse both without preprocessing
8
+ node1 = Canon::Html::DataModel.from_html(html1)
9
+ node2 = Canon::Html::DataModel.from_html(html2)
10
+
11
+ puts "=== Without preprocessing ==="
12
+ puts "node1 root children count: #{node1.children.count}"
13
+ node1.children.each_with_index do |child, i|
14
+ puts " Child #{i}: #{child.class}"
15
+ if child.is_a?(Canon::Xml::Nodes::ElementNode)
16
+ puts " Name: #{child.name}"
17
+ puts " Children count: #{child.children.count}"
18
+ end
19
+ end
20
+
21
+ puts "\nnode2 root children count: #{node2.children.count}"
22
+ node2.children.each_with_index do |child, i|
23
+ puts " Child #{i}: #{child.class}"
24
+ if child.is_a?(Canon::Xml::Nodes::ElementNode)
25
+ puts " Name: #{child.name}"
26
+ puts " Children count: #{child.children.count}"
27
+ end
28
+ end
29
+
30
+ # Now with normalize preprocessing
31
+ html2_norm = html2.lines.map(&:strip).reject(&:empty?).join("\n")
32
+ node2_norm = Canon::Html::DataModel.from_html(html2_norm)
33
+
34
+ puts "\n=== With :normalize preprocessing ==="
35
+ puts "html2_norm:"
36
+ puts html2_norm
37
+ puts ""
38
+ puts "node2_norm root children count: #{node2_norm.children.count}"
39
+ node2_norm.children.each_with_index do |child, i|
40
+ puts " Child #{i}: #{child.class}"
41
+ if child.is_a?(Canon::Xml::Nodes::ElementNode)
42
+ puts " Name: #{child.name}"
43
+ puts " Children count: #{child.children.count}"
44
+ end
45
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: canon
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-12-24 00:00:00.000000000 Z
11
+ date: 2026-01-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: diff-lcs
@@ -139,11 +139,13 @@ files:
139
139
  - CODE_OF_CONDUCT.md
140
140
  - README.adoc
141
141
  - Rakefile
142
+ - docs/.lycheeignore
142
143
  - docs/Gemfile
143
144
  - docs/INDEX.adoc
144
145
  - docs/_config.yml
145
146
  - docs/advanced/diff-classification.adoc
146
147
  - docs/advanced/diff-pipeline.adoc
148
+ - docs/advanced/extending-canon.adoc
147
149
  - docs/advanced/index.adoc
148
150
  - docs/advanced/semantic-diff-report.adoc
149
151
  - docs/advanced/verbose-mode-architecture.adoc
@@ -169,7 +171,10 @@ files:
169
171
  - docs/interfaces/index.adoc
170
172
  - docs/interfaces/rspec/index.adoc
171
173
  - docs/interfaces/ruby-api/index.adoc
174
+ - docs/internals/diffnode-enrichment.adoc
175
+ - docs/internals/index.adoc
172
176
  - docs/lychee.toml
177
+ - docs/plans/2025-01-17-html-parser-selection-fix.adoc
173
178
  - docs/reference/cli-options.adoc
174
179
  - docs/reference/environment-variables.adoc
175
180
  - docs/reference/index.adoc
@@ -186,7 +191,11 @@ files:
186
191
  - docs/understanding/formats/yaml.adoc
187
192
  - docs/understanding/index.adoc
188
193
  - exe/canon
194
+ - false_positive_analysis.txt
195
+ - file1.html
196
+ - file2.html
189
197
  - lib/canon.rb
198
+ - lib/canon/cache.rb
190
199
  - lib/canon/cli.rb
191
200
  - lib/canon/commands/diff_command.rb
192
201
  - lib/canon/commands/format_command.rb
@@ -194,14 +203,43 @@ files:
194
203
  - lib/canon/comparison/base_comparator.rb
195
204
  - lib/canon/comparison/compare_profile.rb
196
205
  - lib/canon/comparison/comparison_result.rb
206
+ - lib/canon/comparison/dimensions.rb
207
+ - lib/canon/comparison/dimensions/attribute_order_dimension.rb
208
+ - lib/canon/comparison/dimensions/attribute_presence_dimension.rb
209
+ - lib/canon/comparison/dimensions/attribute_values_dimension.rb
210
+ - lib/canon/comparison/dimensions/base_dimension.rb
211
+ - lib/canon/comparison/dimensions/comments_dimension.rb
212
+ - lib/canon/comparison/dimensions/element_position_dimension.rb
213
+ - lib/canon/comparison/dimensions/registry.rb
214
+ - lib/canon/comparison/dimensions/structural_whitespace_dimension.rb
215
+ - lib/canon/comparison/dimensions/text_content_dimension.rb
216
+ - lib/canon/comparison/format_detector.rb
197
217
  - lib/canon/comparison/html_comparator.rb
198
218
  - lib/canon/comparison/html_compare_profile.rb
219
+ - lib/canon/comparison/html_parser.rb
199
220
  - lib/canon/comparison/json_comparator.rb
221
+ - lib/canon/comparison/json_parser.rb
222
+ - lib/canon/comparison/markup_comparator.rb
200
223
  - lib/canon/comparison/match_options.rb
224
+ - lib/canon/comparison/match_options/base_resolver.rb
225
+ - lib/canon/comparison/match_options/json_resolver.rb
226
+ - lib/canon/comparison/match_options/xml_resolver.rb
227
+ - lib/canon/comparison/match_options/yaml_resolver.rb
228
+ - lib/canon/comparison/profile_definition.rb
229
+ - lib/canon/comparison/ruby_object_comparator.rb
201
230
  - lib/canon/comparison/strategies/base_match_strategy.rb
202
231
  - lib/canon/comparison/strategies/match_strategy_factory.rb
203
232
  - lib/canon/comparison/strategies/semantic_tree_match_strategy.rb
204
233
  - lib/canon/comparison/xml_comparator.rb
234
+ - lib/canon/comparison/xml_comparator/attribute_comparator.rb
235
+ - lib/canon/comparison/xml_comparator/attribute_filter.rb
236
+ - lib/canon/comparison/xml_comparator/child_comparison.rb
237
+ - lib/canon/comparison/xml_comparator/diff_node_builder.rb
238
+ - lib/canon/comparison/xml_comparator/namespace_comparator.rb
239
+ - lib/canon/comparison/xml_comparator/node_parser.rb
240
+ - lib/canon/comparison/xml_comparator/node_type_comparator.rb
241
+ - lib/canon/comparison/xml_node_comparison.rb
242
+ - lib/canon/comparison/xml_parser.rb
205
243
  - lib/canon/comparison/yaml_comparator.rb
206
244
  - lib/canon/config.rb
207
245
  - lib/canon/config/env_provider.rb
@@ -220,6 +258,8 @@ files:
220
258
  - lib/canon/diff/diff_report.rb
221
259
  - lib/canon/diff/diff_report_builder.rb
222
260
  - lib/canon/diff/formatting_detector.rb
261
+ - lib/canon/diff/node_serializer.rb
262
+ - lib/canon/diff/path_builder.rb
223
263
  - lib/canon/diff_formatter.rb
224
264
  - lib/canon/diff_formatter/by_line/base_formatter.rb
225
265
  - lib/canon/diff_formatter/by_line/html_formatter.rb
@@ -234,6 +274,11 @@ files:
234
274
  - lib/canon/diff_formatter/character_map.yml
235
275
  - lib/canon/diff_formatter/debug_output.rb
236
276
  - lib/canon/diff_formatter/diff_detail_formatter.rb
277
+ - lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb
278
+ - lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb
279
+ - lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb
280
+ - lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb
281
+ - lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb
237
282
  - lib/canon/diff_formatter/legend.rb
238
283
  - lib/canon/errors.rb
239
284
  - lib/canon/formatters/html4_formatter.rb
@@ -266,6 +311,10 @@ files:
266
311
  - lib/canon/tree_diff/matchers/structural_propagator.rb
267
312
  - lib/canon/tree_diff/matchers/universal_matcher.rb
268
313
  - lib/canon/tree_diff/operation_converter.rb
314
+ - lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb
315
+ - lib/canon/tree_diff/operation_converter_helpers/post_processor.rb
316
+ - lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb
317
+ - lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb
269
318
  - lib/canon/tree_diff/operations/operation.rb
270
319
  - lib/canon/tree_diff/operations/operation_detector.rb
271
320
  - lib/canon/tree_diff/tree_diff_integrator.rb
@@ -295,7 +344,79 @@ files:
295
344
  - lib/canon/xml/whitespace_normalizer.rb
296
345
  - lib/canon/xml/xml_base_handler.rb
297
346
  - lib/xml-c14n.rb
347
+ - old-docs/ADVANCED_TOPICS.adoc
348
+ - old-docs/BASIC_USAGE.adoc
349
+ - old-docs/CHARACTER_VISUALIZATION.adoc
350
+ - old-docs/CLI.adoc
351
+ - old-docs/CUSTOMIZING_BEHAVIOR.adoc
352
+ - old-docs/DIFF_ARCHITECTURE.adoc
353
+ - old-docs/DIFF_FORMATTING.adoc
354
+ - old-docs/DIFF_PARAMETERS.adoc
355
+ - old-docs/DOM_DIFF.adoc
356
+ - old-docs/ENV_CONFIG.adoc
357
+ - old-docs/FORMATS.adoc
358
+ - old-docs/INPUT_VALIDATION.adoc
359
+ - old-docs/MATCHER_BEHAVIOR.adoc
360
+ - old-docs/MATCH_ARCHITECTURE.adoc
361
+ - old-docs/MATCH_OPTIONS.adoc
362
+ - old-docs/MODES.adoc
363
+ - old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc
364
+ - old-docs/OPTIONS.adoc
365
+ - old-docs/PREPROCESSING.adoc
366
+ - old-docs/README.old.adoc
367
+ - old-docs/RSPEC.adoc
368
+ - old-docs/RUBY_API.adoc
369
+ - old-docs/SEMANTIC_DIFF_REPORT.adoc
370
+ - old-docs/SEMANTIC_TREE_DIFF.adoc
371
+ - old-docs/STRING_COMPARE.adoc
372
+ - old-docs/TMP.adoc
373
+ - old-docs/TREE_DIFF.adoc
374
+ - old-docs/UNDERSTANDING_CANON.adoc
375
+ - old-docs/VERBOSE.adoc
376
+ - old-docs/VISUALIZATION_MAP.adoc
377
+ - old-docs/WHITESPACE_TREATMENT.adoc
378
+ - scripts/analyze_current_state.rb
379
+ - scripts/analyze_false_positives.rb
380
+ - scripts/analyze_remaining_failures.rb
381
+ - scripts/compare_current_failures.rb
382
+ - scripts/compare_dom_tree_diff.rb
383
+ - scripts/compare_failures.rb
384
+ - scripts/debug_attribute_extraction.rb
385
+ - scripts/debug_blocks_839.rb
386
+ - scripts/debug_meta_matching.rb
387
+ - scripts/debug_p_matching.rb
388
+ - scripts/debug_signature_matching.rb
389
+ - scripts/debug_sourcecode_124.rb
390
+ - scripts/debug_whitespace_sensitive.rb
391
+ - scripts/extract_false_positives.rb
392
+ - scripts/find_actual_false_positives.rb
393
+ - scripts/investigate_all_false_positives.rb
394
+ - scripts/investigate_batch1.rb
395
+ - scripts/investigate_classification.rb
396
+ - scripts/investigate_classification_detailed.rb
397
+ - scripts/investigate_common_failures.rb
398
+ - scripts/investigate_false_negative.rb
399
+ - scripts/investigate_false_positive.rb
400
+ - scripts/investigate_false_positives.rb
401
+ - scripts/investigate_false_positives_batch.rb
402
+ - scripts/investigate_mixed_content.rb
403
+ - scripts/investigate_remaining_16.rb
404
+ - scripts/run_single_test.rb
405
+ - scripts/test_all_false_positives.rb
406
+ - scripts/test_attribute_details.rb
407
+ - scripts/test_both_algorithms.rb
408
+ - scripts/test_both_simple.rb
409
+ - scripts/test_enhanced_semantic_output.rb
410
+ - scripts/test_readme_examples.rb
411
+ - scripts/test_semantic_tree_diff.rb
412
+ - scripts/test_semantic_ux_improvements.rb
413
+ - scripts/test_single_false_positive.rb
414
+ - scripts/test_size_limits.rb
298
415
  - sig/xml/c14n.rbs
416
+ - test_html_1.html
417
+ - test_html_2.html
418
+ - test_nokogiri.rb
419
+ - test_normalize.rb
299
420
  homepage: https://github.com/lutaml/canon
300
421
  licenses:
301
422
  - BSD-2-Clause