canon 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +112 -25
  3. data/docs/Gemfile +1 -0
  4. data/docs/_config.yml +90 -1
  5. data/docs/advanced/diff-classification.adoc +82 -2
  6. data/docs/features/match-options/index.adoc +239 -1
  7. data/lib/canon/comparison/format_detector.rb +2 -1
  8. data/lib/canon/comparison/html_comparator.rb +19 -8
  9. data/lib/canon/comparison/html_compare_profile.rb +8 -2
  10. data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
  11. data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
  12. data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
  13. data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
  14. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
  15. data/lib/canon/comparison/xml_comparator.rb +48 -23
  16. data/lib/canon/comparison/xml_node_comparison.rb +25 -3
  17. data/lib/canon/diff/diff_classifier.rb +101 -2
  18. data/lib/canon/diff/formatting_detector.rb +1 -1
  19. data/lib/canon/rspec_matchers.rb +37 -8
  20. data/lib/canon/version.rb +1 -1
  21. data/lib/canon/xml/data_model.rb +24 -13
  22. metadata +3 -78
  23. data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
  24. data/false_positive_analysis.txt +0 -0
  25. data/file1.html +0 -1
  26. data/file2.html +0 -1
  27. data/old-docs/ADVANCED_TOPICS.adoc +0 -20
  28. data/old-docs/BASIC_USAGE.adoc +0 -16
  29. data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
  30. data/old-docs/CLI.adoc +0 -497
  31. data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  32. data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
  33. data/old-docs/DIFF_FORMATTING.adoc +0 -540
  34. data/old-docs/DIFF_PARAMETERS.adoc +0 -261
  35. data/old-docs/DOM_DIFF.adoc +0 -1017
  36. data/old-docs/ENV_CONFIG.adoc +0 -876
  37. data/old-docs/FORMATS.adoc +0 -867
  38. data/old-docs/INPUT_VALIDATION.adoc +0 -477
  39. data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
  40. data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
  41. data/old-docs/MATCH_OPTIONS.adoc +0 -912
  42. data/old-docs/MODES.adoc +0 -432
  43. data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  44. data/old-docs/OPTIONS.adoc +0 -1387
  45. data/old-docs/PREPROCESSING.adoc +0 -491
  46. data/old-docs/README.old.adoc +0 -2831
  47. data/old-docs/RSPEC.adoc +0 -814
  48. data/old-docs/RUBY_API.adoc +0 -485
  49. data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
  50. data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
  51. data/old-docs/STRING_COMPARE.adoc +0 -345
  52. data/old-docs/TMP.adoc +0 -3384
  53. data/old-docs/TREE_DIFF.adoc +0 -1080
  54. data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
  55. data/old-docs/VERBOSE.adoc +0 -482
  56. data/old-docs/VISUALIZATION_MAP.adoc +0 -625
  57. data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
  58. data/scripts/analyze_current_state.rb +0 -85
  59. data/scripts/analyze_false_positives.rb +0 -114
  60. data/scripts/analyze_remaining_failures.rb +0 -105
  61. data/scripts/compare_current_failures.rb +0 -95
  62. data/scripts/compare_dom_tree_diff.rb +0 -158
  63. data/scripts/compare_failures.rb +0 -151
  64. data/scripts/debug_attribute_extraction.rb +0 -66
  65. data/scripts/debug_blocks_839.rb +0 -115
  66. data/scripts/debug_meta_matching.rb +0 -52
  67. data/scripts/debug_p_matching.rb +0 -192
  68. data/scripts/debug_signature_matching.rb +0 -118
  69. data/scripts/debug_sourcecode_124.rb +0 -32
  70. data/scripts/debug_whitespace_sensitive.rb +0 -192
  71. data/scripts/extract_false_positives.rb +0 -138
  72. data/scripts/find_actual_false_positives.rb +0 -125
  73. data/scripts/investigate_all_false_positives.rb +0 -161
  74. data/scripts/investigate_batch1.rb +0 -127
  75. data/scripts/investigate_classification.rb +0 -150
  76. data/scripts/investigate_classification_detailed.rb +0 -190
  77. data/scripts/investigate_common_failures.rb +0 -342
  78. data/scripts/investigate_false_negative.rb +0 -80
  79. data/scripts/investigate_false_positive.rb +0 -83
  80. data/scripts/investigate_false_positives.rb +0 -227
  81. data/scripts/investigate_false_positives_batch.rb +0 -163
  82. data/scripts/investigate_mixed_content.rb +0 -125
  83. data/scripts/investigate_remaining_16.rb +0 -214
  84. data/scripts/run_single_test.rb +0 -29
  85. data/scripts/test_all_false_positives.rb +0 -95
  86. data/scripts/test_attribute_details.rb +0 -61
  87. data/scripts/test_both_algorithms.rb +0 -49
  88. data/scripts/test_both_simple.rb +0 -49
  89. data/scripts/test_enhanced_semantic_output.rb +0 -125
  90. data/scripts/test_readme_examples.rb +0 -131
  91. data/scripts/test_semantic_tree_diff.rb +0 -99
  92. data/scripts/test_semantic_ux_improvements.rb +0 -135
  93. data/scripts/test_single_false_positive.rb +0 -119
  94. data/scripts/test_size_limits.rb +0 -99
  95. data/test_html_1.html +0 -21
  96. data/test_html_2.html +0 -21
  97. data/test_nokogiri.rb +0 -33
  98. data/test_normalize.rb +0 -45
@@ -1,151 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- # Compare semantic and DOM algorithm failures to identify false positives/negatives
5
- # Usage: ruby scripts/compare_failures.rb /tmp/semantic_current.txt DOM_DIFF_RESULTS.md
6
-
7
- require "set"
8
-
9
- def parse_semantic_failures(file)
10
- failures = []
11
- File.readlines(file).each do |line|
12
- # Format: "rspec ./spec/isodoc/blocks_notes_spec.rb:494 # ..."
13
- if line =~ %r{rspec \./spec/isodoc/([a-z_]+_spec\.rb):(\d+)}
14
- failures << { file: $1, line: $2.to_i }
15
- end
16
- end
17
- failures
18
- end
19
-
20
- def parse_dom_failures(file)
21
- failures = []
22
- in_failures = false
23
-
24
- File.readlines(file).each do |line|
25
- if line.include?("Failed examples:")
26
- in_failures = true
27
- next
28
- end
29
-
30
- next unless in_failures
31
-
32
- # Stop at Coverage report
33
- break if line.include?("Coverage report")
34
-
35
- # Format: "rspec ./spec/isodoc/blocks_notes_spec.rb:494"
36
- if line =~ %r{rspec \./spec/isodoc/([a-z_]+_spec\.rb):(\d+)}
37
- failures << { file: $1, line: $2.to_i }
38
- end
39
- end
40
-
41
- failures
42
- end
43
-
44
- def categorize_failures(semantic, dom)
45
- semantic_set = Set.new(semantic.map { |f| "#{f[:file]}:#{f[:line]}" })
46
- dom_set = Set.new(dom.map { |f| "#{f[:file]}:#{f[:line]}" })
47
-
48
- {
49
- false_positives: semantic_set - dom_set, # Semantic fails, DOM passes
50
- false_negatives: dom_set - semantic_set, # DOM fails, Semantic passes
51
- common: semantic_set & dom_set, # Both fail (real failures)
52
- }
53
- end
54
-
55
- def group_by_spec(failures)
56
- failures.group_by { |f| f.split(":").first }.transform_values(&:count)
57
- end
58
-
59
- def main
60
- semantic_file = ARGV[0] || "/tmp/semantic_current.txt"
61
- dom_file = ARGV[1] || "DOM_DIFF_RESULTS.md"
62
-
63
- puts "Parsing semantic failures from: #{semantic_file}"
64
- semantic = parse_semantic_failures(semantic_file)
65
-
66
- puts "Parsing DOM failures from: #{dom_file}"
67
- dom = parse_dom_failures(dom_file)
68
-
69
- puts "\n#{'=' * 80}"
70
- puts "FAILURE COMPARISON SUMMARY"
71
- puts "=" * 80
72
-
73
- puts "\nTotal failures:"
74
- puts " Semantic: #{semantic.size}"
75
- puts " DOM: #{dom.size}"
76
-
77
- categories = categorize_failures(semantic, dom)
78
-
79
- puts "\n#{'-' * 80}"
80
- puts "FALSE POSITIVES (Semantic fails, DOM passes) - #{categories[:false_positives].size}"
81
- puts "-" * 80
82
- puts "\nBy spec file:"
83
- group_by_spec(categories[:false_positives].to_a).sort_by do |_, v|
84
- -v
85
- end.each do |file, count|
86
- puts " #{file}: #{count}"
87
- end
88
-
89
- puts "\nDetailed list:"
90
- categories[:false_positives].sort.each do |failure|
91
- puts " #{failure}"
92
- end
93
-
94
- puts "\n#{'-' * 80}"
95
- puts "FALSE NEGATIVES (DOM fails, Semantic passes) - #{categories[:false_negatives].size}"
96
- puts "-" * 80
97
- puts "\nBy spec file:"
98
- group_by_spec(categories[:false_negatives].to_a).sort_by do |_, v|
99
- -v
100
- end.each do |file, count|
101
- puts " #{file}: #{count}"
102
- end
103
-
104
- puts "\nDetailed list:"
105
- categories[:false_negatives].sort.each do |failure|
106
- puts " #{failure}"
107
- end
108
-
109
- puts "\n#{'-' * 80}"
110
- puts "COMMON FAILURES (Both algorithms fail) - #{categories[:common].size}"
111
- puts "-" * 80
112
- puts "\nBy spec file:"
113
- group_by_spec(categories[:common].to_a).sort_by do |_, v|
114
- -v
115
- end.each do |file, count|
116
- puts " #{file}: #{count}"
117
- end
118
-
119
- puts "\n#{'=' * 80}"
120
- puts "NEXT STEPS"
121
- puts "=" * 80
122
- puts "\n1. Fix false positives (#{categories[:false_positives].size} tests):"
123
- puts " - These are cases where semantic is too strict"
124
- puts " - DOM passes but semantic fails"
125
- puts " - Fix these to reduce semantic failures"
126
-
127
- puts "\n2. Fix false negatives (#{categories[:false_negatives].size} tests):"
128
- puts " - These are cases where semantic is too lenient"
129
- puts " - Semantic passes but DOM fails"
130
- puts " - Fix these to maintain correctness"
131
-
132
- puts "\n3. Common failures (#{categories[:common].size} tests):"
133
- puts " - These are real test failures in both algorithms"
134
- puts " - Will remain after parity is achieved"
135
- puts " - May indicate actual test/code issues"
136
-
137
- # Save detailed results
138
- output_file = "/tmp/failure_comparison.txt"
139
- File.open(output_file, "w") do |f|
140
- f.puts "FALSE POSITIVES (#{categories[:false_positives].size}):"
141
- categories[:false_positives].sort.each { |fp| f.puts fp }
142
- f.puts "\nFALSE NEGATIVES (#{categories[:false_negatives].size}):"
143
- categories[:false_negatives].sort.each { |fn| f.puts fn }
144
- f.puts "\nCOMMON FAILURES (#{categories[:common].size}):"
145
- categories[:common].sort.each { |cf| f.puts cf }
146
- end
147
-
148
- puts "\nDetailed results saved to: #{output_file}"
149
- end
150
-
151
- main if __FILE__ == $PROGRAM_NAME
@@ -1,66 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- require_relative "../lib/canon"
5
- require_relative "../lib/canon/diff_formatter"
6
- require_relative "../lib/canon/diff_formatter/diff_detail_formatter"
7
-
8
- # Test attribute values formatting
9
- html1 = '<table id="T1" class="MsoNormalTable" border="1"></table>'
10
- html2 = '<table id="T2" class="MsoNormalTable" border="2"></table>'
11
-
12
- result = Canon::Comparison.equivalent?(
13
- html1,
14
- html2,
15
- match_algorithm: :semantic_tree,
16
- ignore_attr_order: true,
17
- verbose: true,
18
- )
19
-
20
- puts "Number of differences: #{result.differences.length}"
21
- puts
22
-
23
- result.differences.each_with_index do |diff, i|
24
- puts "=" * 70
25
- puts "Difference ##{i + 1}"
26
- puts "=" * 70
27
- puts "Class: #{diff.class}"
28
- puts "Dimension: #{diff.dimension if diff.respond_to?(:dimension)}"
29
-
30
- if diff.respond_to?(:node1) && diff.respond_to?(:node2)
31
- node1 = diff.node1
32
- node2 = diff.node2
33
-
34
- puts "\nNode1:"
35
- puts " Class: #{node1.class}"
36
- puts " Name: #{node1.name if node1.respond_to?(:name)}"
37
- if node1.respond_to?(:attributes)
38
- puts " Attributes: #{node1.attributes.inspect}"
39
- puts " Attributes class: #{node1.attributes.class}"
40
- puts " Attributes keys: #{node1.attributes.keys.inspect}"
41
- node1.attributes.each do |key, val|
42
- puts " #{key.inspect} (#{key.class}) => #{val.inspect} (#{val.class})"
43
- if val.respond_to?(:value)
44
- puts " val.value = #{val.value.inspect}"
45
- end
46
- end
47
- end
48
-
49
- puts "\nNode2:"
50
- puts " Class: #{node2.class}"
51
- puts " Name: #{node2.name if node2.respond_to?(:name)}"
52
- if node2.respond_to?(:attributes)
53
- puts " Attributes: #{node2.attributes.inspect}"
54
- puts " Attributes class: #{node2.attributes.class}"
55
- puts " Attributes keys: #{node2.attributes.keys.inspect}"
56
- node2.attributes.each do |key, val|
57
- puts " #{key.inspect} (#{key.class}) => #{val.inspect} (#{val.class})"
58
- if val.respond_to?(:value)
59
- puts " val.value = #{val.value.inspect}"
60
- end
61
- end
62
- end
63
- end
64
-
65
- puts
66
- end
@@ -1,115 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # Debug script for blocks_spec.rb:839
3
-
4
- require "bundler/setup"
5
- require "nokogiri"
6
-
7
- HTML_HDR = <<~HEADER.freeze
8
- <html lang="en">
9
- <head/>
10
- <body lang="en">
11
- <div class="title-section">
12
- <p>\u00a0</p>
13
- </div>
14
- <br/>
15
- <div class="prefatory-section">
16
- <p>\u00a0</p>
17
- </div>
18
- <br/>
19
- <div class="main-section">
20
- <br/>
21
- <div class="TOC" id="_">
22
- <h1 class="IntroTitle">Table of contents</h1>
23
- </div>
24
- HEADER
25
-
26
- WORD_HDR = <<~HEADER.freeze
27
- <html xmlns:epub="http://www.idpf.org/2007/ops" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" lang="en">
28
- <head>
29
- <style>
30
- <!--
31
- -->
32
- </style>
33
- </head>
34
- <body lang="EN-US" link="blue" vlink="#954F72">
35
- <div class="WordSection1">
36
- <p>\u00a0</p>
37
- </div>
38
- HEADER
39
-
40
- html5_doc = <<~HTML
41
- #{HTML_HDR}
42
- <br/>
43
- <div id="_">
44
- <h1 class="ForewordTitle">Foreword</h1>
45
- <p id="_" style="text-align:left;">Test</p>
46
- </div>
47
- </div>
48
- </body>
49
- </html>
50
- HTML
51
-
52
- html4_doc = <<~HTML
53
- #{WORD_HDR}
54
- <p class="page-break">
55
- <br clear="all" style="mso-special-character:line-break;page-break-before:always"/>
56
- </p>
57
- <div class="TOC" id="_">
58
- <h1 class="IntroTitle">Table of contents</h1>
59
- </div>
60
- <p class="page-break">
61
- <br clear="all" style="mso-special-character:line-break;page-break-before:always"/>
62
- </p>
63
- <div id="_">
64
- <h1 class="ForewordTitle">Foreword</h1>
65
- <p id="_" align="left" style="text-align:left;">Test</p>
66
- </div>
67
- <p>\u00a0</p>
68
- </div>
69
- </body>
70
- </html>
71
- HTML
72
-
73
- puts "=" * 80
74
- puts "HTML5 PARSING"
75
- puts "=" * 80
76
-
77
- doc5 = Nokogiri::HTML5(html5_doc)
78
- head5 = doc5.at("//head")
79
- puts "HEAD element:"
80
- puts head5.to_html
81
- puts "\nCHILDREN:"
82
- head5.children.each_with_index do |child, i|
83
- puts " #{i}: #{child.name} - #{child.attributes.inspect}"
84
- end
85
-
86
- puts "\n#{'=' * 80}"
87
- puts "HTML4 PARSING"
88
- puts "=" * 80
89
-
90
- doc4 = Nokogiri::HTML4(html4_doc)
91
- head4 = doc4.at("//head")
92
- puts "HEAD element:"
93
- puts head4.to_html
94
- puts "\nCHILDREN:"
95
- head4.children.each_with_index do |child, i|
96
- puts " #{i}: #{child.name} - #{child.attributes.inspect}"
97
- end
98
-
99
- puts "\n#{'=' * 80}"
100
- puts "COMPARISON"
101
- puts "=" * 80
102
- puts "HTML5 head children: #{head5.children.size}"
103
- puts "HTML4 head children: #{head4.children.size}"
104
-
105
- meta5 = head5.xpath(".//meta")
106
- meta4 = head4.xpath(".//meta")
107
- puts "\nMETA elements:"
108
- puts "HTML5: #{meta5.size} meta elements"
109
- meta5.each_with_index do |m, i|
110
- puts " #{i}: #{m.to_html}"
111
- end
112
- puts "HTML4: #{meta4.size} meta elements"
113
- meta4.each_with_index do |m, i|
114
- puts " #{i}: #{m.to_html}"
115
- end
@@ -1,52 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- require "bundler/setup"
5
- require "canon"
6
-
7
- # Test case: Meta element with attributes should match
8
- expected = <<~HTML
9
- <html>
10
- <head>
11
- <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
12
- </head>
13
- <body>
14
- <p>Test</p>
15
- </body>
16
- </html>
17
- HTML
18
-
19
- actual = <<~HTML
20
- <html>
21
- <head>
22
- <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
23
- </head>
24
- <body>
25
- <p>Test</p>
26
- </body>
27
- </html>
28
- HTML
29
-
30
- puts "=" * 80
31
- puts "Testing Meta Element Matching"
32
- puts "=" * 80
33
-
34
- result = Canon::Comparison.equivalent?(expected, actual,
35
- format: :html4,
36
- diff_algorithm: :semantic,
37
- verbose: true)
38
-
39
- if result.is_a?(Canon::Comparison::ComparisonResult)
40
- puts "\nResult: #{result.equivalent? ? 'PASS ✅' : 'FAIL ❌'}"
41
- puts "Normative diffs: #{result.normative_differences.count}"
42
- puts "Total diffs: #{result.differences.count}"
43
-
44
- unless result.equivalent?
45
- puts "\nDifferences:"
46
- result.differences.each_with_index do |diff, i|
47
- puts "\n #{i + 1}. #{diff.inspect}"
48
- end
49
- end
50
- else
51
- puts "Result: #{result}"
52
- end
@@ -1,192 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- require "bundler/setup"
5
- require "canon"
6
- require "nokogiri"
7
-
8
- # Read the test files
9
- expected_file = "/Users/mulgogi/src/mn/isodoc/spec/fixtures/html/isodoc-section-names-expected.html"
10
- actual_file = "/Users/mulgogi/src/mn/isodoc/spec/fixtures/html/isodoc-section-names-actual.html"
11
-
12
- expected = File.read(expected_file)
13
- actual = File.read(actual_file)
14
-
15
- puts "=" * 80
16
- puts "ANALYZING <p> ELEMENT MATCHING"
17
- puts "=" * 80
18
-
19
- # Parse with Nokogiri to see what we have
20
- doc1 = Nokogiri::HTML4(expected)
21
- doc2 = Nokogiri::HTML4(actual)
22
-
23
- # Find all <p> elements
24
- p_elements1 = doc1.css("p")
25
- p_elements2 = doc2.css("p")
26
-
27
- puts "\nFile 1 has #{p_elements1.size} <p> elements"
28
- puts "File 2 has #{p_elements2.size} <p> elements"
29
-
30
- # Group by class attribute
31
- p_by_class1 = p_elements1.group_by { |p| p["class"] }
32
- p_by_class2 = p_elements2.group_by { |p| p["class"] }
33
-
34
- puts "\nFile 1 <p> elements by class:"
35
- p_by_class1.each do |klass, elements|
36
- puts " #{klass.inspect}: #{elements.size} elements"
37
- elements.first(3).each do |el|
38
- content = el.text.strip
39
- content = "#{content[0..50]}..." if content.length > 50
40
- puts " - #{content.inspect}"
41
- end
42
- end
43
-
44
- puts "\nFile 2 <p> elements by class:"
45
- p_by_class2.each do |klass, elements|
46
- puts " #{klass.inspect}: #{elements.size} elements"
47
- elements.first(3).each do |el|
48
- content = el.text.strip
49
- content = "#{content[0..50]}..." if content.length > 50
50
- puts " - #{content.inspect}"
51
- end
52
- end
53
-
54
- # Now run Canon's tree diff to see what happens
55
- puts "\n#{'=' * 80}"
56
- puts "RUNNING CANON TREE DIFF"
57
- puts "=" * 80
58
-
59
- require_relative "../lib/canon/tree_diff/adapters/html_adapter"
60
- require_relative "../lib/canon/tree_diff/matchers/hash_matcher"
61
- require_relative "../lib/canon/tree_diff/matchers/similarity_matcher"
62
- require_relative "../lib/canon/tree_diff/operations/operation_detector"
63
-
64
- # Create trees
65
- adapter = Canon::TreeDiff::Adapters::HtmlAdapter.new
66
- tree1 = adapter.parse(expected)
67
- tree2 = adapter.parse(actual)
68
-
69
- puts "\nTree 1 has #{tree1.descendants.size} total nodes"
70
- puts "Tree 2 has #{tree2.descendants.size} total nodes"
71
-
72
- # Find <p> nodes in tree
73
- p_nodes1 = tree1.descendants.select { |n| n.label == "p" }
74
- p_nodes2 = tree2.descendants.select { |n| n.label == "p" }
75
-
76
- puts "\nTree 1 has #{p_nodes1.size} <p> nodes"
77
- puts "Tree 2 has #{p_nodes2.size} <p> nodes"
78
-
79
- # Group by attributes
80
- p_by_attrs1 = p_nodes1.group_by(&:attributes)
81
- p_by_attrs2 = p_nodes2.group_by(&:attributes)
82
-
83
- puts "\nTree 1 <p> nodes by attributes:"
84
- p_by_attrs1.each do |attrs, nodes|
85
- puts " #{attrs.inspect}: #{nodes.size} nodes"
86
- end
87
-
88
- puts "\nTree 2 <p> nodes by attributes:"
89
- p_by_attrs2.each do |attrs, nodes|
90
- puts " #{attrs.inspect}: #{nodes.size} nodes"
91
- end
92
-
93
- # Look at signatures
94
- require_relative "../lib/canon/tree_diff/core/node_signature"
95
-
96
- puts "\n#{'=' * 80}"
97
- puts "ANALYZING SIGNATURES"
98
- puts "=" * 80
99
-
100
- # Get page-break <p> nodes
101
- page_break_p1 = p_nodes1.select { |n| n.attributes["class"] == "page-break" }
102
- page_break_p2 = p_nodes2.select { |n| n.attributes["class"] == "page-break" }
103
-
104
- puts "\nFile 1 has #{page_break_p1.size} <p class=\"page-break\"> nodes"
105
- puts "File 2 has #{page_break_p2.size} <p class=\"page-break\"> nodes"
106
-
107
- if page_break_p1.any?
108
- puts "\nFirst 3 signatures from File 1:"
109
- page_break_p1.first(3).each_with_index do |node, i|
110
- sig = Canon::TreeDiff::Core::NodeSignature.for(node)
111
- puts " #{i + 1}. #{sig}"
112
- puts " Children: #{node.children.size}"
113
- if node.children.any?
114
- node.children.each do |child|
115
- child_sig = Canon::TreeDiff::Core::NodeSignature.for(child)
116
- puts " - #{child.label}: #{child_sig}"
117
- end
118
- end
119
- end
120
- end
121
-
122
- if page_break_p2.any?
123
- puts "\nFirst 3 signatures from File 2:"
124
- page_break_p2.first(3).each_with_index do |node, i|
125
- sig = Canon::TreeDiff::Core::NodeSignature.for(node)
126
- puts " #{i + 1}. #{sig}"
127
- puts " Children: #{node.children.size}"
128
- if node.children.any?
129
- node.children.each do |child|
130
- child_sig = Canon::TreeDiff::Core::NodeSignature.for(child)
131
- puts " - #{child.label}: #{child_sig}"
132
- end
133
- end
134
- end
135
- end
136
-
137
- # Run hash matcher
138
- puts "\n#{'=' * 80}"
139
- puts "RUNNING HASH MATCHER"
140
- puts "=" * 80
141
-
142
- options = {
143
- attribute_order: :ignore,
144
- text_content: :normalize,
145
- }
146
-
147
- matcher = Canon::TreeDiff::Matchers::HashMatcher.new(tree1, tree2, options)
148
- matching = matcher.match
149
-
150
- puts "\nTotal matched pairs: #{matching.size}"
151
-
152
- # Check how many <p> nodes were matched
153
- matched_p1 = p_nodes1.count { |n| matching.matched1?(n) }
154
- matched_p2 = p_nodes2.count { |n| matching.matched2?(n) }
155
-
156
- puts "Matched <p> from tree1: #{matched_p1}/#{p_nodes1.size}"
157
- puts "Matched <p> from tree2: #{matched_p2}/#{p_nodes2.size}"
158
-
159
- matched_page_break_p1 = page_break_p1.count { |n| matching.matched1?(n) }
160
- matched_page_break_p2 = page_break_p2.count { |n| matching.matched2?(n) }
161
-
162
- puts "Matched <p class=\"page-break\"> from tree1: #{matched_page_break_p1}/#{page_break_p1.size}"
163
- puts "Matched <p class=\"page-break\"> from tree2: #{matched_page_break_p2}/#{page_break_p2.size}"
164
-
165
- # Check unmatched page-break <p> nodes
166
- unmatched_p1 = page_break_p1.reject { |n| matching.matched1?(n) }
167
- unmatched_p2 = page_break_p2.reject { |n| matching.matched2?(n) }
168
-
169
- puts "\nUnmatched <p class=\"page-break\"> from tree1: #{unmatched_p1.size}"
170
- puts "Unmatched <p class=\"page-break\"> from tree2: #{unmatched_p2.size}"
171
-
172
- if unmatched_p1.any?
173
- puts "\nFirst unmatched from tree1:"
174
- node = unmatched_p1.first
175
- puts " Path: #{node.xpath}"
176
- puts " Signature: #{Canon::TreeDiff::Core::NodeSignature.for(node)}"
177
- puts " Children: #{node.children.size}"
178
- node.children.each do |child|
179
- puts " - #{child.label}: value=#{child.value.inspect}, attrs=#{child.attributes.inspect}"
180
- end
181
- end
182
-
183
- if unmatched_p2.any?
184
- puts "\nFirst unmatched from tree2:"
185
- node = unmatched_p2.first
186
- puts " Path: #{node.xpath}"
187
- puts " Signature: #{Canon::TreeDiff::Core::NodeSignature.for(node)}"
188
- puts " Children: #{node.children.size}"
189
- node.children.each do |child|
190
- puts " - #{child.label}: value=#{child.value.inspect}, attrs=#{child.attributes.inspect}"
191
- end
192
- end
@@ -1,118 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- # Debug signature matching to understand why elements aren't matching
5
- # Usage: ruby scripts/debug_signature_matching.rb
6
-
7
- require_relative "../lib/canon"
8
- require "nokogiri"
9
-
10
- # Sample XML with semx elements that should match
11
- xml1 = <<~XML
12
- <p>
13
- <fmt-concept>
14
- <semx element="title" source="_">word</semx>
15
- </fmt-concept>
16
- </p>
17
- XML
18
-
19
- xml2 = <<~XML
20
- <p>
21
- <fmt-concept>
22
- <semx element="concept" source="_">word</semx>
23
- </fmt-concept>
24
- </p>
25
- XML
26
-
27
- puts "=" * 80
28
- puts "SIGNATURE MATCHING DEBUG"
29
- puts "=" * 80
30
-
31
- # Parse both
32
- doc1 = Nokogiri::XML(xml1)
33
- doc2 = Nokogiri::XML(xml2)
34
-
35
- # Create adapter
36
- adapter = Canon::TreeDiff::Adapters::XMLAdapter.new
37
-
38
- # Convert to tree
39
- tree1 = adapter.to_tree(doc1.root)
40
- tree2 = adapter.to_tree(doc2.root)
41
-
42
- puts "\nTree 1 structure:"
43
- def print_tree(node, indent = 0)
44
- prefix = " " * indent
45
- if node.text?
46
- puts "#{prefix}#text: #{node.value.inspect}"
47
- else
48
- attrs = node.attributes.empty? ? "" : " {#{node.attributes.inspect}}"
49
- puts "#{prefix}<#{node.label}>#{attrs}"
50
- node.children.each { |c| print_tree(c, indent + 1) }
51
- end
52
- end
53
-
54
- print_tree(tree1)
55
-
56
- puts "\nTree 2 structure:"
57
- print_tree(tree2)
58
-
59
- # Get semx nodes
60
- semx1 = tree1.descendants.find { |n| n.label == "semx" }
61
- semx2 = tree2.descendants.find { |n| n.label == "semx" }
62
-
63
- puts "\n#{'-' * 80}"
64
- puts "SEMX NODE COMPARISON"
65
- puts "-" * 80
66
-
67
- puts "\nSemx 1:"
68
- puts " Label: #{semx1.label}"
69
- puts " Value: #{semx1.value.inspect}"
70
- puts " Attributes: #{semx1.attributes.inspect}"
71
-
72
- puts "\nSemx 2:"
73
- puts " Label: #{semx2.label}"
74
- puts " Value: #{semx2.value.inspect}"
75
- puts " Attributes: #{semx2.attributes.inspect}"
76
-
77
- # Compute signatures
78
- sig1_strict = Canon::TreeDiff::Core::NodeSignature.for(semx1,
79
- include_attributes: true)
80
- sig2_strict = Canon::TreeDiff::Core::NodeSignature.for(semx2,
81
- include_attributes: true)
82
-
83
- sig1_loose = Canon::TreeDiff::Core::NodeSignature.for(semx1,
84
- include_attributes: false)
85
- sig2_loose = Canon::TreeDiff::Core::NodeSignature.for(semx2,
86
- include_attributes: false)
87
-
88
- puts "\n#{'-' * 80}"
89
- puts "SIGNATURE COMPARISON"
90
- puts "-" * 80
91
-
92
- puts "\nStrict signatures (with attributes):"
93
- puts " Semx 1: #{sig1_strict.signature_string}"
94
- puts " Semx 2: #{sig2_strict.signature_string}"
95
- puts " Match? #{sig1_strict == sig2_strict}"
96
-
97
- puts "\nLoose signatures (without attributes):"
98
- puts " Semx 1: #{sig1_loose.signature_string}"
99
- puts " Semx 2: #{sig2_loose.signature_string}"
100
- puts " Match? #{sig1_loose == sig2_loose}"
101
-
102
- puts "\n#{'-' * 80}"
103
- puts "ANALYSIS"
104
- puts "-" * 80
105
-
106
- if sig1_strict != sig2_strict
107
- puts "\n⚠️ ISSUE FOUND:"
108
- puts "Strict signatures don't match due to attribute differences!"
109
- puts "This prevents HashMatcher from considering these nodes as candidates."
110
- puts "\nDifference:"
111
- puts " File 1: element='title'"
112
- puts " File 2: element='concept'"
113
- puts "\nSOLUTION:"
114
- puts "HashMatcher should use LOOSE signatures (no attributes) to find candidates,"
115
- puts "then check attributes separately during matching."
116
- end
117
-
118
- puts "\n#{'=' * 80}"