canon 0.1.8 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +83 -22
- data/docs/Gemfile +1 -0
- data/docs/_config.yml +90 -1
- data/docs/advanced/diff-classification.adoc +196 -24
- data/docs/features/match-options/index.adoc +239 -1
- data/lib/canon/comparison/format_detector.rb +2 -1
- data/lib/canon/comparison/html_comparator.rb +19 -8
- data/lib/canon/comparison/html_compare_profile.rb +8 -2
- data/lib/canon/comparison/markup_comparator.rb +109 -2
- data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
- data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +108 -0
- data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
- data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
- data/lib/canon/comparison/xml_comparator.rb +240 -23
- data/lib/canon/comparison/xml_node_comparison.rb +25 -3
- data/lib/canon/diff/diff_classifier.rb +119 -5
- data/lib/canon/diff/formatting_detector.rb +1 -1
- data/lib/canon/diff/xml_serialization_formatter.rb +153 -0
- data/lib/canon/rspec_matchers.rb +37 -8
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +24 -13
- metadata +4 -78
- data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
- data/false_positive_analysis.txt +0 -0
- data/file1.html +0 -1
- data/file2.html +0 -1
- data/old-docs/ADVANCED_TOPICS.adoc +0 -20
- data/old-docs/BASIC_USAGE.adoc +0 -16
- data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
- data/old-docs/CLI.adoc +0 -497
- data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
- data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
- data/old-docs/DIFF_FORMATTING.adoc +0 -540
- data/old-docs/DIFF_PARAMETERS.adoc +0 -261
- data/old-docs/DOM_DIFF.adoc +0 -1017
- data/old-docs/ENV_CONFIG.adoc +0 -876
- data/old-docs/FORMATS.adoc +0 -867
- data/old-docs/INPUT_VALIDATION.adoc +0 -477
- data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
- data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
- data/old-docs/MATCH_OPTIONS.adoc +0 -912
- data/old-docs/MODES.adoc +0 -432
- data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
- data/old-docs/OPTIONS.adoc +0 -1387
- data/old-docs/PREPROCESSING.adoc +0 -491
- data/old-docs/README.old.adoc +0 -2831
- data/old-docs/RSPEC.adoc +0 -814
- data/old-docs/RUBY_API.adoc +0 -485
- data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
- data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
- data/old-docs/STRING_COMPARE.adoc +0 -345
- data/old-docs/TMP.adoc +0 -3384
- data/old-docs/TREE_DIFF.adoc +0 -1080
- data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
- data/old-docs/VERBOSE.adoc +0 -482
- data/old-docs/VISUALIZATION_MAP.adoc +0 -625
- data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
- data/scripts/analyze_current_state.rb +0 -85
- data/scripts/analyze_false_positives.rb +0 -114
- data/scripts/analyze_remaining_failures.rb +0 -105
- data/scripts/compare_current_failures.rb +0 -95
- data/scripts/compare_dom_tree_diff.rb +0 -158
- data/scripts/compare_failures.rb +0 -151
- data/scripts/debug_attribute_extraction.rb +0 -66
- data/scripts/debug_blocks_839.rb +0 -115
- data/scripts/debug_meta_matching.rb +0 -52
- data/scripts/debug_p_matching.rb +0 -192
- data/scripts/debug_signature_matching.rb +0 -118
- data/scripts/debug_sourcecode_124.rb +0 -32
- data/scripts/debug_whitespace_sensitive.rb +0 -192
- data/scripts/extract_false_positives.rb +0 -138
- data/scripts/find_actual_false_positives.rb +0 -125
- data/scripts/investigate_all_false_positives.rb +0 -161
- data/scripts/investigate_batch1.rb +0 -127
- data/scripts/investigate_classification.rb +0 -150
- data/scripts/investigate_classification_detailed.rb +0 -190
- data/scripts/investigate_common_failures.rb +0 -342
- data/scripts/investigate_false_negative.rb +0 -80
- data/scripts/investigate_false_positive.rb +0 -83
- data/scripts/investigate_false_positives.rb +0 -227
- data/scripts/investigate_false_positives_batch.rb +0 -163
- data/scripts/investigate_mixed_content.rb +0 -125
- data/scripts/investigate_remaining_16.rb +0 -214
- data/scripts/run_single_test.rb +0 -29
- data/scripts/test_all_false_positives.rb +0 -95
- data/scripts/test_attribute_details.rb +0 -61
- data/scripts/test_both_algorithms.rb +0 -49
- data/scripts/test_both_simple.rb +0 -49
- data/scripts/test_enhanced_semantic_output.rb +0 -125
- data/scripts/test_readme_examples.rb +0 -131
- data/scripts/test_semantic_tree_diff.rb +0 -99
- data/scripts/test_semantic_ux_improvements.rb +0 -135
- data/scripts/test_single_false_positive.rb +0 -119
- data/scripts/test_size_limits.rb +0 -99
- data/test_html_1.html +0 -21
- data/test_html_2.html +0 -21
- data/test_nokogiri.rb +0 -33
- data/test_normalize.rb +0 -45
data/scripts/compare_failures.rb
DELETED
|
@@ -1,151 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
# frozen_string_literal: true
|
|
3
|
-
|
|
4
|
-
# Compare semantic and DOM algorithm failures to identify false positives/negatives
|
|
5
|
-
# Usage: ruby scripts/compare_failures.rb /tmp/semantic_current.txt DOM_DIFF_RESULTS.md
|
|
6
|
-
|
|
7
|
-
require "set"
|
|
8
|
-
|
|
9
|
-
def parse_semantic_failures(file)
|
|
10
|
-
failures = []
|
|
11
|
-
File.readlines(file).each do |line|
|
|
12
|
-
# Format: "rspec ./spec/isodoc/blocks_notes_spec.rb:494 # ..."
|
|
13
|
-
if line =~ %r{rspec \./spec/isodoc/([a-z_]+_spec\.rb):(\d+)}
|
|
14
|
-
failures << { file: $1, line: $2.to_i }
|
|
15
|
-
end
|
|
16
|
-
end
|
|
17
|
-
failures
|
|
18
|
-
end
|
|
19
|
-
|
|
20
|
-
def parse_dom_failures(file)
|
|
21
|
-
failures = []
|
|
22
|
-
in_failures = false
|
|
23
|
-
|
|
24
|
-
File.readlines(file).each do |line|
|
|
25
|
-
if line.include?("Failed examples:")
|
|
26
|
-
in_failures = true
|
|
27
|
-
next
|
|
28
|
-
end
|
|
29
|
-
|
|
30
|
-
next unless in_failures
|
|
31
|
-
|
|
32
|
-
# Stop at Coverage report
|
|
33
|
-
break if line.include?("Coverage report")
|
|
34
|
-
|
|
35
|
-
# Format: "rspec ./spec/isodoc/blocks_notes_spec.rb:494"
|
|
36
|
-
if line =~ %r{rspec \./spec/isodoc/([a-z_]+_spec\.rb):(\d+)}
|
|
37
|
-
failures << { file: $1, line: $2.to_i }
|
|
38
|
-
end
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
failures
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
def categorize_failures(semantic, dom)
|
|
45
|
-
semantic_set = Set.new(semantic.map { |f| "#{f[:file]}:#{f[:line]}" })
|
|
46
|
-
dom_set = Set.new(dom.map { |f| "#{f[:file]}:#{f[:line]}" })
|
|
47
|
-
|
|
48
|
-
{
|
|
49
|
-
false_positives: semantic_set - dom_set, # Semantic fails, DOM passes
|
|
50
|
-
false_negatives: dom_set - semantic_set, # DOM fails, Semantic passes
|
|
51
|
-
common: semantic_set & dom_set, # Both fail (real failures)
|
|
52
|
-
}
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
def group_by_spec(failures)
|
|
56
|
-
failures.group_by { |f| f.split(":").first }.transform_values(&:count)
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
def main
|
|
60
|
-
semantic_file = ARGV[0] || "/tmp/semantic_current.txt"
|
|
61
|
-
dom_file = ARGV[1] || "DOM_DIFF_RESULTS.md"
|
|
62
|
-
|
|
63
|
-
puts "Parsing semantic failures from: #{semantic_file}"
|
|
64
|
-
semantic = parse_semantic_failures(semantic_file)
|
|
65
|
-
|
|
66
|
-
puts "Parsing DOM failures from: #{dom_file}"
|
|
67
|
-
dom = parse_dom_failures(dom_file)
|
|
68
|
-
|
|
69
|
-
puts "\n#{'=' * 80}"
|
|
70
|
-
puts "FAILURE COMPARISON SUMMARY"
|
|
71
|
-
puts "=" * 80
|
|
72
|
-
|
|
73
|
-
puts "\nTotal failures:"
|
|
74
|
-
puts " Semantic: #{semantic.size}"
|
|
75
|
-
puts " DOM: #{dom.size}"
|
|
76
|
-
|
|
77
|
-
categories = categorize_failures(semantic, dom)
|
|
78
|
-
|
|
79
|
-
puts "\n#{'-' * 80}"
|
|
80
|
-
puts "FALSE POSITIVES (Semantic fails, DOM passes) - #{categories[:false_positives].size}"
|
|
81
|
-
puts "-" * 80
|
|
82
|
-
puts "\nBy spec file:"
|
|
83
|
-
group_by_spec(categories[:false_positives].to_a).sort_by do |_, v|
|
|
84
|
-
-v
|
|
85
|
-
end.each do |file, count|
|
|
86
|
-
puts " #{file}: #{count}"
|
|
87
|
-
end
|
|
88
|
-
|
|
89
|
-
puts "\nDetailed list:"
|
|
90
|
-
categories[:false_positives].sort.each do |failure|
|
|
91
|
-
puts " #{failure}"
|
|
92
|
-
end
|
|
93
|
-
|
|
94
|
-
puts "\n#{'-' * 80}"
|
|
95
|
-
puts "FALSE NEGATIVES (DOM fails, Semantic passes) - #{categories[:false_negatives].size}"
|
|
96
|
-
puts "-" * 80
|
|
97
|
-
puts "\nBy spec file:"
|
|
98
|
-
group_by_spec(categories[:false_negatives].to_a).sort_by do |_, v|
|
|
99
|
-
-v
|
|
100
|
-
end.each do |file, count|
|
|
101
|
-
puts " #{file}: #{count}"
|
|
102
|
-
end
|
|
103
|
-
|
|
104
|
-
puts "\nDetailed list:"
|
|
105
|
-
categories[:false_negatives].sort.each do |failure|
|
|
106
|
-
puts " #{failure}"
|
|
107
|
-
end
|
|
108
|
-
|
|
109
|
-
puts "\n#{'-' * 80}"
|
|
110
|
-
puts "COMMON FAILURES (Both algorithms fail) - #{categories[:common].size}"
|
|
111
|
-
puts "-" * 80
|
|
112
|
-
puts "\nBy spec file:"
|
|
113
|
-
group_by_spec(categories[:common].to_a).sort_by do |_, v|
|
|
114
|
-
-v
|
|
115
|
-
end.each do |file, count|
|
|
116
|
-
puts " #{file}: #{count}"
|
|
117
|
-
end
|
|
118
|
-
|
|
119
|
-
puts "\n#{'=' * 80}"
|
|
120
|
-
puts "NEXT STEPS"
|
|
121
|
-
puts "=" * 80
|
|
122
|
-
puts "\n1. Fix false positives (#{categories[:false_positives].size} tests):"
|
|
123
|
-
puts " - These are cases where semantic is too strict"
|
|
124
|
-
puts " - DOM passes but semantic fails"
|
|
125
|
-
puts " - Fix these to reduce semantic failures"
|
|
126
|
-
|
|
127
|
-
puts "\n2. Fix false negatives (#{categories[:false_negatives].size} tests):"
|
|
128
|
-
puts " - These are cases where semantic is too lenient"
|
|
129
|
-
puts " - Semantic passes but DOM fails"
|
|
130
|
-
puts " - Fix these to maintain correctness"
|
|
131
|
-
|
|
132
|
-
puts "\n3. Common failures (#{categories[:common].size} tests):"
|
|
133
|
-
puts " - These are real test failures in both algorithms"
|
|
134
|
-
puts " - Will remain after parity is achieved"
|
|
135
|
-
puts " - May indicate actual test/code issues"
|
|
136
|
-
|
|
137
|
-
# Save detailed results
|
|
138
|
-
output_file = "/tmp/failure_comparison.txt"
|
|
139
|
-
File.open(output_file, "w") do |f|
|
|
140
|
-
f.puts "FALSE POSITIVES (#{categories[:false_positives].size}):"
|
|
141
|
-
categories[:false_positives].sort.each { |fp| f.puts fp }
|
|
142
|
-
f.puts "\nFALSE NEGATIVES (#{categories[:false_negatives].size}):"
|
|
143
|
-
categories[:false_negatives].sort.each { |fn| f.puts fn }
|
|
144
|
-
f.puts "\nCOMMON FAILURES (#{categories[:common].size}):"
|
|
145
|
-
categories[:common].sort.each { |cf| f.puts cf }
|
|
146
|
-
end
|
|
147
|
-
|
|
148
|
-
puts "\nDetailed results saved to: #{output_file}"
|
|
149
|
-
end
|
|
150
|
-
|
|
151
|
-
main if __FILE__ == $PROGRAM_NAME
|
|
@@ -1,66 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
# frozen_string_literal: true
|
|
3
|
-
|
|
4
|
-
require_relative "../lib/canon"
|
|
5
|
-
require_relative "../lib/canon/diff_formatter"
|
|
6
|
-
require_relative "../lib/canon/diff_formatter/diff_detail_formatter"
|
|
7
|
-
|
|
8
|
-
# Test attribute values formatting
|
|
9
|
-
html1 = '<table id="T1" class="MsoNormalTable" border="1"></table>'
|
|
10
|
-
html2 = '<table id="T2" class="MsoNormalTable" border="2"></table>'
|
|
11
|
-
|
|
12
|
-
result = Canon::Comparison.equivalent?(
|
|
13
|
-
html1,
|
|
14
|
-
html2,
|
|
15
|
-
match_algorithm: :semantic_tree,
|
|
16
|
-
ignore_attr_order: true,
|
|
17
|
-
verbose: true,
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
puts "Number of differences: #{result.differences.length}"
|
|
21
|
-
puts
|
|
22
|
-
|
|
23
|
-
result.differences.each_with_index do |diff, i|
|
|
24
|
-
puts "=" * 70
|
|
25
|
-
puts "Difference ##{i + 1}"
|
|
26
|
-
puts "=" * 70
|
|
27
|
-
puts "Class: #{diff.class}"
|
|
28
|
-
puts "Dimension: #{diff.dimension if diff.respond_to?(:dimension)}"
|
|
29
|
-
|
|
30
|
-
if diff.respond_to?(:node1) && diff.respond_to?(:node2)
|
|
31
|
-
node1 = diff.node1
|
|
32
|
-
node2 = diff.node2
|
|
33
|
-
|
|
34
|
-
puts "\nNode1:"
|
|
35
|
-
puts " Class: #{node1.class}"
|
|
36
|
-
puts " Name: #{node1.name if node1.respond_to?(:name)}"
|
|
37
|
-
if node1.respond_to?(:attributes)
|
|
38
|
-
puts " Attributes: #{node1.attributes.inspect}"
|
|
39
|
-
puts " Attributes class: #{node1.attributes.class}"
|
|
40
|
-
puts " Attributes keys: #{node1.attributes.keys.inspect}"
|
|
41
|
-
node1.attributes.each do |key, val|
|
|
42
|
-
puts " #{key.inspect} (#{key.class}) => #{val.inspect} (#{val.class})"
|
|
43
|
-
if val.respond_to?(:value)
|
|
44
|
-
puts " val.value = #{val.value.inspect}"
|
|
45
|
-
end
|
|
46
|
-
end
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
puts "\nNode2:"
|
|
50
|
-
puts " Class: #{node2.class}"
|
|
51
|
-
puts " Name: #{node2.name if node2.respond_to?(:name)}"
|
|
52
|
-
if node2.respond_to?(:attributes)
|
|
53
|
-
puts " Attributes: #{node2.attributes.inspect}"
|
|
54
|
-
puts " Attributes class: #{node2.attributes.class}"
|
|
55
|
-
puts " Attributes keys: #{node2.attributes.keys.inspect}"
|
|
56
|
-
node2.attributes.each do |key, val|
|
|
57
|
-
puts " #{key.inspect} (#{key.class}) => #{val.inspect} (#{val.class})"
|
|
58
|
-
if val.respond_to?(:value)
|
|
59
|
-
puts " val.value = #{val.value.inspect}"
|
|
60
|
-
end
|
|
61
|
-
end
|
|
62
|
-
end
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
puts
|
|
66
|
-
end
|
data/scripts/debug_blocks_839.rb
DELETED
|
@@ -1,115 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
# Debug script for blocks_spec.rb:839
|
|
3
|
-
|
|
4
|
-
require "bundler/setup"
|
|
5
|
-
require "nokogiri"
|
|
6
|
-
|
|
7
|
-
HTML_HDR = <<~HEADER.freeze
|
|
8
|
-
<html lang="en">
|
|
9
|
-
<head/>
|
|
10
|
-
<body lang="en">
|
|
11
|
-
<div class="title-section">
|
|
12
|
-
<p>\u00a0</p>
|
|
13
|
-
</div>
|
|
14
|
-
<br/>
|
|
15
|
-
<div class="prefatory-section">
|
|
16
|
-
<p>\u00a0</p>
|
|
17
|
-
</div>
|
|
18
|
-
<br/>
|
|
19
|
-
<div class="main-section">
|
|
20
|
-
<br/>
|
|
21
|
-
<div class="TOC" id="_">
|
|
22
|
-
<h1 class="IntroTitle">Table of contents</h1>
|
|
23
|
-
</div>
|
|
24
|
-
HEADER
|
|
25
|
-
|
|
26
|
-
WORD_HDR = <<~HEADER.freeze
|
|
27
|
-
<html xmlns:epub="http://www.idpf.org/2007/ops" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" lang="en">
|
|
28
|
-
<head>
|
|
29
|
-
<style>
|
|
30
|
-
<!--
|
|
31
|
-
-->
|
|
32
|
-
</style>
|
|
33
|
-
</head>
|
|
34
|
-
<body lang="EN-US" link="blue" vlink="#954F72">
|
|
35
|
-
<div class="WordSection1">
|
|
36
|
-
<p>\u00a0</p>
|
|
37
|
-
</div>
|
|
38
|
-
HEADER
|
|
39
|
-
|
|
40
|
-
html5_doc = <<~HTML
|
|
41
|
-
#{HTML_HDR}
|
|
42
|
-
<br/>
|
|
43
|
-
<div id="_">
|
|
44
|
-
<h1 class="ForewordTitle">Foreword</h1>
|
|
45
|
-
<p id="_" style="text-align:left;">Test</p>
|
|
46
|
-
</div>
|
|
47
|
-
</div>
|
|
48
|
-
</body>
|
|
49
|
-
</html>
|
|
50
|
-
HTML
|
|
51
|
-
|
|
52
|
-
html4_doc = <<~HTML
|
|
53
|
-
#{WORD_HDR}
|
|
54
|
-
<p class="page-break">
|
|
55
|
-
<br clear="all" style="mso-special-character:line-break;page-break-before:always"/>
|
|
56
|
-
</p>
|
|
57
|
-
<div class="TOC" id="_">
|
|
58
|
-
<h1 class="IntroTitle">Table of contents</h1>
|
|
59
|
-
</div>
|
|
60
|
-
<p class="page-break">
|
|
61
|
-
<br clear="all" style="mso-special-character:line-break;page-break-before:always"/>
|
|
62
|
-
</p>
|
|
63
|
-
<div id="_">
|
|
64
|
-
<h1 class="ForewordTitle">Foreword</h1>
|
|
65
|
-
<p id="_" align="left" style="text-align:left;">Test</p>
|
|
66
|
-
</div>
|
|
67
|
-
<p>\u00a0</p>
|
|
68
|
-
</div>
|
|
69
|
-
</body>
|
|
70
|
-
</html>
|
|
71
|
-
HTML
|
|
72
|
-
|
|
73
|
-
puts "=" * 80
|
|
74
|
-
puts "HTML5 PARSING"
|
|
75
|
-
puts "=" * 80
|
|
76
|
-
|
|
77
|
-
doc5 = Nokogiri::HTML5(html5_doc)
|
|
78
|
-
head5 = doc5.at("//head")
|
|
79
|
-
puts "HEAD element:"
|
|
80
|
-
puts head5.to_html
|
|
81
|
-
puts "\nCHILDREN:"
|
|
82
|
-
head5.children.each_with_index do |child, i|
|
|
83
|
-
puts " #{i}: #{child.name} - #{child.attributes.inspect}"
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
puts "\n#{'=' * 80}"
|
|
87
|
-
puts "HTML4 PARSING"
|
|
88
|
-
puts "=" * 80
|
|
89
|
-
|
|
90
|
-
doc4 = Nokogiri::HTML4(html4_doc)
|
|
91
|
-
head4 = doc4.at("//head")
|
|
92
|
-
puts "HEAD element:"
|
|
93
|
-
puts head4.to_html
|
|
94
|
-
puts "\nCHILDREN:"
|
|
95
|
-
head4.children.each_with_index do |child, i|
|
|
96
|
-
puts " #{i}: #{child.name} - #{child.attributes.inspect}"
|
|
97
|
-
end
|
|
98
|
-
|
|
99
|
-
puts "\n#{'=' * 80}"
|
|
100
|
-
puts "COMPARISON"
|
|
101
|
-
puts "=" * 80
|
|
102
|
-
puts "HTML5 head children: #{head5.children.size}"
|
|
103
|
-
puts "HTML4 head children: #{head4.children.size}"
|
|
104
|
-
|
|
105
|
-
meta5 = head5.xpath(".//meta")
|
|
106
|
-
meta4 = head4.xpath(".//meta")
|
|
107
|
-
puts "\nMETA elements:"
|
|
108
|
-
puts "HTML5: #{meta5.size} meta elements"
|
|
109
|
-
meta5.each_with_index do |m, i|
|
|
110
|
-
puts " #{i}: #{m.to_html}"
|
|
111
|
-
end
|
|
112
|
-
puts "HTML4: #{meta4.size} meta elements"
|
|
113
|
-
meta4.each_with_index do |m, i|
|
|
114
|
-
puts " #{i}: #{m.to_html}"
|
|
115
|
-
end
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
# frozen_string_literal: true
|
|
3
|
-
|
|
4
|
-
require "bundler/setup"
|
|
5
|
-
require "canon"
|
|
6
|
-
|
|
7
|
-
# Test case: Meta element with attributes should match
|
|
8
|
-
expected = <<~HTML
|
|
9
|
-
<html>
|
|
10
|
-
<head>
|
|
11
|
-
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
|
12
|
-
</head>
|
|
13
|
-
<body>
|
|
14
|
-
<p>Test</p>
|
|
15
|
-
</body>
|
|
16
|
-
</html>
|
|
17
|
-
HTML
|
|
18
|
-
|
|
19
|
-
actual = <<~HTML
|
|
20
|
-
<html>
|
|
21
|
-
<head>
|
|
22
|
-
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
|
23
|
-
</head>
|
|
24
|
-
<body>
|
|
25
|
-
<p>Test</p>
|
|
26
|
-
</body>
|
|
27
|
-
</html>
|
|
28
|
-
HTML
|
|
29
|
-
|
|
30
|
-
puts "=" * 80
|
|
31
|
-
puts "Testing Meta Element Matching"
|
|
32
|
-
puts "=" * 80
|
|
33
|
-
|
|
34
|
-
result = Canon::Comparison.equivalent?(expected, actual,
|
|
35
|
-
format: :html4,
|
|
36
|
-
diff_algorithm: :semantic,
|
|
37
|
-
verbose: true)
|
|
38
|
-
|
|
39
|
-
if result.is_a?(Canon::Comparison::ComparisonResult)
|
|
40
|
-
puts "\nResult: #{result.equivalent? ? 'PASS ✅' : 'FAIL ❌'}"
|
|
41
|
-
puts "Normative diffs: #{result.normative_differences.count}"
|
|
42
|
-
puts "Total diffs: #{result.differences.count}"
|
|
43
|
-
|
|
44
|
-
unless result.equivalent?
|
|
45
|
-
puts "\nDifferences:"
|
|
46
|
-
result.differences.each_with_index do |diff, i|
|
|
47
|
-
puts "\n #{i + 1}. #{diff.inspect}"
|
|
48
|
-
end
|
|
49
|
-
end
|
|
50
|
-
else
|
|
51
|
-
puts "Result: #{result}"
|
|
52
|
-
end
|
data/scripts/debug_p_matching.rb
DELETED
|
@@ -1,192 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
# frozen_string_literal: true
|
|
3
|
-
|
|
4
|
-
require "bundler/setup"
|
|
5
|
-
require "canon"
|
|
6
|
-
require "nokogiri"
|
|
7
|
-
|
|
8
|
-
# Read the test files
|
|
9
|
-
expected_file = "/Users/mulgogi/src/mn/isodoc/spec/fixtures/html/isodoc-section-names-expected.html"
|
|
10
|
-
actual_file = "/Users/mulgogi/src/mn/isodoc/spec/fixtures/html/isodoc-section-names-actual.html"
|
|
11
|
-
|
|
12
|
-
expected = File.read(expected_file)
|
|
13
|
-
actual = File.read(actual_file)
|
|
14
|
-
|
|
15
|
-
puts "=" * 80
|
|
16
|
-
puts "ANALYZING <p> ELEMENT MATCHING"
|
|
17
|
-
puts "=" * 80
|
|
18
|
-
|
|
19
|
-
# Parse with Nokogiri to see what we have
|
|
20
|
-
doc1 = Nokogiri::HTML4(expected)
|
|
21
|
-
doc2 = Nokogiri::HTML4(actual)
|
|
22
|
-
|
|
23
|
-
# Find all <p> elements
|
|
24
|
-
p_elements1 = doc1.css("p")
|
|
25
|
-
p_elements2 = doc2.css("p")
|
|
26
|
-
|
|
27
|
-
puts "\nFile 1 has #{p_elements1.size} <p> elements"
|
|
28
|
-
puts "File 2 has #{p_elements2.size} <p> elements"
|
|
29
|
-
|
|
30
|
-
# Group by class attribute
|
|
31
|
-
p_by_class1 = p_elements1.group_by { |p| p["class"] }
|
|
32
|
-
p_by_class2 = p_elements2.group_by { |p| p["class"] }
|
|
33
|
-
|
|
34
|
-
puts "\nFile 1 <p> elements by class:"
|
|
35
|
-
p_by_class1.each do |klass, elements|
|
|
36
|
-
puts " #{klass.inspect}: #{elements.size} elements"
|
|
37
|
-
elements.first(3).each do |el|
|
|
38
|
-
content = el.text.strip
|
|
39
|
-
content = "#{content[0..50]}..." if content.length > 50
|
|
40
|
-
puts " - #{content.inspect}"
|
|
41
|
-
end
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
puts "\nFile 2 <p> elements by class:"
|
|
45
|
-
p_by_class2.each do |klass, elements|
|
|
46
|
-
puts " #{klass.inspect}: #{elements.size} elements"
|
|
47
|
-
elements.first(3).each do |el|
|
|
48
|
-
content = el.text.strip
|
|
49
|
-
content = "#{content[0..50]}..." if content.length > 50
|
|
50
|
-
puts " - #{content.inspect}"
|
|
51
|
-
end
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
# Now run Canon's tree diff to see what happens
|
|
55
|
-
puts "\n#{'=' * 80}"
|
|
56
|
-
puts "RUNNING CANON TREE DIFF"
|
|
57
|
-
puts "=" * 80
|
|
58
|
-
|
|
59
|
-
require_relative "../lib/canon/tree_diff/adapters/html_adapter"
|
|
60
|
-
require_relative "../lib/canon/tree_diff/matchers/hash_matcher"
|
|
61
|
-
require_relative "../lib/canon/tree_diff/matchers/similarity_matcher"
|
|
62
|
-
require_relative "../lib/canon/tree_diff/operations/operation_detector"
|
|
63
|
-
|
|
64
|
-
# Create trees
|
|
65
|
-
adapter = Canon::TreeDiff::Adapters::HtmlAdapter.new
|
|
66
|
-
tree1 = adapter.parse(expected)
|
|
67
|
-
tree2 = adapter.parse(actual)
|
|
68
|
-
|
|
69
|
-
puts "\nTree 1 has #{tree1.descendants.size} total nodes"
|
|
70
|
-
puts "Tree 2 has #{tree2.descendants.size} total nodes"
|
|
71
|
-
|
|
72
|
-
# Find <p> nodes in tree
|
|
73
|
-
p_nodes1 = tree1.descendants.select { |n| n.label == "p" }
|
|
74
|
-
p_nodes2 = tree2.descendants.select { |n| n.label == "p" }
|
|
75
|
-
|
|
76
|
-
puts "\nTree 1 has #{p_nodes1.size} <p> nodes"
|
|
77
|
-
puts "Tree 2 has #{p_nodes2.size} <p> nodes"
|
|
78
|
-
|
|
79
|
-
# Group by attributes
|
|
80
|
-
p_by_attrs1 = p_nodes1.group_by(&:attributes)
|
|
81
|
-
p_by_attrs2 = p_nodes2.group_by(&:attributes)
|
|
82
|
-
|
|
83
|
-
puts "\nTree 1 <p> nodes by attributes:"
|
|
84
|
-
p_by_attrs1.each do |attrs, nodes|
|
|
85
|
-
puts " #{attrs.inspect}: #{nodes.size} nodes"
|
|
86
|
-
end
|
|
87
|
-
|
|
88
|
-
puts "\nTree 2 <p> nodes by attributes:"
|
|
89
|
-
p_by_attrs2.each do |attrs, nodes|
|
|
90
|
-
puts " #{attrs.inspect}: #{nodes.size} nodes"
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
# Look at signatures
|
|
94
|
-
require_relative "../lib/canon/tree_diff/core/node_signature"
|
|
95
|
-
|
|
96
|
-
puts "\n#{'=' * 80}"
|
|
97
|
-
puts "ANALYZING SIGNATURES"
|
|
98
|
-
puts "=" * 80
|
|
99
|
-
|
|
100
|
-
# Get page-break <p> nodes
|
|
101
|
-
page_break_p1 = p_nodes1.select { |n| n.attributes["class"] == "page-break" }
|
|
102
|
-
page_break_p2 = p_nodes2.select { |n| n.attributes["class"] == "page-break" }
|
|
103
|
-
|
|
104
|
-
puts "\nFile 1 has #{page_break_p1.size} <p class=\"page-break\"> nodes"
|
|
105
|
-
puts "File 2 has #{page_break_p2.size} <p class=\"page-break\"> nodes"
|
|
106
|
-
|
|
107
|
-
if page_break_p1.any?
|
|
108
|
-
puts "\nFirst 3 signatures from File 1:"
|
|
109
|
-
page_break_p1.first(3).each_with_index do |node, i|
|
|
110
|
-
sig = Canon::TreeDiff::Core::NodeSignature.for(node)
|
|
111
|
-
puts " #{i + 1}. #{sig}"
|
|
112
|
-
puts " Children: #{node.children.size}"
|
|
113
|
-
if node.children.any?
|
|
114
|
-
node.children.each do |child|
|
|
115
|
-
child_sig = Canon::TreeDiff::Core::NodeSignature.for(child)
|
|
116
|
-
puts " - #{child.label}: #{child_sig}"
|
|
117
|
-
end
|
|
118
|
-
end
|
|
119
|
-
end
|
|
120
|
-
end
|
|
121
|
-
|
|
122
|
-
if page_break_p2.any?
|
|
123
|
-
puts "\nFirst 3 signatures from File 2:"
|
|
124
|
-
page_break_p2.first(3).each_with_index do |node, i|
|
|
125
|
-
sig = Canon::TreeDiff::Core::NodeSignature.for(node)
|
|
126
|
-
puts " #{i + 1}. #{sig}"
|
|
127
|
-
puts " Children: #{node.children.size}"
|
|
128
|
-
if node.children.any?
|
|
129
|
-
node.children.each do |child|
|
|
130
|
-
child_sig = Canon::TreeDiff::Core::NodeSignature.for(child)
|
|
131
|
-
puts " - #{child.label}: #{child_sig}"
|
|
132
|
-
end
|
|
133
|
-
end
|
|
134
|
-
end
|
|
135
|
-
end
|
|
136
|
-
|
|
137
|
-
# Run hash matcher
|
|
138
|
-
puts "\n#{'=' * 80}"
|
|
139
|
-
puts "RUNNING HASH MATCHER"
|
|
140
|
-
puts "=" * 80
|
|
141
|
-
|
|
142
|
-
options = {
|
|
143
|
-
attribute_order: :ignore,
|
|
144
|
-
text_content: :normalize,
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
matcher = Canon::TreeDiff::Matchers::HashMatcher.new(tree1, tree2, options)
|
|
148
|
-
matching = matcher.match
|
|
149
|
-
|
|
150
|
-
puts "\nTotal matched pairs: #{matching.size}"
|
|
151
|
-
|
|
152
|
-
# Check how many <p> nodes were matched
|
|
153
|
-
matched_p1 = p_nodes1.count { |n| matching.matched1?(n) }
|
|
154
|
-
matched_p2 = p_nodes2.count { |n| matching.matched2?(n) }
|
|
155
|
-
|
|
156
|
-
puts "Matched <p> from tree1: #{matched_p1}/#{p_nodes1.size}"
|
|
157
|
-
puts "Matched <p> from tree2: #{matched_p2}/#{p_nodes2.size}"
|
|
158
|
-
|
|
159
|
-
matched_page_break_p1 = page_break_p1.count { |n| matching.matched1?(n) }
|
|
160
|
-
matched_page_break_p2 = page_break_p2.count { |n| matching.matched2?(n) }
|
|
161
|
-
|
|
162
|
-
puts "Matched <p class=\"page-break\"> from tree1: #{matched_page_break_p1}/#{page_break_p1.size}"
|
|
163
|
-
puts "Matched <p class=\"page-break\"> from tree2: #{matched_page_break_p2}/#{page_break_p2.size}"
|
|
164
|
-
|
|
165
|
-
# Check unmatched page-break <p> nodes
|
|
166
|
-
unmatched_p1 = page_break_p1.reject { |n| matching.matched1?(n) }
|
|
167
|
-
unmatched_p2 = page_break_p2.reject { |n| matching.matched2?(n) }
|
|
168
|
-
|
|
169
|
-
puts "\nUnmatched <p class=\"page-break\"> from tree1: #{unmatched_p1.size}"
|
|
170
|
-
puts "Unmatched <p class=\"page-break\"> from tree2: #{unmatched_p2.size}"
|
|
171
|
-
|
|
172
|
-
if unmatched_p1.any?
|
|
173
|
-
puts "\nFirst unmatched from tree1:"
|
|
174
|
-
node = unmatched_p1.first
|
|
175
|
-
puts " Path: #{node.xpath}"
|
|
176
|
-
puts " Signature: #{Canon::TreeDiff::Core::NodeSignature.for(node)}"
|
|
177
|
-
puts " Children: #{node.children.size}"
|
|
178
|
-
node.children.each do |child|
|
|
179
|
-
puts " - #{child.label}: value=#{child.value.inspect}, attrs=#{child.attributes.inspect}"
|
|
180
|
-
end
|
|
181
|
-
end
|
|
182
|
-
|
|
183
|
-
if unmatched_p2.any?
|
|
184
|
-
puts "\nFirst unmatched from tree2:"
|
|
185
|
-
node = unmatched_p2.first
|
|
186
|
-
puts " Path: #{node.xpath}"
|
|
187
|
-
puts " Signature: #{Canon::TreeDiff::Core::NodeSignature.for(node)}"
|
|
188
|
-
puts " Children: #{node.children.size}"
|
|
189
|
-
node.children.each do |child|
|
|
190
|
-
puts " - #{child.label}: value=#{child.value.inspect}, attrs=#{child.attributes.inspect}"
|
|
191
|
-
end
|
|
192
|
-
end
|
|
@@ -1,118 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
# frozen_string_literal: true
|
|
3
|
-
|
|
4
|
-
# Debug signature matching to understand why elements aren't matching
|
|
5
|
-
# Usage: ruby scripts/debug_signature_matching.rb
|
|
6
|
-
|
|
7
|
-
require_relative "../lib/canon"
|
|
8
|
-
require "nokogiri"
|
|
9
|
-
|
|
10
|
-
# Sample XML with semx elements that should match
|
|
11
|
-
xml1 = <<~XML
|
|
12
|
-
<p>
|
|
13
|
-
<fmt-concept>
|
|
14
|
-
<semx element="title" source="_">word</semx>
|
|
15
|
-
</fmt-concept>
|
|
16
|
-
</p>
|
|
17
|
-
XML
|
|
18
|
-
|
|
19
|
-
xml2 = <<~XML
|
|
20
|
-
<p>
|
|
21
|
-
<fmt-concept>
|
|
22
|
-
<semx element="concept" source="_">word</semx>
|
|
23
|
-
</fmt-concept>
|
|
24
|
-
</p>
|
|
25
|
-
XML
|
|
26
|
-
|
|
27
|
-
puts "=" * 80
|
|
28
|
-
puts "SIGNATURE MATCHING DEBUG"
|
|
29
|
-
puts "=" * 80
|
|
30
|
-
|
|
31
|
-
# Parse both
|
|
32
|
-
doc1 = Nokogiri::XML(xml1)
|
|
33
|
-
doc2 = Nokogiri::XML(xml2)
|
|
34
|
-
|
|
35
|
-
# Create adapter
|
|
36
|
-
adapter = Canon::TreeDiff::Adapters::XMLAdapter.new
|
|
37
|
-
|
|
38
|
-
# Convert to tree
|
|
39
|
-
tree1 = adapter.to_tree(doc1.root)
|
|
40
|
-
tree2 = adapter.to_tree(doc2.root)
|
|
41
|
-
|
|
42
|
-
puts "\nTree 1 structure:"
|
|
43
|
-
def print_tree(node, indent = 0)
|
|
44
|
-
prefix = " " * indent
|
|
45
|
-
if node.text?
|
|
46
|
-
puts "#{prefix}#text: #{node.value.inspect}"
|
|
47
|
-
else
|
|
48
|
-
attrs = node.attributes.empty? ? "" : " {#{node.attributes.inspect}}"
|
|
49
|
-
puts "#{prefix}<#{node.label}>#{attrs}"
|
|
50
|
-
node.children.each { |c| print_tree(c, indent + 1) }
|
|
51
|
-
end
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
print_tree(tree1)
|
|
55
|
-
|
|
56
|
-
puts "\nTree 2 structure:"
|
|
57
|
-
print_tree(tree2)
|
|
58
|
-
|
|
59
|
-
# Get semx nodes
|
|
60
|
-
semx1 = tree1.descendants.find { |n| n.label == "semx" }
|
|
61
|
-
semx2 = tree2.descendants.find { |n| n.label == "semx" }
|
|
62
|
-
|
|
63
|
-
puts "\n#{'-' * 80}"
|
|
64
|
-
puts "SEMX NODE COMPARISON"
|
|
65
|
-
puts "-" * 80
|
|
66
|
-
|
|
67
|
-
puts "\nSemx 1:"
|
|
68
|
-
puts " Label: #{semx1.label}"
|
|
69
|
-
puts " Value: #{semx1.value.inspect}"
|
|
70
|
-
puts " Attributes: #{semx1.attributes.inspect}"
|
|
71
|
-
|
|
72
|
-
puts "\nSemx 2:"
|
|
73
|
-
puts " Label: #{semx2.label}"
|
|
74
|
-
puts " Value: #{semx2.value.inspect}"
|
|
75
|
-
puts " Attributes: #{semx2.attributes.inspect}"
|
|
76
|
-
|
|
77
|
-
# Compute signatures
|
|
78
|
-
sig1_strict = Canon::TreeDiff::Core::NodeSignature.for(semx1,
|
|
79
|
-
include_attributes: true)
|
|
80
|
-
sig2_strict = Canon::TreeDiff::Core::NodeSignature.for(semx2,
|
|
81
|
-
include_attributes: true)
|
|
82
|
-
|
|
83
|
-
sig1_loose = Canon::TreeDiff::Core::NodeSignature.for(semx1,
|
|
84
|
-
include_attributes: false)
|
|
85
|
-
sig2_loose = Canon::TreeDiff::Core::NodeSignature.for(semx2,
|
|
86
|
-
include_attributes: false)
|
|
87
|
-
|
|
88
|
-
puts "\n#{'-' * 80}"
|
|
89
|
-
puts "SIGNATURE COMPARISON"
|
|
90
|
-
puts "-" * 80
|
|
91
|
-
|
|
92
|
-
puts "\nStrict signatures (with attributes):"
|
|
93
|
-
puts " Semx 1: #{sig1_strict.signature_string}"
|
|
94
|
-
puts " Semx 2: #{sig2_strict.signature_string}"
|
|
95
|
-
puts " Match? #{sig1_strict == sig2_strict}"
|
|
96
|
-
|
|
97
|
-
puts "\nLoose signatures (without attributes):"
|
|
98
|
-
puts " Semx 1: #{sig1_loose.signature_string}"
|
|
99
|
-
puts " Semx 2: #{sig2_loose.signature_string}"
|
|
100
|
-
puts " Match? #{sig1_loose == sig2_loose}"
|
|
101
|
-
|
|
102
|
-
puts "\n#{'-' * 80}"
|
|
103
|
-
puts "ANALYSIS"
|
|
104
|
-
puts "-" * 80
|
|
105
|
-
|
|
106
|
-
if sig1_strict != sig2_strict
|
|
107
|
-
puts "\n⚠️ ISSUE FOUND:"
|
|
108
|
-
puts "Strict signatures don't match due to attribute differences!"
|
|
109
|
-
puts "This prevents HashMatcher from considering these nodes as candidates."
|
|
110
|
-
puts "\nDifference:"
|
|
111
|
-
puts " File 1: element='title'"
|
|
112
|
-
puts " File 2: element='concept'"
|
|
113
|
-
puts "\nSOLUTION:"
|
|
114
|
-
puts "HashMatcher should use LOOSE signatures (no attributes) to find candidates,"
|
|
115
|
-
puts "then check attributes separately during matching."
|
|
116
|
-
end
|
|
117
|
-
|
|
118
|
-
puts "\n#{'=' * 80}"
|