canon 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +112 -25
  3. data/docs/Gemfile +1 -0
  4. data/docs/_config.yml +90 -1
  5. data/docs/advanced/diff-classification.adoc +82 -2
  6. data/docs/features/match-options/index.adoc +239 -1
  7. data/lib/canon/comparison/format_detector.rb +2 -1
  8. data/lib/canon/comparison/html_comparator.rb +19 -8
  9. data/lib/canon/comparison/html_compare_profile.rb +8 -2
  10. data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
  11. data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
  12. data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
  13. data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
  14. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
  15. data/lib/canon/comparison/xml_comparator.rb +48 -23
  16. data/lib/canon/comparison/xml_node_comparison.rb +25 -3
  17. data/lib/canon/diff/diff_classifier.rb +101 -2
  18. data/lib/canon/diff/formatting_detector.rb +1 -1
  19. data/lib/canon/rspec_matchers.rb +37 -8
  20. data/lib/canon/version.rb +1 -1
  21. data/lib/canon/xml/data_model.rb +24 -13
  22. metadata +3 -78
  23. data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
  24. data/false_positive_analysis.txt +0 -0
  25. data/file1.html +0 -1
  26. data/file2.html +0 -1
  27. data/old-docs/ADVANCED_TOPICS.adoc +0 -20
  28. data/old-docs/BASIC_USAGE.adoc +0 -16
  29. data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
  30. data/old-docs/CLI.adoc +0 -497
  31. data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  32. data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
  33. data/old-docs/DIFF_FORMATTING.adoc +0 -540
  34. data/old-docs/DIFF_PARAMETERS.adoc +0 -261
  35. data/old-docs/DOM_DIFF.adoc +0 -1017
  36. data/old-docs/ENV_CONFIG.adoc +0 -876
  37. data/old-docs/FORMATS.adoc +0 -867
  38. data/old-docs/INPUT_VALIDATION.adoc +0 -477
  39. data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
  40. data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
  41. data/old-docs/MATCH_OPTIONS.adoc +0 -912
  42. data/old-docs/MODES.adoc +0 -432
  43. data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  44. data/old-docs/OPTIONS.adoc +0 -1387
  45. data/old-docs/PREPROCESSING.adoc +0 -491
  46. data/old-docs/README.old.adoc +0 -2831
  47. data/old-docs/RSPEC.adoc +0 -814
  48. data/old-docs/RUBY_API.adoc +0 -485
  49. data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
  50. data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
  51. data/old-docs/STRING_COMPARE.adoc +0 -345
  52. data/old-docs/TMP.adoc +0 -3384
  53. data/old-docs/TREE_DIFF.adoc +0 -1080
  54. data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
  55. data/old-docs/VERBOSE.adoc +0 -482
  56. data/old-docs/VISUALIZATION_MAP.adoc +0 -625
  57. data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
  58. data/scripts/analyze_current_state.rb +0 -85
  59. data/scripts/analyze_false_positives.rb +0 -114
  60. data/scripts/analyze_remaining_failures.rb +0 -105
  61. data/scripts/compare_current_failures.rb +0 -95
  62. data/scripts/compare_dom_tree_diff.rb +0 -158
  63. data/scripts/compare_failures.rb +0 -151
  64. data/scripts/debug_attribute_extraction.rb +0 -66
  65. data/scripts/debug_blocks_839.rb +0 -115
  66. data/scripts/debug_meta_matching.rb +0 -52
  67. data/scripts/debug_p_matching.rb +0 -192
  68. data/scripts/debug_signature_matching.rb +0 -118
  69. data/scripts/debug_sourcecode_124.rb +0 -32
  70. data/scripts/debug_whitespace_sensitive.rb +0 -192
  71. data/scripts/extract_false_positives.rb +0 -138
  72. data/scripts/find_actual_false_positives.rb +0 -125
  73. data/scripts/investigate_all_false_positives.rb +0 -161
  74. data/scripts/investigate_batch1.rb +0 -127
  75. data/scripts/investigate_classification.rb +0 -150
  76. data/scripts/investigate_classification_detailed.rb +0 -190
  77. data/scripts/investigate_common_failures.rb +0 -342
  78. data/scripts/investigate_false_negative.rb +0 -80
  79. data/scripts/investigate_false_positive.rb +0 -83
  80. data/scripts/investigate_false_positives.rb +0 -227
  81. data/scripts/investigate_false_positives_batch.rb +0 -163
  82. data/scripts/investigate_mixed_content.rb +0 -125
  83. data/scripts/investigate_remaining_16.rb +0 -214
  84. data/scripts/run_single_test.rb +0 -29
  85. data/scripts/test_all_false_positives.rb +0 -95
  86. data/scripts/test_attribute_details.rb +0 -61
  87. data/scripts/test_both_algorithms.rb +0 -49
  88. data/scripts/test_both_simple.rb +0 -49
  89. data/scripts/test_enhanced_semantic_output.rb +0 -125
  90. data/scripts/test_readme_examples.rb +0 -131
  91. data/scripts/test_semantic_tree_diff.rb +0 -99
  92. data/scripts/test_semantic_ux_improvements.rb +0 -135
  93. data/scripts/test_single_false_positive.rb +0 -119
  94. data/scripts/test_size_limits.rb +0 -99
  95. data/test_html_1.html +0 -21
  96. data/test_html_2.html +0 -21
  97. data/test_nokogiri.rb +0 -33
  98. data/test_normalize.rb +0 -45
@@ -1,32 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- # Debug sourcecode_spec.rb:124 to understand the false positive pattern
5
- # Usage: ruby scripts/debug_sourcecode_124.rb
6
-
7
- require "bundler/setup"
8
- require_relative "../../src/mn/isodoc/spec/spec_helper"
9
-
10
- # Run the specific test with verbose output to capture expected/actual
11
- puts "=" * 80
12
- puts "DEBUGGING: sourcecode_spec.rb:124"
13
- puts "=" * 80
14
-
15
- # Run test with DOM (should pass)
16
- puts "\n1. Running with DOM algorithm (should PASS):"
17
- puts "-" * 60
18
- ENV["CANON_HTML_DIFF_ALGORITHM"] = "dom"
19
- ENV["CANON_XML_DIFF_ALGORITHM"] = "dom"
20
- ENV["CANON_HTML_DIFF_VERBOSE"] = "true"
21
- ENV["CANON_XML_DIFF_VERBOSE"] = "true"
22
- system("cd /Users/mulgogi/src/mn/isodoc && bundle exec rspec spec/isodoc/sourcecode_spec.rb:124 --format documentation 2>&1")
23
-
24
- puts "\n#{'=' * 80}"
25
- # Run test with Semantic (should fail - false positive)
26
- puts "\n2. Running with Semantic algorithm (should FAIL):"
27
- puts "-" * 60
28
- ENV["CANON_HTML_DIFF_ALGORITHM"] = "semantic"
29
- ENV["CANON_XML_DIFF_ALGORITHM"] = "semantic"
30
- ENV["CANON_HTML_DIFF_VERBOSE"] = "true"
31
- ENV["CANON_XML_DIFF_VERBOSE"] = "true"
32
- system("cd /Users/mulgogi/src/mn/isodoc && bundle exec rspec spec/isodoc/sourcecode_spec.rb:124 --format documentation 2>&1")
@@ -1,192 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- require_relative "../lib/canon"
5
- require "nokogiri"
6
-
7
- expected = <<~HTML
8
- <pre>
9
-
10
- </pre>
11
- HTML
12
-
13
- actual = <<~HTML
14
- <pre> </pre>
15
- HTML
16
-
17
- # Parse and inspect the trees directly
18
- puts "=== Tree Inspection ==="
19
- doc1 = Nokogiri::HTML(expected)
20
- doc2 = Nokogiri::HTML(actual)
21
-
22
- pre1 = doc1.at_css("pre")
23
- pre2 = doc2.at_css("pre")
24
-
25
- puts "Pre1 text: #{pre1.text.inspect}"
26
- puts "Pre1 text length: #{pre1.text.length}"
27
- puts "Pre1 text bytes: #{pre1.text.bytes.inspect}"
28
-
29
- puts "\nPre2 text: #{pre2.text.inspect}"
30
- puts "Pre2 text length: #{pre2.text.length}"
31
- puts "Pre2 text bytes: #{pre2.text.bytes.inspect}"
32
-
33
- # Now build trees using the adapter
34
- adapter = Canon::TreeDiff::Adapters::HTMLAdapter.new
35
- tree1 = adapter.to_tree(doc1)
36
- tree2 = adapter.to_tree(doc2)
37
-
38
- # Check signatures
39
- # tree1 structure: html -> body -> pre
40
- body1 = tree1.children.first
41
- pre1_node = body1.children.find { |c| c.label == "pre" }
42
- body2 = tree2.children.first
43
- pre2_node = body2.children.find { |c| c.label == "pre" }
44
-
45
- puts "\n=== Signatures ==="
46
- if pre1_node && pre2_node
47
- sig1 = Canon::TreeDiff::Core::NodeSignature.for(pre1_node)
48
- sig2 = Canon::TreeDiff::Core::NodeSignature.for(pre2_node)
49
- puts "Pre1 label: #{pre1_node.label}, value: #{pre1_node.value.inspect}"
50
- puts "Pre1 signature: #{sig1}"
51
- puts "Pre2 label: #{pre2_node.label}, value: #{pre2_node.value.inspect}"
52
- puts "Pre2 signature: #{sig2}"
53
- puts "Signatures equal: #{sig1 == sig2}"
54
- else
55
- puts "ERROR: Could not find <pre> nodes"
56
- puts "Body1 children: #{body1.children.map(&:label)}"
57
- puts "Body2 children: #{body2.children.map(&:label)}"
58
- end
59
-
60
- def print_tree(node, indent = 0)
61
- prefix = " " * indent
62
- puts "#{prefix}<#{node.label}>"
63
- puts "#{prefix} value: #{node.value.inspect}" if node.value
64
- puts "#{prefix} attrs: #{node.attributes}" unless node.attributes.empty?
65
- node.children.each { |child| print_tree(child, indent + 1) }
66
- end
67
-
68
- puts "\n=== Tree 1 ==="
69
- print_tree(tree1)
70
-
71
- puts "\n=== Tree 2 ==="
72
- print_tree(tree2)
73
-
74
- # Now test comparison
75
- puts "\n=== Comparison ==="
76
-
77
- # Test using TreeDiff directly
78
- require_relative "../lib/canon/tree_diff/tree_diff_integrator"
79
- integrator = Canon::TreeDiff::TreeDiffIntegrator.new(
80
- format: :html,
81
- options: {},
82
- )
83
-
84
- puts "\n=== Direct TreeDiff Test ==="
85
- diff_result = integrator.diff(doc1, doc2)
86
- puts "Operations count: #{diff_result[:operations].size}"
87
- diff_result[:operations].each_with_index do |op, idx|
88
- puts "\nOperation #{idx + 1}:"
89
- puts " Type: #{op.type}"
90
- puts " Node: #{begin
91
- op[:node]&.label
92
- rescue StandardError
93
- 'N/A'
94
- end}"
95
- if op[:node]
96
- puts " Value: #{begin
97
- op[:node]&.value.inspect
98
- rescue StandardError
99
- 'N/A'
100
- end}"
101
- end
102
- end
103
-
104
- # Convert operations to DiffNodes
105
- puts "\n=== Operation Conversion ==="
106
- converter = Canon::TreeDiff::OperationConverter.new(
107
- format: :html,
108
- match_options: {},
109
- )
110
- diff_nodes = converter.convert(diff_result[:operations])
111
- puts "Converted DiffNodes count: #{diff_nodes.size}"
112
- diff_nodes.each_with_index do |dn, idx|
113
- puts "\nDiffNode #{idx + 1}:"
114
- puts " Dimension: #{dn.dimension}"
115
- puts " Normative: #{dn.normative?}"
116
- puts " Reason: #{dn.reason}"
117
- puts " Node1: #{begin
118
- dn.node1.inspect
119
- rescue StandardError
120
- 'nil'
121
- end}"
122
- puts " Node2: #{begin
123
- dn.node2.inspect
124
- rescue StandardError
125
- 'nil'
126
- end}"
127
- end
128
-
129
- # Now test via Canon::Comparison
130
- puts "\n=== Canon::Comparison Result (with :semantic) ==="
131
- result = Canon::Comparison.equivalent?(
132
- expected,
133
- actual,
134
- format: :html,
135
- diff_algorithm: :semantic,
136
- verbose: true,
137
- )
138
-
139
- puts "Result class: #{result.class}"
140
- puts "Equivalent: #{result.equivalent?}"
141
- puts "Differences count: #{result.differences.size}"
142
- puts "Has normative diffs: #{result.has_normative_diffs?}"
143
-
144
- puts "\n=== Trying with :semantic_tree ==="
145
- result2 = Canon::Comparison.equivalent?(
146
- expected,
147
- actual,
148
- format: :html,
149
- diff_algorithm: :semantic_tree,
150
- verbose: true,
151
- )
152
-
153
- puts "Result class: #{result2.class}"
154
- if result2.is_a?(Canon::Comparison::ComparisonResult)
155
- puts "Equivalent: #{result2.equivalent?}"
156
- puts "Differences count: #{result2.differences.size}"
157
- else
158
- puts "Result: #{result2.inspect}"
159
- end
160
-
161
- result.differences.each_with_index do |diff, idx|
162
- puts "\n--- Diff #{idx + 1} ---"
163
- puts "Dimension: #{diff.dimension}"
164
- puts "Normative: #{diff.normative?}"
165
- puts "Reason: #{diff.reason}"
166
- if diff.node1
167
- puts "Node1 type: #{diff.node1.class}"
168
- puts "Node1 name: #{begin
169
- diff.node1.name
170
- rescue StandardError
171
- 'N/A'
172
- end}"
173
- puts "Node1 text: #{begin
174
- diff.node1.text.inspect
175
- rescue StandardError
176
- 'N/A'
177
- end}"
178
- end
179
- if diff.node2
180
- puts "Node2 type: #{diff.node2.class}"
181
- puts "Node2 name: #{begin
182
- diff.node2.name
183
- rescue StandardError
184
- 'N/A'
185
- end}"
186
- puts "Node2 text: #{begin
187
- diff.node2.text.inspect
188
- rescue StandardError
189
- 'N/A'
190
- end}"
191
- end
192
- end
@@ -1,138 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- # Script to extract and analyze the 16 false positive test cases
5
- # Usage: ruby scripts/extract_false_positives.rb
6
-
7
- require "bundler/setup"
8
- require "canon"
9
-
10
- # Map of test files to line numbers
11
- FALSE_POSITIVES = {
12
- "blocks_spec.rb" => [352],
13
- "footnotes_spec.rb" => [740],
14
- "inline_spec.rb" => [1012, 1251],
15
- "postproc_spec.rb" => [948],
16
- "postproc_word_spec.rb" => [372, 576],
17
- "presentation_xml_numbers_override_spec.rb" => [2095],
18
- "presentation_xml_spec.rb" => [1288, 1500],
19
- "ref_spec.rb" => [906],
20
- "sourcecode_spec.rb" => [124, 610],
21
- "terms_spec.rb" => [1445],
22
- "xref_format_spec.rb" => [628],
23
- "xref_spec.rb" => [315],
24
- }.freeze
25
-
26
- ISODOC_SPEC_DIR = File.expand_path("../../mn/isodoc/spec/isodoc", __dir__)
27
-
28
- def extract_test_context(file_path, line_number)
29
- return nil unless File.exist?(file_path)
30
-
31
- lines = File.readlines(file_path)
32
-
33
- # Find the start of the test block (looking backward for 'it "')
34
- start_line = line_number - 1
35
- while start_line.positive?
36
- break if /^\s*it\s+["']/.match?(lines[start_line])
37
-
38
- start_line -= 1
39
- end
40
-
41
- # Find the end of the test block (looking forward for matching 'end')
42
- end_line = line_number - 1
43
- depth = 0
44
- while end_line < lines.length
45
- line = lines[end_line]
46
- depth += 1 if /\b(do|begin)\b/.match?(line)
47
- depth -= 1 if /\bend\b/.match?(line)
48
- break if depth <= 0 && end_line > start_line
49
-
50
- end_line += 1
51
- end
52
-
53
- # Extract test description
54
- test_desc = lines[start_line].match(/it\s+["'](.+?)["']/)&.captures&.first || "Unknown test"
55
-
56
- {
57
- file: File.basename(file_path),
58
- line: line_number,
59
- description: test_desc,
60
- content: lines[start_line..end_line].join,
61
- }
62
- end
63
-
64
- def analyze_test_for_patterns(test_info)
65
- content = test_info[:content]
66
-
67
- patterns = []
68
-
69
- # Check for various patterns
70
- patterns << "whitespace_in_pre" if /<pre[^>]*>.*?<\/pre>/m.match?(content)
71
- patterns << "sourcecode_element" if /sourcecode/i.match?(content)
72
- patterns << "metadata_elements" if /<(bookmark|span|meta|a name=)/.match?(content)
73
- patterns << "mixed_content" if /<[^>]+>[^<]*<[^>]+>/.match?(content)
74
- patterns << "attribute_order" if /\s+\w+=["'][^"']*["']\s+\w+=["'][^"']*["']/.match?(content)
75
- patterns << "nested_formatting" if /<(strong|em|i|b|u)[^>]*>.*?<(strong|em|i|b|u)/m.match?(content)
76
-
77
- patterns
78
- end
79
-
80
- puts "=" * 80
81
- puts "EXTRACTING FALSE POSITIVE TEST CASES"
82
- puts "=" * 80
83
-
84
- all_tests = []
85
- pattern_summary = Hash.new(0)
86
-
87
- FALSE_POSITIVES.each do |file, line_numbers|
88
- file_path = File.join(ISODOC_SPEC_DIR, file)
89
-
90
- puts "\n#{file}:"
91
-
92
- line_numbers.each do |line|
93
- test_info = extract_test_context(file_path, line)
94
-
95
- if test_info
96
- patterns = analyze_test_for_patterns(test_info)
97
- test_info[:patterns] = patterns
98
- all_tests << test_info
99
-
100
- patterns.each { |p| pattern_summary[p] += 1 }
101
-
102
- puts " Line #{line}: #{test_info[:description]}"
103
- puts " Patterns: #{patterns.join(', ')}" unless patterns.empty?
104
- else
105
- puts " Line #{line}: ⚠️ Could not extract test"
106
- end
107
- end
108
- end
109
-
110
- puts "\n#{'=' * 80}"
111
- puts "PATTERN SUMMARY"
112
- puts "=" * 80
113
-
114
- pattern_summary.sort_by { |_, count| -count }.each do |pattern, count|
115
- puts " #{pattern}: #{count} occurrences"
116
- end
117
-
118
- puts "\n#{'=' * 80}"
119
- puts "DETAILED TEST EXTRACTION"
120
- puts "=" * 80
121
-
122
- # Save detailed output
123
- output_file = "false_positive_analysis.txt"
124
- File.open(output_file, "w") do |f|
125
- all_tests.each_with_index do |test, i|
126
- f.puts "\n#{'=' * 80}"
127
- f.puts "TEST #{i + 1}: #{test[:file]}:#{test[:line]}"
128
- f.puts "=" * 80
129
- f.puts "Description: #{test[:description]}"
130
- f.puts "Patterns: #{test[:patterns].join(', ')}"
131
- f.puts "\nTest Code:"
132
- f.puts "-" * 80
133
- f.puts test[:content]
134
- end
135
- end
136
-
137
- puts "\nDetailed analysis saved to: #{output_file}"
138
- puts "\nTotal false positives analyzed: #{all_tests.length}"
@@ -1,125 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- # Find the actual false positives by comparing DOM vs semantic failures
5
-
6
- require "json"
7
- require "set"
8
-
9
- ISODOC_DIR = File.expand_path("../../../mn/isodoc", __dir__)
10
-
11
- def run_tests(algorithm)
12
- puts "Running with CANON_ALGORITHM=#{algorithm}..."
13
- output_file = "/tmp/rspec_#{algorithm}_#{Process.pid}.json"
14
- cmd = "cd #{ISODOC_DIR} && CANON_ALGORITHM=#{algorithm} bundle exec rspec --format json --out #{output_file} 2>&1 >/dev/null"
15
- system(cmd)
16
-
17
- if File.exist?(output_file)
18
- content = File.read(output_file)
19
- File.delete(output_file)
20
- begin
21
- JSON.parse(content)
22
- rescue JSON::ParserError => e
23
- puts "Failed to parse JSON for #{algorithm}: #{e.message}"
24
- puts "First 200 chars: #{content[0..200]}"
25
- nil
26
- end
27
- else
28
- puts "Output file not created for #{algorithm}"
29
- nil
30
- end
31
- end
32
-
33
- def extract_failures(results)
34
- return [] unless results && results["examples"]
35
-
36
- results["examples"].select { |ex| ex["status"] == "failed" }.map do |ex|
37
- # Extract file and line from id
38
- # Format: "./spec/isodoc/blocks_spec.rb[1:1:1]"
39
- if ex["id"] =~ %r{\./(spec/isodoc/[^\[]+)\[}
40
- file = $1
41
- line = ex["line_number"]
42
- "#{file}:#{line}"
43
- end
44
- end.compact
45
- end
46
-
47
- puts "=" * 80
48
- puts "FINDING ACTUAL FALSE POSITIVES"
49
- puts "=" * 80
50
- puts
51
-
52
- # Run with both algorithms
53
- dom_results = run_tests("dom")
54
- semantic_results = run_tests("semantic")
55
-
56
- if dom_results.nil? || semantic_results.nil?
57
- puts "ERROR: Failed to get test results"
58
- exit 1
59
- end
60
-
61
- # Extract failure lists
62
- dom_failures = Set.new(extract_failures(dom_results))
63
- semantic_failures = Set.new(extract_failures(semantic_results))
64
-
65
- puts "DOM failures: #{dom_failures.size}"
66
- puts "Semantic failures: #{semantic_failures.size}"
67
- puts
68
-
69
- # Find false positives (pass with DOM, fail with semantic)
70
- false_positives = semantic_failures - dom_failures
71
-
72
- puts "=" * 80
73
- puts "FALSE POSITIVES (#{false_positives.size})"
74
- puts "Tests that PASS with DOM but FAIL with semantic:"
75
- puts "=" * 80
76
-
77
- if false_positives.empty?
78
- puts "✅ NO FALSE POSITIVES FOUND!"
79
- puts "DOM and semantic algorithms have perfect parity!"
80
- else
81
- false_positives.sort.each_with_index do |test, idx|
82
- puts "#{idx + 1}. #{test}"
83
- end
84
- end
85
-
86
- # Find false negatives (fail with DOM, pass with semantic)
87
- false_negatives = dom_failures - semantic_failures
88
-
89
- puts
90
- puts "=" * 80
91
- puts "FALSE NEGATIVES (#{false_negatives.size})"
92
- puts "Tests that FAIL with DOM but PASS with semantic:"
93
- puts "=" * 80
94
-
95
- if false_negatives.empty?
96
- puts "✅ NO FALSE NEGATIVES FOUND!"
97
- else
98
- false_negatives.sort.each_with_index do |test, idx|
99
- puts "#{idx + 1}. #{test}"
100
- end
101
- end
102
-
103
- # Common failures
104
- common_failures = dom_failures & semantic_failures
105
-
106
- puts
107
- puts "=" * 80
108
- puts "COMMON FAILURES (#{common_failures.size})"
109
- puts "Tests that FAIL with BOTH algorithms:"
110
- puts "=" * 80
111
- puts "#{common_failures.size} tests fail with both algorithms"
112
-
113
- # Summary
114
- puts
115
- puts "=" * 80
116
- puts "SUMMARY"
117
- puts "=" * 80
118
- puts "Total tests: #{dom_results['examples'].size}"
119
- puts "DOM failures: #{dom_failures.size}"
120
- puts "Semantic failures: #{semantic_failures.size}"
121
- puts "Common failures: #{common_failures.size}"
122
- puts "False positives: #{false_positives.size} (semantic fails, DOM passes)"
123
- puts "False negatives: #{false_negatives.size} (DOM fails, semantic passes)"
124
- puts "Gap: #{(semantic_failures.size - dom_failures.size).abs}"
125
- puts "=" * 80
@@ -1,161 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- # Systematically investigate all false positive failures
5
- # to identify patterns in why semantic fails but DOM passes
6
-
7
- require "fileutils"
8
-
9
- FALSE_POSITIVES = [
10
- "blocks_spec.rb:352",
11
- "footnotes_spec.rb:740",
12
- "inline_spec.rb:1012",
13
- "inline_spec.rb:1251",
14
- "postproc_spec.rb:948",
15
- "postproc_word_spec.rb:372",
16
- "postproc_word_spec.rb:576",
17
- "presentation_xml_numbers_override_spec.rb:2095",
18
- "presentation_xml_spec.rb:1288",
19
- "presentation_xml_spec.rb:1500",
20
- "ref_spec.rb:906",
21
- "sourcecode_spec.rb:124",
22
- "sourcecode_spec.rb:610",
23
- "terms_spec.rb:1445",
24
- "xref_format_spec.rb:628",
25
- "xref_spec.rb:315",
26
- ].freeze
27
-
28
- ISODOC_PATH = "/Users/mulgogi/src/mn/isodoc"
29
-
30
- def run_test(spec_file, line, algorithm)
31
- cmd = "cd #{ISODOC_PATH} && CANON_ALGORITHM=#{algorithm} bundle exec rspec ./spec/isodoc/#{spec_file}:#{line} 2>&1"
32
- output = `#{cmd}`
33
- {
34
- passed: $?.success?,
35
- output: output,
36
- }
37
- end
38
-
39
- def extract_diff_type(output)
40
- # Look for dimension in diff report
41
- if output =~ /Dimension:[^\n]*\n[^\n]*Location:[^\n]*([^\n]+)/
42
- location = $1.strip
43
- end
44
-
45
- if output =~ /Dimension:\s*([^\n]+)/
46
- dimension = $1.strip
47
- end
48
-
49
- # Look for changes description
50
- changes = []
51
- output.scan(/✨ Changes:\s*([^\n]+)/) do |match|
52
- changes << match[0].strip
53
- end
54
-
55
- {
56
- dimension: dimension,
57
- location: location,
58
- changes: changes,
59
- }
60
- end
61
-
62
- def analyze_false_positive(fp)
63
- file, line = fp.split(":")
64
- puts "\n#{'=' * 80}"
65
- puts "Analyzing: #{fp}"
66
- puts "=" * 80
67
-
68
- # Run with both algorithms
69
- puts "\nRunning with DOM algorithm..."
70
- dom_result = run_test(file, line, "dom")
71
-
72
- puts "Running with SEMANTIC algorithm..."
73
- semantic_result = run_test(file, line, "semantic")
74
-
75
- # Verify it's actually a false positive
76
- unless dom_result[:passed] && !semantic_result[:passed]
77
- puts "⚠️ WARNING: Not a false positive!"
78
- puts " DOM passed: #{dom_result[:passed]}"
79
- puts " Semantic passed: #{semantic_result[:passed]}"
80
- return nil
81
- end
82
-
83
- puts "✓ Confirmed false positive (DOM passes, Semantic fails)"
84
-
85
- # Extract diff details from semantic output
86
- diff_info = extract_diff_type(semantic_result[:output])
87
-
88
- puts "\nDiff Details:"
89
- puts " Dimension: #{diff_info[:dimension] || 'unknown'}"
90
- puts " Location: #{diff_info[:location] || 'unknown'}"
91
- puts " Changes: #{diff_info[:changes].join(', ')}" unless diff_info[:changes].empty?
92
-
93
- # Save full output for detailed analysis
94
- output_dir = "/tmp/false_positives"
95
- FileUtils.mkdir_p(output_dir)
96
-
97
- File.write("#{output_dir}/#{file.gsub('.rb', '')}_#{line}.txt",
98
- semantic_result[:output])
99
- puts "\nFull output saved to: #{output_dir}/#{file.gsub('.rb',
100
- '')}_#{line}.txt"
101
-
102
- diff_info.merge(spec: fp, file: file, line: line)
103
- end
104
-
105
- def main
106
- puts "Investigating #{FALSE_POSITIVES.size} false positives..."
107
- puts "This will take several minutes..."
108
-
109
- results = []
110
-
111
- FALSE_POSITIVES.each_with_index do |fp, idx|
112
- puts "\n[#{idx + 1}/#{FALSE_POSITIVES.size}]"
113
- result = analyze_false_positive(fp)
114
- results << result if result
115
- sleep 0.5 # Brief pause between tests
116
- end
117
-
118
- # Summarize patterns
119
- puts "\n#{'=' * 80}"
120
- puts "PATTERN ANALYSIS"
121
- puts "=" * 80
122
-
123
- puts "\nBy Dimension:"
124
- dimension_groups = results.compact.group_by { |r| r[:dimension] }
125
- dimension_groups.each do |dim, group|
126
- puts " #{dim}: #{group.size} cases"
127
- group.each { |r| puts " - #{r[:spec]}" }
128
- end
129
-
130
- puts "\nBy Changes:"
131
- changes_groups = results.compact.group_by { |r| r[:changes].join(", ") }
132
- changes_groups.each do |change, group|
133
- puts " #{change}: #{group.size} cases"
134
- group.each { |r| puts " - #{r[:spec]}" }
135
- end
136
-
137
- # Save summary
138
- summary_file = "/tmp/false_positive_patterns.txt"
139
- File.open(summary_file, "w") do |f|
140
- f.puts "FALSE POSITIVE PATTERN ANALYSIS"
141
- f.puts "=" * 80
142
- f.puts "\nTotal: #{results.compact.size} false positives analyzed"
143
-
144
- f.puts "\n\nBy Dimension:"
145
- dimension_groups.each do |dim, group|
146
- f.puts " #{dim}: #{group.size}"
147
- group.each { |r| f.puts " #{r[:spec]}" }
148
- end
149
-
150
- f.puts "\n\nBy Changes:"
151
- changes_groups.each do |change, group|
152
- f.puts " #{change}: #{group.size}"
153
- group.each { |r| f.puts " #{r[:spec]}" }
154
- end
155
- end
156
-
157
- puts "\nSummary saved to: #{summary_file}"
158
- puts "\nDetailed outputs in: /tmp/false_positives/"
159
- end
160
-
161
- main if __FILE__ == $PROGRAM_NAME