RubyGems - canon - Versions diffs - 0.1.8 → 0.1.10 - Mend

canon 0.1.8 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

checksums.yaml +4 -4
data/.rubocop_todo.yml +83 -22
data/docs/Gemfile +1 -0
data/docs/_config.yml +90 -1
data/docs/advanced/diff-classification.adoc +196 -24
data/docs/features/match-options/index.adoc +239 -1
data/lib/canon/comparison/format_detector.rb +2 -1
data/lib/canon/comparison/html_comparator.rb +19 -8
data/lib/canon/comparison/html_compare_profile.rb +8 -2
data/lib/canon/comparison/markup_comparator.rb +109 -2
data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +108 -0
data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
data/lib/canon/comparison/xml_comparator.rb +240 -23
data/lib/canon/comparison/xml_node_comparison.rb +25 -3
data/lib/canon/diff/diff_classifier.rb +119 -5
data/lib/canon/diff/formatting_detector.rb +1 -1
data/lib/canon/diff/xml_serialization_formatter.rb +153 -0
data/lib/canon/rspec_matchers.rb +37 -8
data/lib/canon/version.rb +1 -1
data/lib/canon/xml/data_model.rb +24 -13
metadata +4 -78
data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
data/false_positive_analysis.txt +0 -0
data/file1.html +0 -1
data/file2.html +0 -1
data/old-docs/ADVANCED_TOPICS.adoc +0 -20
data/old-docs/BASIC_USAGE.adoc +0 -16
data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
data/old-docs/CLI.adoc +0 -497
data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
data/old-docs/DIFF_FORMATTING.adoc +0 -540
data/old-docs/DIFF_PARAMETERS.adoc +0 -261
data/old-docs/DOM_DIFF.adoc +0 -1017
data/old-docs/ENV_CONFIG.adoc +0 -876
data/old-docs/FORMATS.adoc +0 -867
data/old-docs/INPUT_VALIDATION.adoc +0 -477
data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
data/old-docs/MATCH_OPTIONS.adoc +0 -912
data/old-docs/MODES.adoc +0 -432
data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
data/old-docs/OPTIONS.adoc +0 -1387
data/old-docs/PREPROCESSING.adoc +0 -491
data/old-docs/README.old.adoc +0 -2831
data/old-docs/RSPEC.adoc +0 -814
data/old-docs/RUBY_API.adoc +0 -485
data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
data/old-docs/STRING_COMPARE.adoc +0 -345
data/old-docs/TMP.adoc +0 -3384
data/old-docs/TREE_DIFF.adoc +0 -1080
data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
data/old-docs/VERBOSE.adoc +0 -482
data/old-docs/VISUALIZATION_MAP.adoc +0 -625
data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
data/scripts/analyze_current_state.rb +0 -85
data/scripts/analyze_false_positives.rb +0 -114
data/scripts/analyze_remaining_failures.rb +0 -105
data/scripts/compare_current_failures.rb +0 -95
data/scripts/compare_dom_tree_diff.rb +0 -158
data/scripts/compare_failures.rb +0 -151
data/scripts/debug_attribute_extraction.rb +0 -66
data/scripts/debug_blocks_839.rb +0 -115
data/scripts/debug_meta_matching.rb +0 -52
data/scripts/debug_p_matching.rb +0 -192
data/scripts/debug_signature_matching.rb +0 -118
data/scripts/debug_sourcecode_124.rb +0 -32
data/scripts/debug_whitespace_sensitive.rb +0 -192
data/scripts/extract_false_positives.rb +0 -138
data/scripts/find_actual_false_positives.rb +0 -125
data/scripts/investigate_all_false_positives.rb +0 -161
data/scripts/investigate_batch1.rb +0 -127
data/scripts/investigate_classification.rb +0 -150
data/scripts/investigate_classification_detailed.rb +0 -190
data/scripts/investigate_common_failures.rb +0 -342
data/scripts/investigate_false_negative.rb +0 -80
data/scripts/investigate_false_positive.rb +0 -83
data/scripts/investigate_false_positives.rb +0 -227
data/scripts/investigate_false_positives_batch.rb +0 -163
data/scripts/investigate_mixed_content.rb +0 -125
data/scripts/investigate_remaining_16.rb +0 -214
data/scripts/run_single_test.rb +0 -29
data/scripts/test_all_false_positives.rb +0 -95
data/scripts/test_attribute_details.rb +0 -61
data/scripts/test_both_algorithms.rb +0 -49
data/scripts/test_both_simple.rb +0 -49
data/scripts/test_enhanced_semantic_output.rb +0 -125
data/scripts/test_readme_examples.rb +0 -131
data/scripts/test_semantic_tree_diff.rb +0 -99
data/scripts/test_semantic_ux_improvements.rb +0 -135
data/scripts/test_single_false_positive.rb +0 -119
data/scripts/test_size_limits.rb +0 -99
data/test_html_1.html +0 -21
data/test_html_2.html +0 -21
data/test_nokogiri.rb +0 -33
data/test_normalize.rb +0 -45

data/scripts/compare_failures.rb DELETED Viewed

@@ -1,151 +0,0 @@
-#!/usr/bin/env ruby
-# frozen_string_literal: true
-# Compare semantic and DOM algorithm failures to identify false positives/negatives
-# Usage: ruby scripts/compare_failures.rb /tmp/semantic_current.txt DOM_DIFF_RESULTS.md
-require "set"
-def parse_semantic_failures(file)
-  failures = []
-  File.readlines(file).each do |line|
-    # Format: "rspec ./spec/isodoc/blocks_notes_spec.rb:494 # ..."
-    if line =~ %r{rspec \./spec/isodoc/([a-z_]+_spec\.rb):(\d+)}
-      failures << { file: $1, line: $2.to_i }
-    end
-  end
-  failures
-end
-def parse_dom_failures(file)
-  failures = []
-  in_failures = false
-  File.readlines(file).each do |line|
-    if line.include?("Failed examples:")
-      in_failures = true
-      next
-    end
-    next unless in_failures
-    # Stop at Coverage report
-    break if line.include?("Coverage report")
-    # Format: "rspec ./spec/isodoc/blocks_notes_spec.rb:494"
-    if line =~ %r{rspec \./spec/isodoc/([a-z_]+_spec\.rb):(\d+)}
-      failures << { file: $1, line: $2.to_i }
-    end
-  end
-  failures
-end
-def categorize_failures(semantic, dom)
-  semantic_set = Set.new(semantic.map { |f| "#{f[:file]}:#{f[:line]}" })
-  dom_set = Set.new(dom.map { |f| "#{f[:file]}:#{f[:line]}" })
-  {
-    false_positives: semantic_set - dom_set, # Semantic fails, DOM passes
-    false_negatives: dom_set - semantic_set, # DOM fails, Semantic passes
-    common: semantic_set & dom_set, # Both fail (real failures)
-  }
-end
-def group_by_spec(failures)
-  failures.group_by { |f| f.split(":").first }.transform_values(&:count)
-end
-def main
-  semantic_file = ARGV[0] || "/tmp/semantic_current.txt"
-  dom_file = ARGV[1] || "DOM_DIFF_RESULTS.md"
-  puts "Parsing semantic failures from: #{semantic_file}"
-  semantic = parse_semantic_failures(semantic_file)
-  puts "Parsing DOM failures from: #{dom_file}"
-  dom = parse_dom_failures(dom_file)
-  puts "\n#{'=' * 80}"
-  puts "FAILURE COMPARISON SUMMARY"
-  puts "=" * 80
-  puts "\nTotal failures:"
-  puts "  Semantic: #{semantic.size}"
-  puts "  DOM:      #{dom.size}"
-  categories = categorize_failures(semantic, dom)
-  puts "\n#{'-' * 80}"
-  puts "FALSE POSITIVES (Semantic fails, DOM passes) - #{categories[:false_positives].size}"
-  puts "-" * 80
-  puts "\nBy spec file:"
-  group_by_spec(categories[:false_positives].to_a).sort_by do |_, v|
-    -v
-  end.each do |file, count|
-    puts "  #{file}: #{count}"
-  end
-  puts "\nDetailed list:"
-  categories[:false_positives].sort.each do |failure|
-    puts "  #{failure}"
-  end
-  puts "\n#{'-' * 80}"
-  puts "FALSE NEGATIVES (DOM fails, Semantic passes) - #{categories[:false_negatives].size}"
-  puts "-" * 80
-  puts "\nBy spec file:"
-  group_by_spec(categories[:false_negatives].to_a).sort_by do |_, v|
-    -v
-  end.each do |file, count|
-    puts "  #{file}: #{count}"
-  end
-  puts "\nDetailed list:"
-  categories[:false_negatives].sort.each do |failure|
-    puts "  #{failure}"
-  end
-  puts "\n#{'-' * 80}"
-  puts "COMMON FAILURES (Both algorithms fail) - #{categories[:common].size}"
-  puts "-" * 80
-  puts "\nBy spec file:"
-  group_by_spec(categories[:common].to_a).sort_by do |_, v|
-    -v
-  end.each do |file, count|
-    puts "  #{file}: #{count}"
-  end
-  puts "\n#{'=' * 80}"
-  puts "NEXT STEPS"
-  puts "=" * 80
-  puts "\n1. Fix false positives (#{categories[:false_positives].size} tests):"
-  puts "   - These are cases where semantic is too strict"
-  puts "   - DOM passes but semantic fails"
-  puts "   - Fix these to reduce semantic failures"
-  puts "\n2. Fix false negatives (#{categories[:false_negatives].size} tests):"
-  puts "   - These are cases where semantic is too lenient"
-  puts "   - Semantic passes but DOM fails"
-  puts "   - Fix these to maintain correctness"
-  puts "\n3. Common failures (#{categories[:common].size} tests):"
-  puts "   - These are real test failures in both algorithms"
-  puts "   - Will remain after parity is achieved"
-  puts "   - May indicate actual test/code issues"
-  # Save detailed results
-  output_file = "/tmp/failure_comparison.txt"
-  File.open(output_file, "w") do |f|
-    f.puts "FALSE POSITIVES (#{categories[:false_positives].size}):"
-    categories[:false_positives].sort.each { |fp| f.puts fp }
-    f.puts "\nFALSE NEGATIVES (#{categories[:false_negatives].size}):"
-    categories[:false_negatives].sort.each { |fn| f.puts fn }
-    f.puts "\nCOMMON FAILURES (#{categories[:common].size}):"
-    categories[:common].sort.each { |cf| f.puts cf }
-  end
-  puts "\nDetailed results saved to: #{output_file}"
-end
-main if __FILE__ == $PROGRAM_NAME

data/scripts/debug_attribute_extraction.rb DELETED Viewed

@@ -1,66 +0,0 @@
-#!/usr/bin/env ruby
-# frozen_string_literal: true
-require_relative "../lib/canon"
-require_relative "../lib/canon/diff_formatter"
-require_relative "../lib/canon/diff_formatter/diff_detail_formatter"
-# Test attribute values formatting
-html1 = '<table id="T1" class="MsoNormalTable" border="1"></table>'
-html2 = '<table id="T2" class="MsoNormalTable" border="2"></table>'
-result = Canon::Comparison.equivalent?(
-  html1,
-  html2,
-  match_algorithm: :semantic_tree,
-  ignore_attr_order: true,
-  verbose: true,
-)
-puts "Number of differences: #{result.differences.length}"
-puts
-result.differences.each_with_index do |diff, i|
-  puts "=" * 70
-  puts "Difference ##{i + 1}"
-  puts "=" * 70
-  puts "Class: #{diff.class}"
-  puts "Dimension: #{diff.dimension if diff.respond_to?(:dimension)}"
-  if diff.respond_to?(:node1) && diff.respond_to?(:node2)
-    node1 = diff.node1
-    node2 = diff.node2
-    puts "\nNode1:"
-    puts "  Class: #{node1.class}"
-    puts "  Name: #{node1.name if node1.respond_to?(:name)}"
-    if node1.respond_to?(:attributes)
-      puts "  Attributes: #{node1.attributes.inspect}"
-      puts "  Attributes class: #{node1.attributes.class}"
-      puts "  Attributes keys: #{node1.attributes.keys.inspect}"
-      node1.attributes.each do |key, val|
-        puts "    #{key.inspect} (#{key.class}) => #{val.inspect} (#{val.class})"
-        if val.respond_to?(:value)
-          puts "      val.value = #{val.value.inspect}"
-        end
-      end
-    end
-    puts "\nNode2:"
-    puts "  Class: #{node2.class}"
-    puts "  Name: #{node2.name if node2.respond_to?(:name)}"
-    if node2.respond_to?(:attributes)
-      puts "  Attributes: #{node2.attributes.inspect}"
-      puts "  Attributes class: #{node2.attributes.class}"
-      puts "  Attributes keys: #{node2.attributes.keys.inspect}"
-      node2.attributes.each do |key, val|
-        puts "    #{key.inspect} (#{key.class}) => #{val.inspect} (#{val.class})"
-        if val.respond_to?(:value)
-          puts "      val.value = #{val.value.inspect}"
-        end
-      end
-    end
-  end
-  puts
-end

data/scripts/debug_blocks_839.rb DELETED Viewed

@@ -1,115 +0,0 @@
-#!/usr/bin/env ruby
-# Debug script for blocks_spec.rb:839
-require "bundler/setup"
-require "nokogiri"
-HTML_HDR = <<~HEADER.freeze
-  <html lang="en">
-    <head/>
-    <body lang="en">
-      <div class="title-section">
-        <p>\u00a0</p>
-      </div>
-      <br/>
-      <div class="prefatory-section">
-        <p>\u00a0</p>
-      </div>
-      <br/>
-      <div class="main-section">
-         <br/>
-            <div class="TOC" id="_">
-        <h1 class="IntroTitle">Table of contents</h1>
-      </div>
-HEADER
-WORD_HDR = <<~HEADER.freeze
-       <html xmlns:epub="http://www.idpf.org/2007/ops" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" lang="en">
-           <head>
-    <style>
-      <!--
-      -->
-    </style>
-  </head>
-         <body lang="EN-US" link="blue" vlink="#954F72">
-           <div class="WordSection1">
-             <p>\u00a0</p>
-           </div>
-HEADER
-html5_doc = <<~HTML
-  #{HTML_HDR}
-              <br/>
-              <div id="_">
-                <h1 class="ForewordTitle">Foreword</h1>
-                <p id="_" style="text-align:left;">Test</p>
-              </div>
-            </div>
-          </body>
-      </html>
-HTML
-html4_doc = <<~HTML
-  #{WORD_HDR}
-  <p class="page-break">
-    <br clear="all" style="mso-special-character:line-break;page-break-before:always"/>
-  </p>
-          <div class="TOC" id="_">
-    <h1 class="IntroTitle">Table of contents</h1>
-  </div>
-  <p class="page-break">
-    <br clear="all" style="mso-special-character:line-break;page-break-before:always"/>
-  </p>
-          <div id="_">
-            <h1 class="ForewordTitle">Foreword</h1>
-            <p id="_" align="left" style="text-align:left;">Test</p>
-          </div>
-          <p>\u00a0</p>
-        </div>
-      </body>
-  </html>
-HTML
-puts "=" * 80
-puts "HTML5 PARSING"
-puts "=" * 80
-doc5 = Nokogiri::HTML5(html5_doc)
-head5 = doc5.at("//head")
-puts "HEAD element:"
-puts head5.to_html
-puts "\nCHILDREN:"
-head5.children.each_with_index do |child, i|
-  puts "  #{i}: #{child.name} - #{child.attributes.inspect}"
-end
-puts "\n#{'=' * 80}"
-puts "HTML4 PARSING"
-puts "=" * 80
-doc4 = Nokogiri::HTML4(html4_doc)
-head4 = doc4.at("//head")
-puts "HEAD element:"
-puts head4.to_html
-puts "\nCHILDREN:"
-head4.children.each_with_index do |child, i|
-  puts "  #{i}: #{child.name} - #{child.attributes.inspect}"
-end
-puts "\n#{'=' * 80}"
-puts "COMPARISON"
-puts "=" * 80
-puts "HTML5 head children: #{head5.children.size}"
-puts "HTML4 head children: #{head4.children.size}"
-meta5 = head5.xpath(".//meta")
-meta4 = head4.xpath(".//meta")
-puts "\nMETA elements:"
-puts "HTML5: #{meta5.size} meta elements"
-meta5.each_with_index do |m, i|
-  puts "  #{i}: #{m.to_html}"
-end
-puts "HTML4: #{meta4.size} meta elements"
-meta4.each_with_index do |m, i|
-  puts "  #{i}: #{m.to_html}"
-end

data/scripts/debug_meta_matching.rb DELETED Viewed

@@ -1,52 +0,0 @@
-#!/usr/bin/env ruby
-# frozen_string_literal: true
-require "bundler/setup"
-require "canon"
-# Test case: Meta element with attributes should match
-expected = <<~HTML
-  <html>
-    <head>
-      <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
-    </head>
-    <body>
-      <p>Test</p>
-    </body>
-  </html>
-HTML
-actual = <<~HTML
-  <html>
-    <head>
-      <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
-    </head>
-    <body>
-      <p>Test</p>
-    </body>
-  </html>
-HTML
-puts "=" * 80
-puts "Testing Meta Element Matching"
-puts "=" * 80
-result = Canon::Comparison.equivalent?(expected, actual,
-                                       format: :html4,
-                                       diff_algorithm: :semantic,
-                                       verbose: true)
-if result.is_a?(Canon::Comparison::ComparisonResult)
-  puts "\nResult: #{result.equivalent? ? 'PASS ✅' : 'FAIL ❌'}"
-  puts "Normative diffs: #{result.normative_differences.count}"
-  puts "Total diffs: #{result.differences.count}"
-  unless result.equivalent?
-    puts "\nDifferences:"
-    result.differences.each_with_index do |diff, i|
-      puts "\n  #{i + 1}. #{diff.inspect}"
-    end
-  end
-else
-  puts "Result: #{result}"
-end

data/scripts/debug_p_matching.rb DELETED Viewed

@@ -1,192 +0,0 @@
-#!/usr/bin/env ruby
-# frozen_string_literal: true
-require "bundler/setup"
-require "canon"
-require "nokogiri"
-# Read the test files
-expected_file = "/Users/mulgogi/src/mn/isodoc/spec/fixtures/html/isodoc-section-names-expected.html"
-actual_file = "/Users/mulgogi/src/mn/isodoc/spec/fixtures/html/isodoc-section-names-actual.html"
-expected = File.read(expected_file)
-actual = File.read(actual_file)
-puts "=" * 80
-puts "ANALYZING <p> ELEMENT MATCHING"
-puts "=" * 80
-# Parse with Nokogiri to see what we have
-doc1 = Nokogiri::HTML4(expected)
-doc2 = Nokogiri::HTML4(actual)
-# Find all <p> elements
-p_elements1 = doc1.css("p")
-p_elements2 = doc2.css("p")
-puts "\nFile 1 has #{p_elements1.size} <p> elements"
-puts "File 2 has #{p_elements2.size} <p> elements"
-# Group by class attribute
-p_by_class1 = p_elements1.group_by { |p| p["class"] }
-p_by_class2 = p_elements2.group_by { |p| p["class"] }
-puts "\nFile 1 <p> elements by class:"
-p_by_class1.each do |klass, elements|
-  puts "  #{klass.inspect}: #{elements.size} elements"
-  elements.first(3).each do |el|
-    content = el.text.strip
-    content = "#{content[0..50]}..." if content.length > 50
-    puts "    - #{content.inspect}"
-  end
-end
-puts "\nFile 2 <p> elements by class:"
-p_by_class2.each do |klass, elements|
-  puts "  #{klass.inspect}: #{elements.size} elements"
-  elements.first(3).each do |el|
-    content = el.text.strip
-    content = "#{content[0..50]}..." if content.length > 50
-    puts "    - #{content.inspect}"
-  end
-end
-# Now run Canon's tree diff to see what happens
-puts "\n#{'=' * 80}"
-puts "RUNNING CANON TREE DIFF"
-puts "=" * 80
-require_relative "../lib/canon/tree_diff/adapters/html_adapter"
-require_relative "../lib/canon/tree_diff/matchers/hash_matcher"
-require_relative "../lib/canon/tree_diff/matchers/similarity_matcher"
-require_relative "../lib/canon/tree_diff/operations/operation_detector"
-# Create trees
-adapter = Canon::TreeDiff::Adapters::HtmlAdapter.new
-tree1 = adapter.parse(expected)
-tree2 = adapter.parse(actual)
-puts "\nTree 1 has #{tree1.descendants.size} total nodes"
-puts "Tree 2 has #{tree2.descendants.size} total nodes"
-# Find <p> nodes in tree
-p_nodes1 = tree1.descendants.select { |n| n.label == "p" }
-p_nodes2 = tree2.descendants.select { |n| n.label == "p" }
-puts "\nTree 1 has #{p_nodes1.size} <p> nodes"
-puts "Tree 2 has #{p_nodes2.size} <p> nodes"
-# Group by attributes
-p_by_attrs1 = p_nodes1.group_by(&:attributes)
-p_by_attrs2 = p_nodes2.group_by(&:attributes)
-puts "\nTree 1 <p> nodes by attributes:"
-p_by_attrs1.each do |attrs, nodes|
-  puts "  #{attrs.inspect}: #{nodes.size} nodes"
-end
-puts "\nTree 2 <p> nodes by attributes:"
-p_by_attrs2.each do |attrs, nodes|
-  puts "  #{attrs.inspect}: #{nodes.size} nodes"
-end
-# Look at signatures
-require_relative "../lib/canon/tree_diff/core/node_signature"
-puts "\n#{'=' * 80}"
-puts "ANALYZING SIGNATURES"
-puts "=" * 80
-# Get page-break <p> nodes
-page_break_p1 = p_nodes1.select { |n| n.attributes["class"] == "page-break" }
-page_break_p2 = p_nodes2.select { |n| n.attributes["class"] == "page-break" }
-puts "\nFile 1 has #{page_break_p1.size} <p class=\"page-break\"> nodes"
-puts "File 2 has #{page_break_p2.size} <p class=\"page-break\"> nodes"
-if page_break_p1.any?
-  puts "\nFirst 3 signatures from File 1:"
-  page_break_p1.first(3).each_with_index do |node, i|
-    sig = Canon::TreeDiff::Core::NodeSignature.for(node)
-    puts "  #{i + 1}. #{sig}"
-    puts "     Children: #{node.children.size}"
-    if node.children.any?
-      node.children.each do |child|
-        child_sig = Canon::TreeDiff::Core::NodeSignature.for(child)
-        puts "       - #{child.label}: #{child_sig}"
-      end
-    end
-  end
-end
-if page_break_p2.any?
-  puts "\nFirst 3 signatures from File 2:"
-  page_break_p2.first(3).each_with_index do |node, i|
-    sig = Canon::TreeDiff::Core::NodeSignature.for(node)
-    puts "  #{i + 1}. #{sig}"
-    puts "     Children: #{node.children.size}"
-    if node.children.any?
-      node.children.each do |child|
-        child_sig = Canon::TreeDiff::Core::NodeSignature.for(child)
-        puts "       - #{child.label}: #{child_sig}"
-      end
-    end
-  end
-end
-# Run hash matcher
-puts "\n#{'=' * 80}"
-puts "RUNNING HASH MATCHER"
-puts "=" * 80
-options = {
-  attribute_order: :ignore,
-  text_content: :normalize,
-}
-matcher = Canon::TreeDiff::Matchers::HashMatcher.new(tree1, tree2, options)
-matching = matcher.match
-puts "\nTotal matched pairs: #{matching.size}"
-# Check how many <p> nodes were matched
-matched_p1 = p_nodes1.count { |n| matching.matched1?(n) }
-matched_p2 = p_nodes2.count { |n| matching.matched2?(n) }
-puts "Matched <p> from tree1: #{matched_p1}/#{p_nodes1.size}"
-puts "Matched <p> from tree2: #{matched_p2}/#{p_nodes2.size}"
-matched_page_break_p1 = page_break_p1.count { |n| matching.matched1?(n) }
-matched_page_break_p2 = page_break_p2.count { |n| matching.matched2?(n) }
-puts "Matched <p class=\"page-break\"> from tree1: #{matched_page_break_p1}/#{page_break_p1.size}"
-puts "Matched <p class=\"page-break\"> from tree2: #{matched_page_break_p2}/#{page_break_p2.size}"
-# Check unmatched page-break <p> nodes
-unmatched_p1 = page_break_p1.reject { |n| matching.matched1?(n) }
-unmatched_p2 = page_break_p2.reject { |n| matching.matched2?(n) }
-puts "\nUnmatched <p class=\"page-break\"> from tree1: #{unmatched_p1.size}"
-puts "Unmatched <p class=\"page-break\"> from tree2: #{unmatched_p2.size}"
-if unmatched_p1.any?
-  puts "\nFirst unmatched from tree1:"
-  node = unmatched_p1.first
-  puts "  Path: #{node.xpath}"
-  puts "  Signature: #{Canon::TreeDiff::Core::NodeSignature.for(node)}"
-  puts "  Children: #{node.children.size}"
-  node.children.each do |child|
-    puts "    - #{child.label}: value=#{child.value.inspect}, attrs=#{child.attributes.inspect}"
-  end
-end
-if unmatched_p2.any?
-  puts "\nFirst unmatched from tree2:"
-  node = unmatched_p2.first
-  puts "  Path: #{node.xpath}"
-  puts "  Signature: #{Canon::TreeDiff::Core::NodeSignature.for(node)}"
-  puts "  Children: #{node.children.size}"
-  node.children.each do |child|
-    puts "    - #{child.label}: value=#{child.value.inspect}, attrs=#{child.attributes.inspect}"
-  end
-end

data/scripts/debug_signature_matching.rb DELETED Viewed

@@ -1,118 +0,0 @@
-#!/usr/bin/env ruby
-# frozen_string_literal: true
-# Debug signature matching to understand why elements aren't matching
-# Usage: ruby scripts/debug_signature_matching.rb
-require_relative "../lib/canon"
-require "nokogiri"
-# Sample XML with semx elements that should match
-xml1 = <<~XML
-  <p>
-    <fmt-concept>
-      <semx element="title" source="_">word</semx>
-    </fmt-concept>
-  </p>
-XML
-xml2 = <<~XML
-  <p>
-    <fmt-concept>
-      <semx element="concept" source="_">word</semx>
-    </fmt-concept>
-  </p>
-XML
-puts "=" * 80
-puts "SIGNATURE MATCHING DEBUG"
-puts "=" * 80
-# Parse both
-doc1 = Nokogiri::XML(xml1)
-doc2 = Nokogiri::XML(xml2)
-# Create adapter
-adapter = Canon::TreeDiff::Adapters::XMLAdapter.new
-# Convert to tree
-tree1 = adapter.to_tree(doc1.root)
-tree2 = adapter.to_tree(doc2.root)
-puts "\nTree 1 structure:"
-def print_tree(node, indent = 0)
-  prefix = "  " * indent
-  if node.text?
-    puts "#{prefix}#text: #{node.value.inspect}"
-  else
-    attrs = node.attributes.empty? ? "" : " {#{node.attributes.inspect}}"
-    puts "#{prefix}<#{node.label}>#{attrs}"
-    node.children.each { |c| print_tree(c, indent + 1) }
-  end
-end
-print_tree(tree1)
-puts "\nTree 2 structure:"
-print_tree(tree2)
-# Get semx nodes
-semx1 = tree1.descendants.find { |n| n.label == "semx" }
-semx2 = tree2.descendants.find { |n| n.label == "semx" }
-puts "\n#{'-' * 80}"
-puts "SEMX NODE COMPARISON"
-puts "-" * 80
-puts "\nSemx 1:"
-puts "  Label: #{semx1.label}"
-puts "  Value: #{semx1.value.inspect}"
-puts "  Attributes: #{semx1.attributes.inspect}"
-puts "\nSemx 2:"
-puts "  Label: #{semx2.label}"
-puts "  Value: #{semx2.value.inspect}"
-puts "  Attributes: #{semx2.attributes.inspect}"
-# Compute signatures
-sig1_strict = Canon::TreeDiff::Core::NodeSignature.for(semx1,
-                                                       include_attributes: true)
-sig2_strict = Canon::TreeDiff::Core::NodeSignature.for(semx2,
-                                                       include_attributes: true)
-sig1_loose = Canon::TreeDiff::Core::NodeSignature.for(semx1,
-                                                      include_attributes: false)
-sig2_loose = Canon::TreeDiff::Core::NodeSignature.for(semx2,
-                                                      include_attributes: false)
-puts "\n#{'-' * 80}"
-puts "SIGNATURE COMPARISON"
-puts "-" * 80
-puts "\nStrict signatures (with attributes):"
-puts "  Semx 1: #{sig1_strict.signature_string}"
-puts "  Semx 2: #{sig2_strict.signature_string}"
-puts "  Match? #{sig1_strict == sig2_strict}"
-puts "\nLoose signatures (without attributes):"
-puts "  Semx 1: #{sig1_loose.signature_string}"
-puts "  Semx 2: #{sig2_loose.signature_string}"
-puts "  Match? #{sig1_loose == sig2_loose}"
-puts "\n#{'-' * 80}"
-puts "ANALYSIS"
-puts "-" * 80
-if sig1_strict != sig2_strict
-  puts "\n⚠️  ISSUE FOUND:"
-  puts "Strict signatures don't match due to attribute differences!"
-  puts "This prevents HashMatcher from considering these nodes as candidates."
-  puts "\nDifference:"
-  puts "  File 1: element='title'"
-  puts "  File 2: element='concept'"
-  puts "\nSOLUTION:"
-  puts "HashMatcher should use LOOSE signatures (no attributes) to find candidates,"
-  puts "then check attributes separately during matching."
-end
-puts "\n#{'=' * 80}"