RubyGems - canon - Versions diffs - 0.1.8 → 0.1.9 - Mend

canon 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

checksums.yaml +4 -4
data/.rubocop_todo.yml +112 -25
data/docs/Gemfile +1 -0
data/docs/_config.yml +90 -1
data/docs/advanced/diff-classification.adoc +82 -2
data/docs/features/match-options/index.adoc +239 -1
data/lib/canon/comparison/format_detector.rb +2 -1
data/lib/canon/comparison/html_comparator.rb +19 -8
data/lib/canon/comparison/html_compare_profile.rb +8 -2
data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
data/lib/canon/comparison/xml_comparator.rb +48 -23
data/lib/canon/comparison/xml_node_comparison.rb +25 -3
data/lib/canon/diff/diff_classifier.rb +101 -2
data/lib/canon/diff/formatting_detector.rb +1 -1
data/lib/canon/rspec_matchers.rb +37 -8
data/lib/canon/version.rb +1 -1
data/lib/canon/xml/data_model.rb +24 -13
metadata +3 -78
data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
data/false_positive_analysis.txt +0 -0
data/file1.html +0 -1
data/file2.html +0 -1
data/old-docs/ADVANCED_TOPICS.adoc +0 -20
data/old-docs/BASIC_USAGE.adoc +0 -16
data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
data/old-docs/CLI.adoc +0 -497
data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
data/old-docs/DIFF_FORMATTING.adoc +0 -540
data/old-docs/DIFF_PARAMETERS.adoc +0 -261
data/old-docs/DOM_DIFF.adoc +0 -1017
data/old-docs/ENV_CONFIG.adoc +0 -876
data/old-docs/FORMATS.adoc +0 -867
data/old-docs/INPUT_VALIDATION.adoc +0 -477
data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
data/old-docs/MATCH_OPTIONS.adoc +0 -912
data/old-docs/MODES.adoc +0 -432
data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
data/old-docs/OPTIONS.adoc +0 -1387
data/old-docs/PREPROCESSING.adoc +0 -491
data/old-docs/README.old.adoc +0 -2831
data/old-docs/RSPEC.adoc +0 -814
data/old-docs/RUBY_API.adoc +0 -485
data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
data/old-docs/STRING_COMPARE.adoc +0 -345
data/old-docs/TMP.adoc +0 -3384
data/old-docs/TREE_DIFF.adoc +0 -1080
data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
data/old-docs/VERBOSE.adoc +0 -482
data/old-docs/VISUALIZATION_MAP.adoc +0 -625
data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
data/scripts/analyze_current_state.rb +0 -85
data/scripts/analyze_false_positives.rb +0 -114
data/scripts/analyze_remaining_failures.rb +0 -105
data/scripts/compare_current_failures.rb +0 -95
data/scripts/compare_dom_tree_diff.rb +0 -158
data/scripts/compare_failures.rb +0 -151
data/scripts/debug_attribute_extraction.rb +0 -66
data/scripts/debug_blocks_839.rb +0 -115
data/scripts/debug_meta_matching.rb +0 -52
data/scripts/debug_p_matching.rb +0 -192
data/scripts/debug_signature_matching.rb +0 -118
data/scripts/debug_sourcecode_124.rb +0 -32
data/scripts/debug_whitespace_sensitive.rb +0 -192
data/scripts/extract_false_positives.rb +0 -138
data/scripts/find_actual_false_positives.rb +0 -125
data/scripts/investigate_all_false_positives.rb +0 -161
data/scripts/investigate_batch1.rb +0 -127
data/scripts/investigate_classification.rb +0 -150
data/scripts/investigate_classification_detailed.rb +0 -190
data/scripts/investigate_common_failures.rb +0 -342
data/scripts/investigate_false_negative.rb +0 -80
data/scripts/investigate_false_positive.rb +0 -83
data/scripts/investigate_false_positives.rb +0 -227
data/scripts/investigate_false_positives_batch.rb +0 -163
data/scripts/investigate_mixed_content.rb +0 -125
data/scripts/investigate_remaining_16.rb +0 -214
data/scripts/run_single_test.rb +0 -29
data/scripts/test_all_false_positives.rb +0 -95
data/scripts/test_attribute_details.rb +0 -61
data/scripts/test_both_algorithms.rb +0 -49
data/scripts/test_both_simple.rb +0 -49
data/scripts/test_enhanced_semantic_output.rb +0 -125
data/scripts/test_readme_examples.rb +0 -131
data/scripts/test_semantic_tree_diff.rb +0 -99
data/scripts/test_semantic_ux_improvements.rb +0 -135
data/scripts/test_single_false_positive.rb +0 -119
data/scripts/test_size_limits.rb +0 -99
data/test_html_1.html +0 -21
data/test_html_2.html +0 -21
data/test_nokogiri.rb +0 -33
data/test_normalize.rb +0 -45

data/scripts/investigate_false_positives.rb DELETED Viewed

@@ -1,227 +0,0 @@
-#!/usr/bin/env ruby
-# frozen_string_literal: true
-# Script to systematically investigate false positives
-# where semantic algorithm fails but DOM algorithm passes
-require "fileutils"
-require "json"
-# False positives to investigate
-FALSE_POSITIVES = [
-  { file: "blocks_spec.rb", line: 352 },
-  { file: "footnotes_spec.rb", line: 740 },
-  { file: "inline_spec.rb", line: 1012 },
-  { file: "inline_spec.rb", line: 1251 },
-  { file: "postproc_spec.rb", line: 948 },
-  { file: "postproc_word_spec.rb", line: 372 },
-  { file: "postproc_word_spec.rb", line: 576 },
-  { file: "presentation_xml_numbers_override_spec.rb", line: 2095 },
-  { file: "presentation_xml_spec.rb", line: 1288 },
-  { file: "presentation_xml_spec.rb", line: 1500 },
-  { file: "ref_spec.rb", line: 906 },
-  { file: "sourcecode_spec.rb", line: 124 },
-  { file: "sourcecode_spec.rb", line: 610 },
-  { file: "terms_spec.rb", line: 1445 },
-  { file: "xref_format_spec.rb", line: 628 },
-  { file: "xref_spec.rb", line: 315 },
-].freeze
-ISODOC_SPEC_DIR = File.expand_path("../../../mn/isodoc/spec/isodoc", __dir__)
-class FalsePositiveInvestigator
-  attr_reader :results
-  def initialize
-    @results = []
-  end
-  def investigate_all
-    puts "=" * 80
-    puts "INVESTIGATING 16 FALSE POSITIVES"
-    puts "=" * 80
-    puts
-    FALSE_POSITIVES.each_with_index do |test, idx|
-      puts "\n#{idx + 1}/#{FALSE_POSITIVES.size}: #{test[:file]}:#{test[:line]}"
-      puts "-" * 80
-      result = investigate_test(test)
-      @results << result
-      display_result(result)
-    end
-    summarize_results
-  end
-  def investigate_test(test)
-    file_path = File.join(ISODOC_SPEC_DIR, test[:file])
-    unless File.exist?(file_path)
-      return {
-        test: test,
-        error: "File not found: #{file_path}",
-        dom_passes: nil,
-        semantic_passes: nil,
-      }
-    end
-    result = {
-      test: test,
-      file_path: file_path,
-      dom_passes: nil,
-      semantic_passes: nil,
-      semantic_output: nil,
-      error: nil,
-    }
-    # Test with DOM algorithm
-    puts "  Testing with DOM algorithm..."
-    dom_output = run_test(file_path, test[:line], "dom")
-    result[:dom_passes] = dom_output[:success]
-    result[:dom_output] = dom_output[:output]
-    # Test with semantic algorithm
-    puts "  Testing with semantic algorithm..."
-    semantic_output = run_test(file_path, test[:line], "semantic")
-    result[:semantic_passes] = semantic_output[:success]
-    result[:semantic_output] = semantic_output[:output]
-    result
-  rescue StandardError => e
-    {
-      test: test,
-      error: "Exception: #{e.message}",
-      dom_passes: nil,
-      semantic_passes: nil,
-    }
-  end
-  def run_test(file_path, line, algorithm)
-    cmd = "cd #{ISODOC_SPEC_DIR}/.. && CANON_ALGORITHM=#{algorithm} bundle exec rspec #{file_path}:#{line} 2>&1"
-    output = `#{cmd}`
-    success = $?.success?
-    {
-      success: success,
-      output: output,
-      exit_code: $?.exitstatus,
-    }
-  end
-  def display_result(result)
-    if result[:error]
-      puts "  ❌ ERROR: #{result[:error]}"
-      return
-    end
-    dom_status = result[:dom_passes] ? "✅ PASS" : "❌ FAIL"
-    sem_status = result[:semantic_passes] ? "✅ PASS" : "❌ FAIL"
-    puts "  DOM:      #{dom_status}"
-    puts "  Semantic: #{sem_status}"
-    if result[:dom_passes] && !result[:semantic_passes]
-      puts "  ⚠️  CONFIRMED FALSE POSITIVE"
-      analyze_failure(result)
-    elsif !result[:dom_passes] && result[:semantic_passes]
-      puts "  ⚠️  UNEXPECTED: DOM fails but semantic passes!"
-    elsif !result[:dom_passes] && !result[:semantic_passes]
-      puts "  ℹ️  Both algorithms fail (not a false positive)"
-    else
-      puts "  ✅ Both algorithms pass (false positive may be fixed)"
-    end
-  end
-  def analyze_failure(result)
-    output = result[:semantic_output]
-    # Look for diff patterns
-    if output.include?("Expected XML to be equivalent")
-      puts "  📋 Failure type: XML equivalence check"
-    elsif output.include?("Expected HTML to be equivalent")
-      puts "  📋 Failure type: HTML equivalence check"
-    end
-    # Extract key diff lines
-    diff_lines = output.lines.select { |l| l.match?(/^\s*[+-]/) }.take(10)
-    if diff_lines.any?
-      puts "  📊 Sample diff:"
-      diff_lines.each { |l| puts "    #{l.strip}" }
-    end
-    # Look for specific patterns
-    if output.include?("whitespace")
-      puts "  🔍 Involves: whitespace differences"
-    end
-    if output.include?("attribute")
-      puts "  🔍 Involves: attribute differences"
-    end
-    if output.include?("text content")
-      puts "  🔍 Involves: text content differences"
-    end
-  end
-  def summarize_results
-    puts "\n#{'=' * 80}"
-    puts "SUMMARY"
-    puts "=" * 80
-    confirmed_fps = @results.count do |r|
-      r[:dom_passes] && !r[:semantic_passes]
-    end
-    fixed = @results.count { |r| r[:dom_passes] && r[:semantic_passes] }
-    errors = @results.count { |r| r[:error] }
-    both_fail = @results.count { |r| !r[:dom_passes] && !r[:semantic_passes] }
-    puts "Confirmed false positives: #{confirmed_fps}/16"
-    puts "Already fixed:             #{fixed}/16"
-    puts "Both fail (not FP):        #{both_fail}/16"
-    puts "Errors:                    #{errors}/16"
-    puts
-    if confirmed_fps.positive?
-      puts "FALSE POSITIVES TO FIX:"
-      @results.each do |r|
-        next unless r[:dom_passes] && !r[:semantic_passes]
-        puts "  - #{r[:test][:file]}:#{r[:test][:line]}"
-      end
-    end
-    puts "\n#{'=' * 80}"
-  end
-  def save_detailed_output(output_dir = "tmp/false_positive_investigation")
-    FileUtils.mkdir_p(output_dir)
-    @results.each_with_index do |result, idx|
-      next if result[:error]
-      test = result[:test]
-      filename = "#{idx + 1}_#{test[:file].gsub('.rb', '')}_#{test[:line]}.txt"
-      filepath = File.join(output_dir, filename)
-      File.write(filepath, <<~OUTPUT)
-        Test: #{test[:file]}:#{test[:line]}
-        DOM passes: #{result[:dom_passes]}
-        Semantic passes: #{result[:semantic_passes]}
-        ========================================
-        SEMANTIC OUTPUT:
-        ========================================
-        #{result[:semantic_output]}
-      OUTPUT
-    end
-    puts "\nDetailed output saved to: #{output_dir}/"
-  end
-end
-# Run investigation
-investigator = FalsePositiveInvestigator.new
-investigator.investigate_all
-investigator.save_detailed_output
-puts "\nInvestigation complete!"

data/scripts/investigate_false_positives_batch.rb DELETED Viewed

@@ -1,163 +0,0 @@
-#!/usr/bin/env ruby
-# frozen_string_literal: true
-# Systematically investigate false positive failures
-# Usage: ruby scripts/investigate_false_positives_batch.rb <spec_file:line> [<spec_file:line> ...]
-require "bundler/setup"
-require "fileutils"
-# False positive test cases from XMLNS_FIX_VALIDATION.md
-FALSE_POSITIVES = [
-  "blocks_spec.rb:352",
-  "footnotes_spec.rb:740",
-  "inline_spec.rb:1012",
-  "inline_spec.rb:1251",
-  "postproc_spec.rb:948",
-  "postproc_word_spec.rb:372",
-  "postproc_word_spec.rb:576",
-  "presentation_xml_numbers_override_spec.rb:2095",
-  "presentation_xml_spec.rb:1288",
-  "presentation_xml_spec.rb:1500",
-  "ref_spec.rb:906",
-  "sourcecode_spec.rb:124",
-  "sourcecode_spec.rb:610",
-  "terms_spec.rb:1445",
-  "xref_format_spec.rb:628",
-  "xref_spec.rb:315",
-].freeze
-def run_test(spec_file, line, algorithm)
-  spec_path = File.expand_path("../../../mn/isodoc/spec/isodoc/#{spec_file}",
-                               __dir__)
-  unless File.exist?(spec_path)
-    puts "  ❌ File not found: #{spec_path}"
-    return nil
-  end
-  # Run with specific algorithm
-  { "CANON_ALGORITHM" => algorithm }
-  cmd = "cd #{File.dirname(spec_path)} && bundle exec rspec #{spec_path}:#{line} 2>&1"
-  output = `#{cmd}`
-  success = $?.success?
-  { success: success, output: output }
-end
-def analyze_test(test_case)
-  spec_file, line = test_case.split(":")
-  puts "\n#{'=' * 80}"
-  puts "ANALYZING: #{test_case}"
-  puts "=" * 80
-  # Run with DOM algorithm
-  puts "\n1. Testing with DOM algorithm..."
-  dom_result = run_test(spec_file, line, "dom")
-  return unless dom_result
-  dom_pass = dom_result[:success]
-  puts "   Result: #{dom_pass ? '✅ PASS' : '❌ FAIL'}"
-  # Run with Semantic algorithm
-  puts "\n2. Testing with Semantic algorithm..."
-  semantic_result = run_test(spec_file, line, "semantic")
-  return unless semantic_result
-  semantic_pass = semantic_result[:success]
-  puts "   Result: #{semantic_pass ? '✅ PASS' : '❌ FAIL'}"
-  # Analysis
-  puts "\n#{'-' * 80}"
-  puts "ANALYSIS:"
-  puts "-" * 80
-  if dom_pass && !semantic_pass
-    puts "✅ CONFIRMED FALSE POSITIVE: DOM passes, Semantic fails"
-    puts "\nThis test should be investigated to understand why semantic is too strict."
-    # Extract failure details from semantic output
-    if semantic_result[:output] =~ /Failure\/Error:(.+?)(?=\n\n|\z)/m
-      failure_section = $1
-      puts "\nFailure details:"
-      puts failure_section.lines.take(20).join
-    end
-    :false_positive
-  elsif !dom_pass && semantic_pass
-    puts "⚠️  UNEXPECTED: This was listed as false positive but DOM fails, Semantic passes"
-    puts "This is actually a false NEGATIVE, not a false positive!"
-    :false_negative
-  elsif dom_pass && semantic_pass
-    puts "✅ BOTH PASS: This is no longer a false positive!"
-    :fixed
-  else
-    puts "❌ BOTH FAIL: This is a common failure, not a false positive"
-    :common_failure
-  end
-end
-def main
-  # Get test cases from arguments or use all false positives
-  test_cases = if ARGV.empty?
-                 FALSE_POSITIVES
-               else
-                 ARGV
-               end
-  puts "Investigating #{test_cases.size} false positive test cases..."
-  results = {
-    false_positive: [],
-    false_negative: [],
-    fixed: [],
-    common_failure: [],
-    error: [],
-  }
-  test_cases.each do |test_case|
-    result = analyze_test(test_case)
-    results[result || :error] << test_case
-  end
-  # Summary
-  puts "\n#{'=' * 80}"
-  puts "SUMMARY"
-  puts "=" * 80
-  puts "\n✅ Confirmed False Positives (need fixing): #{results[:false_positive].size}"
-  results[:false_positive].each { |tc| puts "   - #{tc}" }
-  puts "\n🎉 Already Fixed: #{results[:fixed].size}"
-  results[:fixed].each { |tc| puts "   - #{tc}" }
-  puts "\n⚠️  Misclassified (actually false negatives): #{results[:false_negative].size}"
-  results[:false_negative].each { |tc| puts "   - #{tc}" }
-  puts "\n❌ Common Failures: #{results[:common_failure].size}"
-  results[:common_failure].each { |tc| puts "   - #{tc}" }
-  puts "\n💥 Errors: #{results[:error].size}"
-  results[:error].each { |tc| puts "   - #{tc}" }
-  # Save detailed results
-  output_file = "/tmp/false_positive_investigation.txt"
-  File.open(output_file, "w") do |f|
-    f.puts "FALSE POSITIVE INVESTIGATION RESULTS"
-    f.puts "=" * 80
-    f.puts "\nConfirmed False Positives (#{results[:false_positive].size}):"
-    results[:false_positive].each { |tc| f.puts tc }
-    f.puts "\nAlready Fixed (#{results[:fixed].size}):"
-    results[:fixed].each { |tc| f.puts tc }
-    f.puts "\nMisclassified (#{results[:false_negative].size}):"
-    results[:false_negative].each { |tc| f.puts tc }
-    f.puts "\nCommon Failures (#{results[:common_failure].size}):"
-    results[:common_failure].each { |tc| f.puts tc }
-  end
-  puts "\nDetailed results saved to: #{output_file}"
-end
-main if __FILE__ == $PROGRAM_NAME

data/scripts/investigate_mixed_content.rb DELETED Viewed

@@ -1,125 +0,0 @@
-#!/usr/bin/env ruby
-# frozen_string_literal: true
-require "bundler/setup"
-require "canon"
-require "nokogiri"
-# Test mixed content element extraction
-def test_mixed_content_extraction
-  puts "=" * 80
-  puts "Testing Mixed Content Text Extraction"
-  puts "=" * 80
-  # Create test XML with mixed content
-  xml_str = <<~XML
-    <root>
-      <formattedAddress>123 Main St<br/>Springfield, IL<br/>62701</formattedAddress>
-      <normalText>Just plain text</normalText>
-      <withSpaces>  Text with   spaces  </withSpaces>
-      <withNewlines>Text
-      with
-      newlines</withNewlines>
-    </root>
-  XML
-  doc = Nokogiri::XML(xml_str)
-  # Test each element
-  doc.root.element_children.each do |elem|
-    puts "\nElement: <#{elem.name}>"
-    puts "  Content: #{elem.content.inspect}"
-    # Extract text nodes
-    text_nodes = elem.children.select(&:text?)
-    puts "  Text nodes count: #{text_nodes.size}"
-    text_nodes.each_with_index do |node, i|
-      puts "    [#{i}]: #{node.text.inspect}"
-    end
-    # Join text
-    joined = text_nodes.map(&:text).join
-    puts "  Joined text: #{joined.inspect}"
-    # Show normalization
-    normalized = joined.gsub(/\s+/, " ").strip
-    puts "  Normalized: #{normalized.inspect}"
-    # Element children
-    elem_children = elem.element_children
-    puts "  Element children: #{elem_children.map(&:name).inspect}"
-  end
-end
-# Test with Canon adapter
-def test_with_adapter
-  puts "\n#{'=' * 80}"
-  puts "Testing with Canon XML Adapter"
-  puts "=" * 80
-  xml_str = <<~XML
-    <root>
-      <formattedAddress>123 Main St<br/>Springfield, IL<br/>62701</formattedAddress>
-    </root>
-  XML
-  doc = Nokogiri::XML(xml_str)
-  adapter = Canon::TreeDiff::Adapters::XMLAdapter.new
-  tree = adapter.to_tree(doc)
-  # Find the formattedAddress node
-  address_node = tree.children.first
-  puts "\nTreeNode for formattedAddress:"
-  puts "  Label: #{address_node.label}"
-  puts "  Value: #{address_node.value.inspect}"
-  puts "  Children count: #{address_node.children.size}"
-  address_node.children.each do |child|
-    puts "    Child: #{child.label} = #{child.value.inspect}"
-  end
-end
-# Test normalization in operation detector
-def test_normalization_comparison
-  puts "\n#{'=' * 80}"
-  puts "Testing Normalization in Comparison"
-  puts "=" * 80
-  # Two versions with different whitespace in mixed content
-  xml1 = <<~XML
-    <root>
-      <address>123 Main St<br/>Springfield, IL<br/>62701</address>
-    </root>
-  XML
-  xml2 = <<~XML
-    <root>
-      <address>123 Main St<br/>Springfield,  IL<br/>62701</address>
-    </root>
-  XML
-  # Compare with whitespace_sensitive: false
-  result = Canon.semantic_tree_diff(xml1, xml2,
-                                    whitespace_sensitive: false,
-                                    verbose: true)
-  puts "\nComparison result:"
-  puts "  Identical: #{result.identical?}"
-  puts "  Normative differences: #{result.normative_differences?}"
-  puts "  Informative differences: #{result.informative_differences?}"
-  if result.operations.any?
-    puts "\nOperations:"
-    result.operations.each do |op|
-      puts "  #{op.type}: #{op.path} - #{op.classification}"
-      puts "    Old: #{op.old_value.inspect}" if op.old_value
-      puts "    New: #{op.new_value.inspect}" if op.new_value
-    end
-  end
-end
-# Run all tests
-test_mixed_content_extraction
-test_with_adapter
-test_normalization_comparison