RubyGems - canon - Versions diffs - 0.1.8 → 0.1.9 - Mend

canon 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

checksums.yaml +4 -4
data/.rubocop_todo.yml +112 -25
data/docs/Gemfile +1 -0
data/docs/_config.yml +90 -1
data/docs/advanced/diff-classification.adoc +82 -2
data/docs/features/match-options/index.adoc +239 -1
data/lib/canon/comparison/format_detector.rb +2 -1
data/lib/canon/comparison/html_comparator.rb +19 -8
data/lib/canon/comparison/html_compare_profile.rb +8 -2
data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
data/lib/canon/comparison/xml_comparator.rb +48 -23
data/lib/canon/comparison/xml_node_comparison.rb +25 -3
data/lib/canon/diff/diff_classifier.rb +101 -2
data/lib/canon/diff/formatting_detector.rb +1 -1
data/lib/canon/rspec_matchers.rb +37 -8
data/lib/canon/version.rb +1 -1
data/lib/canon/xml/data_model.rb +24 -13
metadata +3 -78
data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
data/false_positive_analysis.txt +0 -0
data/file1.html +0 -1
data/file2.html +0 -1
data/old-docs/ADVANCED_TOPICS.adoc +0 -20
data/old-docs/BASIC_USAGE.adoc +0 -16
data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
data/old-docs/CLI.adoc +0 -497
data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
data/old-docs/DIFF_FORMATTING.adoc +0 -540
data/old-docs/DIFF_PARAMETERS.adoc +0 -261
data/old-docs/DOM_DIFF.adoc +0 -1017
data/old-docs/ENV_CONFIG.adoc +0 -876
data/old-docs/FORMATS.adoc +0 -867
data/old-docs/INPUT_VALIDATION.adoc +0 -477
data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
data/old-docs/MATCH_OPTIONS.adoc +0 -912
data/old-docs/MODES.adoc +0 -432
data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
data/old-docs/OPTIONS.adoc +0 -1387
data/old-docs/PREPROCESSING.adoc +0 -491
data/old-docs/README.old.adoc +0 -2831
data/old-docs/RSPEC.adoc +0 -814
data/old-docs/RUBY_API.adoc +0 -485
data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
data/old-docs/STRING_COMPARE.adoc +0 -345
data/old-docs/TMP.adoc +0 -3384
data/old-docs/TREE_DIFF.adoc +0 -1080
data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
data/old-docs/VERBOSE.adoc +0 -482
data/old-docs/VISUALIZATION_MAP.adoc +0 -625
data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
data/scripts/analyze_current_state.rb +0 -85
data/scripts/analyze_false_positives.rb +0 -114
data/scripts/analyze_remaining_failures.rb +0 -105
data/scripts/compare_current_failures.rb +0 -95
data/scripts/compare_dom_tree_diff.rb +0 -158
data/scripts/compare_failures.rb +0 -151
data/scripts/debug_attribute_extraction.rb +0 -66
data/scripts/debug_blocks_839.rb +0 -115
data/scripts/debug_meta_matching.rb +0 -52
data/scripts/debug_p_matching.rb +0 -192
data/scripts/debug_signature_matching.rb +0 -118
data/scripts/debug_sourcecode_124.rb +0 -32
data/scripts/debug_whitespace_sensitive.rb +0 -192
data/scripts/extract_false_positives.rb +0 -138
data/scripts/find_actual_false_positives.rb +0 -125
data/scripts/investigate_all_false_positives.rb +0 -161
data/scripts/investigate_batch1.rb +0 -127
data/scripts/investigate_classification.rb +0 -150
data/scripts/investigate_classification_detailed.rb +0 -190
data/scripts/investigate_common_failures.rb +0 -342
data/scripts/investigate_false_negative.rb +0 -80
data/scripts/investigate_false_positive.rb +0 -83
data/scripts/investigate_false_positives.rb +0 -227
data/scripts/investigate_false_positives_batch.rb +0 -163
data/scripts/investigate_mixed_content.rb +0 -125
data/scripts/investigate_remaining_16.rb +0 -214
data/scripts/run_single_test.rb +0 -29
data/scripts/test_all_false_positives.rb +0 -95
data/scripts/test_attribute_details.rb +0 -61
data/scripts/test_both_algorithms.rb +0 -49
data/scripts/test_both_simple.rb +0 -49
data/scripts/test_enhanced_semantic_output.rb +0 -125
data/scripts/test_readme_examples.rb +0 -131
data/scripts/test_semantic_tree_diff.rb +0 -99
data/scripts/test_semantic_ux_improvements.rb +0 -135
data/scripts/test_single_false_positive.rb +0 -119
data/scripts/test_size_limits.rb +0 -99
data/test_html_1.html +0 -21
data/test_html_2.html +0 -21
data/test_nokogiri.rb +0 -33
data/test_normalize.rb +0 -45

data/scripts/investigate_common_failures.rb DELETED Viewed

@@ -1,342 +0,0 @@
-#!/usr/bin/env ruby
-# frozen_string_literal: true
-# Investigate the 43 common failures to determine if they represent Canon classification bugs
-# Both DOM and semantic algorithms agree these tests fail, but they might BOTH be wrong
-require "json"
-require "fileutils"
-# Sample tests from the 43 common failures
-SAMPLE_TESTS = [
-  { file: "blocks_notes_spec.rb", line: 494 },
-  { file: "blocks_provisions_spec.rb", line: 4 },
-  { file: "cleanup_spec.rb", line: 180 },
-  { file: "figures_spec.rb", line: 5 },
-  { file: "tables_spec.rb", line: 4 },
-  # Add more samples for thorough investigation
-  { file: "blocks_notes_spec.rb", line: 12 },
-  { file: "blocks_notes_spec.rb", line: 15 },
-  { file: "blocks_notes_spec.rb", line: 18 },
-  { file: "blocks_notes_spec.rb", line: 21 },
-  { file: "cleanup_spec.rb", line: 126 },
-].freeze
-ISODOC_PATH = "/Users/mulgogi/src/mn/isodoc"
-OUTPUT_DIR = "/tmp/common_failure_investigation"
-FileUtils.mkdir_p(OUTPUT_DIR)
-class CommonFailureInvestigator
-  def initialize
-    @findings = []
-    @bugs_found = []
-  end
-  def investigate_all
-    puts "=" * 80
-    puts "INVESTIGATING 43 COMMON FAILURES"
-    puts "Hypothesis: Both algorithms might incorrectly classify differences"
-    puts "=" * 80
-    puts
-    SAMPLE_TESTS.each_with_index do |test, idx|
-      puts "\n#{'-' * 80}"
-      puts "Test #{idx + 1}/#{SAMPLE_TESTS.size}: #{test[:file]}:#{test[:line]}"
-      puts "-" * 80
-      investigate_test(test)
-    end
-    generate_report
-  end
-  private
-  def investigate_test(test)
-    spec_file = "spec/isodoc/#{test[:file]}"
-    line = test[:line]
-    # Run with DOM algorithm first
-    puts "\n1. Running with DOM algorithm (verbose)..."
-    dom_result = run_test_verbose(spec_file, line, "dom")
-    # Run with Semantic algorithm
-    puts "\n2. Running with Semantic algorithm (verbose)..."
-    semantic_result = run_test_verbose(spec_file, line, "semantic")
-    # Analyze both results
-    finding = analyze_results(test, dom_result, semantic_result)
-    @findings << finding
-    if finding[:bug_suspected]
-      @bugs_found << finding
-      puts "\n⚠️  POTENTIAL BUG FOUND!"
-      puts "   #{finding[:bug_description]}"
-    else
-      puts "\n✓ Classification appears correct"
-    end
-  end
-  def run_test_verbose(spec_file, line, algorithm)
-    output_file = "#{OUTPUT_DIR}/#{algorithm}_#{spec_file.gsub('/',
-                                                               '_')}_#{line}.txt"
-    cmd = <<~CMD
-      cd #{ISODOC_PATH} && \
-      CANON_ALGORITHM=#{algorithm} \
-      CANON_VERBOSE=true \
-      bundle exec rspec #{spec_file}:#{line} 2>&1 | tee #{output_file}
-    CMD
-    system(cmd)
-    parse_test_output(output_file)
-  end
-  def parse_test_output(file)
-    return nil unless File.exist?(file)
-    content = File.read(file)
-    {
-      passed: content.include?("0 failures"),
-      differences: extract_differences(content),
-      match_options: extract_match_options(content),
-      dimensions: extract_dimensions(content),
-      raw_output: content,
-    }
-  end
-  def extract_differences(content)
-    diffs = []
-    current_diff = nil
-    content.each_line do |line|
-      if line =~ /DIFFERENCE #(\d+):/
-        current_diff = { number: $1.to_i, lines: [line] }
-        diffs << current_diff
-      elsif current_diff && line =~ /^\s*[│┌└├]/
-        current_diff[:lines] << line
-      elsif current_diff && line.strip.empty?
-        current_diff = nil
-      elsif current_diff
-        current_diff[:lines] << line
-      end
-    end
-    diffs
-  end
-  def extract_match_options(content)
-    options = {}
-    if content =~ /Match Options:\s*\{([^}]+)\}/
-      options_str = $1
-      options_str.scan(/(\w+):\s*:?(\w+)/) do |key, value|
-        options[key.to_sym] = value.to_sym
-      end
-    end
-    options
-  end
-  def extract_dimensions(content)
-    dimensions = []
-    content.scan(/Dimension:\s*(\w+)/) do |match|
-      dimensions << match[0]
-    end
-    dimensions.uniq
-  end
-  def analyze_results(test, dom_result, semantic_result)
-    finding = {
-      test: test,
-      bug_suspected: false,
-      bug_description: nil,
-      dom_analysis: analyze_single_result(dom_result),
-      semantic_analysis: analyze_single_result(semantic_result),
-    }
-    # Check for classification bugs
-    bugs = check_for_bugs(dom_result, semantic_result)
-    if bugs.any?
-      finding[:bug_suspected] = true
-      finding[:bug_description] = bugs.join("; ")
-    end
-    finding
-  end
-  def analyze_single_result(result)
-    return nil unless result
-    {
-      passed: result[:passed],
-      match_options: result[:match_options],
-      dimensions: result[:dimensions],
-      diff_count: result[:differences].size,
-    }
-  end
-  def check_for_bugs(dom_result, semantic_result)
-    bugs = []
-    return bugs unless dom_result && semantic_result
-    # Check DOM result for bugs
-    bugs.concat(check_result_for_bugs(dom_result, "DOM"))
-    # Check Semantic result for bugs
-    bugs.concat(check_result_for_bugs(semantic_result, "Semantic"))
-    bugs
-  end
-  def check_result_for_bugs(result, algorithm)
-    bugs = []
-    return bugs if result[:passed]
-    options = result[:match_options]
-    dimensions = result[:dimensions]
-    # Bug 1: attribute_order: ignore but order diffs are NORMATIVE
-    if options[:attribute_order] == :ignore && dimensions.include?("attribute_order")
-      bugs << "#{algorithm}: attribute_order:ignore but order diffs reported as NORMATIVE"
-    end
-    # Bug 2: text_content normalization issues
-    if options[:text_content] == :normalize && dimensions.include?("text_content")
-      # Check if the diff is about normalized-equivalent text
-      result[:differences].each do |diff|
-        diff_text = diff[:lines].join
-        if diff_text.include?("whitespace") || diff_text.include?("spacing")
-          bugs << "#{algorithm}: text_content:normalize but whitespace diffs reported"
-        end
-      end
-    end
-    # Bug 3: comments: ignore but comments cause failure
-    if options[:comments] == :ignore && dimensions.include?("comment")
-      bugs << "#{algorithm}: comments:ignore but comment diffs reported as NORMATIVE"
-    end
-    # Bug 4: whitespace_only: ignore but whitespace-only changes fail
-    if options[:whitespace_only] == :ignore
-      result[:differences].each do |diff|
-        diff_text = diff[:lines].join
-        if diff_text.match?(/only.*whitespace/i)
-          bugs << "#{algorithm}: whitespace_only:ignore but whitespace-only diffs reported"
-        end
-      end
-    end
-    bugs
-  end
-  def generate_report
-    report_file = "#{OUTPUT_DIR}/CLASSIFICATION_INVESTIGATION_REPORT.md"
-    File.open(report_file, "w") do |f|
-      f.puts "# Classification Investigation Report"
-      f.puts
-      f.puts "Investigation of the 43 common failures where both DOM and semantic algorithms agree."
-      f.puts
-      f.puts "**Date:** #{Time.now.strftime('%Y-%m-%d %H:%M:%S')}"
-      f.puts
-      f.puts "## Executive Summary"
-      f.puts
-      f.puts "- Tests investigated: #{@findings.size}"
-      f.puts "- Potential bugs found: #{@bugs_found.size}"
-      f.puts
-      if @bugs_found.any?
-        f.puts "## ⚠️ Bugs Found"
-        f.puts
-        @bugs_found.each_with_index do |bug, idx|
-          f.puts "### Bug #{idx + 1}: #{bug[:test][:file]}:#{bug[:test][:line]}"
-          f.puts
-          f.puts "**Description:** #{bug[:bug_description]}"
-          f.puts
-          f.puts "**DOM Analysis:**"
-          f.puts "```"
-          f.puts bug[:dom_analysis].inspect
-          f.puts "```"
-          f.puts
-          f.puts "**Semantic Analysis:**"
-          f.puts "```"
-          f.puts bug[:semantic_analysis].inspect
-          f.puts "```"
-          f.puts
-        end
-      else
-        f.puts "## ✅ No Classification Bugs Found"
-        f.puts
-        f.puts "All investigated failures appear to be legitimate test failures,"
-        f.puts "not bugs in Canon's classification logic."
-        f.puts
-      end
-      f.puts "## Detailed Findings"
-      f.puts
-      @findings.each_with_index do |finding, idx|
-        f.puts "### Test #{idx + 1}: #{finding[:test][:file]}:#{finding[:test][:line]}"
-        f.puts
-        f.puts "**Bug Suspected:** #{finding[:bug_suspected] ? 'YES ⚠️' : 'No'}"
-        f.puts
-        if finding[:bug_description]
-          f.puts "**Issue:** #{finding[:bug_description]}"
-          f.puts
-        end
-        f.puts "**DOM:**"
-        f.puts "- Match Options: #{finding[:dom_analysis][:match_options]}"
-        f.puts "- Dimensions: #{finding[:dom_analysis][:dimensions].join(', ')}"
-        f.puts "- Diff Count: #{finding[:dom_analysis][:diff_count]}"
-        f.puts
-        f.puts "**Semantic:**"
-        f.puts "- Match Options: #{finding[:semantic_analysis][:match_options]}"
-        f.puts "- Dimensions: #{finding[:semantic_analysis][:dimensions].join(', ')}"
-        f.puts "- Diff Count: #{finding[:semantic_analysis][:diff_count]}"
-        f.puts
-        f.puts "---"
-        f.puts
-      end
-      f.puts "## Next Steps"
-      f.puts
-      if @bugs_found.any?
-        f.puts "1. Review the bugs found above"
-        f.puts "2. Fix classification logic in:"
-        f.puts "   - `lib/canon/diff/diff_classifier.rb`"
-        f.puts "   - `lib/canon/tree_diff/operation_converter.rb`"
-        f.puts "3. Re-run tests to verify fixes"
-      else
-        f.puts "The 43 common failures appear to be legitimate test failures,"
-        f.puts "not Canon classification bugs. These tests fail correctly in both algorithms."
-      end
-    end
-    puts "\n\n#{'=' * 80}"
-    puts "INVESTIGATION COMPLETE"
-    puts "=" * 80
-    puts
-    puts "Report saved to: #{report_file}"
-    puts
-    puts "Summary:"
-    puts "  Tests investigated: #{@findings.size}"
-    puts "  Bugs found: #{@bugs_found.size}"
-    puts
-    if @bugs_found.any?
-      puts "⚠️  Classification bugs detected! See report for details."
-    else
-      puts "✓ No classification bugs found. Failures are legitimate."
-    end
-    puts
-  end
-end
-# Run investigation
-investigator = CommonFailureInvestigator.new
-investigator.investigate_all

data/scripts/investigate_false_negative.rb DELETED Viewed

@@ -1,80 +0,0 @@
-#!/usr/bin/env ruby
-# frozen_string_literal: true
-require "bundler/setup"
-require "canon"
-require "nokogiri"
-# Test one of the false negative cases
-# These are passing semantic but failing DOM
-# Test case: Check if the space insertion is causing problems
-def test_space_insertion_edge_cases
-  puts "=" * 80
-  puts "Testing Space Insertion Edge Cases"
-  puts "=" * 80
-  # Case 1: Elements without any child elements should not get spaces
-  xml1 = "<root><text>Hello World</text></root>"
-  xml2 = "<root><text>Hello  World</text></root>"
-  puts "\nCase 1: Simple text (no child elements)"
-  puts "  XML1: #{xml1}"
-  puts "  XML2: #{xml2}"
-  doc1 = Nokogiri::XML(xml1)
-  doc2 = Nokogiri::XML(xml2)
-  adapter = Canon::TreeDiff::Adapters::XMLAdapter.new
-  tree1 = adapter.to_tree(doc1)
-  tree2 = adapter.to_tree(doc2)
-  text_node1 = tree1.children.first
-  text_node2 = tree2.children.first
-  puts "  Tree1 text value: #{text_node1.value.inspect}"
-  puts "  Tree2 text value: #{text_node2.value.inspect}"
-  puts "  Should be different: #{text_node1.value != text_node2.value}"
-  # Case 2: Mixed content WITH br
-  xml3 = "<root><text>A<br/>B</text></root>"
-  xml4 = "<root><text>A<br/>C</text></root>"
-  puts "\nCase 2: Mixed content with <br/>"
-  puts "  XML3: #{xml3}"
-  puts "  XML4: #{xml4}"
-  doc3 = Nokogiri::XML(xml3)
-  doc4 = Nokogiri::XML(xml4)
-  tree3 = adapter.to_tree(doc3)
-  tree4 = adapter.to_tree(doc4)
-  text_node3 = tree3.children.first
-  text_node4 = tree4.children.first
-  puts "  Tree3 text value: #{text_node3.value.inspect}"
-  puts "  Tree4 text value: #{text_node4.value.inspect}"
-  puts "  Should be different: #{text_node3.value != text_node4.value}"
-  # Case 3: Text nodes that are just whitespace between elements
-  xml5 = "<root><a>X</a> <b>Y</b></root>"
-  xml6 = "<root><a>X</a><b>Y</b></root>"
-  puts "\nCase 3: Whitespace between elements"
-  puts "  XML5: #{xml5}"
-  puts "  XML6: #{xml6}"
-  doc5 = Nokogiri::XML(xml5)
-  doc6 = Nokogiri::XML(xml6)
-  tree5 = adapter.to_tree(doc5)
-  tree6 = adapter.to_tree(doc6)
-  puts "  Tree5 root value: #{tree5.value.inspect}"
-  puts "  Tree6 root value: #{tree6.value.inspect}"
-  puts "  Tree5 root has #{tree5.children.size} children"
-  puts "  Tree6 root has #{tree6.children.size} children"
-end
-test_space_insertion_edge_cases

data/scripts/investigate_false_positive.rb DELETED Viewed

@@ -1,83 +0,0 @@
-#!/usr/bin/env ruby
-# frozen_string_literal: true
-# Script to investigate false positives in semantic tree algorithm
-# Usage: ruby scripts/investigate_false_positive.rb
-require "bundler/setup"
-require "canon"
-# Simple test case demonstrating whitespace in sourcecode
-expected = <<~XML
-  <div>
-    <pre class="sourcecode">
-      Hey
-      Que?
-    </pre>
-  </div>
-XML
-actual = <<~XML
-    <div>
-      <pre class="sourcecode">Hey
-  Que?</pre>
-    </div>
-XML
-puts "=" * 80
-puts "Testing Whitespace Handling in <pre> Elements"
-puts "=" * 80
-# Test with DOM diff algorithm
-puts "\n1. DOM DIFF ALGORITHM:"
-puts "-" * 80
-result_dom = Canon::Comparison.equivalent?(expected, actual,
-                                           format: :html,
-                                           diff_algorithm: :dom,
-                                           verbose: true)
-dom_match = result_dom.is_a?(Canon::Comparison::ComparisonResult) ? result_dom.equivalent? : result_dom
-puts "Match: #{dom_match}"
-if result_dom.is_a?(Canon::Comparison::ComparisonResult)
-  puts "Normative diffs: #{result_dom.normative_differences.count}"
-  puts "Total diffs: #{result_dom.differences.count}"
-end
-# Test with Semantic Tree diff algorithm
-puts "\n2. SEMANTIC TREE ALGORITHM:"
-puts "-" * 80
-result_semantic = Canon::Comparison.equivalent?(expected, actual,
-                                                format: :html,
-                                                diff_algorithm: :semantic,
-                                                verbose: true)
-semantic_match = result_semantic.is_a?(Canon::Comparison::ComparisonResult) ? result_semantic.equivalent? : result_semantic
-puts "Match: #{semantic_match}"
-if result_semantic.is_a?(Canon::Comparison::ComparisonResult)
-  puts "Normative diffs: #{result_semantic.normative_differences.count}"
-  puts "Total diffs: #{result_semantic.differences.count}"
-end
-puts "\n#{'=' * 80}"
-puts "ANALYSIS:"
-puts "=" * 80
-if dom_match && !semantic_match
-  puts "❌ FALSE POSITIVE: Semantic tree incorrectly reports difference"
-  if result_semantic.is_a?(Canon::Comparison::ComparisonResult)
-    puts "\nDifferences found by semantic tree:"
-    result_semantic.differences.each_with_index do |diff, i|
-      puts "\n  Diff #{i + 1}:"
-      puts "    Type: #{diff.class}"
-      puts "    Normative: #{diff.normative?}" if diff.respond_to?(:normative?)
-      puts "    Details: #{diff.inspect}"
-    end
-  end
-elsif !dom_match && semantic_match
-  puts "❌ FALSE NEGATIVE: Semantic tree misses real difference"
-elsif dom_match && semantic_match
-  puts "✅ BOTH AGREE: No difference (correct)"
-else
-  puts "✅ BOTH AGREE: Difference exists (correct)"
-end