RubyGems - canon - Versions diffs - 0.1.8 → 0.1.10 - Mend

canon 0.1.8 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

checksums.yaml +4 -4
data/.rubocop_todo.yml +83 -22
data/docs/Gemfile +1 -0
data/docs/_config.yml +90 -1
data/docs/advanced/diff-classification.adoc +196 -24
data/docs/features/match-options/index.adoc +239 -1
data/lib/canon/comparison/format_detector.rb +2 -1
data/lib/canon/comparison/html_comparator.rb +19 -8
data/lib/canon/comparison/html_compare_profile.rb +8 -2
data/lib/canon/comparison/markup_comparator.rb +109 -2
data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +108 -0
data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
data/lib/canon/comparison/xml_comparator.rb +240 -23
data/lib/canon/comparison/xml_node_comparison.rb +25 -3
data/lib/canon/diff/diff_classifier.rb +119 -5
data/lib/canon/diff/formatting_detector.rb +1 -1
data/lib/canon/diff/xml_serialization_formatter.rb +153 -0
data/lib/canon/rspec_matchers.rb +37 -8
data/lib/canon/version.rb +1 -1
data/lib/canon/xml/data_model.rb +24 -13
metadata +4 -78
data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
data/false_positive_analysis.txt +0 -0
data/file1.html +0 -1
data/file2.html +0 -1
data/old-docs/ADVANCED_TOPICS.adoc +0 -20
data/old-docs/BASIC_USAGE.adoc +0 -16
data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
data/old-docs/CLI.adoc +0 -497
data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
data/old-docs/DIFF_FORMATTING.adoc +0 -540
data/old-docs/DIFF_PARAMETERS.adoc +0 -261
data/old-docs/DOM_DIFF.adoc +0 -1017
data/old-docs/ENV_CONFIG.adoc +0 -876
data/old-docs/FORMATS.adoc +0 -867
data/old-docs/INPUT_VALIDATION.adoc +0 -477
data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
data/old-docs/MATCH_OPTIONS.adoc +0 -912
data/old-docs/MODES.adoc +0 -432
data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
data/old-docs/OPTIONS.adoc +0 -1387
data/old-docs/PREPROCESSING.adoc +0 -491
data/old-docs/README.old.adoc +0 -2831
data/old-docs/RSPEC.adoc +0 -814
data/old-docs/RUBY_API.adoc +0 -485
data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
data/old-docs/STRING_COMPARE.adoc +0 -345
data/old-docs/TMP.adoc +0 -3384
data/old-docs/TREE_DIFF.adoc +0 -1080
data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
data/old-docs/VERBOSE.adoc +0 -482
data/old-docs/VISUALIZATION_MAP.adoc +0 -625
data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
data/scripts/analyze_current_state.rb +0 -85
data/scripts/analyze_false_positives.rb +0 -114
data/scripts/analyze_remaining_failures.rb +0 -105
data/scripts/compare_current_failures.rb +0 -95
data/scripts/compare_dom_tree_diff.rb +0 -158
data/scripts/compare_failures.rb +0 -151
data/scripts/debug_attribute_extraction.rb +0 -66
data/scripts/debug_blocks_839.rb +0 -115
data/scripts/debug_meta_matching.rb +0 -52
data/scripts/debug_p_matching.rb +0 -192
data/scripts/debug_signature_matching.rb +0 -118
data/scripts/debug_sourcecode_124.rb +0 -32
data/scripts/debug_whitespace_sensitive.rb +0 -192
data/scripts/extract_false_positives.rb +0 -138
data/scripts/find_actual_false_positives.rb +0 -125
data/scripts/investigate_all_false_positives.rb +0 -161
data/scripts/investigate_batch1.rb +0 -127
data/scripts/investigate_classification.rb +0 -150
data/scripts/investigate_classification_detailed.rb +0 -190
data/scripts/investigate_common_failures.rb +0 -342
data/scripts/investigate_false_negative.rb +0 -80
data/scripts/investigate_false_positive.rb +0 -83
data/scripts/investigate_false_positives.rb +0 -227
data/scripts/investigate_false_positives_batch.rb +0 -163
data/scripts/investigate_mixed_content.rb +0 -125
data/scripts/investigate_remaining_16.rb +0 -214
data/scripts/run_single_test.rb +0 -29
data/scripts/test_all_false_positives.rb +0 -95
data/scripts/test_attribute_details.rb +0 -61
data/scripts/test_both_algorithms.rb +0 -49
data/scripts/test_both_simple.rb +0 -49
data/scripts/test_enhanced_semantic_output.rb +0 -125
data/scripts/test_readme_examples.rb +0 -131
data/scripts/test_semantic_tree_diff.rb +0 -99
data/scripts/test_semantic_ux_improvements.rb +0 -135
data/scripts/test_single_false_positive.rb +0 -119
data/scripts/test_size_limits.rb +0 -99
data/test_html_1.html +0 -21
data/test_html_2.html +0 -21
data/test_nokogiri.rb +0 -33
data/test_normalize.rb +0 -45

data/scripts/investigate_batch1.rb DELETED Viewed

@@ -1,127 +0,0 @@
-#!/usr/bin/env ruby
-# frozen_string_literal: true
-# Investigate first batch of false positives in detail
-ISODOC_DIR = File.expand_path("../../../mn/isodoc", __dir__)
-BATCH_1 = [
-  { file: "spec/isodoc/blocks_spec.rb", line: 352 },
-  { file: "spec/isodoc/footnotes_spec.rb", line: 740 },
-  { file: "spec/isodoc/inline_spec.rb", line: 1012 },
-  { file: "spec/isodoc/inline_spec.rb", line: 1251 },
-  { file: "spec/isodoc/postproc_spec.rb", line: 948 },
-].freeze
-def run_test(test, algorithm)
-  file = File.join(ISODOC_DIR, test[:file])
-  cmd = "cd #{ISODOC_DIR} && CANON_ALGORITHM=#{algorithm} bundle exec rspec #{file}:#{test[:line]} 2>&1"
-  output = `#{cmd}`
-  success = $?.success?
-  {
-    success: success,
-    output: output,
-  }
-end
-def extract_diff(output)
-  lines = output.lines
-  # Find the diff section
-  diff_start = lines.index { |l| l.include?("Diff:") }
-  return nil unless diff_start
-  # Extract lines after "Diff:" until we hit a blank line or end
-  diff_lines = []
-  (diff_start + 1...lines.size).each do |i|
-    line = lines[i]
-    break if line.strip.empty? && diff_lines.size > 5
-    diff_lines << line
-  end
-  diff_lines.join
-end
-def analyze_diff(diff)
-  return {} unless diff
-  analysis = {
-    whitespace: diff.match?(/\s+/) && diff.match?(/^\s*[-+]/),
-    attributes: diff.match?(/\sattr|attribute/i),
-    text_content: diff.match?(/text|content/i),
-    elements: diff.match?(/element|tag|node/i),
-    line_count: diff.lines.size,
-  }
-  # Sample key differences
-  added = diff.lines.select do |l|
-    l.start_with?("+") && !l.start_with?("+++")
-  end.take(3)
-  removed = diff.lines.select do |l|
-    l.start_with?("-") && !l.start_with?("---")
-  end.take(3)
-  analysis[:sample_added] = added
-  analysis[:sample_removed] = removed
-  analysis
-end
-puts "=" * 80
-puts "BATCH 1 INVESTIGATION: 5 False Positives"
-puts "=" * 80
-puts
-BATCH_1.each_with_index do |test, idx|
-  puts "\n#{idx + 1}/5: #{test[:file].sub('spec/isodoc/', '')}:#{test[:line]}"
-  puts "-" * 80
-  # Run with semantic (should fail)
-  sem_result = run_test(test, "semantic")
-  if sem_result[:success]
-    puts "⚠️  UNEXPECTED: Test passes with semantic (may have been fixed)"
-    next
-  end
-  puts "✓ Confirmed: Fails with semantic as expected"
-  # Extract and analyze diff
-  diff = extract_diff(sem_result[:output])
-  if diff
-    puts "\n📊 Diff Analysis:"
-    analysis = analyze_diff(diff)
-    puts "  Diff size: #{analysis[:line_count]} lines"
-    puts "  Involves whitespace: #{analysis[:whitespace]}" if analysis[:whitespace]
-    puts "  Involves attributes: #{analysis[:attributes]}" if analysis[:attributes]
-    puts "  Involves text content: #{analysis[:text_content]}" if analysis[:text_content]
-    puts "  Involves elements: #{analysis[:elements]}" if analysis[:elements]
-    if analysis[:sample_removed].any?
-      puts "\n  Sample lines REMOVED (semantic sees but DOM doesn't):"
-      analysis[:sample_removed].each { |l| puts "    #{l.strip}" }
-    end
-    if analysis[:sample_added].any?
-      puts "\n  Sample lines ADDED (semantic missing but DOM has):"
-      analysis[:sample_added].each { |l| puts "    #{l.strip}" }
-    end
-    # Show first 30 lines of actual diff
-    puts "\n  📋 First 30 lines of diff:"
-    diff.lines.take(30).each do |line|
-      puts "    #{line.rstrip}"
-    end
-  else
-    puts "\n⚠️  Could not extract diff from output"
-    puts "\nFirst 50 lines of output:"
-    sem_result[:output].lines.take(50).each { |l| puts "  #{l.rstrip}" }
-  end
-end
-puts "\n#{'=' * 80}"
-puts "BATCH 1 INVESTIGATION COMPLETE"
-puts "=" * 80

data/scripts/investigate_classification.rb DELETED Viewed

@@ -1,150 +0,0 @@
-#!/usr/bin/env ruby
-# frozen_string_literal: true
-# Investigation script to check if Canon is properly classifying
-# differences as normative vs informative according to match options
-require "bundler/setup"
-require_relative "../lib/canon"
-# Sample tests from the 43 common failures
-SAMPLE_TESTS = [
-  { file: "blocks_notes_spec.rb", line: 494, desc: "blocks with notes" },
-  { file: "blocks_provisions_spec.rb", line: 4, desc: "block provisions" },
-  { file: "cleanup_spec.rb", line: 180, desc: "cleanup processing" },
-  { file: "figures_spec.rb", line: 5, desc: "figure handling" },
-  { file: "tables_spec.rb", line: 4, desc: "table structure" },
-].freeze
-def run_single_test(test_info)
-  puts "\n#{'=' * 80}"
-  puts "Test: #{test_info[:file]}:#{test_info[:line]}"
-  puts "Description: #{test_info[:desc]}"
-  puts "=" * 80
-  # Run the test with both algorithms to capture output
-  isodoc_path = File.expand_path("~/src/mn/isodoc")
-  test_path = File.join(isodoc_path, "spec/isodoc", test_info[:file])
-  unless File.exist?(test_path)
-    puts "⚠️  Test file not found: #{test_path}"
-    return
-  end
-  %w[dom semantic].each do |algorithm|
-    puts "\n--- #{algorithm.upcase} Algorithm ---"
-    # Run test and capture output
-    cmd = "cd #{isodoc_path} && " \
-          "CANON_DIFF_ALGORITHM=#{algorithm} " \
-          "bundle exec rspec #{test_path}:#{test_info[:line]} 2>&1"
-    output = `#{cmd}`
-    # Check if test passed or failed
-    if output.include?("0 failures")
-      puts "✅ PASSED"
-      next
-    elsif output.include?("1 failure")
-      puts "❌ FAILED"
-    else
-      puts "⚠️  Unexpected output"
-      next
-    end
-    # Extract diff information
-    extract_diff_info(output)
-  end
-end
-def extract_diff_info(output)
-  # Look for dimension information in the output
-  dimensions_found = []
-  # Common patterns in Canon output
-  dimension_patterns = [
-    /DIFFERENCE.*dimension:\s*(\w+)/i,
-    /Dimension:\s*(\w+)/i,
-    /\[(\w+)\]/,
-  ]
-  dimension_patterns.each do |pattern|
-    output.scan(pattern) do |match|
-      dimension = match[0].downcase.to_sym
-      dimensions_found << dimension unless dimensions_found.include?(dimension)
-    end
-  end
-  if dimensions_found.any?
-    puts "\n📊 Dimensions detected:"
-    dimensions_found.each do |dim|
-      puts "   - #{dim}"
-    end
-  else
-    puts "\n⚠️  No dimension information found in output"
-  end
-  # Look for normative/informative classification
-  if output.match?(/normative/i)
-    puts "\n📝 Normative differences found"
-  end
-  if output.match?(/informative/i)
-    puts "\n📝 Informative differences found"
-  end
-  # Count differences
-  diff_count = output.scan(/DIFFERENCE|difference/i).length
-  puts "\n📈 Approximate difference count: #{diff_count}"
-end
-def check_match_options_config
-  puts "\n#{'=' * 80}"
-  puts "Match Options Configuration Check"
-  puts "=" * 80
-  # Check HTML match options (most common format in isodoc tests)
-  puts "\nHTML Default Match Options:"
-  html_defaults = Canon::Comparison::MatchOptions::Xml::FORMAT_DEFAULTS[:html]
-  html_defaults.each do |dimension, behavior|
-    normative = behavior != :ignore
-    status = normative ? "NORMATIVE" : "INFORMATIVE"
-    puts "  #{dimension.to_s.ljust(25)} : #{behavior.to_s.ljust(12)} → #{status}"
-  end
-  puts "\nKey classifications:"
-  puts "  - attribute_order: #{html_defaults[:attribute_order]} " \
-       "→ #{html_defaults[:attribute_order] == :ignore ? 'INFORMATIVE ✓' : 'NORMATIVE ✗'}"
-  puts "  - text_content: #{html_defaults[:text_content]} " \
-       "→ NORMATIVE (but normalized during matching)"
-  puts "  - structural_whitespace: #{html_defaults[:structural_whitespace]} " \
-       "→ NORMATIVE (but normalized during matching)"
-  puts "  - comments: #{html_defaults[:comments]} " \
-       "→ #{html_defaults[:comments] == :ignore ? 'INFORMATIVE ✓' : 'NORMATIVE ✗'}"
-end
-def main
-  puts "Canon Classification Investigation"
-  puts "Checking if differences are properly classified as normative vs informative"
-  puts "based on match options in effect"
-  # First, show the match options configuration
-  check_match_options_config
-  # Then run sample tests
-  puts "\n\nRunning sample tests to examine actual behavior..."
-  SAMPLE_TESTS.each do |test_info|
-    run_single_test(test_info)
-  end
-  puts "\n#{'=' * 80}"
-  puts "Investigation complete"
-  puts "=" * 80
-  puts "\nKey Questions:"
-  puts "1. Are both algorithms reporting the same dimensions?"
-  puts "2. Are dimensions correctly classified per match options?"
-  puts "3. Are ignored dimensions being treated as informative?"
-  puts "4. Are normalized dimensions still showing as normative when they differ?"
-end
-main if __FILE__ == $PROGRAM_NAME

data/scripts/investigate_classification_detailed.rb DELETED Viewed

@@ -1,190 +0,0 @@
-#!/usr/bin/env ruby
-# frozen_string_literal: true
-# Detailed investigation of classification in actual failing tests
-require "bundler/setup"
-# Actual failing tests from the 43 common failures
-FAILING_TESTS = [
-  { file: "blocks_spec.rb", line: 352, desc: "examples" },
-  { file: "cleanup_spec.rb", line: 180, desc: "tables with tfoot" },
-  { file: "figures_spec.rb", line: 5, desc: "figures" },
-  { file: "inline_spec.rb", line: 1012, desc: "inline formatting" },
-  { file: "sourcecode_spec.rb", line: 124, desc: "sourcecode" },
-].freeze
-def run_test_with_verbose(test_info, algorithm)
-  isodoc_path = File.expand_path("~/src/mn/isodoc")
-  test_path = "spec/isodoc/#{test_info[:file]}:#{test_info[:line]}"
-  puts "\n--- #{algorithm.upcase} Algorithm ---"
-  # Run test and capture full output
-  cmd = "cd #{isodoc_path} && " \
-        "CANON_DIFF_ALGORITHM=#{algorithm} " \
-        "CANON_VERBOSE=true " \
-        "bundle exec rspec #{test_path} 2>&1"
-  output = `#{cmd}`
-  # Check result
-  if output.include?("0 failures")
-    puts "✅ PASSED - No classification to check"
-    return nil
-  elsif !output.include?("1 failure")
-    puts "⚠️  Unexpected result"
-    return nil
-  end
-  puts "❌ FAILED - Analyzing diff output..."
-  # Extract and analyze dimensions
-  analyze_dimensions(output)
-  output
-end
-def analyze_dimensions(output)
-  # Look for dimension mentions in various formats
-  dimensions = {}
-  # Pattern 1: DIFFERENCE blocks
-  output.scan(/DIFFERENCE.*?dimension:\s*(\w+).*?normative:\s*(\w+)/mi) do |dim, norm|
-    dim_sym = dim.downcase.to_sym
-    is_normative = norm.downcase == "true"
-    dimensions[dim_sym] ||= { normative: 0, informative: 0 }
-    if is_normative
-      dimensions[dim_sym][:normative] += 1
-    else
-      dimensions[dim_sym][:informative] += 1
-    end
-  end
-  # Pattern 2: Simple dimension mentions
-  output.scan(/(?:dimension|Dimension):\s*(\w+)/i) do |match|
-    dim_sym = match[0].downcase.to_sym
-    dimensions[dim_sym] ||= { normative: 0, informative: 0, unknown: 0 }
-    dimensions[dim_sym][:unknown] ||= 0
-    dimensions[dim_sym][:unknown] += 1
-  end
-  if dimensions.any?
-    puts "\n📊 Dimensions found:"
-    dimensions.each do |dim, counts|
-      puts "   #{dim}:"
-      counts.each do |type, count|
-        puts "      #{type}: #{count}" if count.positive?
-      end
-    end
-  else
-    puts "\n⚠️  No dimension information extracted"
-    # Try to find any diff-related output
-    if /expected.*to eq/mi.match?(output)
-      puts "   Found RSpec expectation failure"
-    end
-    if /differ/i.match?(output)
-      puts "   Found 'differ' mentions: #{output.scan(/differ/i).length}"
-    end
-  end
-  # Check for specific match option mentions
-  check_match_options_usage(output)
-end
-def check_match_options_usage(output)
-  puts "\n🔧 Match Options Application:"
-  # Check if attribute_order is mentioned
-  if /attribute.order/i.match?(output)
-    attr_order_count = output.scan(/attribute.order/i).length
-    puts "   ✓ attribute_order mentioned (#{attr_order_count} times)"
-    puts "     Expected: INFORMATIVE (match option: ignore)"
-  end
-  # Check if text normalization is mentioned
-  if /text.*normaliz/i.match?(output)
-    puts "   ✓ text normalization mentioned"
-    puts "     Expected: differences after normalization = NORMATIVE"
-  end
-  # Check if whitespace is mentioned
-  if /whitespace/i.match?(output)
-    ws_count = output.scan(/whitespace/i).length
-    puts "   ✓ whitespace mentioned (#{ws_count} times)"
-    puts "     Expected: structural_whitespace = NORMATIVE (normalized)"
-  end
-  # Check if comments are mentioned
-  if /comment/i.match?(output)
-    comment_count = output.scan(/comment/i).length
-    puts "   ✓ comments mentioned (#{comment_count} times)"
-    puts "     Expected: INFORMATIVE (match option: ignore for HTML)"
-  end
-end
-def compare_algorithms(test_info)
-  puts "\n#{'=' * 80}"
-  puts "Test: #{test_info[:file]}:#{test_info[:line]}"
-  puts "Description: #{test_info[:desc]}"
-  puts "=" * 80
-  dom_output = run_test_with_verbose(test_info, "dom")
-  semantic_output = run_test_with_verbose(test_info, "semantic")
-  if dom_output && semantic_output
-    puts "\n🔍 Comparing algorithm outputs:"
-    # Extract dimension info from both
-    dom_dims = extract_dimension_list(dom_output)
-    sem_dims = extract_dimension_list(semantic_output)
-    if dom_dims == sem_dims
-      puts "   ✅ Both algorithms report same dimensions: #{dom_dims.sort.join(', ')}"
-    else
-      puts "   ⚠️  Algorithms report different dimensions:"
-      puts "      DOM:      #{dom_dims.sort.join(', ')}"
-      puts "      Semantic: #{sem_dims.sort.join(', ')}"
-      puts "      Only in DOM: #{(dom_dims - sem_dims).sort.join(', ')}" if (dom_dims - sem_dims).any?
-      puts "      Only in Semantic: #{(sem_dims - dom_dims).sort.join(', ')}" if (sem_dims - dom_dims).any?
-    end
-  end
-end
-def extract_dimension_list(output)
-  dimensions = []
-  output.scan(/(?:dimension|Dimension):\s*(\w+)/i) do |match|
-    dim = match[0].downcase.to_sym
-    dimensions << dim unless dimensions.include?(dim)
-  end
-  dimensions
-end
-def main
-  puts "Detailed Canon Classification Investigation"
-  puts "Examining actual failing tests to verify correct classification"
-  puts "\nMatch Options for HTML (default isodoc format):"
-  puts "  - attribute_order: ignore → INFORMATIVE ✓"
-  puts "  - text_content: normalize → NORMATIVE (after normalization)"
-  puts "  - structural_whitespace: normalize → NORMATIVE (after normalization)"
-  puts "  - comments: ignore → INFORMATIVE ✓"
-  puts "  - attribute_values: strict → NORMATIVE ✓"
-  puts "  - attribute_presence: strict → NORMATIVE ✓"
-  FAILING_TESTS.each do |test_info|
-    compare_algorithms(test_info)
-    puts "\n"
-  end
-  puts "=" * 80
-  puts "Investigation Complete"
-  puts "=" * 80
-  puts "\nKey Findings to Check:"
-  puts "1. Do both algorithms classify the same dimensions?"
-  puts "2. Are 'ignore' dimensions (attribute_order, comments) INFORMATIVE?"
-  puts "3. Are 'normalize' dimensions NORMATIVE when differences persist?"
-  puts "4. Are 'strict' dimensions always NORMATIVE?"
-end
-main if __FILE__ == $PROGRAM_NAME