canon 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -1
- data/.rubocop_todo.yml +276 -7
- data/README.adoc +203 -138
- data/_config.yml +116 -0
- data/docs/ADVANCED_TOPICS.adoc +20 -0
- data/docs/BASIC_USAGE.adoc +16 -0
- data/docs/CHARACTER_VISUALIZATION.adoc +567 -0
- data/docs/CLI.adoc +493 -0
- data/docs/CUSTOMIZING_BEHAVIOR.adoc +19 -0
- data/docs/DIFF_ARCHITECTURE.adoc +435 -0
- data/docs/DIFF_FORMATTING.adoc +540 -0
- data/docs/FORMATS.adoc +447 -0
- data/docs/INDEX.adoc +222 -0
- data/docs/INPUT_VALIDATION.adoc +477 -0
- data/docs/MATCH_ARCHITECTURE.adoc +463 -0
- data/docs/MATCH_OPTIONS.adoc +719 -0
- data/docs/MODES.adoc +432 -0
- data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +219 -0
- data/docs/OPTIONS.adoc +1387 -0
- data/docs/PREPROCESSING.adoc +491 -0
- data/docs/RSPEC.adoc +605 -0
- data/docs/RUBY_API.adoc +478 -0
- data/docs/SEMANTIC_DIFF_REPORT.adoc +528 -0
- data/docs/UNDERSTANDING_CANON.adoc +17 -0
- data/docs/VERBOSE.adoc +482 -0
- data/exe/canon +7 -0
- data/lib/canon/cli.rb +179 -0
- data/lib/canon/commands/diff_command.rb +195 -0
- data/lib/canon/commands/format_command.rb +113 -0
- data/lib/canon/comparison/base_comparator.rb +39 -0
- data/lib/canon/comparison/comparison_result.rb +79 -0
- data/lib/canon/comparison/html_comparator.rb +410 -0
- data/lib/canon/comparison/json_comparator.rb +212 -0
- data/lib/canon/comparison/match_options.rb +616 -0
- data/lib/canon/comparison/xml_comparator.rb +566 -0
- data/lib/canon/comparison/yaml_comparator.rb +93 -0
- data/lib/canon/comparison.rb +239 -0
- data/lib/canon/config.rb +172 -0
- data/lib/canon/diff/diff_block.rb +71 -0
- data/lib/canon/diff/diff_block_builder.rb +105 -0
- data/lib/canon/diff/diff_classifier.rb +46 -0
- data/lib/canon/diff/diff_context.rb +85 -0
- data/lib/canon/diff/diff_context_builder.rb +107 -0
- data/lib/canon/diff/diff_line.rb +77 -0
- data/lib/canon/diff/diff_node.rb +56 -0
- data/lib/canon/diff/diff_node_mapper.rb +148 -0
- data/lib/canon/diff/diff_report.rb +133 -0
- data/lib/canon/diff/diff_report_builder.rb +62 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +407 -0
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +672 -0
- data/lib/canon/diff_formatter/by_line/json_formatter.rb +284 -0
- data/lib/canon/diff_formatter/by_line/simple_formatter.rb +190 -0
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +860 -0
- data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +292 -0
- data/lib/canon/diff_formatter/by_object/base_formatter.rb +199 -0
- data/lib/canon/diff_formatter/by_object/json_formatter.rb +305 -0
- data/lib/canon/diff_formatter/by_object/xml_formatter.rb +248 -0
- data/lib/canon/diff_formatter/by_object/yaml_formatter.rb +17 -0
- data/lib/canon/diff_formatter/character_map.yml +197 -0
- data/lib/canon/diff_formatter/debug_output.rb +431 -0
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +551 -0
- data/lib/canon/diff_formatter/legend.rb +141 -0
- data/lib/canon/diff_formatter.rb +520 -0
- data/lib/canon/errors.rb +56 -0
- data/lib/canon/formatters/html4_formatter.rb +17 -0
- data/lib/canon/formatters/html5_formatter.rb +17 -0
- data/lib/canon/formatters/html_formatter.rb +37 -0
- data/lib/canon/formatters/html_formatter_base.rb +163 -0
- data/lib/canon/formatters/json_formatter.rb +3 -0
- data/lib/canon/formatters/xml_formatter.rb +20 -55
- data/lib/canon/formatters/yaml_formatter.rb +4 -1
- data/lib/canon/pretty_printer/html.rb +57 -0
- data/lib/canon/pretty_printer/json.rb +25 -0
- data/lib/canon/pretty_printer/xml.rb +29 -0
- data/lib/canon/rspec_matchers.rb +222 -80
- data/lib/canon/validators/base_validator.rb +49 -0
- data/lib/canon/validators/html_validator.rb +138 -0
- data/lib/canon/validators/json_validator.rb +89 -0
- data/lib/canon/validators/xml_validator.rb +53 -0
- data/lib/canon/validators/yaml_validator.rb +73 -0
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/attribute_handler.rb +80 -0
- data/lib/canon/xml/c14n.rb +36 -0
- data/lib/canon/xml/character_encoder.rb +38 -0
- data/lib/canon/xml/data_model.rb +225 -0
- data/lib/canon/xml/element_matcher.rb +196 -0
- data/lib/canon/xml/line_range_mapper.rb +158 -0
- data/lib/canon/xml/namespace_handler.rb +86 -0
- data/lib/canon/xml/node.rb +32 -0
- data/lib/canon/xml/nodes/attribute_node.rb +54 -0
- data/lib/canon/xml/nodes/comment_node.rb +23 -0
- data/lib/canon/xml/nodes/element_node.rb +56 -0
- data/lib/canon/xml/nodes/namespace_node.rb +38 -0
- data/lib/canon/xml/nodes/processing_instruction_node.rb +24 -0
- data/lib/canon/xml/nodes/root_node.rb +16 -0
- data/lib/canon/xml/nodes/text_node.rb +23 -0
- data/lib/canon/xml/processor.rb +151 -0
- data/lib/canon/xml/whitespace_normalizer.rb +72 -0
- data/lib/canon/xml/xml_base_handler.rb +188 -0
- data/lib/canon.rb +14 -3
- metadata +116 -21
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "yaml"
|
|
4
|
+
require "date"
|
|
5
|
+
require "time"
|
|
6
|
+
require_relative "base_validator"
|
|
7
|
+
|
|
8
|
+
module Canon
|
|
9
|
+
module Validators
|
|
10
|
+
# Validator for YAML input
|
|
11
|
+
#
|
|
12
|
+
# Validates YAML input using Ruby's YAML parser.
|
|
13
|
+
# Raises detailed ValidationError with position information
|
|
14
|
+
# when malformed YAML is detected.
|
|
15
|
+
class YamlValidator < BaseValidator
|
|
16
|
+
# Validate YAML input
|
|
17
|
+
#
|
|
18
|
+
# @param input [String] The YAML string to validate
|
|
19
|
+
# @raise [Canon::ValidationError] If YAML is malformed
|
|
20
|
+
# @return [void]
|
|
21
|
+
def self.validate!(input)
|
|
22
|
+
return if input.nil? || input.strip.empty?
|
|
23
|
+
|
|
24
|
+
YAML.safe_load(input, permitted_classes: [Symbol, Date, Time])
|
|
25
|
+
rescue Psych::SyntaxError => e
|
|
26
|
+
location = extract_location(e)
|
|
27
|
+
|
|
28
|
+
raise Canon::ValidationError.new(
|
|
29
|
+
clean_error_message(e.message),
|
|
30
|
+
format: :yaml,
|
|
31
|
+
line: location[:line],
|
|
32
|
+
column: location[:column],
|
|
33
|
+
details: extract_context(input, e),
|
|
34
|
+
)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Clean error message by removing file path details
|
|
38
|
+
#
|
|
39
|
+
# @param message [String] The raw error message
|
|
40
|
+
# @return [String] Cleaned error message
|
|
41
|
+
def self.clean_error_message(message)
|
|
42
|
+
# Remove file path and keep main message
|
|
43
|
+
message.gsub(/\(<unknown>\):\s*/, "").split("\n").first.strip
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Extract context around the error
|
|
47
|
+
#
|
|
48
|
+
# @param input [String] The input YAML string
|
|
49
|
+
# @param error [Psych::SyntaxError] The syntax error
|
|
50
|
+
# @return [String, nil] Context snippet around the error
|
|
51
|
+
def self.extract_context(input, error)
|
|
52
|
+
return nil unless error.line
|
|
53
|
+
|
|
54
|
+
lines = input.split("\n")
|
|
55
|
+
line_idx = error.line - 1
|
|
56
|
+
return nil if line_idx.negative? || line_idx >= lines.size
|
|
57
|
+
|
|
58
|
+
# Get the problematic line
|
|
59
|
+
problem_line = lines[line_idx]
|
|
60
|
+
|
|
61
|
+
# Add column indicator if available
|
|
62
|
+
if error.column
|
|
63
|
+
indicator = "#{' ' * (error.column - 1)}^"
|
|
64
|
+
"Line content: #{problem_line}\n#{indicator}"
|
|
65
|
+
else
|
|
66
|
+
"Line content: #{problem_line}"
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
private_class_method :clean_error_message, :extract_context
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
data/lib/canon/version.rb
CHANGED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
module Xml
|
|
5
|
+
# Attribute handler for C14N 1.1
|
|
6
|
+
# Handles attribute processing per spec
|
|
7
|
+
class AttributeHandler
|
|
8
|
+
def initialize(encoder)
|
|
9
|
+
@encoder = encoder
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Process attribute axis of an element
|
|
13
|
+
# Includes handling of simple inheritable attributes for document subsets
|
|
14
|
+
# rubocop:disable Metrics/MethodLength
|
|
15
|
+
def process_attributes(element, output, omitted_ancestors = [])
|
|
16
|
+
return unless element.in_node_set?
|
|
17
|
+
|
|
18
|
+
# Collect attributes including inherited simple inheritable ones
|
|
19
|
+
attributes = collect_attributes(element, omitted_ancestors)
|
|
20
|
+
|
|
21
|
+
# Sort and process attributes
|
|
22
|
+
attributes.each do |attr|
|
|
23
|
+
output << " "
|
|
24
|
+
output << attr.qname
|
|
25
|
+
output << '="'
|
|
26
|
+
output << @encoder.encode_attribute(attr.value)
|
|
27
|
+
output << '"'
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
private
|
|
32
|
+
|
|
33
|
+
# Collect attributes including inherited simple inheritable attributes
|
|
34
|
+
def collect_attributes(element, omitted_ancestors)
|
|
35
|
+
attributes = element.sorted_attribute_nodes.select(&:in_node_set?)
|
|
36
|
+
|
|
37
|
+
# Add inherited simple inheritable attributes if needed
|
|
38
|
+
if omitted_ancestors.any?
|
|
39
|
+
inherited = collect_inherited_attributes(element, omitted_ancestors)
|
|
40
|
+
attributes = merge_attributes(attributes, inherited)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
attributes
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Collect simple inheritable attributes from omitted ancestors
|
|
47
|
+
# rubocop:disable Metrics/MethodLength
|
|
48
|
+
def collect_inherited_attributes(element, omitted_ancestors)
|
|
49
|
+
inherited = []
|
|
50
|
+
seen = Set.new
|
|
51
|
+
|
|
52
|
+
# Track which simple inheritable attributes element already has
|
|
53
|
+
element.attribute_nodes.each do |attr|
|
|
54
|
+
seen.add(attr.name) if attr.simple_inheritable?
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Walk up omitted ancestors to find inheritable attributes
|
|
58
|
+
omitted_ancestors.reverse.each do |ancestor|
|
|
59
|
+
ancestor.attribute_nodes.each do |attr|
|
|
60
|
+
next unless attr.simple_inheritable?
|
|
61
|
+
next if seen.include?(attr.name)
|
|
62
|
+
|
|
63
|
+
inherited << attr
|
|
64
|
+
seen.add(attr.name)
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
inherited
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Merge and sort attributes
|
|
72
|
+
def merge_attributes(element_attrs, inherited_attrs)
|
|
73
|
+
all_attrs = element_attrs + inherited_attrs
|
|
74
|
+
all_attrs.sort_by do |attr|
|
|
75
|
+
[attr.namespace_uri.to_s, attr.local_name]
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "data_model"
|
|
4
|
+
require_relative "processor"
|
|
5
|
+
|
|
6
|
+
module Canon
|
|
7
|
+
module Xml
|
|
8
|
+
# XML Canonicalization 1.1 implementation
|
|
9
|
+
# Per W3C Recommendation: https://www.w3.org/TR/xml-c14n11/
|
|
10
|
+
class C14n
|
|
11
|
+
# Canonicalize an XML document
|
|
12
|
+
# @param xml [String] XML document as string
|
|
13
|
+
# @param with_comments [Boolean] Include comments in canonical form
|
|
14
|
+
# @return [String] Canonical form in UTF-8
|
|
15
|
+
def self.canonicalize(xml, with_comments: false)
|
|
16
|
+
# Build XPath data model
|
|
17
|
+
root_node = DataModel.from_xml(xml)
|
|
18
|
+
|
|
19
|
+
# Process to canonical form
|
|
20
|
+
processor = Processor.new(with_comments: with_comments)
|
|
21
|
+
processor.process(root_node)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Canonicalize a document subset (for future implementation)
|
|
25
|
+
# @param xml [String] XML document as string
|
|
26
|
+
# @param xpath [String] XPath expression for subset selection
|
|
27
|
+
# @param with_comments [Boolean] Include comments in canonical form
|
|
28
|
+
# @return [String] Canonical form in UTF-8
|
|
29
|
+
def self.canonicalize_subset(xml, _xpath, with_comments: false)
|
|
30
|
+
# TODO: Implement XPath-based subset selection
|
|
31
|
+
# For now, just canonicalize the whole document
|
|
32
|
+
canonicalize(xml, with_comments: with_comments)
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
module Xml
|
|
5
|
+
# Character encoder for C14N 1.1
|
|
6
|
+
# Handles UTF-8 encoding and character reference encoding per spec
|
|
7
|
+
class CharacterEncoder
|
|
8
|
+
# Encode text node content
|
|
9
|
+
# Replace: & → &, < → <, > → >, #xD → 
|
|
10
|
+
def encode_text(text)
|
|
11
|
+
text.gsub(/[&<>\r]/) do |char|
|
|
12
|
+
case char
|
|
13
|
+
when "&" then "&"
|
|
14
|
+
when "<" then "<"
|
|
15
|
+
when ">" then ">"
|
|
16
|
+
when "\r" then "
"
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# Encode attribute value
|
|
22
|
+
# Replace: & → &, < → <, " → ",
|
|
23
|
+
# #x9 → 	, #xA → 
, #xD → 
|
|
24
|
+
def encode_attribute(value)
|
|
25
|
+
value.gsub(/[&<"\t\n\r]/) do |char|
|
|
26
|
+
case char
|
|
27
|
+
when "&" then "&"
|
|
28
|
+
when "<" then "<"
|
|
29
|
+
when '"' then """
|
|
30
|
+
when "\t" then "	"
|
|
31
|
+
when "\n" then "
"
|
|
32
|
+
when "\r" then "
"
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
require "set"
|
|
5
|
+
require_relative "nodes/root_node"
|
|
6
|
+
require_relative "nodes/element_node"
|
|
7
|
+
require_relative "nodes/namespace_node"
|
|
8
|
+
require_relative "nodes/attribute_node"
|
|
9
|
+
require_relative "nodes/text_node"
|
|
10
|
+
require_relative "nodes/comment_node"
|
|
11
|
+
require_relative "nodes/processing_instruction_node"
|
|
12
|
+
|
|
13
|
+
module Canon
|
|
14
|
+
module Xml
|
|
15
|
+
# Builds XPath data model from XML
|
|
16
|
+
class DataModel
|
|
17
|
+
# Build XPath data model from XML string
|
|
18
|
+
def self.from_xml(xml_string)
|
|
19
|
+
# Parse with Nokogiri
|
|
20
|
+
doc = Nokogiri::XML(xml_string) do |config|
|
|
21
|
+
config.nonet # Disable network access
|
|
22
|
+
config.strict # Strict parsing
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Check for relative namespace URIs (prohibited by C14N 1.1)
|
|
26
|
+
check_for_relative_namespace_uris(doc)
|
|
27
|
+
|
|
28
|
+
# Convert to XPath data model
|
|
29
|
+
build_from_nokogiri(doc)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Build XPath data model from HTML string
|
|
33
|
+
#
|
|
34
|
+
# @param html_string [String] HTML content to parse
|
|
35
|
+
# @param version [Symbol] HTML version (:html4 or :html5)
|
|
36
|
+
# @return [Nodes::RootNode] Root of the data model tree
|
|
37
|
+
def self.from_html(html_string, version: :html4)
|
|
38
|
+
# Parse with Nokogiri using appropriate HTML parser
|
|
39
|
+
doc = if version == :html5
|
|
40
|
+
Nokogiri::HTML5.fragment(html_string)
|
|
41
|
+
else
|
|
42
|
+
Nokogiri::HTML4.fragment(html_string)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# HTML doesn't have strict namespace requirements like XML,
|
|
46
|
+
# so skip the relative namespace URI check
|
|
47
|
+
|
|
48
|
+
# Convert to XPath data model (reuse XML infrastructure)
|
|
49
|
+
build_from_nokogiri(doc)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Check for relative namespace URIs (prohibited by C14N 1.1)
|
|
53
|
+
# rubocop:disable Metrics/MethodLength
|
|
54
|
+
def self.check_for_relative_namespace_uris(doc)
|
|
55
|
+
doc.traverse do |node|
|
|
56
|
+
next unless node.is_a?(Nokogiri::XML::Element)
|
|
57
|
+
|
|
58
|
+
node.namespace_definitions.each do |ns|
|
|
59
|
+
next if ns.href.nil? || ns.href.empty?
|
|
60
|
+
|
|
61
|
+
# Check if URI is relative
|
|
62
|
+
if relative_uri?(ns.href)
|
|
63
|
+
raise Canon::Error,
|
|
64
|
+
"Relative namespace URI not allowed: #{ns.href}"
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Check if a URI is relative
|
|
71
|
+
def self.relative_uri?(uri)
|
|
72
|
+
# A URI is relative if it doesn't have a scheme
|
|
73
|
+
uri !~ %r{^[a-zA-Z][a-zA-Z0-9+.-]*:}
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Build XPath data model from Nokogiri document or fragment
|
|
77
|
+
# rubocop:disable Metrics/MethodLength
|
|
78
|
+
def self.build_from_nokogiri(nokogiri_doc)
|
|
79
|
+
root = Nodes::RootNode.new
|
|
80
|
+
|
|
81
|
+
if nokogiri_doc.respond_to?(:root) && nokogiri_doc.root
|
|
82
|
+
# For Documents (XML, HTML4, HTML5, Moxml): process the root element
|
|
83
|
+
root.add_child(build_element_node(nokogiri_doc.root))
|
|
84
|
+
|
|
85
|
+
# Process PIs and comments outside doc element
|
|
86
|
+
nokogiri_doc.children.each do |child|
|
|
87
|
+
next if child == nokogiri_doc.root
|
|
88
|
+
next if child.is_a?(Nokogiri::XML::DTD)
|
|
89
|
+
|
|
90
|
+
node = build_node_from_nokogiri(child)
|
|
91
|
+
root.add_child(node) if node
|
|
92
|
+
end
|
|
93
|
+
else
|
|
94
|
+
# For DocumentFragments: process all children directly
|
|
95
|
+
# Fragments don't have a single .root, they contain multiple top-level nodes
|
|
96
|
+
nokogiri_doc.children.each do |child|
|
|
97
|
+
next if child.is_a?(Nokogiri::XML::DTD)
|
|
98
|
+
|
|
99
|
+
node = build_node_from_nokogiri(child)
|
|
100
|
+
root.add_child(node) if node
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
root
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Build node from Nokogiri node
|
|
108
|
+
def self.build_node_from_nokogiri(nokogiri_node)
|
|
109
|
+
case nokogiri_node
|
|
110
|
+
when Nokogiri::XML::Element
|
|
111
|
+
build_element_node(nokogiri_node)
|
|
112
|
+
when Nokogiri::XML::Text
|
|
113
|
+
build_text_node(nokogiri_node)
|
|
114
|
+
when Nokogiri::XML::Comment
|
|
115
|
+
build_comment_node(nokogiri_node)
|
|
116
|
+
when Nokogiri::XML::ProcessingInstruction
|
|
117
|
+
build_pi_node(nokogiri_node)
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Build element node from Nokogiri element
|
|
122
|
+
# rubocop:disable Metrics/MethodLength
|
|
123
|
+
def self.build_element_node(nokogiri_element)
|
|
124
|
+
element = Nodes::ElementNode.new(
|
|
125
|
+
name: nokogiri_element.name,
|
|
126
|
+
namespace_uri: nokogiri_element.namespace&.href,
|
|
127
|
+
prefix: nokogiri_element.namespace&.prefix,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Build namespace nodes (includes inherited namespaces)
|
|
131
|
+
build_namespace_nodes(nokogiri_element, element)
|
|
132
|
+
|
|
133
|
+
# Build attribute nodes
|
|
134
|
+
build_attribute_nodes(nokogiri_element, element)
|
|
135
|
+
|
|
136
|
+
# Build child nodes
|
|
137
|
+
nokogiri_element.children.each do |child|
|
|
138
|
+
node = build_node_from_nokogiri(child)
|
|
139
|
+
element.add_child(node) if node
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
element
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Build namespace nodes for an element
|
|
146
|
+
def self.build_namespace_nodes(nokogiri_element, element)
|
|
147
|
+
# Collect all in-scope namespaces
|
|
148
|
+
namespaces = collect_in_scope_namespaces(nokogiri_element)
|
|
149
|
+
|
|
150
|
+
namespaces.each do |prefix, uri|
|
|
151
|
+
ns_node = Nodes::NamespaceNode.new(
|
|
152
|
+
prefix: prefix,
|
|
153
|
+
uri: uri,
|
|
154
|
+
)
|
|
155
|
+
element.add_namespace(ns_node)
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Collect all in-scope namespaces for an element
|
|
160
|
+
# rubocop:disable Metrics/MethodLength
|
|
161
|
+
def self.collect_in_scope_namespaces(nokogiri_element)
|
|
162
|
+
namespaces = {}
|
|
163
|
+
|
|
164
|
+
# Walk up the tree to collect all namespace declarations
|
|
165
|
+
current = nokogiri_element
|
|
166
|
+
while current && !current.is_a?(Nokogiri::XML::Document)
|
|
167
|
+
if current.is_a?(Nokogiri::XML::Element)
|
|
168
|
+
current.namespace_definitions.each do |ns|
|
|
169
|
+
prefix = ns.prefix || ""
|
|
170
|
+
# Only add if not already defined (child overrides parent)
|
|
171
|
+
unless namespaces.key?(prefix)
|
|
172
|
+
namespaces[prefix] = ns.href
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
current = current.parent
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Always include xml namespace
|
|
180
|
+
namespaces["xml"] ||= "http://www.w3.org/XML/1998/namespace"
|
|
181
|
+
|
|
182
|
+
namespaces
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Build attribute nodes for an element
|
|
186
|
+
def self.build_attribute_nodes(nokogiri_element, element)
|
|
187
|
+
nokogiri_element.attributes.each do |name, attr|
|
|
188
|
+
next if name.start_with?("xmlns")
|
|
189
|
+
|
|
190
|
+
attr_node = Nodes::AttributeNode.new(
|
|
191
|
+
name: attr.name,
|
|
192
|
+
value: attr.value,
|
|
193
|
+
namespace_uri: attr.namespace&.href,
|
|
194
|
+
prefix: attr.namespace&.prefix,
|
|
195
|
+
)
|
|
196
|
+
element.add_attribute(attr_node)
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
# Build text node from Nokogiri text node
|
|
201
|
+
def self.build_text_node(nokogiri_text)
|
|
202
|
+
# Skip text nodes that are only whitespace between elements
|
|
203
|
+
# unless they have significant content
|
|
204
|
+
content = nokogiri_text.content
|
|
205
|
+
return nil if content.strip.empty? && nokogiri_text.parent.is_a?(Nokogiri::XML::Element)
|
|
206
|
+
|
|
207
|
+
# Nokogiri already handles CDATA conversion and entity resolution
|
|
208
|
+
Nodes::TextNode.new(value: content)
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# Build comment node from Nokogiri comment
|
|
212
|
+
def self.build_comment_node(nokogiri_comment)
|
|
213
|
+
Nodes::CommentNode.new(value: nokogiri_comment.content)
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# Build PI node from Nokogiri PI
|
|
217
|
+
def self.build_pi_node(nokogiri_pi)
|
|
218
|
+
Nodes::ProcessingInstructionNode.new(
|
|
219
|
+
target: nokogiri_pi.name,
|
|
220
|
+
data: nokogiri_pi.content,
|
|
221
|
+
)
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
end
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
module Xml
|
|
5
|
+
# Matches XML elements semantically across two DOM trees
|
|
6
|
+
#
|
|
7
|
+
# This class implements intelligent element matching for XML diffs.
|
|
8
|
+
# Instead of naive line-by-line comparison, it semantically matches
|
|
9
|
+
# elements across documents using identity attributes and structural
|
|
10
|
+
# position.
|
|
11
|
+
#
|
|
12
|
+
# == Matching Strategy
|
|
13
|
+
#
|
|
14
|
+
# Elements are matched in two passes:
|
|
15
|
+
#
|
|
16
|
+
# 1. **Identity attribute matching**: Elements with same identity attribute
|
|
17
|
+
# values are matched (e.g., id="foo" matches id="foo")
|
|
18
|
+
# 2. **Position-based matching**: Remaining elements matched by name and
|
|
19
|
+
# document position
|
|
20
|
+
#
|
|
21
|
+
# This allows detecting when elements:
|
|
22
|
+
# - Move to different positions (matched by ID)
|
|
23
|
+
# - Have content changes (matched, diff shows changes)
|
|
24
|
+
# - Are added/deleted (no match found)
|
|
25
|
+
#
|
|
26
|
+
# == Identity Attributes
|
|
27
|
+
#
|
|
28
|
+
# By default, these attributes identify elements:
|
|
29
|
+
# - id
|
|
30
|
+
# - ref
|
|
31
|
+
# - name
|
|
32
|
+
# - key
|
|
33
|
+
#
|
|
34
|
+
# Custom identity attributes can be provided to the constructor.
|
|
35
|
+
#
|
|
36
|
+
# == Usage
|
|
37
|
+
#
|
|
38
|
+
# matcher = ElementMatcher.new
|
|
39
|
+
# root1 = Canon::Xml::DataModel.from_xml(xml1)
|
|
40
|
+
# root2 = Canon::Xml::DataModel.from_xml(xml2)
|
|
41
|
+
# matches = matcher.match_trees(root1, root2)
|
|
42
|
+
#
|
|
43
|
+
# matches.each do |match|
|
|
44
|
+
# case match.status
|
|
45
|
+
# when :matched
|
|
46
|
+
# # Elements found in both trees
|
|
47
|
+
# when :deleted
|
|
48
|
+
# # Element only in first tree
|
|
49
|
+
# when :inserted
|
|
50
|
+
# # Element only in second tree
|
|
51
|
+
# end
|
|
52
|
+
# end
|
|
53
|
+
#
|
|
54
|
+
class ElementMatcher
|
|
55
|
+
# Default attributes used to identify elements
|
|
56
|
+
DEFAULT_IDENTITY_ATTRS = %w[id ref name key].freeze
|
|
57
|
+
|
|
58
|
+
# Match result for an element
|
|
59
|
+
MatchResult = Struct.new(:status, :elem1, :elem2, :path) do
|
|
60
|
+
def matched?
|
|
61
|
+
status == :matched
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def inserted?
|
|
65
|
+
status == :inserted
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def deleted?
|
|
69
|
+
status == :deleted
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def initialize(identity_attrs: DEFAULT_IDENTITY_ATTRS)
|
|
74
|
+
@identity_attrs = identity_attrs
|
|
75
|
+
@matches = []
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Match elements between two DOM trees
|
|
79
|
+
#
|
|
80
|
+
# @param root1 [Canon::Xml::Nodes::RootNode] First DOM tree
|
|
81
|
+
# @param root2 [Canon::Xml::Nodes::RootNode] Second DOM tree
|
|
82
|
+
# @return [Array<MatchResult>] Array of match results
|
|
83
|
+
def match_trees(root1, root2)
|
|
84
|
+
@matches = []
|
|
85
|
+
match_children(root1.children, root2.children, [])
|
|
86
|
+
@matches
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
private
|
|
90
|
+
|
|
91
|
+
# Match children recursively
|
|
92
|
+
def match_children(children1, children2, path)
|
|
93
|
+
# Filter to only element nodes
|
|
94
|
+
elems1 = children1.select { |n| n.node_type == :element }
|
|
95
|
+
elems2 = children2.select { |n| n.node_type == :element }
|
|
96
|
+
|
|
97
|
+
# Build identity maps for quick lookup
|
|
98
|
+
map1 = build_identity_map(elems1)
|
|
99
|
+
map2 = build_identity_map(elems2)
|
|
100
|
+
|
|
101
|
+
matched1 = Set.new
|
|
102
|
+
matched2 = Set.new
|
|
103
|
+
|
|
104
|
+
# Match by identity attributes
|
|
105
|
+
map1.each do |identity, elem1|
|
|
106
|
+
if map2.key?(identity)
|
|
107
|
+
elem2 = map2[identity]
|
|
108
|
+
elem_path = path + [elem1.name]
|
|
109
|
+
@matches << MatchResult.new(:matched, elem1, elem2, elem_path)
|
|
110
|
+
matched1.add(elem1)
|
|
111
|
+
matched2.add(elem2)
|
|
112
|
+
|
|
113
|
+
# Recursively match children
|
|
114
|
+
match_children(elem1.children, elem2.children, elem_path)
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Match remaining elements by name and position
|
|
119
|
+
unmatched1 = elems1.reject { |e| matched1.include?(e) }
|
|
120
|
+
unmatched2 = elems2.reject { |e| matched2.include?(e) }
|
|
121
|
+
|
|
122
|
+
match_by_position(unmatched1, unmatched2, path, matched1, matched2)
|
|
123
|
+
|
|
124
|
+
# Record unmatched as deleted/inserted
|
|
125
|
+
unmatched1.each do |elem1|
|
|
126
|
+
next if matched1.include?(elem1)
|
|
127
|
+
|
|
128
|
+
elem_path = path + [elem1.name]
|
|
129
|
+
@matches << MatchResult.new(:deleted, elem1, nil, elem_path)
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
unmatched2.each do |elem2|
|
|
133
|
+
next if matched2.include?(elem2)
|
|
134
|
+
|
|
135
|
+
elem_path = path + [elem2.name]
|
|
136
|
+
@matches << MatchResult.new(:inserted, nil, elem2, elem_path)
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Match remaining elements by name and position
|
|
141
|
+
def match_by_position(elems1, elems2, path, matched1, matched2)
|
|
142
|
+
# Group by element name
|
|
143
|
+
by_name1 = elems1.group_by(&:name)
|
|
144
|
+
by_name2 = elems2.group_by(&:name)
|
|
145
|
+
|
|
146
|
+
# For each name, match by position
|
|
147
|
+
by_name1.each do |name, list1|
|
|
148
|
+
next unless by_name2.key?(name)
|
|
149
|
+
|
|
150
|
+
list2 = by_name2[name]
|
|
151
|
+
|
|
152
|
+
# Match pairs by position
|
|
153
|
+
[list1.length, list2.length].min.times do |i|
|
|
154
|
+
elem1 = list1[i]
|
|
155
|
+
elem2 = list2[i]
|
|
156
|
+
|
|
157
|
+
next if matched1.include?(elem1) || matched2.include?(elem2)
|
|
158
|
+
|
|
159
|
+
elem_path = path + [name]
|
|
160
|
+
@matches << MatchResult.new(:matched, elem1, elem2, elem_path)
|
|
161
|
+
matched1.add(elem1)
|
|
162
|
+
matched2.add(elem2)
|
|
163
|
+
|
|
164
|
+
# Recursively match children
|
|
165
|
+
match_children(elem1.children, elem2.children, elem_path)
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Build map of identity → element
|
|
171
|
+
def build_identity_map(elements)
|
|
172
|
+
map = {}
|
|
173
|
+
|
|
174
|
+
elements.each do |elem|
|
|
175
|
+
identity = extract_identity(elem)
|
|
176
|
+
next unless identity
|
|
177
|
+
|
|
178
|
+
# Use element name + identity as key to handle multiple element types
|
|
179
|
+
key = "#{elem.name}##{identity}"
|
|
180
|
+
map[key] = elem
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
map
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Extract identity from element attributes
|
|
187
|
+
def extract_identity(elem)
|
|
188
|
+
@identity_attrs.each do |attr_name|
|
|
189
|
+
attr = elem.attribute_nodes.find { |a| a.name == attr_name }
|
|
190
|
+
return attr.value if attr
|
|
191
|
+
end
|
|
192
|
+
nil
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
end
|