canon 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +9 -1
  3. data/.rubocop_todo.yml +276 -7
  4. data/README.adoc +203 -138
  5. data/_config.yml +116 -0
  6. data/docs/ADVANCED_TOPICS.adoc +20 -0
  7. data/docs/BASIC_USAGE.adoc +16 -0
  8. data/docs/CHARACTER_VISUALIZATION.adoc +567 -0
  9. data/docs/CLI.adoc +493 -0
  10. data/docs/CUSTOMIZING_BEHAVIOR.adoc +19 -0
  11. data/docs/DIFF_ARCHITECTURE.adoc +435 -0
  12. data/docs/DIFF_FORMATTING.adoc +540 -0
  13. data/docs/FORMATS.adoc +447 -0
  14. data/docs/INDEX.adoc +222 -0
  15. data/docs/INPUT_VALIDATION.adoc +477 -0
  16. data/docs/MATCH_ARCHITECTURE.adoc +463 -0
  17. data/docs/MATCH_OPTIONS.adoc +719 -0
  18. data/docs/MODES.adoc +432 -0
  19. data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +219 -0
  20. data/docs/OPTIONS.adoc +1387 -0
  21. data/docs/PREPROCESSING.adoc +491 -0
  22. data/docs/RSPEC.adoc +605 -0
  23. data/docs/RUBY_API.adoc +478 -0
  24. data/docs/SEMANTIC_DIFF_REPORT.adoc +528 -0
  25. data/docs/UNDERSTANDING_CANON.adoc +17 -0
  26. data/docs/VERBOSE.adoc +482 -0
  27. data/exe/canon +7 -0
  28. data/lib/canon/cli.rb +179 -0
  29. data/lib/canon/commands/diff_command.rb +195 -0
  30. data/lib/canon/commands/format_command.rb +113 -0
  31. data/lib/canon/comparison/base_comparator.rb +39 -0
  32. data/lib/canon/comparison/comparison_result.rb +79 -0
  33. data/lib/canon/comparison/html_comparator.rb +410 -0
  34. data/lib/canon/comparison/json_comparator.rb +212 -0
  35. data/lib/canon/comparison/match_options.rb +616 -0
  36. data/lib/canon/comparison/xml_comparator.rb +566 -0
  37. data/lib/canon/comparison/yaml_comparator.rb +93 -0
  38. data/lib/canon/comparison.rb +239 -0
  39. data/lib/canon/config.rb +172 -0
  40. data/lib/canon/diff/diff_block.rb +71 -0
  41. data/lib/canon/diff/diff_block_builder.rb +105 -0
  42. data/lib/canon/diff/diff_classifier.rb +46 -0
  43. data/lib/canon/diff/diff_context.rb +85 -0
  44. data/lib/canon/diff/diff_context_builder.rb +107 -0
  45. data/lib/canon/diff/diff_line.rb +77 -0
  46. data/lib/canon/diff/diff_node.rb +56 -0
  47. data/lib/canon/diff/diff_node_mapper.rb +148 -0
  48. data/lib/canon/diff/diff_report.rb +133 -0
  49. data/lib/canon/diff/diff_report_builder.rb +62 -0
  50. data/lib/canon/diff_formatter/by_line/base_formatter.rb +407 -0
  51. data/lib/canon/diff_formatter/by_line/html_formatter.rb +672 -0
  52. data/lib/canon/diff_formatter/by_line/json_formatter.rb +284 -0
  53. data/lib/canon/diff_formatter/by_line/simple_formatter.rb +190 -0
  54. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +860 -0
  55. data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +292 -0
  56. data/lib/canon/diff_formatter/by_object/base_formatter.rb +199 -0
  57. data/lib/canon/diff_formatter/by_object/json_formatter.rb +305 -0
  58. data/lib/canon/diff_formatter/by_object/xml_formatter.rb +248 -0
  59. data/lib/canon/diff_formatter/by_object/yaml_formatter.rb +17 -0
  60. data/lib/canon/diff_formatter/character_map.yml +197 -0
  61. data/lib/canon/diff_formatter/debug_output.rb +431 -0
  62. data/lib/canon/diff_formatter/diff_detail_formatter.rb +551 -0
  63. data/lib/canon/diff_formatter/legend.rb +141 -0
  64. data/lib/canon/diff_formatter.rb +520 -0
  65. data/lib/canon/errors.rb +56 -0
  66. data/lib/canon/formatters/html4_formatter.rb +17 -0
  67. data/lib/canon/formatters/html5_formatter.rb +17 -0
  68. data/lib/canon/formatters/html_formatter.rb +37 -0
  69. data/lib/canon/formatters/html_formatter_base.rb +163 -0
  70. data/lib/canon/formatters/json_formatter.rb +3 -0
  71. data/lib/canon/formatters/xml_formatter.rb +20 -55
  72. data/lib/canon/formatters/yaml_formatter.rb +4 -1
  73. data/lib/canon/pretty_printer/html.rb +57 -0
  74. data/lib/canon/pretty_printer/json.rb +25 -0
  75. data/lib/canon/pretty_printer/xml.rb +29 -0
  76. data/lib/canon/rspec_matchers.rb +222 -80
  77. data/lib/canon/validators/base_validator.rb +49 -0
  78. data/lib/canon/validators/html_validator.rb +138 -0
  79. data/lib/canon/validators/json_validator.rb +89 -0
  80. data/lib/canon/validators/xml_validator.rb +53 -0
  81. data/lib/canon/validators/yaml_validator.rb +73 -0
  82. data/lib/canon/version.rb +1 -1
  83. data/lib/canon/xml/attribute_handler.rb +80 -0
  84. data/lib/canon/xml/c14n.rb +36 -0
  85. data/lib/canon/xml/character_encoder.rb +38 -0
  86. data/lib/canon/xml/data_model.rb +225 -0
  87. data/lib/canon/xml/element_matcher.rb +196 -0
  88. data/lib/canon/xml/line_range_mapper.rb +158 -0
  89. data/lib/canon/xml/namespace_handler.rb +86 -0
  90. data/lib/canon/xml/node.rb +32 -0
  91. data/lib/canon/xml/nodes/attribute_node.rb +54 -0
  92. data/lib/canon/xml/nodes/comment_node.rb +23 -0
  93. data/lib/canon/xml/nodes/element_node.rb +56 -0
  94. data/lib/canon/xml/nodes/namespace_node.rb +38 -0
  95. data/lib/canon/xml/nodes/processing_instruction_node.rb +24 -0
  96. data/lib/canon/xml/nodes/root_node.rb +16 -0
  97. data/lib/canon/xml/nodes/text_node.rb +23 -0
  98. data/lib/canon/xml/processor.rb +151 -0
  99. data/lib/canon/xml/whitespace_normalizer.rb +72 -0
  100. data/lib/canon/xml/xml_base_handler.rb +188 -0
  101. data/lib/canon.rb +14 -3
  102. metadata +116 -21
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "yaml"
4
+ require "date"
5
+ require "time"
6
+ require_relative "base_validator"
7
+
8
+ module Canon
9
+ module Validators
10
+ # Validator for YAML input
11
+ #
12
+ # Validates YAML input using Ruby's YAML parser.
13
+ # Raises detailed ValidationError with position information
14
+ # when malformed YAML is detected.
15
+ class YamlValidator < BaseValidator
16
+ # Validate YAML input
17
+ #
18
+ # @param input [String] The YAML string to validate
19
+ # @raise [Canon::ValidationError] If YAML is malformed
20
+ # @return [void]
21
+ def self.validate!(input)
22
+ return if input.nil? || input.strip.empty?
23
+
24
+ YAML.safe_load(input, permitted_classes: [Symbol, Date, Time])
25
+ rescue Psych::SyntaxError => e
26
+ location = extract_location(e)
27
+
28
+ raise Canon::ValidationError.new(
29
+ clean_error_message(e.message),
30
+ format: :yaml,
31
+ line: location[:line],
32
+ column: location[:column],
33
+ details: extract_context(input, e),
34
+ )
35
+ end
36
+
37
+ # Clean error message by removing file path details
38
+ #
39
+ # @param message [String] The raw error message
40
+ # @return [String] Cleaned error message
41
+ def self.clean_error_message(message)
42
+ # Remove file path and keep main message
43
+ message.gsub(/\(<unknown>\):\s*/, "").split("\n").first.strip
44
+ end
45
+
46
+ # Extract context around the error
47
+ #
48
+ # @param input [String] The input YAML string
49
+ # @param error [Psych::SyntaxError] The syntax error
50
+ # @return [String, nil] Context snippet around the error
51
+ def self.extract_context(input, error)
52
+ return nil unless error.line
53
+
54
+ lines = input.split("\n")
55
+ line_idx = error.line - 1
56
+ return nil if line_idx.negative? || line_idx >= lines.size
57
+
58
+ # Get the problematic line
59
+ problem_line = lines[line_idx]
60
+
61
+ # Add column indicator if available
62
+ if error.column
63
+ indicator = "#{' ' * (error.column - 1)}^"
64
+ "Line content: #{problem_line}\n#{indicator}"
65
+ else
66
+ "Line content: #{problem_line}"
67
+ end
68
+ end
69
+
70
+ private_class_method :clean_error_message, :extract_context
71
+ end
72
+ end
73
+ end
data/lib/canon/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Canon
4
- VERSION = "0.1.3"
4
+ VERSION = "0.1.5"
5
5
  end
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Canon
4
+ module Xml
5
+ # Attribute handler for C14N 1.1
6
+ # Handles attribute processing per spec
7
+ class AttributeHandler
8
+ def initialize(encoder)
9
+ @encoder = encoder
10
+ end
11
+
12
+ # Process attribute axis of an element
13
+ # Includes handling of simple inheritable attributes for document subsets
14
+ # rubocop:disable Metrics/MethodLength
15
+ def process_attributes(element, output, omitted_ancestors = [])
16
+ return unless element.in_node_set?
17
+
18
+ # Collect attributes including inherited simple inheritable ones
19
+ attributes = collect_attributes(element, omitted_ancestors)
20
+
21
+ # Sort and process attributes
22
+ attributes.each do |attr|
23
+ output << " "
24
+ output << attr.qname
25
+ output << '="'
26
+ output << @encoder.encode_attribute(attr.value)
27
+ output << '"'
28
+ end
29
+ end
30
+
31
+ private
32
+
33
+ # Collect attributes including inherited simple inheritable attributes
34
+ def collect_attributes(element, omitted_ancestors)
35
+ attributes = element.sorted_attribute_nodes.select(&:in_node_set?)
36
+
37
+ # Add inherited simple inheritable attributes if needed
38
+ if omitted_ancestors.any?
39
+ inherited = collect_inherited_attributes(element, omitted_ancestors)
40
+ attributes = merge_attributes(attributes, inherited)
41
+ end
42
+
43
+ attributes
44
+ end
45
+
46
+ # Collect simple inheritable attributes from omitted ancestors
47
+ # rubocop:disable Metrics/MethodLength
48
+ def collect_inherited_attributes(element, omitted_ancestors)
49
+ inherited = []
50
+ seen = Set.new
51
+
52
+ # Track which simple inheritable attributes element already has
53
+ element.attribute_nodes.each do |attr|
54
+ seen.add(attr.name) if attr.simple_inheritable?
55
+ end
56
+
57
+ # Walk up omitted ancestors to find inheritable attributes
58
+ omitted_ancestors.reverse.each do |ancestor|
59
+ ancestor.attribute_nodes.each do |attr|
60
+ next unless attr.simple_inheritable?
61
+ next if seen.include?(attr.name)
62
+
63
+ inherited << attr
64
+ seen.add(attr.name)
65
+ end
66
+ end
67
+
68
+ inherited
69
+ end
70
+
71
+ # Merge and sort attributes
72
+ def merge_attributes(element_attrs, inherited_attrs)
73
+ all_attrs = element_attrs + inherited_attrs
74
+ all_attrs.sort_by do |attr|
75
+ [attr.namespace_uri.to_s, attr.local_name]
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "data_model"
4
+ require_relative "processor"
5
+
6
+ module Canon
7
+ module Xml
8
+ # XML Canonicalization 1.1 implementation
9
+ # Per W3C Recommendation: https://www.w3.org/TR/xml-c14n11/
10
+ class C14n
11
+ # Canonicalize an XML document
12
+ # @param xml [String] XML document as string
13
+ # @param with_comments [Boolean] Include comments in canonical form
14
+ # @return [String] Canonical form in UTF-8
15
+ def self.canonicalize(xml, with_comments: false)
16
+ # Build XPath data model
17
+ root_node = DataModel.from_xml(xml)
18
+
19
+ # Process to canonical form
20
+ processor = Processor.new(with_comments: with_comments)
21
+ processor.process(root_node)
22
+ end
23
+
24
+ # Canonicalize a document subset (for future implementation)
25
+ # @param xml [String] XML document as string
26
+ # @param xpath [String] XPath expression for subset selection
27
+ # @param with_comments [Boolean] Include comments in canonical form
28
+ # @return [String] Canonical form in UTF-8
29
+ def self.canonicalize_subset(xml, _xpath, with_comments: false)
30
+ # TODO: Implement XPath-based subset selection
31
+ # For now, just canonicalize the whole document
32
+ canonicalize(xml, with_comments: with_comments)
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Canon
4
+ module Xml
5
+ # Character encoder for C14N 1.1
6
+ # Handles UTF-8 encoding and character reference encoding per spec
7
+ class CharacterEncoder
8
+ # Encode text node content
9
+ # Replace: & → &amp;, < → &lt;, > → &gt;, #xD → &#xD;
10
+ def encode_text(text)
11
+ text.gsub(/[&<>\r]/) do |char|
12
+ case char
13
+ when "&" then "&amp;"
14
+ when "<" then "&lt;"
15
+ when ">" then "&gt;"
16
+ when "\r" then "&#xD;"
17
+ end
18
+ end
19
+ end
20
+
21
+ # Encode attribute value
22
+ # Replace: & → &amp;, < → &lt;, " → &quot;,
23
+ # #x9 → &#x9;, #xA → &#xA;, #xD → &#xD;
24
+ def encode_attribute(value)
25
+ value.gsub(/[&<"\t\n\r]/) do |char|
26
+ case char
27
+ when "&" then "&amp;"
28
+ when "<" then "&lt;"
29
+ when '"' then "&quot;"
30
+ when "\t" then "&#x9;"
31
+ when "\n" then "&#xA;"
32
+ when "\r" then "&#xD;"
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,225 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+ require "set"
5
+ require_relative "nodes/root_node"
6
+ require_relative "nodes/element_node"
7
+ require_relative "nodes/namespace_node"
8
+ require_relative "nodes/attribute_node"
9
+ require_relative "nodes/text_node"
10
+ require_relative "nodes/comment_node"
11
+ require_relative "nodes/processing_instruction_node"
12
+
13
+ module Canon
14
+ module Xml
15
+ # Builds XPath data model from XML
16
+ class DataModel
17
+ # Build XPath data model from XML string
18
+ def self.from_xml(xml_string)
19
+ # Parse with Nokogiri
20
+ doc = Nokogiri::XML(xml_string) do |config|
21
+ config.nonet # Disable network access
22
+ config.strict # Strict parsing
23
+ end
24
+
25
+ # Check for relative namespace URIs (prohibited by C14N 1.1)
26
+ check_for_relative_namespace_uris(doc)
27
+
28
+ # Convert to XPath data model
29
+ build_from_nokogiri(doc)
30
+ end
31
+
32
+ # Build XPath data model from HTML string
33
+ #
34
+ # @param html_string [String] HTML content to parse
35
+ # @param version [Symbol] HTML version (:html4 or :html5)
36
+ # @return [Nodes::RootNode] Root of the data model tree
37
+ def self.from_html(html_string, version: :html4)
38
+ # Parse with Nokogiri using appropriate HTML parser
39
+ doc = if version == :html5
40
+ Nokogiri::HTML5.fragment(html_string)
41
+ else
42
+ Nokogiri::HTML4.fragment(html_string)
43
+ end
44
+
45
+ # HTML doesn't have strict namespace requirements like XML,
46
+ # so skip the relative namespace URI check
47
+
48
+ # Convert to XPath data model (reuse XML infrastructure)
49
+ build_from_nokogiri(doc)
50
+ end
51
+
52
+ # Check for relative namespace URIs (prohibited by C14N 1.1)
53
+ # rubocop:disable Metrics/MethodLength
54
+ def self.check_for_relative_namespace_uris(doc)
55
+ doc.traverse do |node|
56
+ next unless node.is_a?(Nokogiri::XML::Element)
57
+
58
+ node.namespace_definitions.each do |ns|
59
+ next if ns.href.nil? || ns.href.empty?
60
+
61
+ # Check if URI is relative
62
+ if relative_uri?(ns.href)
63
+ raise Canon::Error,
64
+ "Relative namespace URI not allowed: #{ns.href}"
65
+ end
66
+ end
67
+ end
68
+ end
69
+
70
+ # Check if a URI is relative
71
+ def self.relative_uri?(uri)
72
+ # A URI is relative if it doesn't have a scheme
73
+ uri !~ %r{^[a-zA-Z][a-zA-Z0-9+.-]*:}
74
+ end
75
+
76
+ # Build XPath data model from Nokogiri document or fragment
77
+ # rubocop:disable Metrics/MethodLength
78
+ def self.build_from_nokogiri(nokogiri_doc)
79
+ root = Nodes::RootNode.new
80
+
81
+ if nokogiri_doc.respond_to?(:root) && nokogiri_doc.root
82
+ # For Documents (XML, HTML4, HTML5, Moxml): process the root element
83
+ root.add_child(build_element_node(nokogiri_doc.root))
84
+
85
+ # Process PIs and comments outside doc element
86
+ nokogiri_doc.children.each do |child|
87
+ next if child == nokogiri_doc.root
88
+ next if child.is_a?(Nokogiri::XML::DTD)
89
+
90
+ node = build_node_from_nokogiri(child)
91
+ root.add_child(node) if node
92
+ end
93
+ else
94
+ # For DocumentFragments: process all children directly
95
+ # Fragments don't have a single .root, they contain multiple top-level nodes
96
+ nokogiri_doc.children.each do |child|
97
+ next if child.is_a?(Nokogiri::XML::DTD)
98
+
99
+ node = build_node_from_nokogiri(child)
100
+ root.add_child(node) if node
101
+ end
102
+ end
103
+
104
+ root
105
+ end
106
+
107
+ # Build node from Nokogiri node
108
+ def self.build_node_from_nokogiri(nokogiri_node)
109
+ case nokogiri_node
110
+ when Nokogiri::XML::Element
111
+ build_element_node(nokogiri_node)
112
+ when Nokogiri::XML::Text
113
+ build_text_node(nokogiri_node)
114
+ when Nokogiri::XML::Comment
115
+ build_comment_node(nokogiri_node)
116
+ when Nokogiri::XML::ProcessingInstruction
117
+ build_pi_node(nokogiri_node)
118
+ end
119
+ end
120
+
121
+ # Build element node from Nokogiri element
122
+ # rubocop:disable Metrics/MethodLength
123
+ def self.build_element_node(nokogiri_element)
124
+ element = Nodes::ElementNode.new(
125
+ name: nokogiri_element.name,
126
+ namespace_uri: nokogiri_element.namespace&.href,
127
+ prefix: nokogiri_element.namespace&.prefix,
128
+ )
129
+
130
+ # Build namespace nodes (includes inherited namespaces)
131
+ build_namespace_nodes(nokogiri_element, element)
132
+
133
+ # Build attribute nodes
134
+ build_attribute_nodes(nokogiri_element, element)
135
+
136
+ # Build child nodes
137
+ nokogiri_element.children.each do |child|
138
+ node = build_node_from_nokogiri(child)
139
+ element.add_child(node) if node
140
+ end
141
+
142
+ element
143
+ end
144
+
145
+ # Build namespace nodes for an element
146
+ def self.build_namespace_nodes(nokogiri_element, element)
147
+ # Collect all in-scope namespaces
148
+ namespaces = collect_in_scope_namespaces(nokogiri_element)
149
+
150
+ namespaces.each do |prefix, uri|
151
+ ns_node = Nodes::NamespaceNode.new(
152
+ prefix: prefix,
153
+ uri: uri,
154
+ )
155
+ element.add_namespace(ns_node)
156
+ end
157
+ end
158
+
159
+ # Collect all in-scope namespaces for an element
160
+ # rubocop:disable Metrics/MethodLength
161
+ def self.collect_in_scope_namespaces(nokogiri_element)
162
+ namespaces = {}
163
+
164
+ # Walk up the tree to collect all namespace declarations
165
+ current = nokogiri_element
166
+ while current && !current.is_a?(Nokogiri::XML::Document)
167
+ if current.is_a?(Nokogiri::XML::Element)
168
+ current.namespace_definitions.each do |ns|
169
+ prefix = ns.prefix || ""
170
+ # Only add if not already defined (child overrides parent)
171
+ unless namespaces.key?(prefix)
172
+ namespaces[prefix] = ns.href
173
+ end
174
+ end
175
+ end
176
+ current = current.parent
177
+ end
178
+
179
+ # Always include xml namespace
180
+ namespaces["xml"] ||= "http://www.w3.org/XML/1998/namespace"
181
+
182
+ namespaces
183
+ end
184
+
185
+ # Build attribute nodes for an element
186
+ def self.build_attribute_nodes(nokogiri_element, element)
187
+ nokogiri_element.attributes.each do |name, attr|
188
+ next if name.start_with?("xmlns")
189
+
190
+ attr_node = Nodes::AttributeNode.new(
191
+ name: attr.name,
192
+ value: attr.value,
193
+ namespace_uri: attr.namespace&.href,
194
+ prefix: attr.namespace&.prefix,
195
+ )
196
+ element.add_attribute(attr_node)
197
+ end
198
+ end
199
+
200
+ # Build text node from Nokogiri text node
201
+ def self.build_text_node(nokogiri_text)
202
+ # Skip text nodes that are only whitespace between elements
203
+ # unless they have significant content
204
+ content = nokogiri_text.content
205
+ return nil if content.strip.empty? && nokogiri_text.parent.is_a?(Nokogiri::XML::Element)
206
+
207
+ # Nokogiri already handles CDATA conversion and entity resolution
208
+ Nodes::TextNode.new(value: content)
209
+ end
210
+
211
+ # Build comment node from Nokogiri comment
212
+ def self.build_comment_node(nokogiri_comment)
213
+ Nodes::CommentNode.new(value: nokogiri_comment.content)
214
+ end
215
+
216
+ # Build PI node from Nokogiri PI
217
+ def self.build_pi_node(nokogiri_pi)
218
+ Nodes::ProcessingInstructionNode.new(
219
+ target: nokogiri_pi.name,
220
+ data: nokogiri_pi.content,
221
+ )
222
+ end
223
+ end
224
+ end
225
+ end
@@ -0,0 +1,196 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Canon
4
+ module Xml
5
+ # Matches XML elements semantically across two DOM trees
6
+ #
7
+ # This class implements intelligent element matching for XML diffs.
8
+ # Instead of naive line-by-line comparison, it semantically matches
9
+ # elements across documents using identity attributes and structural
10
+ # position.
11
+ #
12
+ # == Matching Strategy
13
+ #
14
+ # Elements are matched in two passes:
15
+ #
16
+ # 1. **Identity attribute matching**: Elements with same identity attribute
17
+ # values are matched (e.g., id="foo" matches id="foo")
18
+ # 2. **Position-based matching**: Remaining elements matched by name and
19
+ # document position
20
+ #
21
+ # This allows detecting when elements:
22
+ # - Move to different positions (matched by ID)
23
+ # - Have content changes (matched, diff shows changes)
24
+ # - Are added/deleted (no match found)
25
+ #
26
+ # == Identity Attributes
27
+ #
28
+ # By default, these attributes identify elements:
29
+ # - id
30
+ # - ref
31
+ # - name
32
+ # - key
33
+ #
34
+ # Custom identity attributes can be provided to the constructor.
35
+ #
36
+ # == Usage
37
+ #
38
+ # matcher = ElementMatcher.new
39
+ # root1 = Canon::Xml::DataModel.from_xml(xml1)
40
+ # root2 = Canon::Xml::DataModel.from_xml(xml2)
41
+ # matches = matcher.match_trees(root1, root2)
42
+ #
43
+ # matches.each do |match|
44
+ # case match.status
45
+ # when :matched
46
+ # # Elements found in both trees
47
+ # when :deleted
48
+ # # Element only in first tree
49
+ # when :inserted
50
+ # # Element only in second tree
51
+ # end
52
+ # end
53
+ #
54
+ class ElementMatcher
55
+ # Default attributes used to identify elements
56
+ DEFAULT_IDENTITY_ATTRS = %w[id ref name key].freeze
57
+
58
+ # Match result for an element
59
+ MatchResult = Struct.new(:status, :elem1, :elem2, :path) do
60
+ def matched?
61
+ status == :matched
62
+ end
63
+
64
+ def inserted?
65
+ status == :inserted
66
+ end
67
+
68
+ def deleted?
69
+ status == :deleted
70
+ end
71
+ end
72
+
73
+ def initialize(identity_attrs: DEFAULT_IDENTITY_ATTRS)
74
+ @identity_attrs = identity_attrs
75
+ @matches = []
76
+ end
77
+
78
+ # Match elements between two DOM trees
79
+ #
80
+ # @param root1 [Canon::Xml::Nodes::RootNode] First DOM tree
81
+ # @param root2 [Canon::Xml::Nodes::RootNode] Second DOM tree
82
+ # @return [Array<MatchResult>] Array of match results
83
+ def match_trees(root1, root2)
84
+ @matches = []
85
+ match_children(root1.children, root2.children, [])
86
+ @matches
87
+ end
88
+
89
+ private
90
+
91
+ # Match children recursively
92
+ def match_children(children1, children2, path)
93
+ # Filter to only element nodes
94
+ elems1 = children1.select { |n| n.node_type == :element }
95
+ elems2 = children2.select { |n| n.node_type == :element }
96
+
97
+ # Build identity maps for quick lookup
98
+ map1 = build_identity_map(elems1)
99
+ map2 = build_identity_map(elems2)
100
+
101
+ matched1 = Set.new
102
+ matched2 = Set.new
103
+
104
+ # Match by identity attributes
105
+ map1.each do |identity, elem1|
106
+ if map2.key?(identity)
107
+ elem2 = map2[identity]
108
+ elem_path = path + [elem1.name]
109
+ @matches << MatchResult.new(:matched, elem1, elem2, elem_path)
110
+ matched1.add(elem1)
111
+ matched2.add(elem2)
112
+
113
+ # Recursively match children
114
+ match_children(elem1.children, elem2.children, elem_path)
115
+ end
116
+ end
117
+
118
+ # Match remaining elements by name and position
119
+ unmatched1 = elems1.reject { |e| matched1.include?(e) }
120
+ unmatched2 = elems2.reject { |e| matched2.include?(e) }
121
+
122
+ match_by_position(unmatched1, unmatched2, path, matched1, matched2)
123
+
124
+ # Record unmatched as deleted/inserted
125
+ unmatched1.each do |elem1|
126
+ next if matched1.include?(elem1)
127
+
128
+ elem_path = path + [elem1.name]
129
+ @matches << MatchResult.new(:deleted, elem1, nil, elem_path)
130
+ end
131
+
132
+ unmatched2.each do |elem2|
133
+ next if matched2.include?(elem2)
134
+
135
+ elem_path = path + [elem2.name]
136
+ @matches << MatchResult.new(:inserted, nil, elem2, elem_path)
137
+ end
138
+ end
139
+
140
+ # Match remaining elements by name and position
141
+ def match_by_position(elems1, elems2, path, matched1, matched2)
142
+ # Group by element name
143
+ by_name1 = elems1.group_by(&:name)
144
+ by_name2 = elems2.group_by(&:name)
145
+
146
+ # For each name, match by position
147
+ by_name1.each do |name, list1|
148
+ next unless by_name2.key?(name)
149
+
150
+ list2 = by_name2[name]
151
+
152
+ # Match pairs by position
153
+ [list1.length, list2.length].min.times do |i|
154
+ elem1 = list1[i]
155
+ elem2 = list2[i]
156
+
157
+ next if matched1.include?(elem1) || matched2.include?(elem2)
158
+
159
+ elem_path = path + [name]
160
+ @matches << MatchResult.new(:matched, elem1, elem2, elem_path)
161
+ matched1.add(elem1)
162
+ matched2.add(elem2)
163
+
164
+ # Recursively match children
165
+ match_children(elem1.children, elem2.children, elem_path)
166
+ end
167
+ end
168
+ end
169
+
170
+ # Build map of identity → element
171
+ def build_identity_map(elements)
172
+ map = {}
173
+
174
+ elements.each do |elem|
175
+ identity = extract_identity(elem)
176
+ next unless identity
177
+
178
+ # Use element name + identity as key to handle multiple element types
179
+ key = "#{elem.name}##{identity}"
180
+ map[key] = elem
181
+ end
182
+
183
+ map
184
+ end
185
+
186
+ # Extract identity from element attributes
187
+ def extract_identity(elem)
188
+ @identity_attrs.each do |attr_name|
189
+ attr = elem.attribute_nodes.find { |a| a.name == attr_name }
190
+ return attr.value if attr
191
+ end
192
+ nil
193
+ end
194
+ end
195
+ end
196
+ end