canon 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +69 -92
  3. data/README.adoc +13 -13
  4. data/docs/.lycheeignore +69 -0
  5. data/docs/Gemfile +1 -0
  6. data/docs/_config.yml +90 -1
  7. data/docs/advanced/diff-classification.adoc +82 -2
  8. data/docs/advanced/extending-canon.adoc +193 -0
  9. data/docs/features/match-options/index.adoc +239 -1
  10. data/docs/internals/diffnode-enrichment.adoc +611 -0
  11. data/docs/internals/index.adoc +251 -0
  12. data/docs/lychee.toml +13 -6
  13. data/docs/understanding/architecture.adoc +749 -33
  14. data/docs/understanding/comparison-pipeline.adoc +122 -0
  15. data/lib/canon/cache.rb +129 -0
  16. data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +68 -0
  17. data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +68 -0
  18. data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +171 -0
  19. data/lib/canon/comparison/dimensions/base_dimension.rb +107 -0
  20. data/lib/canon/comparison/dimensions/comments_dimension.rb +121 -0
  21. data/lib/canon/comparison/dimensions/element_position_dimension.rb +90 -0
  22. data/lib/canon/comparison/dimensions/registry.rb +77 -0
  23. data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +119 -0
  24. data/lib/canon/comparison/dimensions/text_content_dimension.rb +96 -0
  25. data/lib/canon/comparison/dimensions.rb +54 -0
  26. data/lib/canon/comparison/format_detector.rb +87 -0
  27. data/lib/canon/comparison/html_comparator.rb +70 -26
  28. data/lib/canon/comparison/html_compare_profile.rb +8 -2
  29. data/lib/canon/comparison/html_parser.rb +80 -0
  30. data/lib/canon/comparison/json_comparator.rb +12 -0
  31. data/lib/canon/comparison/json_parser.rb +19 -0
  32. data/lib/canon/comparison/markup_comparator.rb +293 -0
  33. data/lib/canon/comparison/match_options/base_resolver.rb +150 -0
  34. data/lib/canon/comparison/match_options/json_resolver.rb +82 -0
  35. data/lib/canon/comparison/match_options/xml_resolver.rb +151 -0
  36. data/lib/canon/comparison/match_options/yaml_resolver.rb +87 -0
  37. data/lib/canon/comparison/match_options.rb +68 -463
  38. data/lib/canon/comparison/profile_definition.rb +149 -0
  39. data/lib/canon/comparison/ruby_object_comparator.rb +180 -0
  40. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +7 -10
  41. data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
  42. data/lib/canon/comparison/xml_comparator/attribute_comparator.rb +177 -0
  43. data/lib/canon/comparison/xml_comparator/attribute_filter.rb +136 -0
  44. data/lib/canon/comparison/xml_comparator/child_comparison.rb +197 -0
  45. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +115 -0
  46. data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +186 -0
  47. data/lib/canon/comparison/xml_comparator/node_parser.rb +79 -0
  48. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +102 -0
  49. data/lib/canon/comparison/xml_comparator.rb +97 -684
  50. data/lib/canon/comparison/xml_node_comparison.rb +319 -0
  51. data/lib/canon/comparison/xml_parser.rb +19 -0
  52. data/lib/canon/comparison/yaml_comparator.rb +3 -3
  53. data/lib/canon/comparison.rb +265 -110
  54. data/lib/canon/diff/diff_classifier.rb +101 -2
  55. data/lib/canon/diff/diff_node.rb +32 -2
  56. data/lib/canon/diff/formatting_detector.rb +1 -1
  57. data/lib/canon/diff/node_serializer.rb +191 -0
  58. data/lib/canon/diff/path_builder.rb +143 -0
  59. data/lib/canon/diff_formatter/by_line/base_formatter.rb +251 -0
  60. data/lib/canon/diff_formatter/by_line/html_formatter.rb +6 -248
  61. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +38 -229
  62. data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +30 -0
  63. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +579 -0
  64. data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +121 -0
  65. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +253 -0
  66. data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +61 -0
  67. data/lib/canon/diff_formatter/diff_detail_formatter.rb +31 -1028
  68. data/lib/canon/diff_formatter.rb +1 -1
  69. data/lib/canon/rspec_matchers.rb +38 -9
  70. data/lib/canon/tree_diff/operation_converter.rb +92 -338
  71. data/lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb +71 -0
  72. data/lib/canon/tree_diff/operation_converter_helpers/post_processor.rb +103 -0
  73. data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +168 -0
  74. data/lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb +188 -0
  75. data/lib/canon/version.rb +1 -1
  76. data/lib/canon/xml/data_model.rb +24 -13
  77. metadata +48 -2
@@ -83,6 +83,128 @@ graph TD
83
83
 
84
84
  **Documentation**: See link:../features/diff-formatting/[Diff Formatting] and link:../features/diff-formatting/algorithm-specific-output.adoc[Algorithm-Specific Output]
85
85
 
86
+ == DiffNode Data Flow
87
+
88
+ === How differences flow through the layers
89
+
90
+ DiffNode objects are created in Layer 2, enriched with metadata, and flow through to Layer 4 for rendering:
91
+
92
+ [mermaid]
93
+ ----
94
+ graph LR
95
+ A[Layer 2: Algorithm] --> B[Create DiffNode]
96
+ B --> C[Enrich Metadata]
97
+ C --> D[Layer 3: Classification]
98
+ D --> E[Layer 4: Rendering]
99
+ E --> F[Formatted Output]
100
+
101
+ C --> C1[PathBuilder]
102
+ C --> C2[NodeSerializer]
103
+ C1 --> G[path]
104
+ C2 --> H[serialized_before/after]
105
+ C2 --> I[attributes_before/after]
106
+
107
+ style A fill:#fff4e1
108
+ style C fill:#e1f5ff
109
+ style E fill:#e1ffe1
110
+ ----
111
+
112
+ === Layer 2: DiffNode creation
113
+
114
+ Each algorithm creates DiffNode objects when it finds differences:
115
+
116
+ [source,ruby]
117
+ ----
118
+ # DOM algorithm: Creates DiffNode during element-by-element comparison
119
+ diff_node = Canon::Diff::DiffNode.new(
120
+ node1: element1,
121
+ node2: element2,
122
+ dimension: :text_content,
123
+ reason: "Text content differs"
124
+ )
125
+
126
+ # Semantic algorithm: Creates DiffNode from tree operations
127
+ diff_node = Canon::Diff::DiffNode.new(
128
+ node1: nil,
129
+ node2: inserted_node,
130
+ dimension: :element_structure,
131
+ reason: "Element inserted"
132
+ )
133
+ ----
134
+
135
+ === Metadata enrichment
136
+
137
+ After creation, DiffNodes are enriched with metadata for Layer 4 rendering:
138
+
139
+ [source,ruby]
140
+ ----
141
+ # Enriched with:
142
+ {
143
+ path: "/#document/div[0]/p[1]/span[2]", # Canonical location
144
+ serialized_before: "<span>Old text</span>", # Captured state
145
+ serialized_after: "<span>New text</span>", # Captured state
146
+ attributes_before: {"id" => "old"}, # Normalized attrs
147
+ attributes_after: {"id" => "new"} # Normalized attrs
148
+ }
149
+ ----
150
+
151
+ **Enrichment utilities**:
152
+
153
+ * **PathBuilder**: Generates canonical paths with ordinal indices
154
+ * **NodeSerializer**: Library-agnostic serialization of node content
155
+ * **Attribute extraction**: Normalized attribute hashes
156
+
157
+ See link:../internals/diffnode-enrichment.adoc[DiffNode Enrichment] for implementation details.
158
+
159
+ === Layer 3: Classification
160
+
161
+ DiffNodes are classified to determine their impact:
162
+
163
+ [source,ruby]
164
+ ----
165
+ diff_node.normative = true # Affects semantic equivalence
166
+ diff_node.formatting = true # Purely cosmetic difference
167
+ ----
168
+
169
+ This classification affects whether differences cause `equivalent?` to return false.
170
+
171
+ === Layer 4: Rendering
172
+
173
+ Layer 4 formatters use enriched metadata to display differences:
174
+
175
+ [source,text]
176
+ ----
177
+ 🔍 DIFFERENCE #1/3 [NORMATIVE]
178
+ ════════════════════════════════════════════════════════════════════════
179
+ Dimension: text_content
180
+ Location: /#document/div[0]/p[1]/span[2]
181
+
182
+ ⊖ Expected (File 1):
183
+ <span>Old text</span>
184
+
185
+ ⊕ Actual (File 2):
186
+ <span>New text</span>
187
+
188
+ ✨ Changes:
189
+ Text content changed from "Old text" to "New text"
190
+ ----
191
+
192
+ * The `Location` field uses the enriched `path`
193
+ * The before/after content uses `serialized_before/after`
194
+ * Attribute differences use `attributes_before/after`
195
+
196
+ This ensures accurate display regardless of which parsing library was used.
197
+
198
+ === Benefits of enriched DiffNodes
199
+
200
+ **Library flexibility**: Layer 4 works with any parsing library through enriched metadata
201
+
202
+ **Performance**: Metadata captured once at diff creation, not recomputed during rendering
203
+
204
+ **Accuracy**: Shows actual node state when difference was found, not current state
205
+
206
+ **Debuggability**: Ordinal indices in paths make it easy to locate specific elements
207
+
86
208
  == Complete Example
87
209
 
88
210
  Here's a full 4-layer configuration showing all layers working together:
@@ -0,0 +1,129 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "digest"
4
+
5
+ module Canon
6
+ # Cache for expensive operations during document comparison
7
+ #
8
+ # Provides thread-safe caching with size limits to prevent memory bloat.
9
+ # Uses LRU (Least Recently Used) eviction when cache is full.
10
+ #
11
+ # @example Cache a parsed document
12
+ # key = Cache.key_for_document(xml_string, :xml, :none)
13
+ # parsed = Cache.fetch(:document_parse, key) { parse_xml(xml_string) }
14
+ #
15
+ # @example Clear all caches (e.g., between test cases)
16
+ # Cache.clear_all
17
+ module Cache
18
+ class << self
19
+ # Maximum number of entries per cache category
20
+ MAX_CACHE_SIZE = 100
21
+
22
+ # Fetch a value from cache, or compute and cache it
23
+ #
24
+ # @param category [Symbol] Cache category (:document_parse, :format_detect, etc.)
25
+ # @param key [String] Cache key
26
+ # @yield Block to compute value if not cached
27
+ # @return [Object] Cached or computed value
28
+ def fetch(category, key)
29
+ cache = cache_for(category)
30
+
31
+ # Check if key exists
32
+ if cache.key?(key)
33
+ # Update access time for LRU
34
+ cache[key][:accessed] = Time.now
35
+ return cache[key][:value]
36
+ end
37
+
38
+ # Compute and cache the value
39
+ value = yield
40
+
41
+ # Evict oldest entry if cache is full
42
+ if cache.size >= MAX_CACHE_SIZE
43
+ oldest_key = cache.min_by { |_, v| v[:accessed] }&.first
44
+ cache.delete(oldest_key) if oldest_key
45
+ end
46
+
47
+ cache[key] = { value: value, accessed: Time.now }
48
+ value
49
+ end
50
+
51
+ # Clear all caches
52
+ #
53
+ # Useful for tests or when memory needs to be freed
54
+ def clear_all
55
+ @caches&.each_value(&:clear)
56
+ @caches = nil
57
+ end
58
+
59
+ # Clear a specific cache category
60
+ #
61
+ # @param category [Symbol] Cache category to clear
62
+ def clear_category(category)
63
+ return unless @caches&.key?(category)
64
+
65
+ @caches[category]&.clear
66
+ end
67
+
68
+ # Get cache statistics
69
+ #
70
+ # @return [Hash] Statistics about cache usage
71
+ def stats
72
+ @caches&.transform_values(&:size) || {}
73
+ end
74
+
75
+ # Generate cache key for document parsing
76
+ #
77
+ # @param content [String] Document content
78
+ # @param format [Symbol] Document format
79
+ # @param preprocessing [Symbol] Preprocessing option
80
+ # @return [String] Cache key
81
+ def key_for_document(content, format, preprocessing)
82
+ digest = Digest::SHA256.hexdigest(content)
83
+ "doc:#{format}:#{preprocessing}:#{digest[0..16]}"
84
+ end
85
+
86
+ # Generate cache key for format detection
87
+ #
88
+ # @param content [String] Document content
89
+ # @return [String] Cache key
90
+ def key_for_format_detection(content)
91
+ # Use first 100 chars for quick key, plus length
92
+ preview = content[0..100]
93
+ digest = Digest::SHA256.hexdigest(preview + content.length.to_s)
94
+ "fmt:#{digest[0..16]}"
95
+ end
96
+
97
+ # Generate cache key for XML canonicalization
98
+ #
99
+ # @param content [String] XML content
100
+ # @param with_comments [Boolean] Whether to include comments
101
+ # @return [String] Cache key
102
+ def key_for_c14n(content, with_comments)
103
+ digest = Digest::SHA256.hexdigest(content)
104
+ "c14n:#{with_comments}:#{digest[0..16]}"
105
+ end
106
+
107
+ # Generate cache key for preprocessing
108
+ #
109
+ # @param content [String] Original content
110
+ # @param preprocessing [Symbol] Preprocessing type
111
+ # @return [String] Cache key
112
+ def key_for_preprocessing(content, preprocessing)
113
+ digest = Digest::SHA256.hexdigest(content)
114
+ "pre:#{preprocessing}:#{digest[0..16]}"
115
+ end
116
+
117
+ private
118
+
119
+ # Get or create cache for a category
120
+ #
121
+ # @param category [Symbol] Cache category
122
+ # @return [Hash] Cache hash for category
123
+ def cache_for(category)
124
+ @caches ||= {}
125
+ @caches[category] ||= {}
126
+ end
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base_dimension"
4
+
5
+ module Canon
6
+ module Comparison
7
+ module Dimensions
8
+ # Attribute order dimension
9
+ #
10
+ # Handles comparison of attribute ordering.
11
+ # Supports :strict and :ignore behaviors.
12
+ #
13
+ # Behaviors:
14
+ # - :strict - Attributes must appear in the same order
15
+ # - :ignore - Attribute order doesn't matter
16
+ class AttributeOrderDimension < BaseDimension
17
+ # Extract attribute order from a node
18
+ #
19
+ # @param node [Moxml::Node, Nokogiri::XML::Node] Node to extract from
20
+ # @return [Array<Symbol>] Array of attribute names in order
21
+ def extract_data(node)
22
+ return [] unless node
23
+
24
+ # Handle Moxml nodes
25
+ if node.is_a?(Moxml::Node)
26
+ extract_from_moxml(node)
27
+ # Handle Nokogiri nodes
28
+ elsif node.is_a?(Nokogiri::XML::Node)
29
+ extract_from_nokogiri(node)
30
+ else
31
+ []
32
+ end
33
+ end
34
+
35
+ # Strict attribute order comparison
36
+ #
37
+ # @param order1 [Array<Symbol>] First attribute order
38
+ # @param order2 [Array<Symbol>] Second attribute order
39
+ # @return [Boolean] true if attribute order is exactly the same
40
+ def compare_strict(order1, order2)
41
+ order1 == order2
42
+ end
43
+
44
+ private
45
+
46
+ # Extract attribute order from Moxml node
47
+ #
48
+ # @param node [Moxml::Node] Moxml node
49
+ # @return [Array<Symbol>] Array of attribute names in order
50
+ def extract_from_moxml(node)
51
+ return [] unless node.node_type == :element
52
+
53
+ node.attributes.map { |attr| attr.name.to_sym }
54
+ end
55
+
56
+ # Extract attribute order from Nokogiri node
57
+ #
58
+ # @param node [Nokogiri::XML::Node] Nokogiri node
59
+ # @return [Array<Symbol>] Array of attribute names in order
60
+ def extract_from_nokogiri(node)
61
+ return [] unless node.node_type == Nokogiri::XML::Node::ELEMENT_NODE
62
+
63
+ node.attribute_nodes.map { |attr| attr.name.to_sym }
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base_dimension"
4
+
5
+ module Canon
6
+ module Comparison
7
+ module Dimensions
8
+ # Attribute presence dimension
9
+ #
10
+ # Handles comparison of attribute presence (which attributes exist).
11
+ # Supports :strict and :ignore behaviors.
12
+ #
13
+ # Behaviors:
14
+ # - :strict - Attribute names must match exactly
15
+ # - :ignore - Skip attribute presence comparison
16
+ class AttributePresenceDimension < BaseDimension
17
+ # Extract attribute names from a node
18
+ #
19
+ # @param node [Moxml::Node, Nokogiri::XML::Node] Node to extract from
20
+ # @return [Array<Symbol>] Array of attribute names
21
+ def extract_data(node)
22
+ return [] unless node
23
+
24
+ # Handle Moxml nodes
25
+ if node.is_a?(Moxml::Node)
26
+ extract_from_moxml(node)
27
+ # Handle Nokogiri nodes
28
+ elsif node.is_a?(Nokogiri::XML::Node)
29
+ extract_from_nokogiri(node)
30
+ else
31
+ []
32
+ end
33
+ end
34
+
35
+ # Strict attribute presence comparison
36
+ #
37
+ # @param names1 [Array<Symbol>] First attribute names
38
+ # @param names2 [Array<Symbol>] Second attribute names
39
+ # @return [Boolean] true if attribute names are exactly equal
40
+ def compare_strict(names1, names2)
41
+ names1.sort == names2.sort
42
+ end
43
+
44
+ private
45
+
46
+ # Extract attribute names from Moxml node
47
+ #
48
+ # @param node [Moxml::Node] Moxml node
49
+ # @return [Array<Symbol>] Array of attribute names
50
+ def extract_from_moxml(node)
51
+ return [] unless node.node_type == :element
52
+
53
+ node.attributes.map { |attr| attr.name.to_sym }
54
+ end
55
+
56
+ # Extract attribute names from Nokogiri node
57
+ #
58
+ # @param node [Nokogiri::XML::Node] Nokogiri node
59
+ # @return [Array<Symbol>] Array of attribute names
60
+ def extract_from_nokogiri(node)
61
+ return [] unless node.node_type == Nokogiri::XML::Node::ELEMENT_NODE
62
+
63
+ node.attribute_nodes.map { |attr| attr.name.to_sym }
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,171 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base_dimension"
4
+ require_relative "../match_options"
5
+
6
+ module Canon
7
+ module Comparison
8
+ module Dimensions
9
+ # Attribute values dimension
10
+ #
11
+ # Handles comparison of attribute values.
12
+ # Supports :strict, :strip, :compact, :normalize, and :ignore behaviors.
13
+ #
14
+ # Behaviors:
15
+ # - :strict - Exact attribute value comparison
16
+ # - :strip - Compare with leading/trailing whitespace removed
17
+ # - :compact - Compare with internal whitespace collapsed
18
+ # - :normalize - Compare with whitespace stripped and collapsed
19
+ # - :ignore - Skip attribute value comparison
20
+ class AttributeValuesDimension < BaseDimension
21
+ # Extract attribute values from a node
22
+ #
23
+ # Returns a hash of attribute name to value.
24
+ #
25
+ # @param node [Moxml::Node, Nokogiri::XML::Node] Node to extract from
26
+ # @return [Hash] Attribute name to value mapping
27
+ def extract_data(node)
28
+ return {} unless node
29
+
30
+ # Handle Moxml nodes
31
+ if node.is_a?(Moxml::Node)
32
+ extract_from_moxml(node)
33
+ # Handle Nokogiri nodes
34
+ elsif node.is_a?(Nokogiri::XML::Node)
35
+ extract_from_nokogiri(node)
36
+ else
37
+ {}
38
+ end
39
+ end
40
+
41
+ # Strict attribute value comparison
42
+ #
43
+ # @param attrs1 [Hash] First attributes hash
44
+ # @param attrs2 [Hash] Second attributes hash
45
+ # @return [Boolean] true if all attribute values are exactly equal
46
+ def compare_strict(attrs1, attrs2)
47
+ # Get all unique attribute names
48
+ all_keys = (attrs1.keys | attrs2.keys)
49
+
50
+ all_keys.all? do |key|
51
+ attrs1[key].to_s == attrs2[key].to_s
52
+ end
53
+ end
54
+
55
+ # Strip comparison
56
+ #
57
+ # Compare with leading/trailing whitespace removed.
58
+ #
59
+ # @param attrs1 [Hash] First attributes hash
60
+ # @param attrs2 [Hash] Second attributes hash
61
+ # @return [Boolean] true if stripped values are equal
62
+ def compare_strip(attrs1, attrs2)
63
+ all_keys = (attrs1.keys | attrs2.keys)
64
+
65
+ all_keys.all? do |key|
66
+ attrs1[key].to_s.strip == attrs2[key].to_s.strip
67
+ end
68
+ end
69
+
70
+ # Compact comparison
71
+ #
72
+ # Compare with internal whitespace collapsed.
73
+ #
74
+ # @param attrs1 [Hash] First attributes hash
75
+ # @param attrs2 [Hash] Second attributes hash
76
+ # @return [Boolean] true if compacted values are equal
77
+ def compare_compact(attrs1, attrs2)
78
+ all_keys = (attrs1.keys | attrs2.keys)
79
+
80
+ all_keys.all? do |key|
81
+ compact_whitespace(attrs1[key].to_s) == compact_whitespace(attrs2[key].to_s)
82
+ end
83
+ end
84
+
85
+ # Normalized comparison
86
+ #
87
+ # Compare with whitespace stripped and collapsed.
88
+ #
89
+ # @param attrs1 [Hash] First attributes hash
90
+ # @param attrs2 [Hash] Second attributes hash
91
+ # @return [Boolean] true if normalized values are equal
92
+ def compare_normalize(attrs1, attrs2)
93
+ all_keys = (attrs1.keys | attrs2.keys)
94
+
95
+ all_keys.all? do |key|
96
+ normalize_text(attrs1[key].to_s) == normalize_text(attrs2[key].to_s)
97
+ end
98
+ end
99
+
100
+ # Compare with custom behavior
101
+ #
102
+ # Supports the extended behaviors for attribute values.
103
+ #
104
+ # @param data1 [Object] First data
105
+ # @param data2 [Object] Second data
106
+ # @param behavior [Symbol] Comparison behavior
107
+ # @return [Boolean] true if data matches according to behavior
108
+ def compare(data1, data2, behavior)
109
+ case behavior
110
+ when :strip
111
+ compare_strip(data1, data2)
112
+ when :compact
113
+ compare_compact(data1, data2)
114
+ else
115
+ super
116
+ end
117
+ end
118
+
119
+ private
120
+
121
+ # Extract attributes from Moxml node
122
+ #
123
+ # @param node [Moxml::Node] Moxml node
124
+ # @return [Hash] Attribute name to value mapping
125
+ def extract_from_moxml(node)
126
+ return {} unless node.node_type == :element
127
+
128
+ attrs = {}
129
+ node.attributes.each do |attr|
130
+ attrs[attr.name] = attr.value
131
+ end
132
+ attrs
133
+ end
134
+
135
+ # Extract attributes from Nokogiri node
136
+ #
137
+ # @param node [Nokogiri::XML::Node] Nokogiri node
138
+ # @return [Hash] Attribute name to value mapping
139
+ def extract_from_nokogiri(node)
140
+ return {} unless node.node_type == Nokogiri::XML::Node::ELEMENT_NODE
141
+
142
+ attrs = {}
143
+ node.attribute_nodes.each do |attr|
144
+ attrs[attr.name] = attr.value
145
+ end
146
+ attrs
147
+ end
148
+
149
+ # Compact whitespace
150
+ #
151
+ # Collapses internal whitespace without trimming.
152
+ #
153
+ # @param text [String] Text to compact
154
+ # @return [String] Compacted text
155
+ def compact_whitespace(text)
156
+ text.gsub(/[\p{Space}\u00a0]+/, " ")
157
+ end
158
+
159
+ # Normalize text
160
+ #
161
+ # Collapses and trims whitespace.
162
+ #
163
+ # @param text [String] Text to normalize
164
+ # @return [String] Normalized text
165
+ def normalize_text(text)
166
+ MatchOptions.normalize_text(text)
167
+ end
168
+ end
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,107 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Canon
4
+ module Comparison
5
+ module Dimensions
6
+ # Base class for comparison dimensions
7
+ #
8
+ # A dimension represents "WHAT to compare" - a specific aspect of a document
9
+ # that can be compared (e.g., text content, attributes, comments).
10
+ #
11
+ # Each dimension knows how to:
12
+ # - Extract relevant data from a node
13
+ # - Compare data according to a behavior (:strict, :normalize, :ignore)
14
+ #
15
+ # Subclasses must implement:
16
+ # - extract_data(node) - Extract relevant data from a node
17
+ # - compare_strict(data1, data2) - Strict comparison
18
+ # - compare_normalize(data1, data2) - Normalized comparison (optional)
19
+ #
20
+ # @abstract Subclass and implement abstract methods
21
+ class BaseDimension
22
+ # Behavior constants
23
+ STRICT = :strict
24
+ NORMALIZE = :normalize
25
+ IGNORE = :ignore
26
+
27
+ # Get the dimension name
28
+ #
29
+ # @return [Symbol] Dimension name
30
+ def dimension_name
31
+ @dimension_name ||= self.class.name.split("::").last.gsub(
32
+ /Dimension$/, ""
33
+ ).downcase.to_sym
34
+ end
35
+
36
+ # Compare extracted data according to behavior
37
+ #
38
+ # @param data1 [Object] First data
39
+ # @param data2 [Object] Second data
40
+ # @param behavior [Symbol] Comparison behavior (:strict, :normalize, :ignore)
41
+ # @return [Boolean] true if data matches according to behavior
42
+ def compare(data1, data2, behavior)
43
+ case behavior
44
+ when STRICT
45
+ compare_strict(data1, data2)
46
+ when NORMALIZE
47
+ compare_normalize(data1, data2)
48
+ when IGNORE
49
+ true
50
+ else
51
+ raise Error, "Unknown behavior: #{behavior}"
52
+ end
53
+ end
54
+
55
+ # Check if two nodes are equivalent for this dimension
56
+ #
57
+ # @param node1 [Object] First node
58
+ # @param node2 [Object] Second node
59
+ # @param behavior [Symbol] Comparison behavior
60
+ # @return [Boolean] true if nodes match for this dimension
61
+ def equivalent?(node1, node2, behavior)
62
+ data1 = extract_data(node1)
63
+ data2 = extract_data(node2)
64
+ compare(data1, data2, behavior)
65
+ end
66
+
67
+ # Extract data from a node
68
+ #
69
+ # @param node [Object] Node to extract data from
70
+ # @return [Object] Extracted data
71
+ # @abstract Subclass must implement
72
+ def extract_data(node)
73
+ raise NotImplementedError, "#{self.class} must implement extract_data"
74
+ end
75
+
76
+ # Strict comparison
77
+ #
78
+ # @param data1 [Object] First data
79
+ # @param data2 [Object] Second data
80
+ # @return [Boolean] true if data matches strictly
81
+ # @abstract Subclass must implement
82
+ def compare_strict(data1, data2)
83
+ raise NotImplementedError,
84
+ "#{self.class} must implement compare_strict"
85
+ end
86
+
87
+ # Normalized comparison
88
+ #
89
+ # @param data1 [Object] First data
90
+ # @param data2 [Object] Second data
91
+ # @return [Boolean] true if data matches after normalization
92
+ def compare_normalize(data1, data2)
93
+ # Default implementation: delegate to strict comparison
94
+ compare_strict(data1, data2)
95
+ end
96
+
97
+ # Check if this dimension supports normalization
98
+ #
99
+ # @return [Boolean] true if normalization is supported
100
+ def supports_normalization?
101
+ # Check if compare_normalize is overridden (not the default implementation)
102
+ method(:compare_normalize).owner != BaseDimension
103
+ end
104
+ end
105
+ end
106
+ end
107
+ end