canon 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +69 -92
- data/README.adoc +13 -13
- data/docs/.lycheeignore +69 -0
- data/docs/Gemfile +1 -0
- data/docs/_config.yml +90 -1
- data/docs/advanced/diff-classification.adoc +82 -2
- data/docs/advanced/extending-canon.adoc +193 -0
- data/docs/features/match-options/index.adoc +239 -1
- data/docs/internals/diffnode-enrichment.adoc +611 -0
- data/docs/internals/index.adoc +251 -0
- data/docs/lychee.toml +13 -6
- data/docs/understanding/architecture.adoc +749 -33
- data/docs/understanding/comparison-pipeline.adoc +122 -0
- data/lib/canon/cache.rb +129 -0
- data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +68 -0
- data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +68 -0
- data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +171 -0
- data/lib/canon/comparison/dimensions/base_dimension.rb +107 -0
- data/lib/canon/comparison/dimensions/comments_dimension.rb +121 -0
- data/lib/canon/comparison/dimensions/element_position_dimension.rb +90 -0
- data/lib/canon/comparison/dimensions/registry.rb +77 -0
- data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +119 -0
- data/lib/canon/comparison/dimensions/text_content_dimension.rb +96 -0
- data/lib/canon/comparison/dimensions.rb +54 -0
- data/lib/canon/comparison/format_detector.rb +87 -0
- data/lib/canon/comparison/html_comparator.rb +70 -26
- data/lib/canon/comparison/html_compare_profile.rb +8 -2
- data/lib/canon/comparison/html_parser.rb +80 -0
- data/lib/canon/comparison/json_comparator.rb +12 -0
- data/lib/canon/comparison/json_parser.rb +19 -0
- data/lib/canon/comparison/markup_comparator.rb +293 -0
- data/lib/canon/comparison/match_options/base_resolver.rb +150 -0
- data/lib/canon/comparison/match_options/json_resolver.rb +82 -0
- data/lib/canon/comparison/match_options/xml_resolver.rb +151 -0
- data/lib/canon/comparison/match_options/yaml_resolver.rb +87 -0
- data/lib/canon/comparison/match_options.rb +68 -463
- data/lib/canon/comparison/profile_definition.rb +149 -0
- data/lib/canon/comparison/ruby_object_comparator.rb +180 -0
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +7 -10
- data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
- data/lib/canon/comparison/xml_comparator/attribute_comparator.rb +177 -0
- data/lib/canon/comparison/xml_comparator/attribute_filter.rb +136 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +197 -0
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +115 -0
- data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +186 -0
- data/lib/canon/comparison/xml_comparator/node_parser.rb +79 -0
- data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +102 -0
- data/lib/canon/comparison/xml_comparator.rb +97 -684
- data/lib/canon/comparison/xml_node_comparison.rb +319 -0
- data/lib/canon/comparison/xml_parser.rb +19 -0
- data/lib/canon/comparison/yaml_comparator.rb +3 -3
- data/lib/canon/comparison.rb +265 -110
- data/lib/canon/diff/diff_classifier.rb +101 -2
- data/lib/canon/diff/diff_node.rb +32 -2
- data/lib/canon/diff/formatting_detector.rb +1 -1
- data/lib/canon/diff/node_serializer.rb +191 -0
- data/lib/canon/diff/path_builder.rb +143 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +251 -0
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +6 -248
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +38 -229
- data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +30 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +579 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +121 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +253 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +61 -0
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +31 -1028
- data/lib/canon/diff_formatter.rb +1 -1
- data/lib/canon/rspec_matchers.rb +38 -9
- data/lib/canon/tree_diff/operation_converter.rb +92 -338
- data/lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb +71 -0
- data/lib/canon/tree_diff/operation_converter_helpers/post_processor.rb +103 -0
- data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +168 -0
- data/lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb +188 -0
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +24 -13
- metadata +48 -2
|
@@ -83,6 +83,128 @@ graph TD
|
|
|
83
83
|
|
|
84
84
|
**Documentation**: See link:../features/diff-formatting/[Diff Formatting] and link:../features/diff-formatting/algorithm-specific-output.adoc[Algorithm-Specific Output]
|
|
85
85
|
|
|
86
|
+
== DiffNode Data Flow
|
|
87
|
+
|
|
88
|
+
=== How differences flow through the layers
|
|
89
|
+
|
|
90
|
+
DiffNode objects are created in Layer 2, enriched with metadata, and flow through to Layer 4 for rendering:
|
|
91
|
+
|
|
92
|
+
[mermaid]
|
|
93
|
+
----
|
|
94
|
+
graph LR
|
|
95
|
+
A[Layer 2: Algorithm] --> B[Create DiffNode]
|
|
96
|
+
B --> C[Enrich Metadata]
|
|
97
|
+
C --> D[Layer 3: Classification]
|
|
98
|
+
D --> E[Layer 4: Rendering]
|
|
99
|
+
E --> F[Formatted Output]
|
|
100
|
+
|
|
101
|
+
C --> C1[PathBuilder]
|
|
102
|
+
C --> C2[NodeSerializer]
|
|
103
|
+
C1 --> G[path]
|
|
104
|
+
C2 --> H[serialized_before/after]
|
|
105
|
+
C2 --> I[attributes_before/after]
|
|
106
|
+
|
|
107
|
+
style A fill:#fff4e1
|
|
108
|
+
style C fill:#e1f5ff
|
|
109
|
+
style E fill:#e1ffe1
|
|
110
|
+
----
|
|
111
|
+
|
|
112
|
+
=== Layer 2: DiffNode creation
|
|
113
|
+
|
|
114
|
+
Each algorithm creates DiffNode objects when it finds differences:
|
|
115
|
+
|
|
116
|
+
[source,ruby]
|
|
117
|
+
----
|
|
118
|
+
# DOM algorithm: Creates DiffNode during element-by-element comparison
|
|
119
|
+
diff_node = Canon::Diff::DiffNode.new(
|
|
120
|
+
node1: element1,
|
|
121
|
+
node2: element2,
|
|
122
|
+
dimension: :text_content,
|
|
123
|
+
reason: "Text content differs"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# Semantic algorithm: Creates DiffNode from tree operations
|
|
127
|
+
diff_node = Canon::Diff::DiffNode.new(
|
|
128
|
+
node1: nil,
|
|
129
|
+
node2: inserted_node,
|
|
130
|
+
dimension: :element_structure,
|
|
131
|
+
reason: "Element inserted"
|
|
132
|
+
)
|
|
133
|
+
----
|
|
134
|
+
|
|
135
|
+
=== Metadata enrichment
|
|
136
|
+
|
|
137
|
+
After creation, DiffNodes are enriched with metadata for Layer 4 rendering:
|
|
138
|
+
|
|
139
|
+
[source,ruby]
|
|
140
|
+
----
|
|
141
|
+
# Enriched with:
|
|
142
|
+
{
|
|
143
|
+
path: "/#document/div[0]/p[1]/span[2]", # Canonical location
|
|
144
|
+
serialized_before: "<span>Old text</span>", # Captured state
|
|
145
|
+
serialized_after: "<span>New text</span>", # Captured state
|
|
146
|
+
attributes_before: {"id" => "old"}, # Normalized attrs
|
|
147
|
+
attributes_after: {"id" => "new"} # Normalized attrs
|
|
148
|
+
}
|
|
149
|
+
----
|
|
150
|
+
|
|
151
|
+
**Enrichment utilities**:
|
|
152
|
+
|
|
153
|
+
* **PathBuilder**: Generates canonical paths with ordinal indices
|
|
154
|
+
* **NodeSerializer**: Library-agnostic serialization of node content
|
|
155
|
+
* **Attribute extraction**: Normalized attribute hashes
|
|
156
|
+
|
|
157
|
+
See link:../internals/diffnode-enrichment.adoc[DiffNode Enrichment] for implementation details.
|
|
158
|
+
|
|
159
|
+
=== Layer 3: Classification
|
|
160
|
+
|
|
161
|
+
DiffNodes are classified to determine their impact:
|
|
162
|
+
|
|
163
|
+
[source,ruby]
|
|
164
|
+
----
|
|
165
|
+
diff_node.normative = true # Affects semantic equivalence
|
|
166
|
+
diff_node.formatting = true # Purely cosmetic difference
|
|
167
|
+
----
|
|
168
|
+
|
|
169
|
+
This classification affects whether differences cause `equivalent?` to return false.
|
|
170
|
+
|
|
171
|
+
=== Layer 4: Rendering
|
|
172
|
+
|
|
173
|
+
Layer 4 formatters use enriched metadata to display differences:
|
|
174
|
+
|
|
175
|
+
[source,text]
|
|
176
|
+
----
|
|
177
|
+
🔍 DIFFERENCE #1/3 [NORMATIVE]
|
|
178
|
+
════════════════════════════════════════════════════════════════════════
|
|
179
|
+
Dimension: text_content
|
|
180
|
+
Location: /#document/div[0]/p[1]/span[2]
|
|
181
|
+
|
|
182
|
+
⊖ Expected (File 1):
|
|
183
|
+
<span>Old text</span>
|
|
184
|
+
|
|
185
|
+
⊕ Actual (File 2):
|
|
186
|
+
<span>New text</span>
|
|
187
|
+
|
|
188
|
+
✨ Changes:
|
|
189
|
+
Text content changed from "Old text" to "New text"
|
|
190
|
+
----
|
|
191
|
+
|
|
192
|
+
* The `Location` field uses the enriched `path`
|
|
193
|
+
* The before/after content uses `serialized_before/after`
|
|
194
|
+
* Attribute differences use `attributes_before/after`
|
|
195
|
+
|
|
196
|
+
This ensures accurate display regardless of which parsing library was used.
|
|
197
|
+
|
|
198
|
+
=== Benefits of enriched DiffNodes
|
|
199
|
+
|
|
200
|
+
**Library flexibility**: Layer 4 works with any parsing library through enriched metadata
|
|
201
|
+
|
|
202
|
+
**Performance**: Metadata captured once at diff creation, not recomputed during rendering
|
|
203
|
+
|
|
204
|
+
**Accuracy**: Shows actual node state when difference was found, not current state
|
|
205
|
+
|
|
206
|
+
**Debuggability**: Ordinal indices in paths make it easy to locate specific elements
|
|
207
|
+
|
|
86
208
|
== Complete Example
|
|
87
209
|
|
|
88
210
|
Here's a full 4-layer configuration showing all layers working together:
|
data/lib/canon/cache.rb
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "digest"
|
|
4
|
+
|
|
5
|
+
module Canon
|
|
6
|
+
# Cache for expensive operations during document comparison
|
|
7
|
+
#
|
|
8
|
+
# Provides thread-safe caching with size limits to prevent memory bloat.
|
|
9
|
+
# Uses LRU (Least Recently Used) eviction when cache is full.
|
|
10
|
+
#
|
|
11
|
+
# @example Cache a parsed document
|
|
12
|
+
# key = Cache.key_for_document(xml_string, :xml, :none)
|
|
13
|
+
# parsed = Cache.fetch(:document_parse, key) { parse_xml(xml_string) }
|
|
14
|
+
#
|
|
15
|
+
# @example Clear all caches (e.g., between test cases)
|
|
16
|
+
# Cache.clear_all
|
|
17
|
+
module Cache
|
|
18
|
+
class << self
|
|
19
|
+
# Maximum number of entries per cache category
|
|
20
|
+
MAX_CACHE_SIZE = 100
|
|
21
|
+
|
|
22
|
+
# Fetch a value from cache, or compute and cache it
|
|
23
|
+
#
|
|
24
|
+
# @param category [Symbol] Cache category (:document_parse, :format_detect, etc.)
|
|
25
|
+
# @param key [String] Cache key
|
|
26
|
+
# @yield Block to compute value if not cached
|
|
27
|
+
# @return [Object] Cached or computed value
|
|
28
|
+
def fetch(category, key)
|
|
29
|
+
cache = cache_for(category)
|
|
30
|
+
|
|
31
|
+
# Check if key exists
|
|
32
|
+
if cache.key?(key)
|
|
33
|
+
# Update access time for LRU
|
|
34
|
+
cache[key][:accessed] = Time.now
|
|
35
|
+
return cache[key][:value]
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Compute and cache the value
|
|
39
|
+
value = yield
|
|
40
|
+
|
|
41
|
+
# Evict oldest entry if cache is full
|
|
42
|
+
if cache.size >= MAX_CACHE_SIZE
|
|
43
|
+
oldest_key = cache.min_by { |_, v| v[:accessed] }&.first
|
|
44
|
+
cache.delete(oldest_key) if oldest_key
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
cache[key] = { value: value, accessed: Time.now }
|
|
48
|
+
value
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Clear all caches
|
|
52
|
+
#
|
|
53
|
+
# Useful for tests or when memory needs to be freed
|
|
54
|
+
def clear_all
|
|
55
|
+
@caches&.each_value(&:clear)
|
|
56
|
+
@caches = nil
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Clear a specific cache category
|
|
60
|
+
#
|
|
61
|
+
# @param category [Symbol] Cache category to clear
|
|
62
|
+
def clear_category(category)
|
|
63
|
+
return unless @caches&.key?(category)
|
|
64
|
+
|
|
65
|
+
@caches[category]&.clear
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Get cache statistics
|
|
69
|
+
#
|
|
70
|
+
# @return [Hash] Statistics about cache usage
|
|
71
|
+
def stats
|
|
72
|
+
@caches&.transform_values(&:size) || {}
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Generate cache key for document parsing
|
|
76
|
+
#
|
|
77
|
+
# @param content [String] Document content
|
|
78
|
+
# @param format [Symbol] Document format
|
|
79
|
+
# @param preprocessing [Symbol] Preprocessing option
|
|
80
|
+
# @return [String] Cache key
|
|
81
|
+
def key_for_document(content, format, preprocessing)
|
|
82
|
+
digest = Digest::SHA256.hexdigest(content)
|
|
83
|
+
"doc:#{format}:#{preprocessing}:#{digest[0..16]}"
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Generate cache key for format detection
|
|
87
|
+
#
|
|
88
|
+
# @param content [String] Document content
|
|
89
|
+
# @return [String] Cache key
|
|
90
|
+
def key_for_format_detection(content)
|
|
91
|
+
# Use first 100 chars for quick key, plus length
|
|
92
|
+
preview = content[0..100]
|
|
93
|
+
digest = Digest::SHA256.hexdigest(preview + content.length.to_s)
|
|
94
|
+
"fmt:#{digest[0..16]}"
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Generate cache key for XML canonicalization
|
|
98
|
+
#
|
|
99
|
+
# @param content [String] XML content
|
|
100
|
+
# @param with_comments [Boolean] Whether to include comments
|
|
101
|
+
# @return [String] Cache key
|
|
102
|
+
def key_for_c14n(content, with_comments)
|
|
103
|
+
digest = Digest::SHA256.hexdigest(content)
|
|
104
|
+
"c14n:#{with_comments}:#{digest[0..16]}"
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Generate cache key for preprocessing
|
|
108
|
+
#
|
|
109
|
+
# @param content [String] Original content
|
|
110
|
+
# @param preprocessing [Symbol] Preprocessing type
|
|
111
|
+
# @return [String] Cache key
|
|
112
|
+
def key_for_preprocessing(content, preprocessing)
|
|
113
|
+
digest = Digest::SHA256.hexdigest(content)
|
|
114
|
+
"pre:#{preprocessing}:#{digest[0..16]}"
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
private
|
|
118
|
+
|
|
119
|
+
# Get or create cache for a category
|
|
120
|
+
#
|
|
121
|
+
# @param category [Symbol] Cache category
|
|
122
|
+
# @return [Hash] Cache hash for category
|
|
123
|
+
def cache_for(category)
|
|
124
|
+
@caches ||= {}
|
|
125
|
+
@caches[category] ||= {}
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "base_dimension"
|
|
4
|
+
|
|
5
|
+
module Canon
|
|
6
|
+
module Comparison
|
|
7
|
+
module Dimensions
|
|
8
|
+
# Attribute order dimension
|
|
9
|
+
#
|
|
10
|
+
# Handles comparison of attribute ordering.
|
|
11
|
+
# Supports :strict and :ignore behaviors.
|
|
12
|
+
#
|
|
13
|
+
# Behaviors:
|
|
14
|
+
# - :strict - Attributes must appear in the same order
|
|
15
|
+
# - :ignore - Attribute order doesn't matter
|
|
16
|
+
class AttributeOrderDimension < BaseDimension
|
|
17
|
+
# Extract attribute order from a node
|
|
18
|
+
#
|
|
19
|
+
# @param node [Moxml::Node, Nokogiri::XML::Node] Node to extract from
|
|
20
|
+
# @return [Array<Symbol>] Array of attribute names in order
|
|
21
|
+
def extract_data(node)
|
|
22
|
+
return [] unless node
|
|
23
|
+
|
|
24
|
+
# Handle Moxml nodes
|
|
25
|
+
if node.is_a?(Moxml::Node)
|
|
26
|
+
extract_from_moxml(node)
|
|
27
|
+
# Handle Nokogiri nodes
|
|
28
|
+
elsif node.is_a?(Nokogiri::XML::Node)
|
|
29
|
+
extract_from_nokogiri(node)
|
|
30
|
+
else
|
|
31
|
+
[]
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Strict attribute order comparison
|
|
36
|
+
#
|
|
37
|
+
# @param order1 [Array<Symbol>] First attribute order
|
|
38
|
+
# @param order2 [Array<Symbol>] Second attribute order
|
|
39
|
+
# @return [Boolean] true if attribute order is exactly the same
|
|
40
|
+
def compare_strict(order1, order2)
|
|
41
|
+
order1 == order2
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private
|
|
45
|
+
|
|
46
|
+
# Extract attribute order from Moxml node
|
|
47
|
+
#
|
|
48
|
+
# @param node [Moxml::Node] Moxml node
|
|
49
|
+
# @return [Array<Symbol>] Array of attribute names in order
|
|
50
|
+
def extract_from_moxml(node)
|
|
51
|
+
return [] unless node.node_type == :element
|
|
52
|
+
|
|
53
|
+
node.attributes.map { |attr| attr.name.to_sym }
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Extract attribute order from Nokogiri node
|
|
57
|
+
#
|
|
58
|
+
# @param node [Nokogiri::XML::Node] Nokogiri node
|
|
59
|
+
# @return [Array<Symbol>] Array of attribute names in order
|
|
60
|
+
def extract_from_nokogiri(node)
|
|
61
|
+
return [] unless node.node_type == Nokogiri::XML::Node::ELEMENT_NODE
|
|
62
|
+
|
|
63
|
+
node.attribute_nodes.map { |attr| attr.name.to_sym }
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "base_dimension"
|
|
4
|
+
|
|
5
|
+
module Canon
|
|
6
|
+
module Comparison
|
|
7
|
+
module Dimensions
|
|
8
|
+
# Attribute presence dimension
|
|
9
|
+
#
|
|
10
|
+
# Handles comparison of attribute presence (which attributes exist).
|
|
11
|
+
# Supports :strict and :ignore behaviors.
|
|
12
|
+
#
|
|
13
|
+
# Behaviors:
|
|
14
|
+
# - :strict - Attribute names must match exactly
|
|
15
|
+
# - :ignore - Skip attribute presence comparison
|
|
16
|
+
class AttributePresenceDimension < BaseDimension
|
|
17
|
+
# Extract attribute names from a node
|
|
18
|
+
#
|
|
19
|
+
# @param node [Moxml::Node, Nokogiri::XML::Node] Node to extract from
|
|
20
|
+
# @return [Array<Symbol>] Array of attribute names
|
|
21
|
+
def extract_data(node)
|
|
22
|
+
return [] unless node
|
|
23
|
+
|
|
24
|
+
# Handle Moxml nodes
|
|
25
|
+
if node.is_a?(Moxml::Node)
|
|
26
|
+
extract_from_moxml(node)
|
|
27
|
+
# Handle Nokogiri nodes
|
|
28
|
+
elsif node.is_a?(Nokogiri::XML::Node)
|
|
29
|
+
extract_from_nokogiri(node)
|
|
30
|
+
else
|
|
31
|
+
[]
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Strict attribute presence comparison
|
|
36
|
+
#
|
|
37
|
+
# @param names1 [Array<Symbol>] First attribute names
|
|
38
|
+
# @param names2 [Array<Symbol>] Second attribute names
|
|
39
|
+
# @return [Boolean] true if attribute names are exactly equal
|
|
40
|
+
def compare_strict(names1, names2)
|
|
41
|
+
names1.sort == names2.sort
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private
|
|
45
|
+
|
|
46
|
+
# Extract attribute names from Moxml node
|
|
47
|
+
#
|
|
48
|
+
# @param node [Moxml::Node] Moxml node
|
|
49
|
+
# @return [Array<Symbol>] Array of attribute names
|
|
50
|
+
def extract_from_moxml(node)
|
|
51
|
+
return [] unless node.node_type == :element
|
|
52
|
+
|
|
53
|
+
node.attributes.map { |attr| attr.name.to_sym }
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Extract attribute names from Nokogiri node
|
|
57
|
+
#
|
|
58
|
+
# @param node [Nokogiri::XML::Node] Nokogiri node
|
|
59
|
+
# @return [Array<Symbol>] Array of attribute names
|
|
60
|
+
def extract_from_nokogiri(node)
|
|
61
|
+
return [] unless node.node_type == Nokogiri::XML::Node::ELEMENT_NODE
|
|
62
|
+
|
|
63
|
+
node.attribute_nodes.map { |attr| attr.name.to_sym }
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "base_dimension"
|
|
4
|
+
require_relative "../match_options"
|
|
5
|
+
|
|
6
|
+
module Canon
|
|
7
|
+
module Comparison
|
|
8
|
+
module Dimensions
|
|
9
|
+
# Attribute values dimension
|
|
10
|
+
#
|
|
11
|
+
# Handles comparison of attribute values.
|
|
12
|
+
# Supports :strict, :strip, :compact, :normalize, and :ignore behaviors.
|
|
13
|
+
#
|
|
14
|
+
# Behaviors:
|
|
15
|
+
# - :strict - Exact attribute value comparison
|
|
16
|
+
# - :strip - Compare with leading/trailing whitespace removed
|
|
17
|
+
# - :compact - Compare with internal whitespace collapsed
|
|
18
|
+
# - :normalize - Compare with whitespace stripped and collapsed
|
|
19
|
+
# - :ignore - Skip attribute value comparison
|
|
20
|
+
class AttributeValuesDimension < BaseDimension
|
|
21
|
+
# Extract attribute values from a node
|
|
22
|
+
#
|
|
23
|
+
# Returns a hash of attribute name to value.
|
|
24
|
+
#
|
|
25
|
+
# @param node [Moxml::Node, Nokogiri::XML::Node] Node to extract from
|
|
26
|
+
# @return [Hash] Attribute name to value mapping
|
|
27
|
+
def extract_data(node)
|
|
28
|
+
return {} unless node
|
|
29
|
+
|
|
30
|
+
# Handle Moxml nodes
|
|
31
|
+
if node.is_a?(Moxml::Node)
|
|
32
|
+
extract_from_moxml(node)
|
|
33
|
+
# Handle Nokogiri nodes
|
|
34
|
+
elsif node.is_a?(Nokogiri::XML::Node)
|
|
35
|
+
extract_from_nokogiri(node)
|
|
36
|
+
else
|
|
37
|
+
{}
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Strict attribute value comparison
|
|
42
|
+
#
|
|
43
|
+
# @param attrs1 [Hash] First attributes hash
|
|
44
|
+
# @param attrs2 [Hash] Second attributes hash
|
|
45
|
+
# @return [Boolean] true if all attribute values are exactly equal
|
|
46
|
+
def compare_strict(attrs1, attrs2)
|
|
47
|
+
# Get all unique attribute names
|
|
48
|
+
all_keys = (attrs1.keys | attrs2.keys)
|
|
49
|
+
|
|
50
|
+
all_keys.all? do |key|
|
|
51
|
+
attrs1[key].to_s == attrs2[key].to_s
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Strip comparison
|
|
56
|
+
#
|
|
57
|
+
# Compare with leading/trailing whitespace removed.
|
|
58
|
+
#
|
|
59
|
+
# @param attrs1 [Hash] First attributes hash
|
|
60
|
+
# @param attrs2 [Hash] Second attributes hash
|
|
61
|
+
# @return [Boolean] true if stripped values are equal
|
|
62
|
+
def compare_strip(attrs1, attrs2)
|
|
63
|
+
all_keys = (attrs1.keys | attrs2.keys)
|
|
64
|
+
|
|
65
|
+
all_keys.all? do |key|
|
|
66
|
+
attrs1[key].to_s.strip == attrs2[key].to_s.strip
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Compact comparison
|
|
71
|
+
#
|
|
72
|
+
# Compare with internal whitespace collapsed.
|
|
73
|
+
#
|
|
74
|
+
# @param attrs1 [Hash] First attributes hash
|
|
75
|
+
# @param attrs2 [Hash] Second attributes hash
|
|
76
|
+
# @return [Boolean] true if compacted values are equal
|
|
77
|
+
def compare_compact(attrs1, attrs2)
|
|
78
|
+
all_keys = (attrs1.keys | attrs2.keys)
|
|
79
|
+
|
|
80
|
+
all_keys.all? do |key|
|
|
81
|
+
compact_whitespace(attrs1[key].to_s) == compact_whitespace(attrs2[key].to_s)
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Normalized comparison
|
|
86
|
+
#
|
|
87
|
+
# Compare with whitespace stripped and collapsed.
|
|
88
|
+
#
|
|
89
|
+
# @param attrs1 [Hash] First attributes hash
|
|
90
|
+
# @param attrs2 [Hash] Second attributes hash
|
|
91
|
+
# @return [Boolean] true if normalized values are equal
|
|
92
|
+
def compare_normalize(attrs1, attrs2)
|
|
93
|
+
all_keys = (attrs1.keys | attrs2.keys)
|
|
94
|
+
|
|
95
|
+
all_keys.all? do |key|
|
|
96
|
+
normalize_text(attrs1[key].to_s) == normalize_text(attrs2[key].to_s)
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Compare with custom behavior
|
|
101
|
+
#
|
|
102
|
+
# Supports the extended behaviors for attribute values.
|
|
103
|
+
#
|
|
104
|
+
# @param data1 [Object] First data
|
|
105
|
+
# @param data2 [Object] Second data
|
|
106
|
+
# @param behavior [Symbol] Comparison behavior
|
|
107
|
+
# @return [Boolean] true if data matches according to behavior
|
|
108
|
+
def compare(data1, data2, behavior)
|
|
109
|
+
case behavior
|
|
110
|
+
when :strip
|
|
111
|
+
compare_strip(data1, data2)
|
|
112
|
+
when :compact
|
|
113
|
+
compare_compact(data1, data2)
|
|
114
|
+
else
|
|
115
|
+
super
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
private
|
|
120
|
+
|
|
121
|
+
# Extract attributes from Moxml node
|
|
122
|
+
#
|
|
123
|
+
# @param node [Moxml::Node] Moxml node
|
|
124
|
+
# @return [Hash] Attribute name to value mapping
|
|
125
|
+
def extract_from_moxml(node)
|
|
126
|
+
return {} unless node.node_type == :element
|
|
127
|
+
|
|
128
|
+
attrs = {}
|
|
129
|
+
node.attributes.each do |attr|
|
|
130
|
+
attrs[attr.name] = attr.value
|
|
131
|
+
end
|
|
132
|
+
attrs
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Extract attributes from Nokogiri node
|
|
136
|
+
#
|
|
137
|
+
# @param node [Nokogiri::XML::Node] Nokogiri node
|
|
138
|
+
# @return [Hash] Attribute name to value mapping
|
|
139
|
+
def extract_from_nokogiri(node)
|
|
140
|
+
return {} unless node.node_type == Nokogiri::XML::Node::ELEMENT_NODE
|
|
141
|
+
|
|
142
|
+
attrs = {}
|
|
143
|
+
node.attribute_nodes.each do |attr|
|
|
144
|
+
attrs[attr.name] = attr.value
|
|
145
|
+
end
|
|
146
|
+
attrs
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Compact whitespace
|
|
150
|
+
#
|
|
151
|
+
# Collapses internal whitespace without trimming.
|
|
152
|
+
#
|
|
153
|
+
# @param text [String] Text to compact
|
|
154
|
+
# @return [String] Compacted text
|
|
155
|
+
def compact_whitespace(text)
|
|
156
|
+
text.gsub(/[\p{Space}\u00a0]+/, " ")
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Normalize text
|
|
160
|
+
#
|
|
161
|
+
# Collapses and trims whitespace.
|
|
162
|
+
#
|
|
163
|
+
# @param text [String] Text to normalize
|
|
164
|
+
# @return [String] Normalized text
|
|
165
|
+
def normalize_text(text)
|
|
166
|
+
MatchOptions.normalize_text(text)
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
end
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
module Comparison
|
|
5
|
+
module Dimensions
|
|
6
|
+
# Base class for comparison dimensions
|
|
7
|
+
#
|
|
8
|
+
# A dimension represents "WHAT to compare" - a specific aspect of a document
|
|
9
|
+
# that can be compared (e.g., text content, attributes, comments).
|
|
10
|
+
#
|
|
11
|
+
# Each dimension knows how to:
|
|
12
|
+
# - Extract relevant data from a node
|
|
13
|
+
# - Compare data according to a behavior (:strict, :normalize, :ignore)
|
|
14
|
+
#
|
|
15
|
+
# Subclasses must implement:
|
|
16
|
+
# - extract_data(node) - Extract relevant data from a node
|
|
17
|
+
# - compare_strict(data1, data2) - Strict comparison
|
|
18
|
+
# - compare_normalize(data1, data2) - Normalized comparison (optional)
|
|
19
|
+
#
|
|
20
|
+
# @abstract Subclass and implement abstract methods
|
|
21
|
+
class BaseDimension
|
|
22
|
+
# Behavior constants
|
|
23
|
+
STRICT = :strict
|
|
24
|
+
NORMALIZE = :normalize
|
|
25
|
+
IGNORE = :ignore
|
|
26
|
+
|
|
27
|
+
# Get the dimension name
|
|
28
|
+
#
|
|
29
|
+
# @return [Symbol] Dimension name
|
|
30
|
+
def dimension_name
|
|
31
|
+
@dimension_name ||= self.class.name.split("::").last.gsub(
|
|
32
|
+
/Dimension$/, ""
|
|
33
|
+
).downcase.to_sym
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Compare extracted data according to behavior
|
|
37
|
+
#
|
|
38
|
+
# @param data1 [Object] First data
|
|
39
|
+
# @param data2 [Object] Second data
|
|
40
|
+
# @param behavior [Symbol] Comparison behavior (:strict, :normalize, :ignore)
|
|
41
|
+
# @return [Boolean] true if data matches according to behavior
|
|
42
|
+
def compare(data1, data2, behavior)
|
|
43
|
+
case behavior
|
|
44
|
+
when STRICT
|
|
45
|
+
compare_strict(data1, data2)
|
|
46
|
+
when NORMALIZE
|
|
47
|
+
compare_normalize(data1, data2)
|
|
48
|
+
when IGNORE
|
|
49
|
+
true
|
|
50
|
+
else
|
|
51
|
+
raise Error, "Unknown behavior: #{behavior}"
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Check if two nodes are equivalent for this dimension
|
|
56
|
+
#
|
|
57
|
+
# @param node1 [Object] First node
|
|
58
|
+
# @param node2 [Object] Second node
|
|
59
|
+
# @param behavior [Symbol] Comparison behavior
|
|
60
|
+
# @return [Boolean] true if nodes match for this dimension
|
|
61
|
+
def equivalent?(node1, node2, behavior)
|
|
62
|
+
data1 = extract_data(node1)
|
|
63
|
+
data2 = extract_data(node2)
|
|
64
|
+
compare(data1, data2, behavior)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Extract data from a node
|
|
68
|
+
#
|
|
69
|
+
# @param node [Object] Node to extract data from
|
|
70
|
+
# @return [Object] Extracted data
|
|
71
|
+
# @abstract Subclass must implement
|
|
72
|
+
def extract_data(node)
|
|
73
|
+
raise NotImplementedError, "#{self.class} must implement extract_data"
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Strict comparison
|
|
77
|
+
#
|
|
78
|
+
# @param data1 [Object] First data
|
|
79
|
+
# @param data2 [Object] Second data
|
|
80
|
+
# @return [Boolean] true if data matches strictly
|
|
81
|
+
# @abstract Subclass must implement
|
|
82
|
+
def compare_strict(data1, data2)
|
|
83
|
+
raise NotImplementedError,
|
|
84
|
+
"#{self.class} must implement compare_strict"
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Normalized comparison
|
|
88
|
+
#
|
|
89
|
+
# @param data1 [Object] First data
|
|
90
|
+
# @param data2 [Object] Second data
|
|
91
|
+
# @return [Boolean] true if data matches after normalization
|
|
92
|
+
def compare_normalize(data1, data2)
|
|
93
|
+
# Default implementation: delegate to strict comparison
|
|
94
|
+
compare_strict(data1, data2)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Check if this dimension supports normalization
|
|
98
|
+
#
|
|
99
|
+
# @return [Boolean] true if normalization is supported
|
|
100
|
+
def supports_normalization?
|
|
101
|
+
# Check if compare_normalize is overridden (not the default implementation)
|
|
102
|
+
method(:compare_normalize).owner != BaseDimension
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|