canon 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +69 -92
- data/README.adoc +13 -13
- data/docs/.lycheeignore +69 -0
- data/docs/Gemfile +1 -0
- data/docs/_config.yml +90 -1
- data/docs/advanced/diff-classification.adoc +82 -2
- data/docs/advanced/extending-canon.adoc +193 -0
- data/docs/features/match-options/index.adoc +239 -1
- data/docs/internals/diffnode-enrichment.adoc +611 -0
- data/docs/internals/index.adoc +251 -0
- data/docs/lychee.toml +13 -6
- data/docs/understanding/architecture.adoc +749 -33
- data/docs/understanding/comparison-pipeline.adoc +122 -0
- data/lib/canon/cache.rb +129 -0
- data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +68 -0
- data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +68 -0
- data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +171 -0
- data/lib/canon/comparison/dimensions/base_dimension.rb +107 -0
- data/lib/canon/comparison/dimensions/comments_dimension.rb +121 -0
- data/lib/canon/comparison/dimensions/element_position_dimension.rb +90 -0
- data/lib/canon/comparison/dimensions/registry.rb +77 -0
- data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +119 -0
- data/lib/canon/comparison/dimensions/text_content_dimension.rb +96 -0
- data/lib/canon/comparison/dimensions.rb +54 -0
- data/lib/canon/comparison/format_detector.rb +87 -0
- data/lib/canon/comparison/html_comparator.rb +70 -26
- data/lib/canon/comparison/html_compare_profile.rb +8 -2
- data/lib/canon/comparison/html_parser.rb +80 -0
- data/lib/canon/comparison/json_comparator.rb +12 -0
- data/lib/canon/comparison/json_parser.rb +19 -0
- data/lib/canon/comparison/markup_comparator.rb +293 -0
- data/lib/canon/comparison/match_options/base_resolver.rb +150 -0
- data/lib/canon/comparison/match_options/json_resolver.rb +82 -0
- data/lib/canon/comparison/match_options/xml_resolver.rb +151 -0
- data/lib/canon/comparison/match_options/yaml_resolver.rb +87 -0
- data/lib/canon/comparison/match_options.rb +68 -463
- data/lib/canon/comparison/profile_definition.rb +149 -0
- data/lib/canon/comparison/ruby_object_comparator.rb +180 -0
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +7 -10
- data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
- data/lib/canon/comparison/xml_comparator/attribute_comparator.rb +177 -0
- data/lib/canon/comparison/xml_comparator/attribute_filter.rb +136 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +197 -0
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +115 -0
- data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +186 -0
- data/lib/canon/comparison/xml_comparator/node_parser.rb +79 -0
- data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +102 -0
- data/lib/canon/comparison/xml_comparator.rb +97 -684
- data/lib/canon/comparison/xml_node_comparison.rb +319 -0
- data/lib/canon/comparison/xml_parser.rb +19 -0
- data/lib/canon/comparison/yaml_comparator.rb +3 -3
- data/lib/canon/comparison.rb +265 -110
- data/lib/canon/diff/diff_classifier.rb +101 -2
- data/lib/canon/diff/diff_node.rb +32 -2
- data/lib/canon/diff/formatting_detector.rb +1 -1
- data/lib/canon/diff/node_serializer.rb +191 -0
- data/lib/canon/diff/path_builder.rb +143 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +251 -0
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +6 -248
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +38 -229
- data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +30 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +579 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +121 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +253 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +61 -0
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +31 -1028
- data/lib/canon/diff_formatter.rb +1 -1
- data/lib/canon/rspec_matchers.rb +38 -9
- data/lib/canon/tree_diff/operation_converter.rb +92 -338
- data/lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb +71 -0
- data/lib/canon/tree_diff/operation_converter_helpers/post_processor.rb +103 -0
- data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +168 -0
- data/lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb +188 -0
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +24 -13
- metadata +48 -2
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "base_dimension"
|
|
4
|
+
|
|
5
|
+
module Canon
|
|
6
|
+
module Comparison
|
|
7
|
+
module Dimensions
|
|
8
|
+
# Comments dimension
|
|
9
|
+
#
|
|
10
|
+
# Handles comparison of comment nodes.
|
|
11
|
+
# Supports :strict and :ignore behaviors.
|
|
12
|
+
#
|
|
13
|
+
# Behaviors:
|
|
14
|
+
# - :strict - Exact comment comparison including whitespace
|
|
15
|
+
# - :ignore - Skip comment comparison
|
|
16
|
+
class CommentsDimension < BaseDimension
|
|
17
|
+
# Extract comments from a node
|
|
18
|
+
#
|
|
19
|
+
# @param node [Moxml::Node, Nokogiri::XML::Node] Node to extract from
|
|
20
|
+
# @return [Array<String>] Array of comment strings
|
|
21
|
+
def extract_data(node)
|
|
22
|
+
return [] unless node
|
|
23
|
+
|
|
24
|
+
# Handle Moxml nodes
|
|
25
|
+
if node.is_a?(Moxml::Node)
|
|
26
|
+
extract_from_moxml(node)
|
|
27
|
+
# Handle Nokogiri nodes
|
|
28
|
+
elsif node.is_a?(Nokogiri::XML::Node)
|
|
29
|
+
extract_from_nokogiri(node)
|
|
30
|
+
else
|
|
31
|
+
[]
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Strict comment comparison
|
|
36
|
+
#
|
|
37
|
+
# @param comments1 [Array<String>] First comments array
|
|
38
|
+
# @param comments2 [Array<String>] Second comments array
|
|
39
|
+
# @return [Boolean] true if comments are exactly equal
|
|
40
|
+
def compare_strict(comments1, comments2)
|
|
41
|
+
comments1 == comments2
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Normalized comment comparison
|
|
45
|
+
#
|
|
46
|
+
# For comments, normalized comparison collapses whitespace in each comment.
|
|
47
|
+
#
|
|
48
|
+
# @param comments1 [Array<String>] First comments array
|
|
49
|
+
# @param comments2 [Array<String>] Second comments array
|
|
50
|
+
# @return [Boolean] true if normalized comments are equal
|
|
51
|
+
def compare_normalize(comments1, comments2)
|
|
52
|
+
normalize_comments(comments1) == normalize_comments(comments2)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
private
|
|
56
|
+
|
|
57
|
+
# Extract comments from Moxml node
|
|
58
|
+
#
|
|
59
|
+
# @param node [Moxml::Node] Moxml node
|
|
60
|
+
# @return [Array<String>] Array of comment strings
|
|
61
|
+
def extract_from_moxml(node)
|
|
62
|
+
comments = []
|
|
63
|
+
|
|
64
|
+
# If node itself is a comment
|
|
65
|
+
if node.node_type == :comment
|
|
66
|
+
comments << node.content
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Extract child comments
|
|
70
|
+
node.children.each do |child|
|
|
71
|
+
comments << child.content if child.node_type == :comment
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
comments
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Extract comments from Nokogiri node
|
|
78
|
+
#
|
|
79
|
+
# @param node [Nokogiri::XML::Node] Nokogiri node
|
|
80
|
+
# @return [Array<String>] Array of comment strings
|
|
81
|
+
def extract_from_nokogiri(node)
|
|
82
|
+
comments = []
|
|
83
|
+
|
|
84
|
+
# If node itself is a comment
|
|
85
|
+
if node.node_type == Nokogiri::XML::Node::COMMENT_NODE
|
|
86
|
+
comments << node.content
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Extract child comments
|
|
90
|
+
node.children.each do |child|
|
|
91
|
+
if child.node_type == Nokogiri::XML::Node::COMMENT_NODE
|
|
92
|
+
comments << child.content
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
comments
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Normalize comments by collapsing whitespace
|
|
100
|
+
#
|
|
101
|
+
# @param comments [Array<String>] Comments to normalize
|
|
102
|
+
# @return [Array<String>] Normalized comments
|
|
103
|
+
def normalize_comments(comments)
|
|
104
|
+
comments.map { |c| normalize_text(c) }
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Normalize text by collapsing whitespace
|
|
108
|
+
#
|
|
109
|
+
# @param text [String, nil] Text to normalize
|
|
110
|
+
# @return [String] Normalized text
|
|
111
|
+
def normalize_text(text)
|
|
112
|
+
return "" if text.nil?
|
|
113
|
+
|
|
114
|
+
text.to_s
|
|
115
|
+
.gsub(/[\p{Space}\u00a0]+/, " ")
|
|
116
|
+
.strip
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "base_dimension"
|
|
4
|
+
|
|
5
|
+
module Canon
|
|
6
|
+
module Comparison
|
|
7
|
+
module Dimensions
|
|
8
|
+
# Element position dimension
|
|
9
|
+
#
|
|
10
|
+
# Handles comparison of element positions within their parent.
|
|
11
|
+
# Supports :strict and :ignore behaviors.
|
|
12
|
+
#
|
|
13
|
+
# Behaviors:
|
|
14
|
+
# - :strict - Elements must appear in the same position (index)
|
|
15
|
+
# - :ignore - Element position doesn't matter
|
|
16
|
+
class ElementPositionDimension < BaseDimension
|
|
17
|
+
# Extract element position from a node
|
|
18
|
+
#
|
|
19
|
+
# Returns the index of this node among its siblings of the same type.
|
|
20
|
+
#
|
|
21
|
+
# @param node [Moxml::Node, Nokogiri::XML::Node] Node to extract from
|
|
22
|
+
# @return [Integer] Position index (0-based)
|
|
23
|
+
def extract_data(node)
|
|
24
|
+
return 0 unless node
|
|
25
|
+
|
|
26
|
+
# Handle Moxml nodes
|
|
27
|
+
if node.is_a?(Moxml::Node)
|
|
28
|
+
extract_from_moxml(node)
|
|
29
|
+
# Handle Nokogiri nodes
|
|
30
|
+
elsif node.is_a?(Nokogiri::XML::Node)
|
|
31
|
+
extract_from_nokogiri(node)
|
|
32
|
+
else
|
|
33
|
+
0
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Strict element position comparison
|
|
38
|
+
#
|
|
39
|
+
# @param pos1 [Integer] First position
|
|
40
|
+
# @param pos2 [Integer] Second position
|
|
41
|
+
# @return [Boolean] true if positions are equal
|
|
42
|
+
def compare_strict(pos1, pos2)
|
|
43
|
+
pos1 == pos2
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
private
|
|
47
|
+
|
|
48
|
+
# Extract position from Moxml node
|
|
49
|
+
#
|
|
50
|
+
# @param node [Moxml::Node] Moxml node
|
|
51
|
+
# @return [Integer] Position index
|
|
52
|
+
def extract_from_moxml(node)
|
|
53
|
+
return 0 unless node.parent
|
|
54
|
+
|
|
55
|
+
# Find position among siblings of the same element name
|
|
56
|
+
siblings = node.parent.children
|
|
57
|
+
node.name
|
|
58
|
+
|
|
59
|
+
siblings.each_with_index do |sibling, index|
|
|
60
|
+
if sibling == node
|
|
61
|
+
return index
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
0
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Extract position from Nokogiri node
|
|
69
|
+
#
|
|
70
|
+
# @param node [Nokogiri::XML::Node] Nokogiri node
|
|
71
|
+
# @return [Integer] Position index
|
|
72
|
+
def extract_from_nokogiri(node)
|
|
73
|
+
return 0 unless node.parent
|
|
74
|
+
|
|
75
|
+
# Find position among siblings
|
|
76
|
+
siblings = node.parent.children
|
|
77
|
+
node.name
|
|
78
|
+
|
|
79
|
+
siblings.each_with_index do |sibling, index|
|
|
80
|
+
if sibling == node
|
|
81
|
+
return index
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
0
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "base_dimension"
|
|
4
|
+
require_relative "text_content_dimension"
|
|
5
|
+
require_relative "comments_dimension"
|
|
6
|
+
require_relative "attribute_values_dimension"
|
|
7
|
+
require_relative "attribute_presence_dimension"
|
|
8
|
+
require_relative "attribute_order_dimension"
|
|
9
|
+
require_relative "element_position_dimension"
|
|
10
|
+
require_relative "structural_whitespace_dimension"
|
|
11
|
+
|
|
12
|
+
module Canon
|
|
13
|
+
module Comparison
|
|
14
|
+
module Dimensions
|
|
15
|
+
# Registry for comparison dimensions
|
|
16
|
+
#
|
|
17
|
+
# Provides a central access point for all dimension classes
|
|
18
|
+
# and maps dimension symbols to their implementations.
|
|
19
|
+
module Registry
|
|
20
|
+
# Dimension class mappings
|
|
21
|
+
DIMENSION_CLASSES = {
|
|
22
|
+
text_content: TextContentDimension,
|
|
23
|
+
comments: CommentsDimension,
|
|
24
|
+
attribute_values: AttributeValuesDimension,
|
|
25
|
+
attribute_presence: AttributePresenceDimension,
|
|
26
|
+
attribute_order: AttributeOrderDimension,
|
|
27
|
+
element_position: ElementPositionDimension,
|
|
28
|
+
structural_whitespace: StructuralWhitespaceDimension,
|
|
29
|
+
}.freeze
|
|
30
|
+
|
|
31
|
+
# Get a dimension instance by name
|
|
32
|
+
#
|
|
33
|
+
# @param dimension_name [Symbol] Dimension name
|
|
34
|
+
# @return [BaseDimension] Dimension instance
|
|
35
|
+
# @raise [Canon::Error] if dimension is unknown
|
|
36
|
+
def self.get(dimension_name)
|
|
37
|
+
dimension_class = DIMENSION_CLASSES[dimension_name]
|
|
38
|
+
|
|
39
|
+
unless dimension_class
|
|
40
|
+
raise Canon::Error,
|
|
41
|
+
"Unknown dimension: #{dimension_name}. " \
|
|
42
|
+
"Valid dimensions: #{DIMENSION_CLASSES.keys.join(', ')}"
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
dimension_class.new
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Get all available dimension names
|
|
49
|
+
#
|
|
50
|
+
# @return [Array<Symbol>] Available dimension names
|
|
51
|
+
def self.available_dimensions
|
|
52
|
+
DIMENSION_CLASSES.keys
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Check if a dimension is available
|
|
56
|
+
#
|
|
57
|
+
# @param dimension_name [Symbol] Dimension name
|
|
58
|
+
# @return [Boolean] true if dimension is available
|
|
59
|
+
def self.dimension_exists?(dimension_name)
|
|
60
|
+
DIMENSION_CLASSES.key?(dimension_name)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Compare two nodes for a specific dimension
|
|
64
|
+
#
|
|
65
|
+
# @param dimension_name [Symbol] Dimension name
|
|
66
|
+
# @param node1 [Object] First node
|
|
67
|
+
# @param node2 [Object] Second node
|
|
68
|
+
# @param behavior [Symbol] Comparison behavior
|
|
69
|
+
# @return [Boolean] true if nodes match for this dimension
|
|
70
|
+
def self.compare(dimension_name, node1, node2, behavior)
|
|
71
|
+
dimension = get(dimension_name)
|
|
72
|
+
dimension.equivalent?(node1, node2, behavior)
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "base_dimension"
|
|
4
|
+
require_relative "../match_options"
|
|
5
|
+
|
|
6
|
+
module Canon
|
|
7
|
+
module Comparison
|
|
8
|
+
module Dimensions
|
|
9
|
+
# Structural whitespace dimension
|
|
10
|
+
#
|
|
11
|
+
# Handles comparison of structural whitespace (whitespace between elements).
|
|
12
|
+
# Supports :strict, :normalize, and :ignore behaviors.
|
|
13
|
+
#
|
|
14
|
+
# Behaviors:
|
|
15
|
+
# - :strict - Exact whitespace comparison
|
|
16
|
+
# - :normalize - Collapse whitespace and compare
|
|
17
|
+
# - :ignore - Skip structural whitespace comparison
|
|
18
|
+
class StructuralWhitespaceDimension < BaseDimension
|
|
19
|
+
# Extract structural whitespace from a node
|
|
20
|
+
#
|
|
21
|
+
# Returns whitespace text nodes that are between elements (structural).
|
|
22
|
+
#
|
|
23
|
+
# @param node [Moxml::Node, Nokogiri::XML::Node] Node to extract from
|
|
24
|
+
# @return [Array<String>] Array of structural whitespace strings
|
|
25
|
+
def extract_data(node)
|
|
26
|
+
return [] unless node
|
|
27
|
+
|
|
28
|
+
# Handle Moxml nodes
|
|
29
|
+
if node.is_a?(Moxml::Node)
|
|
30
|
+
extract_from_moxml(node)
|
|
31
|
+
# Handle Nokogiri nodes
|
|
32
|
+
elsif node.is_a?(Nokogiri::XML::Node)
|
|
33
|
+
extract_from_nokogiri(node)
|
|
34
|
+
else
|
|
35
|
+
[]
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Strict structural whitespace comparison
|
|
40
|
+
#
|
|
41
|
+
# @param ws1 [Array<String>] First whitespace array
|
|
42
|
+
# @param ws2 [Array<String>] Second whitespace array
|
|
43
|
+
# @return [Boolean] true if structural whitespace is exactly equal
|
|
44
|
+
def compare_strict(ws1, ws2)
|
|
45
|
+
ws1 == ws2
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Normalized structural whitespace comparison
|
|
49
|
+
#
|
|
50
|
+
# Collapses whitespace in each entry and compares.
|
|
51
|
+
#
|
|
52
|
+
# @param ws1 [Array<String>] First whitespace array
|
|
53
|
+
# @param ws2 [Array<String>] Second whitespace array
|
|
54
|
+
# @return [Boolean] true if normalized structural whitespace is equal
|
|
55
|
+
def compare_normalize(ws1, ws2)
|
|
56
|
+
normalize_whitespace(ws1) == normalize_whitespace(ws2)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
private
|
|
60
|
+
|
|
61
|
+
# Extract structural whitespace from Moxml node
|
|
62
|
+
#
|
|
63
|
+
# @param node [Moxml::Node] Moxml node
|
|
64
|
+
# @return [Array<String>] Array of structural whitespace strings
|
|
65
|
+
def extract_from_moxml(node)
|
|
66
|
+
whitespace = []
|
|
67
|
+
|
|
68
|
+
node.children.each do |child|
|
|
69
|
+
if child.node_type == :text
|
|
70
|
+
text = child.content.strip
|
|
71
|
+
# Check if this is purely whitespace (structural)
|
|
72
|
+
if text.empty? || child.content =~ /\A\s*\z/
|
|
73
|
+
whitespace << child.content
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
whitespace
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Extract structural whitespace from Nokogiri node
|
|
82
|
+
#
|
|
83
|
+
# @param node [Nokogiri::XML::Node] Nokogiri node
|
|
84
|
+
# @return [Array<String>] Array of structural whitespace strings
|
|
85
|
+
def extract_from_nokogiri(node)
|
|
86
|
+
whitespace = []
|
|
87
|
+
|
|
88
|
+
node.children.each do |child|
|
|
89
|
+
if child.node_type == Nokogiri::XML::Node::TEXT_NODE
|
|
90
|
+
text = child.content.strip
|
|
91
|
+
# Check if this is purely whitespace (structural)
|
|
92
|
+
if text.empty? || child.content =~ /\A\s*\z/
|
|
93
|
+
whitespace << child.content
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
whitespace
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Normalize whitespace array
|
|
102
|
+
#
|
|
103
|
+
# @param whitespace [Array<String>] Whitespace strings
|
|
104
|
+
# @return [Array<String>] Normalized whitespace strings
|
|
105
|
+
def normalize_whitespace(whitespace)
|
|
106
|
+
whitespace.map { |ws| normalize_text(ws) }
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Normalize text
|
|
110
|
+
#
|
|
111
|
+
# @param text [String, nil] Text to normalize
|
|
112
|
+
# @return [String] Normalized text
|
|
113
|
+
def normalize_text(text)
|
|
114
|
+
MatchOptions.normalize_text(text)
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "base_dimension"
|
|
4
|
+
require_relative "../match_options"
|
|
5
|
+
|
|
6
|
+
module Canon
|
|
7
|
+
module Comparison
|
|
8
|
+
module Dimensions
|
|
9
|
+
# Text content dimension
|
|
10
|
+
#
|
|
11
|
+
# Handles comparison of text content in nodes.
|
|
12
|
+
# Supports :strict, :normalize, and :ignore behaviors.
|
|
13
|
+
#
|
|
14
|
+
# Behaviors:
|
|
15
|
+
# - :strict - Exact text comparison including whitespace
|
|
16
|
+
# - :normalize - Collapse whitespace and compare
|
|
17
|
+
# - :ignore - Skip text content comparison
|
|
18
|
+
class TextContentDimension < BaseDimension
|
|
19
|
+
# Extract text content from a node
|
|
20
|
+
#
|
|
21
|
+
# @param node [Moxml::Node, Nokogiri::XML::Node] Node to extract from
|
|
22
|
+
# @return [String, nil] Text content or nil if not a text node
|
|
23
|
+
def extract_data(node)
|
|
24
|
+
return nil unless node
|
|
25
|
+
|
|
26
|
+
# Handle Moxml nodes
|
|
27
|
+
if node.is_a?(Moxml::Node)
|
|
28
|
+
extract_from_moxml(node)
|
|
29
|
+
# Handle Nokogiri nodes
|
|
30
|
+
elsif node.is_a?(Nokogiri::XML::Node)
|
|
31
|
+
extract_from_nokogiri(node)
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Strict text comparison
|
|
36
|
+
#
|
|
37
|
+
# @param text1 [String, nil] First text
|
|
38
|
+
# @param text2 [String, nil] Second text
|
|
39
|
+
# @return [Boolean] true if texts are exactly equal
|
|
40
|
+
def compare_strict(text1, text2)
|
|
41
|
+
text1.to_s == text2.to_s
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Normalized text comparison
|
|
45
|
+
#
|
|
46
|
+
# Collapses whitespace and compares.
|
|
47
|
+
#
|
|
48
|
+
# @param text1 [String, nil] First text
|
|
49
|
+
# @param text2 [String, nil] Second text
|
|
50
|
+
# @return [Boolean] true if normalized texts are equal
|
|
51
|
+
def compare_normalize(text1, text2)
|
|
52
|
+
normalize_text(text1) == normalize_text(text2)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
private
|
|
56
|
+
|
|
57
|
+
# Extract text from Moxml node
|
|
58
|
+
#
|
|
59
|
+
# @param node [Moxml::Node] Moxml node
|
|
60
|
+
# @return [String, nil] Text content
|
|
61
|
+
def extract_from_moxml(node)
|
|
62
|
+
case node.node_type
|
|
63
|
+
when :text, :cdata
|
|
64
|
+
node.content
|
|
65
|
+
when :element
|
|
66
|
+
# For element nodes, extract concatenated text from children
|
|
67
|
+
node.text
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Extract text from Nokogiri node
|
|
72
|
+
#
|
|
73
|
+
# @param node [Nokogiri::XML::Node] Nokogiri node
|
|
74
|
+
# @return [String, nil] Text content
|
|
75
|
+
def extract_from_nokogiri(node)
|
|
76
|
+
case node.node_type
|
|
77
|
+
when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
|
|
78
|
+
node.content
|
|
79
|
+
when Nokogiri::XML::Node::ELEMENT_NODE
|
|
80
|
+
node.content
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Normalize text by collapsing whitespace
|
|
85
|
+
#
|
|
86
|
+
# Uses MatchOptions.normalize_text for consistency.
|
|
87
|
+
#
|
|
88
|
+
# @param text [String, nil] Text to normalize
|
|
89
|
+
# @return [String] Normalized text
|
|
90
|
+
def normalize_text(text)
|
|
91
|
+
MatchOptions.normalize_text(text)
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Comparison dimensions
|
|
4
|
+
#
|
|
5
|
+
# Provides dimension classes for comparing specific aspects of documents.
|
|
6
|
+
# Each dimension knows how to extract and compare data according to different behaviors.
|
|
7
|
+
#
|
|
8
|
+
# == Architecture
|
|
9
|
+
#
|
|
10
|
+
# Dimensions represent "WHAT to compare" - specific aspects of a document that can be compared:
|
|
11
|
+
# - Text content
|
|
12
|
+
# - Comments
|
|
13
|
+
# - Attribute values
|
|
14
|
+
# - Attribute presence
|
|
15
|
+
# - Attribute order
|
|
16
|
+
# - Element position
|
|
17
|
+
# - Structural whitespace
|
|
18
|
+
#
|
|
19
|
+
# == Behaviors
|
|
20
|
+
#
|
|
21
|
+
# Each dimension supports comparison behaviors:
|
|
22
|
+
# - :strict - Exact comparison
|
|
23
|
+
# - :normalize - Normalized comparison (e.g., collapse whitespace)
|
|
24
|
+
# - :ignore - Skip comparison
|
|
25
|
+
#
|
|
26
|
+
# == Usage
|
|
27
|
+
#
|
|
28
|
+
# # Get a dimension instance
|
|
29
|
+
# dimension = Canon::Comparison::Dimensions::Registry.get(:text_content)
|
|
30
|
+
#
|
|
31
|
+
# # Compare two nodes
|
|
32
|
+
# dimension.equivalent?(node1, node2, :normalize)
|
|
33
|
+
#
|
|
34
|
+
# # Or use the registry directly
|
|
35
|
+
# Canon::Comparison::Dimensions::Registry.compare(:text_content, node1, node2, :normalize)
|
|
36
|
+
|
|
37
|
+
require_relative "dimensions/base_dimension"
|
|
38
|
+
require_relative "dimensions/registry"
|
|
39
|
+
require_relative "dimensions/text_content_dimension"
|
|
40
|
+
require_relative "dimensions/comments_dimension"
|
|
41
|
+
require_relative "dimensions/attribute_values_dimension"
|
|
42
|
+
require_relative "dimensions/attribute_presence_dimension"
|
|
43
|
+
require_relative "dimensions/attribute_order_dimension"
|
|
44
|
+
require_relative "dimensions/element_position_dimension"
|
|
45
|
+
require_relative "dimensions/structural_whitespace_dimension"
|
|
46
|
+
|
|
47
|
+
module Canon
|
|
48
|
+
module Comparison
|
|
49
|
+
module Dimensions
|
|
50
|
+
# Version constant for the dimensions module
|
|
51
|
+
VERSION = "1.0.0"
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
module Comparison
|
|
5
|
+
# Format detection service for auto-detecting document formats
|
|
6
|
+
#
|
|
7
|
+
# Provides format detection for various document types including XML, HTML,
|
|
8
|
+
# JSON, YAML, and plain text. Uses caching for performance optimization.
|
|
9
|
+
#
|
|
10
|
+
# @example Detect format from a string
|
|
11
|
+
# FormatDetector.detect("<root>content</root>") # => :xml
|
|
12
|
+
#
|
|
13
|
+
# @example Detect format from an object
|
|
14
|
+
# FormatDetector.detect(Moxml::Document.new) # => :xml
|
|
15
|
+
class FormatDetector
|
|
16
|
+
# Supported format types
|
|
17
|
+
FORMATS = %i[xml html json yaml ruby_object string].freeze
|
|
18
|
+
|
|
19
|
+
class << self
|
|
20
|
+
# Detect the format of an object
|
|
21
|
+
#
|
|
22
|
+
# @param obj [Object] Object to detect format of
|
|
23
|
+
# @return [Symbol] Format type (:xml, :html, :json, :yaml, :ruby_object, :string)
|
|
24
|
+
def detect(obj)
|
|
25
|
+
case obj
|
|
26
|
+
when Moxml::Node, Moxml::Document
|
|
27
|
+
:xml
|
|
28
|
+
when Nokogiri::HTML::DocumentFragment, Nokogiri::HTML5::DocumentFragment
|
|
29
|
+
# HTML DocumentFragments
|
|
30
|
+
:html
|
|
31
|
+
when Nokogiri::XML::DocumentFragment
|
|
32
|
+
# XML DocumentFragments - check if it's actually HTML
|
|
33
|
+
obj.document&.html? ? :html : :xml
|
|
34
|
+
when Nokogiri::XML::Document, Nokogiri::XML::Node
|
|
35
|
+
# Check if it's HTML by looking at the document type
|
|
36
|
+
obj.html? ? :html : :xml
|
|
37
|
+
when Nokogiri::HTML::Document, Nokogiri::HTML5::Document
|
|
38
|
+
:html
|
|
39
|
+
when String
|
|
40
|
+
detect_string(obj)
|
|
41
|
+
when Hash, Array
|
|
42
|
+
# Raw Ruby objects (from parsed JSON/YAML)
|
|
43
|
+
:ruby_object
|
|
44
|
+
else
|
|
45
|
+
raise Canon::Error, "Unknown format for object: #{obj.class}"
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Detect the format of a string with caching
|
|
50
|
+
#
|
|
51
|
+
# @param str [String] String to detect format of
|
|
52
|
+
# @return [Symbol] Format type
|
|
53
|
+
def detect_string(str)
|
|
54
|
+
# Use cache for format detection
|
|
55
|
+
Cache.fetch(:format_detect, Cache.key_for_format_detection(str)) do
|
|
56
|
+
detect_string_uncached(str)
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Detect the format of a string without caching
|
|
61
|
+
#
|
|
62
|
+
# @param str [String] String to detect format of
|
|
63
|
+
# @return [Symbol] Format type
|
|
64
|
+
def detect_string_uncached(str)
|
|
65
|
+
trimmed = str.strip
|
|
66
|
+
|
|
67
|
+
# YAML indicators
|
|
68
|
+
return :yaml if trimmed.start_with?("---")
|
|
69
|
+
return :yaml if trimmed.match?(/^[a-zA-Z_]\w*:\s/)
|
|
70
|
+
|
|
71
|
+
# JSON indicators
|
|
72
|
+
return :json if trimmed.start_with?("{", "[")
|
|
73
|
+
|
|
74
|
+
# HTML indicators
|
|
75
|
+
return :html if trimmed.start_with?("<!DOCTYPE html", "<html",
|
|
76
|
+
"<HTML")
|
|
77
|
+
|
|
78
|
+
# XML indicators - must start with < and end with >
|
|
79
|
+
return :xml if trimmed.start_with?("<") && trimmed.end_with?(">")
|
|
80
|
+
|
|
81
|
+
# Default to plain string for everything else
|
|
82
|
+
:string
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|