canon 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +69 -92
  3. data/README.adoc +13 -13
  4. data/docs/.lycheeignore +69 -0
  5. data/docs/Gemfile +1 -0
  6. data/docs/_config.yml +90 -1
  7. data/docs/advanced/diff-classification.adoc +82 -2
  8. data/docs/advanced/extending-canon.adoc +193 -0
  9. data/docs/features/match-options/index.adoc +239 -1
  10. data/docs/internals/diffnode-enrichment.adoc +611 -0
  11. data/docs/internals/index.adoc +251 -0
  12. data/docs/lychee.toml +13 -6
  13. data/docs/understanding/architecture.adoc +749 -33
  14. data/docs/understanding/comparison-pipeline.adoc +122 -0
  15. data/lib/canon/cache.rb +129 -0
  16. data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +68 -0
  17. data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +68 -0
  18. data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +171 -0
  19. data/lib/canon/comparison/dimensions/base_dimension.rb +107 -0
  20. data/lib/canon/comparison/dimensions/comments_dimension.rb +121 -0
  21. data/lib/canon/comparison/dimensions/element_position_dimension.rb +90 -0
  22. data/lib/canon/comparison/dimensions/registry.rb +77 -0
  23. data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +119 -0
  24. data/lib/canon/comparison/dimensions/text_content_dimension.rb +96 -0
  25. data/lib/canon/comparison/dimensions.rb +54 -0
  26. data/lib/canon/comparison/format_detector.rb +87 -0
  27. data/lib/canon/comparison/html_comparator.rb +70 -26
  28. data/lib/canon/comparison/html_compare_profile.rb +8 -2
  29. data/lib/canon/comparison/html_parser.rb +80 -0
  30. data/lib/canon/comparison/json_comparator.rb +12 -0
  31. data/lib/canon/comparison/json_parser.rb +19 -0
  32. data/lib/canon/comparison/markup_comparator.rb +293 -0
  33. data/lib/canon/comparison/match_options/base_resolver.rb +150 -0
  34. data/lib/canon/comparison/match_options/json_resolver.rb +82 -0
  35. data/lib/canon/comparison/match_options/xml_resolver.rb +151 -0
  36. data/lib/canon/comparison/match_options/yaml_resolver.rb +87 -0
  37. data/lib/canon/comparison/match_options.rb +68 -463
  38. data/lib/canon/comparison/profile_definition.rb +149 -0
  39. data/lib/canon/comparison/ruby_object_comparator.rb +180 -0
  40. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +7 -10
  41. data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
  42. data/lib/canon/comparison/xml_comparator/attribute_comparator.rb +177 -0
  43. data/lib/canon/comparison/xml_comparator/attribute_filter.rb +136 -0
  44. data/lib/canon/comparison/xml_comparator/child_comparison.rb +197 -0
  45. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +115 -0
  46. data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +186 -0
  47. data/lib/canon/comparison/xml_comparator/node_parser.rb +79 -0
  48. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +102 -0
  49. data/lib/canon/comparison/xml_comparator.rb +97 -684
  50. data/lib/canon/comparison/xml_node_comparison.rb +319 -0
  51. data/lib/canon/comparison/xml_parser.rb +19 -0
  52. data/lib/canon/comparison/yaml_comparator.rb +3 -3
  53. data/lib/canon/comparison.rb +265 -110
  54. data/lib/canon/diff/diff_classifier.rb +101 -2
  55. data/lib/canon/diff/diff_node.rb +32 -2
  56. data/lib/canon/diff/formatting_detector.rb +1 -1
  57. data/lib/canon/diff/node_serializer.rb +191 -0
  58. data/lib/canon/diff/path_builder.rb +143 -0
  59. data/lib/canon/diff_formatter/by_line/base_formatter.rb +251 -0
  60. data/lib/canon/diff_formatter/by_line/html_formatter.rb +6 -248
  61. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +38 -229
  62. data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +30 -0
  63. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +579 -0
  64. data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +121 -0
  65. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +253 -0
  66. data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +61 -0
  67. data/lib/canon/diff_formatter/diff_detail_formatter.rb +31 -1028
  68. data/lib/canon/diff_formatter.rb +1 -1
  69. data/lib/canon/rspec_matchers.rb +38 -9
  70. data/lib/canon/tree_diff/operation_converter.rb +92 -338
  71. data/lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb +71 -0
  72. data/lib/canon/tree_diff/operation_converter_helpers/post_processor.rb +103 -0
  73. data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +168 -0
  74. data/lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb +188 -0
  75. data/lib/canon/version.rb +1 -1
  76. data/lib/canon/xml/data_model.rb +24 -13
  77. metadata +48 -2
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base_dimension"
4
+
5
+ module Canon
6
+ module Comparison
7
+ module Dimensions
8
+ # Comments dimension
9
+ #
10
+ # Handles comparison of comment nodes.
11
+ # Supports :strict and :ignore behaviors.
12
+ #
13
+ # Behaviors:
14
+ # - :strict - Exact comment comparison including whitespace
15
+ # - :ignore - Skip comment comparison
16
+ class CommentsDimension < BaseDimension
17
+ # Extract comments from a node
18
+ #
19
+ # @param node [Moxml::Node, Nokogiri::XML::Node] Node to extract from
20
+ # @return [Array<String>] Array of comment strings
21
+ def extract_data(node)
22
+ return [] unless node
23
+
24
+ # Handle Moxml nodes
25
+ if node.is_a?(Moxml::Node)
26
+ extract_from_moxml(node)
27
+ # Handle Nokogiri nodes
28
+ elsif node.is_a?(Nokogiri::XML::Node)
29
+ extract_from_nokogiri(node)
30
+ else
31
+ []
32
+ end
33
+ end
34
+
35
+ # Strict comment comparison
36
+ #
37
+ # @param comments1 [Array<String>] First comments array
38
+ # @param comments2 [Array<String>] Second comments array
39
+ # @return [Boolean] true if comments are exactly equal
40
+ def compare_strict(comments1, comments2)
41
+ comments1 == comments2
42
+ end
43
+
44
+ # Normalized comment comparison
45
+ #
46
+ # For comments, normalized comparison collapses whitespace in each comment.
47
+ #
48
+ # @param comments1 [Array<String>] First comments array
49
+ # @param comments2 [Array<String>] Second comments array
50
+ # @return [Boolean] true if normalized comments are equal
51
+ def compare_normalize(comments1, comments2)
52
+ normalize_comments(comments1) == normalize_comments(comments2)
53
+ end
54
+
55
+ private
56
+
57
+ # Extract comments from Moxml node
58
+ #
59
+ # @param node [Moxml::Node] Moxml node
60
+ # @return [Array<String>] Array of comment strings
61
+ def extract_from_moxml(node)
62
+ comments = []
63
+
64
+ # If node itself is a comment
65
+ if node.node_type == :comment
66
+ comments << node.content
67
+ end
68
+
69
+ # Extract child comments
70
+ node.children.each do |child|
71
+ comments << child.content if child.node_type == :comment
72
+ end
73
+
74
+ comments
75
+ end
76
+
77
+ # Extract comments from Nokogiri node
78
+ #
79
+ # @param node [Nokogiri::XML::Node] Nokogiri node
80
+ # @return [Array<String>] Array of comment strings
81
+ def extract_from_nokogiri(node)
82
+ comments = []
83
+
84
+ # If node itself is a comment
85
+ if node.node_type == Nokogiri::XML::Node::COMMENT_NODE
86
+ comments << node.content
87
+ end
88
+
89
+ # Extract child comments
90
+ node.children.each do |child|
91
+ if child.node_type == Nokogiri::XML::Node::COMMENT_NODE
92
+ comments << child.content
93
+ end
94
+ end
95
+
96
+ comments
97
+ end
98
+
99
+ # Normalize comments by collapsing whitespace
100
+ #
101
+ # @param comments [Array<String>] Comments to normalize
102
+ # @return [Array<String>] Normalized comments
103
+ def normalize_comments(comments)
104
+ comments.map { |c| normalize_text(c) }
105
+ end
106
+
107
+ # Normalize text by collapsing whitespace
108
+ #
109
+ # @param text [String, nil] Text to normalize
110
+ # @return [String] Normalized text
111
+ def normalize_text(text)
112
+ return "" if text.nil?
113
+
114
+ text.to_s
115
+ .gsub(/[\p{Space}\u00a0]+/, " ")
116
+ .strip
117
+ end
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base_dimension"
4
+
5
+ module Canon
6
+ module Comparison
7
+ module Dimensions
8
+ # Element position dimension
9
+ #
10
+ # Handles comparison of element positions within their parent.
11
+ # Supports :strict and :ignore behaviors.
12
+ #
13
+ # Behaviors:
14
+ # - :strict - Elements must appear in the same position (index)
15
+ # - :ignore - Element position doesn't matter
16
+ class ElementPositionDimension < BaseDimension
17
+ # Extract element position from a node
18
+ #
19
+ # Returns the index of this node among its siblings of the same type.
20
+ #
21
+ # @param node [Moxml::Node, Nokogiri::XML::Node] Node to extract from
22
+ # @return [Integer] Position index (0-based)
23
+ def extract_data(node)
24
+ return 0 unless node
25
+
26
+ # Handle Moxml nodes
27
+ if node.is_a?(Moxml::Node)
28
+ extract_from_moxml(node)
29
+ # Handle Nokogiri nodes
30
+ elsif node.is_a?(Nokogiri::XML::Node)
31
+ extract_from_nokogiri(node)
32
+ else
33
+ 0
34
+ end
35
+ end
36
+
37
+ # Strict element position comparison
38
+ #
39
+ # @param pos1 [Integer] First position
40
+ # @param pos2 [Integer] Second position
41
+ # @return [Boolean] true if positions are equal
42
+ def compare_strict(pos1, pos2)
43
+ pos1 == pos2
44
+ end
45
+
46
+ private
47
+
48
+ # Extract position from Moxml node
49
+ #
50
+ # @param node [Moxml::Node] Moxml node
51
+ # @return [Integer] Position index
52
+ def extract_from_moxml(node)
53
+ return 0 unless node.parent
54
+
55
+ # Find position among siblings of the same element name
56
+ siblings = node.parent.children
57
+ node.name
58
+
59
+ siblings.each_with_index do |sibling, index|
60
+ if sibling == node
61
+ return index
62
+ end
63
+ end
64
+
65
+ 0
66
+ end
67
+
68
+ # Extract position from Nokogiri node
69
+ #
70
+ # @param node [Nokogiri::XML::Node] Nokogiri node
71
+ # @return [Integer] Position index
72
+ def extract_from_nokogiri(node)
73
+ return 0 unless node.parent
74
+
75
+ # Find position among siblings
76
+ siblings = node.parent.children
77
+ node.name
78
+
79
+ siblings.each_with_index do |sibling, index|
80
+ if sibling == node
81
+ return index
82
+ end
83
+ end
84
+
85
+ 0
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base_dimension"
4
+ require_relative "text_content_dimension"
5
+ require_relative "comments_dimension"
6
+ require_relative "attribute_values_dimension"
7
+ require_relative "attribute_presence_dimension"
8
+ require_relative "attribute_order_dimension"
9
+ require_relative "element_position_dimension"
10
+ require_relative "structural_whitespace_dimension"
11
+
12
+ module Canon
13
+ module Comparison
14
+ module Dimensions
15
+ # Registry for comparison dimensions
16
+ #
17
+ # Provides a central access point for all dimension classes
18
+ # and maps dimension symbols to their implementations.
19
+ module Registry
20
+ # Dimension class mappings
21
+ DIMENSION_CLASSES = {
22
+ text_content: TextContentDimension,
23
+ comments: CommentsDimension,
24
+ attribute_values: AttributeValuesDimension,
25
+ attribute_presence: AttributePresenceDimension,
26
+ attribute_order: AttributeOrderDimension,
27
+ element_position: ElementPositionDimension,
28
+ structural_whitespace: StructuralWhitespaceDimension,
29
+ }.freeze
30
+
31
+ # Get a dimension instance by name
32
+ #
33
+ # @param dimension_name [Symbol] Dimension name
34
+ # @return [BaseDimension] Dimension instance
35
+ # @raise [Canon::Error] if dimension is unknown
36
+ def self.get(dimension_name)
37
+ dimension_class = DIMENSION_CLASSES[dimension_name]
38
+
39
+ unless dimension_class
40
+ raise Canon::Error,
41
+ "Unknown dimension: #{dimension_name}. " \
42
+ "Valid dimensions: #{DIMENSION_CLASSES.keys.join(', ')}"
43
+ end
44
+
45
+ dimension_class.new
46
+ end
47
+
48
+ # Get all available dimension names
49
+ #
50
+ # @return [Array<Symbol>] Available dimension names
51
+ def self.available_dimensions
52
+ DIMENSION_CLASSES.keys
53
+ end
54
+
55
+ # Check if a dimension is available
56
+ #
57
+ # @param dimension_name [Symbol] Dimension name
58
+ # @return [Boolean] true if dimension is available
59
+ def self.dimension_exists?(dimension_name)
60
+ DIMENSION_CLASSES.key?(dimension_name)
61
+ end
62
+
63
+ # Compare two nodes for a specific dimension
64
+ #
65
+ # @param dimension_name [Symbol] Dimension name
66
+ # @param node1 [Object] First node
67
+ # @param node2 [Object] Second node
68
+ # @param behavior [Symbol] Comparison behavior
69
+ # @return [Boolean] true if nodes match for this dimension
70
+ def self.compare(dimension_name, node1, node2, behavior)
71
+ dimension = get(dimension_name)
72
+ dimension.equivalent?(node1, node2, behavior)
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,119 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base_dimension"
4
+ require_relative "../match_options"
5
+
6
+ module Canon
7
+ module Comparison
8
+ module Dimensions
9
+ # Structural whitespace dimension
10
+ #
11
+ # Handles comparison of structural whitespace (whitespace between elements).
12
+ # Supports :strict, :normalize, and :ignore behaviors.
13
+ #
14
+ # Behaviors:
15
+ # - :strict - Exact whitespace comparison
16
+ # - :normalize - Collapse whitespace and compare
17
+ # - :ignore - Skip structural whitespace comparison
18
+ class StructuralWhitespaceDimension < BaseDimension
19
+ # Extract structural whitespace from a node
20
+ #
21
+ # Returns whitespace text nodes that are between elements (structural).
22
+ #
23
+ # @param node [Moxml::Node, Nokogiri::XML::Node] Node to extract from
24
+ # @return [Array<String>] Array of structural whitespace strings
25
+ def extract_data(node)
26
+ return [] unless node
27
+
28
+ # Handle Moxml nodes
29
+ if node.is_a?(Moxml::Node)
30
+ extract_from_moxml(node)
31
+ # Handle Nokogiri nodes
32
+ elsif node.is_a?(Nokogiri::XML::Node)
33
+ extract_from_nokogiri(node)
34
+ else
35
+ []
36
+ end
37
+ end
38
+
39
+ # Strict structural whitespace comparison
40
+ #
41
+ # @param ws1 [Array<String>] First whitespace array
42
+ # @param ws2 [Array<String>] Second whitespace array
43
+ # @return [Boolean] true if structural whitespace is exactly equal
44
+ def compare_strict(ws1, ws2)
45
+ ws1 == ws2
46
+ end
47
+
48
+ # Normalized structural whitespace comparison
49
+ #
50
+ # Collapses whitespace in each entry and compares.
51
+ #
52
+ # @param ws1 [Array<String>] First whitespace array
53
+ # @param ws2 [Array<String>] Second whitespace array
54
+ # @return [Boolean] true if normalized structural whitespace is equal
55
+ def compare_normalize(ws1, ws2)
56
+ normalize_whitespace(ws1) == normalize_whitespace(ws2)
57
+ end
58
+
59
+ private
60
+
61
+ # Extract structural whitespace from Moxml node
62
+ #
63
+ # @param node [Moxml::Node] Moxml node
64
+ # @return [Array<String>] Array of structural whitespace strings
65
+ def extract_from_moxml(node)
66
+ whitespace = []
67
+
68
+ node.children.each do |child|
69
+ if child.node_type == :text
70
+ text = child.content.strip
71
+ # Check if this is purely whitespace (structural)
72
+ if text.empty? || child.content =~ /\A\s*\z/
73
+ whitespace << child.content
74
+ end
75
+ end
76
+ end
77
+
78
+ whitespace
79
+ end
80
+
81
+ # Extract structural whitespace from Nokogiri node
82
+ #
83
+ # @param node [Nokogiri::XML::Node] Nokogiri node
84
+ # @return [Array<String>] Array of structural whitespace strings
85
+ def extract_from_nokogiri(node)
86
+ whitespace = []
87
+
88
+ node.children.each do |child|
89
+ if child.node_type == Nokogiri::XML::Node::TEXT_NODE
90
+ text = child.content.strip
91
+ # Check if this is purely whitespace (structural)
92
+ if text.empty? || child.content =~ /\A\s*\z/
93
+ whitespace << child.content
94
+ end
95
+ end
96
+ end
97
+
98
+ whitespace
99
+ end
100
+
101
+ # Normalize whitespace array
102
+ #
103
+ # @param whitespace [Array<String>] Whitespace strings
104
+ # @return [Array<String>] Normalized whitespace strings
105
+ def normalize_whitespace(whitespace)
106
+ whitespace.map { |ws| normalize_text(ws) }
107
+ end
108
+
109
+ # Normalize text
110
+ #
111
+ # @param text [String, nil] Text to normalize
112
+ # @return [String] Normalized text
113
+ def normalize_text(text)
114
+ MatchOptions.normalize_text(text)
115
+ end
116
+ end
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,96 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base_dimension"
4
+ require_relative "../match_options"
5
+
6
+ module Canon
7
+ module Comparison
8
+ module Dimensions
9
+ # Text content dimension
10
+ #
11
+ # Handles comparison of text content in nodes.
12
+ # Supports :strict, :normalize, and :ignore behaviors.
13
+ #
14
+ # Behaviors:
15
+ # - :strict - Exact text comparison including whitespace
16
+ # - :normalize - Collapse whitespace and compare
17
+ # - :ignore - Skip text content comparison
18
+ class TextContentDimension < BaseDimension
19
+ # Extract text content from a node
20
+ #
21
+ # @param node [Moxml::Node, Nokogiri::XML::Node] Node to extract from
22
+ # @return [String, nil] Text content or nil if not a text node
23
+ def extract_data(node)
24
+ return nil unless node
25
+
26
+ # Handle Moxml nodes
27
+ if node.is_a?(Moxml::Node)
28
+ extract_from_moxml(node)
29
+ # Handle Nokogiri nodes
30
+ elsif node.is_a?(Nokogiri::XML::Node)
31
+ extract_from_nokogiri(node)
32
+ end
33
+ end
34
+
35
+ # Strict text comparison
36
+ #
37
+ # @param text1 [String, nil] First text
38
+ # @param text2 [String, nil] Second text
39
+ # @return [Boolean] true if texts are exactly equal
40
+ def compare_strict(text1, text2)
41
+ text1.to_s == text2.to_s
42
+ end
43
+
44
+ # Normalized text comparison
45
+ #
46
+ # Collapses whitespace and compares.
47
+ #
48
+ # @param text1 [String, nil] First text
49
+ # @param text2 [String, nil] Second text
50
+ # @return [Boolean] true if normalized texts are equal
51
+ def compare_normalize(text1, text2)
52
+ normalize_text(text1) == normalize_text(text2)
53
+ end
54
+
55
+ private
56
+
57
+ # Extract text from Moxml node
58
+ #
59
+ # @param node [Moxml::Node] Moxml node
60
+ # @return [String, nil] Text content
61
+ def extract_from_moxml(node)
62
+ case node.node_type
63
+ when :text, :cdata
64
+ node.content
65
+ when :element
66
+ # For element nodes, extract concatenated text from children
67
+ node.text
68
+ end
69
+ end
70
+
71
+ # Extract text from Nokogiri node
72
+ #
73
+ # @param node [Nokogiri::XML::Node] Nokogiri node
74
+ # @return [String, nil] Text content
75
+ def extract_from_nokogiri(node)
76
+ case node.node_type
77
+ when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
78
+ node.content
79
+ when Nokogiri::XML::Node::ELEMENT_NODE
80
+ node.content
81
+ end
82
+ end
83
+
84
+ # Normalize text by collapsing whitespace
85
+ #
86
+ # Uses MatchOptions.normalize_text for consistency.
87
+ #
88
+ # @param text [String, nil] Text to normalize
89
+ # @return [String] Normalized text
90
+ def normalize_text(text)
91
+ MatchOptions.normalize_text(text)
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Comparison dimensions
4
+ #
5
+ # Provides dimension classes for comparing specific aspects of documents.
6
+ # Each dimension knows how to extract and compare data according to different behaviors.
7
+ #
8
+ # == Architecture
9
+ #
10
+ # Dimensions represent "WHAT to compare" - specific aspects of a document that can be compared:
11
+ # - Text content
12
+ # - Comments
13
+ # - Attribute values
14
+ # - Attribute presence
15
+ # - Attribute order
16
+ # - Element position
17
+ # - Structural whitespace
18
+ #
19
+ # == Behaviors
20
+ #
21
+ # Each dimension supports comparison behaviors:
22
+ # - :strict - Exact comparison
23
+ # - :normalize - Normalized comparison (e.g., collapse whitespace)
24
+ # - :ignore - Skip comparison
25
+ #
26
+ # == Usage
27
+ #
28
+ # # Get a dimension instance
29
+ # dimension = Canon::Comparison::Dimensions::Registry.get(:text_content)
30
+ #
31
+ # # Compare two nodes
32
+ # dimension.equivalent?(node1, node2, :normalize)
33
+ #
34
+ # # Or use the registry directly
35
+ # Canon::Comparison::Dimensions::Registry.compare(:text_content, node1, node2, :normalize)
36
+
37
+ require_relative "dimensions/base_dimension"
38
+ require_relative "dimensions/registry"
39
+ require_relative "dimensions/text_content_dimension"
40
+ require_relative "dimensions/comments_dimension"
41
+ require_relative "dimensions/attribute_values_dimension"
42
+ require_relative "dimensions/attribute_presence_dimension"
43
+ require_relative "dimensions/attribute_order_dimension"
44
+ require_relative "dimensions/element_position_dimension"
45
+ require_relative "dimensions/structural_whitespace_dimension"
46
+
47
+ module Canon
48
+ module Comparison
49
+ module Dimensions
50
+ # Version constant for the dimensions module
51
+ VERSION = "1.0.0"
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Canon
4
+ module Comparison
5
+ # Format detection service for auto-detecting document formats
6
+ #
7
+ # Provides format detection for various document types including XML, HTML,
8
+ # JSON, YAML, and plain text. Uses caching for performance optimization.
9
+ #
10
+ # @example Detect format from a string
11
+ # FormatDetector.detect("<root>content</root>") # => :xml
12
+ #
13
+ # @example Detect format from an object
14
+ # FormatDetector.detect(Moxml::Document.new) # => :xml
15
+ class FormatDetector
16
+ # Supported format types
17
+ FORMATS = %i[xml html json yaml ruby_object string].freeze
18
+
19
+ class << self
20
+ # Detect the format of an object
21
+ #
22
+ # @param obj [Object] Object to detect format of
23
+ # @return [Symbol] Format type (:xml, :html, :json, :yaml, :ruby_object, :string)
24
+ def detect(obj)
25
+ case obj
26
+ when Moxml::Node, Moxml::Document
27
+ :xml
28
+ when Nokogiri::HTML::DocumentFragment, Nokogiri::HTML5::DocumentFragment
29
+ # HTML DocumentFragments
30
+ :html
31
+ when Nokogiri::XML::DocumentFragment
32
+ # XML DocumentFragments - check if it's actually HTML
33
+ obj.document&.html? ? :html : :xml
34
+ when Nokogiri::XML::Document, Nokogiri::XML::Node
35
+ # Check if it's HTML by looking at the document type
36
+ obj.html? ? :html : :xml
37
+ when Nokogiri::HTML::Document, Nokogiri::HTML5::Document
38
+ :html
39
+ when String
40
+ detect_string(obj)
41
+ when Hash, Array
42
+ # Raw Ruby objects (from parsed JSON/YAML)
43
+ :ruby_object
44
+ else
45
+ raise Canon::Error, "Unknown format for object: #{obj.class}"
46
+ end
47
+ end
48
+
49
+ # Detect the format of a string with caching
50
+ #
51
+ # @param str [String] String to detect format of
52
+ # @return [Symbol] Format type
53
+ def detect_string(str)
54
+ # Use cache for format detection
55
+ Cache.fetch(:format_detect, Cache.key_for_format_detection(str)) do
56
+ detect_string_uncached(str)
57
+ end
58
+ end
59
+
60
+ # Detect the format of a string without caching
61
+ #
62
+ # @param str [String] String to detect format of
63
+ # @return [Symbol] Format type
64
+ def detect_string_uncached(str)
65
+ trimmed = str.strip
66
+
67
+ # YAML indicators
68
+ return :yaml if trimmed.start_with?("---")
69
+ return :yaml if trimmed.match?(/^[a-zA-Z_]\w*:\s/)
70
+
71
+ # JSON indicators
72
+ return :json if trimmed.start_with?("{", "[")
73
+
74
+ # HTML indicators
75
+ return :html if trimmed.start_with?("<!DOCTYPE html", "<html",
76
+ "<HTML")
77
+
78
+ # XML indicators - must start with < and end with >
79
+ return :xml if trimmed.start_with?("<") && trimmed.end_with?(">")
80
+
81
+ # Default to plain string for everything else
82
+ :string
83
+ end
84
+ end
85
+ end
86
+ end
87
+ end