canon 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +163 -67
  3. data/README.adoc +400 -7
  4. data/docs/Gemfile +9 -0
  5. data/docs/INDEX.adoc +99 -182
  6. data/docs/_config.yml +100 -0
  7. data/docs/advanced/diff-classification.adoc +547 -0
  8. data/docs/advanced/diff-pipeline.adoc +358 -0
  9. data/docs/advanced/index.adoc +214 -0
  10. data/docs/advanced/semantic-diff-report.adoc +390 -0
  11. data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
  12. data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
  13. data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
  14. data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
  15. data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
  16. data/docs/features/diff-formatting/display-filtering.adoc +472 -0
  17. data/docs/features/diff-formatting/index.adoc +140 -0
  18. data/docs/features/environment-configuration/index.adoc +327 -0
  19. data/docs/features/environment-configuration/override-system.adoc +436 -0
  20. data/docs/features/environment-configuration/size-limits.adoc +273 -0
  21. data/docs/features/index.adoc +173 -0
  22. data/docs/features/input-validation/index.adoc +521 -0
  23. data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
  24. data/docs/features/match-options/html-policies.adoc +312 -0
  25. data/docs/features/match-options/index.adoc +621 -0
  26. data/docs/getting-started/index.adoc +83 -0
  27. data/docs/getting-started/quick-start.adoc +76 -0
  28. data/docs/guides/choosing-configuration.adoc +689 -0
  29. data/docs/guides/index.adoc +181 -0
  30. data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
  31. data/docs/interfaces/index.adoc +101 -0
  32. data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
  33. data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
  34. data/docs/lychee.toml +65 -0
  35. data/docs/reference/cli-options.adoc +418 -0
  36. data/docs/reference/environment-variables.adoc +375 -0
  37. data/docs/reference/index.adoc +204 -0
  38. data/docs/reference/options-across-interfaces.adoc +417 -0
  39. data/docs/understanding/algorithms/dom-diff.adoc +389 -0
  40. data/docs/understanding/algorithms/index.adoc +314 -0
  41. data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
  42. data/docs/understanding/architecture.adoc +447 -0
  43. data/docs/understanding/comparison-pipeline.adoc +317 -0
  44. data/docs/understanding/formats/html.adoc +380 -0
  45. data/docs/understanding/formats/index.adoc +261 -0
  46. data/docs/understanding/formats/json.adoc +390 -0
  47. data/docs/understanding/formats/xml.adoc +366 -0
  48. data/docs/understanding/formats/yaml.adoc +504 -0
  49. data/docs/understanding/index.adoc +130 -0
  50. data/lib/canon/cli.rb +42 -1
  51. data/lib/canon/commands/diff_command.rb +108 -23
  52. data/lib/canon/comparison/compare_profile.rb +101 -0
  53. data/lib/canon/comparison/comparison_result.rb +41 -2
  54. data/lib/canon/comparison/html_comparator.rb +292 -71
  55. data/lib/canon/comparison/html_compare_profile.rb +117 -0
  56. data/lib/canon/comparison/match_options.rb +42 -4
  57. data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
  58. data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
  59. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
  60. data/lib/canon/comparison/xml_comparator.rb +695 -91
  61. data/lib/canon/comparison.rb +207 -2
  62. data/lib/canon/config/env_provider.rb +71 -0
  63. data/lib/canon/config/env_schema.rb +58 -0
  64. data/lib/canon/config/override_resolver.rb +55 -0
  65. data/lib/canon/config/type_converter.rb +59 -0
  66. data/lib/canon/config.rb +158 -29
  67. data/lib/canon/data_model.rb +29 -0
  68. data/lib/canon/diff/diff_classifier.rb +74 -14
  69. data/lib/canon/diff/diff_context_builder.rb +41 -0
  70. data/lib/canon/diff/diff_line.rb +18 -2
  71. data/lib/canon/diff/diff_node.rb +18 -3
  72. data/lib/canon/diff/diff_node_mapper.rb +71 -12
  73. data/lib/canon/diff/formatting_detector.rb +53 -0
  74. data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
  75. data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
  76. data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
  77. data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
  78. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
  79. data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
  80. data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
  81. data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
  82. data/lib/canon/diff_formatter/debug_output.rb +7 -1
  83. data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
  84. data/lib/canon/diff_formatter/legend.rb +42 -0
  85. data/lib/canon/diff_formatter.rb +78 -9
  86. data/lib/canon/errors.rb +56 -0
  87. data/lib/canon/formatters/html_formatter_base.rb +35 -1
  88. data/lib/canon/formatters/json_formatter.rb +3 -0
  89. data/lib/canon/formatters/yaml_formatter.rb +3 -0
  90. data/lib/canon/html/data_model.rb +229 -0
  91. data/lib/canon/html.rb +9 -0
  92. data/lib/canon/options/cli_generator.rb +70 -0
  93. data/lib/canon/options/registry.rb +234 -0
  94. data/lib/canon/rspec_matchers.rb +34 -13
  95. data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
  96. data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
  97. data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
  98. data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
  99. data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
  100. data/lib/canon/tree_diff/core/matching.rb +241 -0
  101. data/lib/canon/tree_diff/core/node_signature.rb +164 -0
  102. data/lib/canon/tree_diff/core/node_weight.rb +135 -0
  103. data/lib/canon/tree_diff/core/tree_node.rb +450 -0
  104. data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
  105. data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
  106. data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
  107. data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
  108. data/lib/canon/tree_diff/operation_converter.rb +631 -0
  109. data/lib/canon/tree_diff/operations/operation.rb +92 -0
  110. data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
  111. data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
  112. data/lib/canon/tree_diff.rb +33 -0
  113. data/lib/canon/validators/json_validator.rb +3 -1
  114. data/lib/canon/validators/yaml_validator.rb +3 -1
  115. data/lib/canon/version.rb +1 -1
  116. data/lib/canon/xml/data_model.rb +22 -23
  117. data/lib/canon/xml/element_matcher.rb +128 -20
  118. data/lib/canon/xml/namespace_helper.rb +110 -0
  119. data/lib/canon.rb +3 -0
  120. metadata +81 -23
  121. data/_config.yml +0 -116
  122. data/docs/ADVANCED_TOPICS.adoc +0 -20
  123. data/docs/BASIC_USAGE.adoc +0 -16
  124. data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  125. data/docs/DIFF_ARCHITECTURE.adoc +0 -435
  126. data/docs/DIFF_FORMATTING.adoc +0 -540
  127. data/docs/FORMATS.adoc +0 -447
  128. data/docs/INPUT_VALIDATION.adoc +0 -477
  129. data/docs/MATCH_ARCHITECTURE.adoc +0 -463
  130. data/docs/MATCH_OPTIONS.adoc +0 -719
  131. data/docs/MODES.adoc +0 -432
  132. data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  133. data/docs/OPTIONS.adoc +0 -1387
  134. data/docs/PREPROCESSING.adoc +0 -491
  135. data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
  136. data/docs/UNDERSTANDING_CANON.adoc +0 -17
@@ -0,0 +1,234 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../comparison/match_options"
4
+
5
+ module Canon
6
+ module Options
7
+ # Centralized registry for all Canon options
8
+ # This is the SINGLE SOURCE OF TRUTH for option definitions
9
+ # All interfaces (CLI, Ruby API, RSpec) auto-generate from this registry
10
+ class Registry
11
+ class << self
12
+ # Get all option definitions
13
+ def all_options
14
+ @all_options ||= [
15
+ preprocessing_option,
16
+ diff_algorithm_option,
17
+ diff_mode_option,
18
+ *match_dimension_options,
19
+ match_profile_option,
20
+ *diff_formatting_options,
21
+ ].freeze
22
+ end
23
+
24
+ # Get options applicable to a specific format
25
+ def options_for_format(format)
26
+ all_options.select do |opt|
27
+ opt[:applies_to].nil? || opt[:applies_to].include?(format)
28
+ end
29
+ end
30
+
31
+ # Validate options hash against registry
32
+ def validate_options!(opts, format)
33
+ valid_option_names = options_for_format(format).map { |o| o[:name] }
34
+ invalid = opts.keys - valid_option_names
35
+ return if invalid.empty?
36
+
37
+ raise Canon::Error,
38
+ "Invalid options for #{format}: #{invalid.join(', ')}"
39
+ end
40
+
41
+ # Get CLI flag name for an option
42
+ def cli_flag_for(option_name)
43
+ opt = all_options.find { |o| o[:name] == option_name }
44
+ opt&.dig(:cli_flag)
45
+ end
46
+
47
+ # Get default value for an option
48
+ def default_for(option_name, format = nil)
49
+ opt = all_options.find { |o| o[:name] == option_name }
50
+ return nil unless opt
51
+
52
+ # Check for format-specific default
53
+ if format && opt[:format_defaults]&.key?(format)
54
+ opt[:format_defaults][format]
55
+ else
56
+ opt[:default]
57
+ end
58
+ end
59
+
60
+ private
61
+
62
+ # Preprocessing option
63
+ def preprocessing_option
64
+ {
65
+ name: :preprocessing,
66
+ type: :enum,
67
+ values: %w[none c14n normalize format],
68
+ default: :none,
69
+ cli_flag: "--preprocessing",
70
+ description: "Preprocessing: none, c14n, normalize, or format",
71
+ applies_to: %i[xml html json yaml],
72
+ }
73
+ end
74
+
75
+ # Diff algorithm option (NEW)
76
+ def diff_algorithm_option
77
+ {
78
+ name: :diff_algorithm,
79
+ type: :enum,
80
+ values: %w[dom semantic],
81
+ default: :dom,
82
+ cli_flag: "--diff-algorithm",
83
+ aliases: ["-a"],
84
+ description: "Diff algorithm: dom (positional) or semantic (tree-based)",
85
+ applies_to: %i[xml html json yaml],
86
+ }
87
+ end
88
+
89
+ # Diff mode option (replaces --by-line flag)
90
+ def diff_mode_option
91
+ {
92
+ name: :diff_mode,
93
+ type: :enum,
94
+ values: %w[by_line by_object],
95
+ default: :by_object,
96
+ format_defaults: {
97
+ html: :by_line,
98
+ },
99
+ cli_flag: "--diff-mode",
100
+ description: "Diff output mode: by_line or by_object",
101
+ applies_to: %i[xml html json yaml],
102
+ }
103
+ end
104
+
105
+ # Match profile option
106
+ def match_profile_option
107
+ {
108
+ name: :match_profile,
109
+ type: :enum,
110
+ values: Canon::Comparison::MatchOptions::MATCH_PROFILES.keys.map(&:to_s),
111
+ default: nil,
112
+ cli_flag: "--match-profile",
113
+ aliases: ["-p"],
114
+ description: "Match profile: strict, rendered, spec_friendly, or content_only",
115
+ applies_to: %i[xml html json yaml],
116
+ }
117
+ end
118
+
119
+ # Match dimension options (generated from MatchOptions)
120
+ def match_dimension_options
121
+ Canon::Comparison::MatchOptions::MATCH_DIMENSIONS.map do |dim|
122
+ {
123
+ name: dim,
124
+ type: :enum,
125
+ values: behaviors_for_dimension(dim),
126
+ default: nil,
127
+ format_defaults: format_defaults_for_dimension(dim),
128
+ cli_flag: "--#{dim.to_s.tr('_', '-')}",
129
+ description: "#{dimension_description(dim)}: #{behaviors_for_dimension(dim).join(', ')}",
130
+ applies_to: applicable_formats_for_dimension(dim),
131
+ }
132
+ end
133
+ end
134
+
135
+ # Diff formatting options
136
+ def diff_formatting_options
137
+ [
138
+ {
139
+ name: :color,
140
+ type: :boolean,
141
+ default: true,
142
+ cli_flag: "--color",
143
+ description: "Colorize diff output",
144
+ applies_to: %i[xml html json yaml],
145
+ },
146
+ {
147
+ name: :verbose,
148
+ type: :boolean,
149
+ default: false,
150
+ cli_flag: "--verbose",
151
+ aliases: ["-v"],
152
+ description: "Show detailed differences",
153
+ applies_to: %i[xml html json yaml],
154
+ },
155
+ {
156
+ name: :context_lines,
157
+ type: :numeric,
158
+ default: 3,
159
+ cli_flag: "--context-lines",
160
+ description: "Number of context lines around changes",
161
+ applies_to: %i[xml html json yaml],
162
+ },
163
+ {
164
+ name: :diff_grouping_lines,
165
+ type: :numeric,
166
+ default: nil,
167
+ cli_flag: "--diff-grouping-lines",
168
+ description: "Group diffs within N lines into context blocks",
169
+ applies_to: %i[xml html json yaml],
170
+ },
171
+ ]
172
+ end
173
+
174
+ # Get valid behaviors for a dimension
175
+ def behaviors_for_dimension(dimension)
176
+ case dimension
177
+ when :key_order, :attribute_order,
178
+ :element_structure, :element_position, :element_hierarchy
179
+ %w[strict ignore]
180
+ else
181
+ %w[strict normalize ignore]
182
+ end
183
+ end
184
+
185
+ # Get format defaults for a dimension from MatchOptions
186
+ def format_defaults_for_dimension(dimension)
187
+ Canon::Comparison::MatchOptions::FORMAT_DEFAULTS
188
+ .transform_values { |v| v[dimension] }
189
+ .compact
190
+ end
191
+
192
+ # Get applicable formats for a dimension
193
+ def applicable_formats_for_dimension(dimension)
194
+ case dimension
195
+ when :attribute_whitespace, :attribute_order, :attribute_values
196
+ %i[xml html]
197
+ when :key_order
198
+ %i[json yaml]
199
+ else
200
+ %i[xml html json yaml]
201
+ end
202
+ end
203
+
204
+ # Get human-readable description for a dimension
205
+ def dimension_description(dimension)
206
+ case dimension
207
+ when :text_content
208
+ "Text content matching"
209
+ when :structural_whitespace
210
+ "Structural whitespace matching"
211
+ when :attribute_whitespace
212
+ "Attribute whitespace matching (XML/HTML only)"
213
+ when :attribute_order
214
+ "Attribute ordering (XML/HTML only)"
215
+ when :attribute_values
216
+ "Attribute value matching (XML/HTML only)"
217
+ when :key_order
218
+ "Key ordering (JSON/YAML only)"
219
+ when :comments
220
+ "Comment matching"
221
+ when :element_structure
222
+ "Element type/structure matching (semantic diff)"
223
+ when :element_position
224
+ "Element position/order matching (semantic diff)"
225
+ when :element_hierarchy
226
+ "Element hierarchy/parent-child matching (semantic diff)"
227
+ else
228
+ dimension.to_s.tr("_", " ").capitalize
229
+ end
230
+ end
231
+ end
232
+ end
233
+ end
234
+ end
@@ -44,12 +44,23 @@ module Canon
44
44
  # This is a THIN WRAPPER around Canon::Comparison API
45
45
  class SerializationMatcher
46
46
  def initialize(expected, format = nil, match_profile: nil,
47
- match: nil, preprocessing: nil)
47
+ match: nil, preprocessing: nil, diff_algorithm: nil,
48
+ show_diffs: nil)
48
49
  @expected = expected
49
50
  @format = format&.to_sym
50
51
  @match_profile = match_profile
51
52
  @match = match
52
53
  @preprocessing = preprocessing
54
+ @diff_algorithm = diff_algorithm
55
+ @show_diffs = show_diffs
56
+ end
57
+
58
+ # Chain method for controlling diff display
59
+ # @param value [Symbol, String] :all, :normative, or :informative
60
+ # @return [SerializationMatcher] self for chaining
61
+ def show_diffs(value)
62
+ @show_diffs = value.to_sym
63
+ self
53
64
  end
54
65
 
55
66
  def matches?(target)
@@ -134,6 +145,8 @@ module Canon
134
145
  opts[:match_profile] = @match_profile if @match_profile
135
146
  opts[:match] = @match if @match
136
147
  opts[:preprocessing] = @preprocessing if @preprocessing
148
+ opts[:diff_algorithm] = @diff_algorithm if @diff_algorithm
149
+ opts[:show_diffs] = @show_diffs if @show_diffs
137
150
 
138
151
  # Add global configuration from Canon::Config (lower priority)
139
152
  if @format
@@ -151,6 +164,8 @@ module Canon
151
164
  format_config.match.options
152
165
  end
153
166
  opts[:preprocessing] ||= format_config.preprocessing
167
+ # Add diff algorithm from config if not explicitly set
168
+ opts[:diff_algorithm] ||= format_config.diff.algorithm if format_config.diff.algorithm
154
169
  elsif !%i[xml html html4 html5 json yaml
155
170
  string].include?(@format)
156
171
  # Unsupported format - raise error early
@@ -211,27 +226,30 @@ module Canon
211
226
  # Matcher methods
212
227
  def be_serialization_equivalent_to(expected, format: :xml,
213
228
  match_profile: nil, match: nil,
214
- preprocessing: nil)
229
+ preprocessing: nil, diff_algorithm: nil)
215
230
  SerializationMatcher.new(expected, format,
216
231
  match_profile: match_profile,
217
232
  match: match,
218
- preprocessing: preprocessing)
233
+ preprocessing: preprocessing,
234
+ diff_algorithm: diff_algorithm)
219
235
  end
220
236
 
221
237
  def be_analogous_with(expected, match_profile: nil, match: nil,
222
- preprocessing: nil)
238
+ preprocessing: nil, diff_algorithm: nil)
223
239
  SerializationMatcher.new(expected, :xml,
224
240
  match_profile: match_profile,
225
241
  match: match,
226
- preprocessing: preprocessing)
242
+ preprocessing: preprocessing,
243
+ diff_algorithm: diff_algorithm)
227
244
  end
228
245
 
229
246
  def be_xml_equivalent_to(expected, match_profile: nil, match: nil,
230
- preprocessing: nil)
247
+ preprocessing: nil, diff_algorithm: nil)
231
248
  SerializationMatcher.new(expected, :xml,
232
249
  match_profile: match_profile,
233
250
  match: match,
234
- preprocessing: preprocessing)
251
+ preprocessing: preprocessing,
252
+ diff_algorithm: diff_algorithm)
235
253
  end
236
254
 
237
255
  def be_yaml_equivalent_to(expected)
@@ -243,27 +261,30 @@ module Canon
243
261
  end
244
262
 
245
263
  def be_html_equivalent_to(expected, match_profile: nil, match: nil,
246
- preprocessing: nil)
264
+ preprocessing: nil, diff_algorithm: nil)
247
265
  SerializationMatcher.new(expected, :html,
248
266
  match_profile: match_profile,
249
267
  match: match,
250
- preprocessing: preprocessing)
268
+ preprocessing: preprocessing,
269
+ diff_algorithm: diff_algorithm)
251
270
  end
252
271
 
253
272
  def be_html4_equivalent_to(expected, match_profile: nil, match: nil,
254
- preprocessing: nil)
273
+ preprocessing: nil, diff_algorithm: nil)
255
274
  SerializationMatcher.new(expected, :html4,
256
275
  match_profile: match_profile,
257
276
  match: match,
258
- preprocessing: preprocessing)
277
+ preprocessing: preprocessing,
278
+ diff_algorithm: diff_algorithm)
259
279
  end
260
280
 
261
281
  def be_html5_equivalent_to(expected, match_profile: nil, match: nil,
262
- preprocessing: nil)
282
+ preprocessing: nil, diff_algorithm: nil)
263
283
  SerializationMatcher.new(expected, :html5,
264
284
  match_profile: match_profile,
265
285
  match: match,
266
- preprocessing: preprocessing)
286
+ preprocessing: preprocessing,
287
+ diff_algorithm: diff_algorithm)
267
288
  end
268
289
 
269
290
  def be_equivalent_to(expected)
@@ -0,0 +1,316 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+
5
+ module Canon
6
+ module TreeDiff
7
+ module Adapters
8
+ # HTMLAdapter converts Nokogiri HTML documents to TreeNode structures
9
+ # and back, enabling semantic tree diffing on HTML documents.
10
+ #
11
+ # This adapter:
12
+ # - Converts Nokogiri::HTML::Document to TreeNode tree
13
+ # - Preserves element names, text content, and attributes
14
+ # - Handles HTML-specific elements (script, style, etc.)
15
+ # - Maintains document structure for round-trip conversion
16
+ #
17
+ # @example Convert HTML to TreeNode
18
+ # html = Nokogiri::HTML("<html><body><p>text</p></body></html>")
19
+ # adapter = HTMLAdapter.new
20
+ # tree = adapter.to_tree(html)
21
+ #
22
+ class HTMLAdapter
23
+ attr_reader :match_options
24
+
25
+ # Initialize adapter with match options
26
+ #
27
+ # @param match_options [Hash] Match options for text/attribute normalization
28
+ def initialize(match_options: {})
29
+ @match_options = match_options
30
+ end
31
+
32
+ # Convert Nokogiri HTML document/element or Canon::Xml::Node to TreeNode
33
+ #
34
+ # @param node [Nokogiri::HTML::Document, Nokogiri::XML::Element, Nokogiri::HTML::DocumentFragment, Canon::Xml::Node] HTML node
35
+ # @return [Core::TreeNode] Root tree node
36
+ def to_tree(node)
37
+ # Handle Canon::Xml::Node types first (same as XML adapter)
38
+ case node
39
+ when Canon::Xml::Nodes::RootNode
40
+ return to_tree_from_canon_root(node)
41
+ when Canon::Xml::Nodes::ElementNode
42
+ return to_tree_from_canon_element(node)
43
+ when Canon::Xml::Nodes::TextNode
44
+ return to_tree_from_canon_text(node)
45
+ when Canon::Xml::Nodes::CommentNode
46
+ return to_tree_from_canon_comment(node)
47
+ end
48
+
49
+ # Fallback to Nokogiri (legacy support)
50
+ case node
51
+ when Nokogiri::HTML::Document, Nokogiri::HTML4::Document, Nokogiri::HTML5::Document
52
+ # Start from html element or root element
53
+ root = node.at_css("html") || node.root
54
+ root ? to_tree(root) : nil
55
+ when Nokogiri::HTML4::DocumentFragment, Nokogiri::HTML5::DocumentFragment, Nokogiri::XML::DocumentFragment
56
+ # For DocumentFragment, create a wrapper root node and add all fragment children
57
+ convert_fragment(node)
58
+ when Nokogiri::XML::Element
59
+ convert_element(node)
60
+ else
61
+ raise ArgumentError, "Unsupported node type: #{node.class}"
62
+ end
63
+ end
64
+
65
+ # Convert TreeNode back to Nokogiri HTML
66
+ #
67
+ # @param tree_node [Core::TreeNode] Root tree node
68
+ # @param doc [Nokogiri::HTML::Document] Optional document to use
69
+ # @return [Nokogiri::HTML::Document, Nokogiri::XML::Element]
70
+ def from_tree(tree_node, doc = nil)
71
+ doc ||= Nokogiri::HTML::Document.new
72
+
73
+ element = build_element(tree_node, doc)
74
+
75
+ if doc.root.nil?
76
+ doc.root = element
77
+ doc
78
+ else
79
+ element
80
+ end
81
+ end
82
+
83
+ private
84
+
85
+ # Convert a DocumentFragment to TreeNode
86
+ # Creates a synthetic root node containing the fragment's children
87
+ #
88
+ # @param fragment [Nokogiri::HTML::DocumentFragment] HTML fragment
89
+ # @return [Core::TreeNode] Root tree node
90
+ def convert_fragment(fragment)
91
+ # Create a synthetic root node for the fragment
92
+ root = Core::TreeNode.new(
93
+ label: "fragment",
94
+ value: nil,
95
+ attributes: {},
96
+ source_node: fragment,
97
+ )
98
+
99
+ # Add all fragment children as children of the root
100
+ fragment.element_children.each do |child|
101
+ child_node = convert_element(child)
102
+ root.add_child(child_node)
103
+ end
104
+
105
+ root
106
+ end
107
+
108
+ # Convert a Nokogiri element to TreeNode
109
+ #
110
+ # @param element [Nokogiri::XML::Element] HTML element
111
+ # @return [Core::TreeNode] Tree node
112
+ def convert_element(element)
113
+ # Get element name (lowercase for HTML)
114
+ label = element.name.downcase
115
+
116
+ # Collect attributes (preserve original order for tree diff)
117
+ # The tree diff will detect attribute order differences
118
+ # and classify them as informative when attribute_order: ignore
119
+ #
120
+ # CRITICAL FIX: Filter out xmlns attributes for HTML documents
121
+ # These are typically added by parsers (e.g., MS Word) and aren't
122
+ # semantically significant for HTML comparison. Keeping them causes
123
+ # false mismatches that prevent the entire subtree from matching due
124
+ # to prefix closure constraints.
125
+ attributes = {}
126
+ element.attributes.each do |name, attr|
127
+ # Skip xmlns namespace declarations for HTML (but keep regular attributes)
128
+ # This prevents false mismatches caused by parser-added namespace declarations
129
+ next if name.start_with?("xmlns")
130
+
131
+ attributes[name] = attr.value
132
+ end
133
+
134
+ # Get text content (only direct text, not from children)
135
+ text_value = extract_text_value(element)
136
+
137
+ # Create tree node with source_node reference
138
+ tree_node = Core::TreeNode.new(
139
+ label: label,
140
+ value: text_value,
141
+ attributes: attributes,
142
+ source_node: element,
143
+ )
144
+
145
+ # Process child elements
146
+ element.element_children.each do |child|
147
+ child_node = convert_element(child)
148
+ tree_node.add_child(child_node)
149
+ end
150
+
151
+ tree_node
152
+ end
153
+
154
+ # Extract direct text content from element
155
+ #
156
+ # Preserves original text for proper normalization during comparison.
157
+ # Normalization happens in OperationDetector based on match_options,
158
+ # NOT during tree conversion.
159
+ #
160
+ # For mixed content (text nodes + child elements), joins text nodes
161
+ # with a space to prevent text from running together when elements
162
+ # like <br/> separate the text.
163
+ #
164
+ # @param element [Nokogiri::XML::Element] HTML element
165
+ # @return [String, nil] Text content or nil
166
+ def extract_text_value(element)
167
+ # Get only direct text nodes, not from nested elements
168
+ text_nodes = element.children.select(&:text?)
169
+
170
+ # For mixed content (has both text nodes and element children),
171
+ # join text nodes with space to handle implicit whitespace around
172
+ # block-level elements like <br/>
173
+ # Example: "Text<br/>More" should become "Text More" not "TextMore"
174
+ # EXCEPT for whitespace-sensitive elements (<pre>, <code>, etc.)
175
+ # where we must preserve exact whitespace
176
+ separator = if element.element_children.any? && !whitespace_sensitive?(element)
177
+ " "
178
+ else
179
+ ""
180
+ end
181
+ text = text_nodes.map(&:text).join(separator)
182
+
183
+ # CRITICAL FIX: Return original text without stripping
184
+ # Normalization will be applied during comparison based on match_options
185
+ # Only return nil for truly empty text
186
+ text.empty? ? nil : text
187
+ end
188
+
189
+ # Check if an element is whitespace-sensitive
190
+ #
191
+ # HTML elements where whitespace is significant: <pre>, <code>, <textarea>, <script>, <style>
192
+ #
193
+ # @param element [Nokogiri::XML::Element] Element to check
194
+ # @return [Boolean] True if element is whitespace-sensitive
195
+ def whitespace_sensitive?(element)
196
+ return false unless element.respond_to?(:name)
197
+
198
+ # List of HTML elements where whitespace is semantically significant
199
+ whitespace_sensitive_tags = %w[pre code textarea script style]
200
+ whitespace_sensitive_tags.include?(element.name.downcase)
201
+ end
202
+
203
+ # Build Nokogiri element from TreeNode
204
+ #
205
+ # @param tree_node [Core::TreeNode] Tree node
206
+ # @param doc [Nokogiri::HTML::Document] Document
207
+ # @return [Nokogiri::XML::Element] HTML element
208
+ def build_element(tree_node, doc)
209
+ element = Nokogiri::XML::Element.new(tree_node.label, doc)
210
+
211
+ # Add attributes
212
+ tree_node.attributes.each do |name, value|
213
+ element[name] = value
214
+ end
215
+
216
+ # Add text content if present
217
+ if tree_node.value && !tree_node.value.empty?
218
+ element.content = tree_node.value
219
+ end
220
+
221
+ # Add child elements
222
+ tree_node.children.each do |child|
223
+ child_element = build_element(child, doc)
224
+ element.add_child(child_element)
225
+ end
226
+
227
+ element
228
+ end
229
+
230
+ # Convert Canon::Xml::Nodes::RootNode to TreeNode
231
+ #
232
+ # @param root_node [Canon::Xml::Nodes::RootNode] Root node
233
+ # @return [Core::TreeNode, nil] Tree node for first child (document element)
234
+ def to_tree_from_canon_root(root_node)
235
+ # Root node: process first child (document element)
236
+ return nil if root_node.children.empty?
237
+
238
+ to_tree(root_node.children.first)
239
+ end
240
+
241
+ # Convert Canon::Xml::Nodes::ElementNode to TreeNode
242
+ #
243
+ # @param element_node [Canon::Xml::Nodes::ElementNode] Element node
244
+ # @return [Core::TreeNode] Tree node
245
+ def to_tree_from_canon_element(element_node)
246
+ # Create TreeNode from Canon::Xml::Nodes::ElementNode
247
+ tree_node = Core::TreeNode.new(
248
+ label: element_node.name.downcase, # Lowercase for HTML
249
+ value: nil, # Elements don't have values
250
+ attributes: extract_canon_attributes(element_node),
251
+ children: [],
252
+ source_node: element_node, # Preserve reference to Canon node
253
+ )
254
+
255
+ # Process children recursively
256
+ element_node.children.each do |child|
257
+ child_tree = to_tree(child)
258
+ tree_node.add_child(child_tree) if child_tree
259
+ end
260
+
261
+ tree_node
262
+ end
263
+
264
+ # Convert Canon::Xml::Nodes::TextNode to TreeNode
265
+ #
266
+ # @param text_node [Canon::Xml::Nodes::TextNode] Text node
267
+ # @return [Core::TreeNode, nil] Tree node or nil for empty text
268
+ def to_tree_from_canon_text(text_node)
269
+ # Extract text value
270
+ text_value = text_node.value.to_s
271
+
272
+ # Return nil for empty text (don't strip for HTML)
273
+ return nil if text_value.empty?
274
+
275
+ Core::TreeNode.new(
276
+ label: "text",
277
+ value: text_value,
278
+ attributes: {},
279
+ children: [],
280
+ source_node: text_node,
281
+ )
282
+ end
283
+
284
+ # Convert Canon::Xml::Nodes::CommentNode to TreeNode
285
+ #
286
+ # @param comment_node [Canon::Xml::Nodes::CommentNode] Comment node
287
+ # @return [Core::TreeNode] Tree node
288
+ def to_tree_from_canon_comment(comment_node)
289
+ Core::TreeNode.new(
290
+ label: "comment",
291
+ value: comment_node.value,
292
+ attributes: {},
293
+ children: [],
294
+ source_node: comment_node,
295
+ )
296
+ end
297
+
298
+ # Extract attributes from Canon::Xml::Nodes::ElementNode
299
+ #
300
+ # @param element_node [Canon::Xml::Nodes::ElementNode] Element node
301
+ # @return [Hash] Attributes hash (preserves order, filters xmlns)
302
+ def extract_canon_attributes(element_node)
303
+ # Canon::Xml::Nodes::ElementNode has attribute_nodes array
304
+ attrs = {}
305
+ element_node.attribute_nodes.each do |attr|
306
+ # Skip xmlns attributes for HTML (like Nokogiri path)
307
+ next if attr.name.start_with?("xmlns")
308
+
309
+ attrs[attr.name] = attr.value
310
+ end
311
+ attrs
312
+ end
313
+ end
314
+ end
315
+ end
316
+ end