canon 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +163 -67
  3. data/README.adoc +400 -7
  4. data/docs/Gemfile +9 -0
  5. data/docs/INDEX.adoc +99 -182
  6. data/docs/_config.yml +100 -0
  7. data/docs/advanced/diff-classification.adoc +547 -0
  8. data/docs/advanced/diff-pipeline.adoc +358 -0
  9. data/docs/advanced/index.adoc +214 -0
  10. data/docs/advanced/semantic-diff-report.adoc +390 -0
  11. data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
  12. data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
  13. data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
  14. data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
  15. data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
  16. data/docs/features/diff-formatting/display-filtering.adoc +472 -0
  17. data/docs/features/diff-formatting/index.adoc +140 -0
  18. data/docs/features/environment-configuration/index.adoc +327 -0
  19. data/docs/features/environment-configuration/override-system.adoc +436 -0
  20. data/docs/features/environment-configuration/size-limits.adoc +273 -0
  21. data/docs/features/index.adoc +173 -0
  22. data/docs/features/input-validation/index.adoc +521 -0
  23. data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
  24. data/docs/features/match-options/html-policies.adoc +312 -0
  25. data/docs/features/match-options/index.adoc +621 -0
  26. data/docs/getting-started/index.adoc +83 -0
  27. data/docs/getting-started/quick-start.adoc +76 -0
  28. data/docs/guides/choosing-configuration.adoc +689 -0
  29. data/docs/guides/index.adoc +181 -0
  30. data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
  31. data/docs/interfaces/index.adoc +101 -0
  32. data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
  33. data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
  34. data/docs/lychee.toml +65 -0
  35. data/docs/reference/cli-options.adoc +418 -0
  36. data/docs/reference/environment-variables.adoc +375 -0
  37. data/docs/reference/index.adoc +204 -0
  38. data/docs/reference/options-across-interfaces.adoc +417 -0
  39. data/docs/understanding/algorithms/dom-diff.adoc +389 -0
  40. data/docs/understanding/algorithms/index.adoc +314 -0
  41. data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
  42. data/docs/understanding/architecture.adoc +447 -0
  43. data/docs/understanding/comparison-pipeline.adoc +317 -0
  44. data/docs/understanding/formats/html.adoc +380 -0
  45. data/docs/understanding/formats/index.adoc +261 -0
  46. data/docs/understanding/formats/json.adoc +390 -0
  47. data/docs/understanding/formats/xml.adoc +366 -0
  48. data/docs/understanding/formats/yaml.adoc +504 -0
  49. data/docs/understanding/index.adoc +130 -0
  50. data/lib/canon/cli.rb +42 -1
  51. data/lib/canon/commands/diff_command.rb +108 -23
  52. data/lib/canon/comparison/compare_profile.rb +101 -0
  53. data/lib/canon/comparison/comparison_result.rb +41 -2
  54. data/lib/canon/comparison/html_comparator.rb +292 -71
  55. data/lib/canon/comparison/html_compare_profile.rb +117 -0
  56. data/lib/canon/comparison/match_options.rb +42 -4
  57. data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
  58. data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
  59. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
  60. data/lib/canon/comparison/xml_comparator.rb +695 -91
  61. data/lib/canon/comparison.rb +207 -2
  62. data/lib/canon/config/env_provider.rb +71 -0
  63. data/lib/canon/config/env_schema.rb +58 -0
  64. data/lib/canon/config/override_resolver.rb +55 -0
  65. data/lib/canon/config/type_converter.rb +59 -0
  66. data/lib/canon/config.rb +158 -29
  67. data/lib/canon/data_model.rb +29 -0
  68. data/lib/canon/diff/diff_classifier.rb +74 -14
  69. data/lib/canon/diff/diff_context_builder.rb +41 -0
  70. data/lib/canon/diff/diff_line.rb +18 -2
  71. data/lib/canon/diff/diff_node.rb +18 -3
  72. data/lib/canon/diff/diff_node_mapper.rb +71 -12
  73. data/lib/canon/diff/formatting_detector.rb +53 -0
  74. data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
  75. data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
  76. data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
  77. data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
  78. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
  79. data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
  80. data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
  81. data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
  82. data/lib/canon/diff_formatter/debug_output.rb +7 -1
  83. data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
  84. data/lib/canon/diff_formatter/legend.rb +42 -0
  85. data/lib/canon/diff_formatter.rb +78 -9
  86. data/lib/canon/errors.rb +56 -0
  87. data/lib/canon/formatters/html_formatter_base.rb +35 -1
  88. data/lib/canon/formatters/json_formatter.rb +3 -0
  89. data/lib/canon/formatters/yaml_formatter.rb +3 -0
  90. data/lib/canon/html/data_model.rb +229 -0
  91. data/lib/canon/html.rb +9 -0
  92. data/lib/canon/options/cli_generator.rb +70 -0
  93. data/lib/canon/options/registry.rb +234 -0
  94. data/lib/canon/rspec_matchers.rb +34 -13
  95. data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
  96. data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
  97. data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
  98. data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
  99. data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
  100. data/lib/canon/tree_diff/core/matching.rb +241 -0
  101. data/lib/canon/tree_diff/core/node_signature.rb +164 -0
  102. data/lib/canon/tree_diff/core/node_weight.rb +135 -0
  103. data/lib/canon/tree_diff/core/tree_node.rb +450 -0
  104. data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
  105. data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
  106. data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
  107. data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
  108. data/lib/canon/tree_diff/operation_converter.rb +631 -0
  109. data/lib/canon/tree_diff/operations/operation.rb +92 -0
  110. data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
  111. data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
  112. data/lib/canon/tree_diff.rb +33 -0
  113. data/lib/canon/validators/json_validator.rb +3 -1
  114. data/lib/canon/validators/yaml_validator.rb +3 -1
  115. data/lib/canon/version.rb +1 -1
  116. data/lib/canon/xml/data_model.rb +22 -23
  117. data/lib/canon/xml/element_matcher.rb +128 -20
  118. data/lib/canon/xml/namespace_helper.rb +110 -0
  119. data/lib/canon.rb +3 -0
  120. metadata +81 -23
  121. data/_config.yml +0 -116
  122. data/docs/ADVANCED_TOPICS.adoc +0 -20
  123. data/docs/BASIC_USAGE.adoc +0 -16
  124. data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  125. data/docs/DIFF_ARCHITECTURE.adoc +0 -435
  126. data/docs/DIFF_FORMATTING.adoc +0 -540
  127. data/docs/FORMATS.adoc +0 -447
  128. data/docs/INPUT_VALIDATION.adoc +0 -477
  129. data/docs/MATCH_ARCHITECTURE.adoc +0 -463
  130. data/docs/MATCH_OPTIONS.adoc +0 -719
  131. data/docs/MODES.adoc +0 -432
  132. data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  133. data/docs/OPTIONS.adoc +0 -1387
  134. data/docs/PREPROCESSING.adoc +0 -491
  135. data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
  136. data/docs/UNDERSTANDING_CANON.adoc +0 -17
@@ -1,11 +1,16 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "nokogiri"
4
+ require_relative "../comparison" # Load base module with constants first
4
5
  require_relative "xml_comparator"
5
6
  require_relative "match_options"
6
7
  require_relative "comparison_result"
8
+ require_relative "compare_profile"
9
+ require_relative "html_compare_profile"
7
10
  require_relative "../diff/diff_node"
8
11
  require_relative "../diff/diff_classifier"
12
+ require_relative "strategies/match_strategy_factory"
13
+ require_relative "../html/data_model"
9
14
 
10
15
  module Canon
11
16
  module Comparison
@@ -59,24 +64,41 @@ module Canon
59
64
  global_options: opts[:global_options],
60
65
  )
61
66
 
67
+ # Parse nodes to detect HTML version before creating profile
68
+ # We need to parse early to know if we're dealing with HTML4 or HTML5
69
+ node1 = parse_node(html1, match_opts_hash[:preprocessing],
70
+ match_opts_hash)
71
+ node2 = parse_node(html2, match_opts_hash[:preprocessing],
72
+ match_opts_hash)
73
+
74
+ # Detect HTML version from parsed nodes
75
+ html_version = detect_html_version_from_node(node1)
76
+
77
+ # Create HTML-specific compare profile
78
+ compare_profile = HtmlCompareProfile.new(
79
+ match_opts_hash,
80
+ html_version: html_version,
81
+ )
82
+
62
83
  # Wrap in ResolvedMatchOptions for DiffClassifier
63
84
  match_opts = Canon::Comparison::ResolvedMatchOptions.new(
64
85
  match_opts_hash,
65
86
  format: :html,
87
+ compare_profile: compare_profile,
66
88
  )
67
89
 
68
90
  # Store resolved match options hash for use in comparison logic
69
91
  opts[:match_opts] = match_opts_hash
70
92
 
93
+ # Use tree diff if semantic_diff option is enabled
94
+ if match_opts.semantic_diff?
95
+ return perform_semantic_tree_diff(html1, html2, opts,
96
+ match_opts_hash)
97
+ end
98
+
71
99
  # Create child_opts with resolved options
72
100
  child_opts = opts.merge(child_opts)
73
101
 
74
- # Parse nodes if they are strings, applying preprocessing if needed
75
- node1 = parse_node(html1, match_opts_hash[:preprocessing],
76
- match_opts_hash)
77
- node2 = parse_node(html2, match_opts_hash[:preprocessing],
78
- match_opts_hash)
79
-
80
102
  # Serialize preprocessed nodes for diff display (avoid re-preprocessing)
81
103
  preprocessed_str1 = serialize_for_display(node1)
82
104
  preprocessed_str2 = serialize_for_display(node2)
@@ -86,11 +108,19 @@ module Canon
86
108
 
87
109
  # DocumentFragment nodes need special handling - compare their children
88
110
  # instead of the fragment nodes themselves
89
- if node1.is_a?(Nokogiri::HTML4::DocumentFragment) &&
90
- node2.is_a?(Nokogiri::HTML4::DocumentFragment)
91
- # Compare children of fragments
92
- children1 = node1.children.to_a
93
- children2 = node2.children.to_a
111
+ if (node1.is_a?(Nokogiri::HTML4::DocumentFragment) ||
112
+ node1.is_a?(Nokogiri::XML::DocumentFragment)) &&
113
+ (node2.is_a?(Nokogiri::HTML4::DocumentFragment) ||
114
+ node2.is_a?(Nokogiri::XML::DocumentFragment))
115
+ # Compare children of fragments - filter them first
116
+ all_children1 = node1.children.to_a
117
+ all_children2 = node2.children.to_a
118
+
119
+ # Filter children based on match options (e.g., ignore comments)
120
+ children1 = XmlComparator.send(:filter_children, all_children1,
121
+ opts)
122
+ children2 = XmlComparator.send(:filter_children, all_children2,
123
+ opts)
94
124
 
95
125
  if children1.length != children2.length
96
126
  result = Comparison::UNEQUAL_ELEMENTS
@@ -129,79 +159,205 @@ module Canon
129
159
  format: :html,
130
160
  html_version: detect_html_version_from_node(node1),
131
161
  match_options: match_opts_hash,
162
+ algorithm: :dom,
132
163
  )
164
+ elsif result != Comparison::EQUIVALENT && !differences.empty?
165
+ # Non-verbose mode: check equivalence
166
+ # If comparison found differences, classify them to determine if normative
167
+ classifier = Canon::Diff::DiffClassifier.new(match_opts)
168
+ classifier.classify_all(differences.select do |d|
169
+ d.is_a?(Canon::Diff::DiffNode)
170
+ end)
171
+ # Equivalent if no normative differences (matches semantic algorithm)
172
+ differences.none?(&:normative?)
133
173
  else
174
+ # Either equivalent or no differences tracked
134
175
  result == Comparison::EQUIVALENT
135
176
  end
136
177
  end
137
178
 
138
179
  private
139
180
 
181
+ # Perform semantic tree diff using SemanticTreeMatchStrategy
182
+ #
183
+ # @param html1 [String, Nokogiri::HTML::Document] First HTML
184
+ # @param html2 [String, Nokogiri::HTML::Document] Second HTML
185
+ # @param opts [Hash] Comparison options
186
+ # @param match_opts_hash [Hash] Resolved match options
187
+ # @return [Boolean, ComparisonResult] Result of tree diff comparison
188
+ def perform_semantic_tree_diff(html1, html2, opts, match_opts_hash)
189
+ # Parse to Canon::Xml::Node (preserves preprocessing)
190
+ # For HTML, we parse as XML to get Canon::Xml::Node structure
191
+ node1 = parse_node_for_semantic(html1,
192
+ match_opts_hash[:preprocessing])
193
+ node2 = parse_node_for_semantic(html2,
194
+ match_opts_hash[:preprocessing])
195
+
196
+ # Create strategy using factory
197
+ strategy = Strategies::MatchStrategyFactory.create(
198
+ format: :html,
199
+ match_options: match_opts_hash,
200
+ )
201
+
202
+ # Pass Canon::Xml::Node directly - adapter now handles it
203
+ differences = strategy.match(node1, node2)
204
+
205
+ # Return based on verbose mode
206
+ if opts[:verbose]
207
+ # Get preprocessed strings for display
208
+ preprocessed = strategy.preprocess_for_display(node1, node2)
209
+
210
+ # Detect HTML version (default to HTML5 for Canon nodes)
211
+ html_version = :html5
212
+
213
+ # Return ComparisonResult with strategy metadata
214
+ ComparisonResult.new(
215
+ differences: differences,
216
+ preprocessed_strings: preprocessed,
217
+ format: :html,
218
+ html_version: html_version,
219
+ match_options: match_opts_hash.merge(strategy.metadata),
220
+ algorithm: :semantic,
221
+ )
222
+ else
223
+ # Simple boolean result - equivalent if no normative differences
224
+ differences.none?(&:normative?)
225
+ end
226
+ end
227
+
228
+ # Parse node as fragment to preserve actual content
229
+ # Uses HTML4.fragment or HTML5.fragment based on content detection
230
+ #
231
+ # @param node [String, Nokogiri node] Node to parse
232
+ # @param preprocessing [Symbol] Preprocessing mode
233
+ # @param match_opts [Hash] Match options
234
+ # @return [Nokogiri::HTML::DocumentFragment] Parsed fragment
235
+ def parse_node_as_fragment(node, preprocessing = :none, match_opts = {})
236
+ # If already an XML fragment (no meta tags), return it
237
+ if node.is_a?(Nokogiri::XML::DocumentFragment)
238
+ return node
239
+ end
240
+
241
+ # Convert HTML fragments to string and re-parse as XML to remove phantom tags
242
+ # This handles cases where pre-parsed HTML4/HTML5 fragments have auto-inserted meta
243
+ html_string = if node.is_a?(Nokogiri::HTML4::DocumentFragment) ||
244
+ node.is_a?(Nokogiri::HTML5::DocumentFragment)
245
+ node.to_s # Use to_s to avoid re-inserting meta tags
246
+ elsif node.is_a?(String)
247
+ node
248
+ else
249
+ node.to_html
250
+ end
251
+
252
+ # Use XML fragment parser to preserve structure without auto-generated elements
253
+ # This avoids both HTML4's meta tag insertion and HTML5's tag stripping
254
+ # See: https://stackoverflow.com/questions/25998824/stop-nokogiri-from-adding-doctype-and-meta-tags
255
+ frag = Nokogiri::XML.fragment(html_string)
256
+
257
+ # Apply preprocessing if needed
258
+ if preprocessing == :rendered
259
+ normalize_html_style_script_comments(frag)
260
+ normalize_rendered_whitespace(frag, match_opts)
261
+ remove_whitespace_only_text_nodes(frag)
262
+ end
263
+
264
+ frag
265
+ end
266
+
267
+ # Parse HTML for semantic tree diff using Canon::Html::DataModel
268
+ # Returns Canon::Xml::Node for preprocessing preservation
269
+ #
270
+ # @param html [String, Object] HTML to parse
271
+ # @param preprocessing [Symbol] Preprocessing mode
272
+ # @return [Canon::Xml::Node] Parsed Canon node
273
+ def parse_node_for_semantic(html, preprocessing = :none)
274
+ # If already a Canon::Xml::Node, return as-is
275
+ return html if html.is_a?(Canon::Xml::Node)
276
+
277
+ # Convert to string if needed
278
+ html_string = if html.is_a?(String)
279
+ html
280
+ elsif html.respond_to?(:to_html)
281
+ html.to_html
282
+ elsif html.respond_to?(:to_s)
283
+ html.to_s
284
+ else
285
+ raise Canon::Error,
286
+ "Unable to convert HTML to string: #{html.class}"
287
+ end
288
+
289
+ # Strip DOCTYPE for consistent parsing
290
+ html_string = html_string.gsub(/<!DOCTYPE[^>]*>/i, "").strip
291
+
292
+ # Apply preprocessing to HTML string before parsing
293
+ processed_html = case preprocessing
294
+ when :normalize
295
+ # Normalize whitespace
296
+ html_string.lines.map(&:strip).reject(&:empty?).join("\n")
297
+ when :c14n
298
+ # Canonicalize
299
+ Canon::Xml::C14n.canonicalize(html_string,
300
+ with_comments: false)
301
+ when :format
302
+ # Pretty format
303
+ Canon.format(html_string, :html)
304
+ else
305
+ # :none or unrecognized
306
+ html_string
307
+ end
308
+
309
+ # Parse using Canon::Html::DataModel to get Canon::Xml::Node
310
+ # HTML parsing with proper HTML-specific handling
311
+ Canon::Html::DataModel.from_html(processed_html)
312
+ end
313
+
140
314
  # Parse a node from string or return as-is
141
315
  # Applies preprocessing transformation before parsing if specified
316
+ # For DOM comparison, returns Nokogiri nodes (not Canon::Xml::Node)
142
317
  def parse_node(node, preprocessing = :none, match_opts = {})
143
318
  # If already a Nokogiri node, check for incompatible XML documents
144
- # Only raise error for non-string incompatible formats
145
319
  unless node.is_a?(String)
146
320
  # Detect if this is an XML document (not HTML)
147
- # Strings are allowed since they can be wrapped/parsed as needed
148
321
  if is_xml_document?(node)
149
322
  raise Canon::CompareFormatMismatchError.new(:xml, :html)
150
323
  end
151
324
 
152
- # For :rendered preprocessing, apply normalization even to pre-parsed nodes
153
- if preprocessing == :rendered
154
- # If already a DocumentFragment with :rendered, just normalize it
155
- if node.is_a?(Nokogiri::HTML4::DocumentFragment) ||
156
- node.is_a?(Nokogiri::HTML5::DocumentFragment) ||
157
- node.is_a?(Nokogiri::XML::DocumentFragment)
158
- # Normalize whitespace directly without re-parsing
159
- normalize_html_style_script_comments(node)
160
- normalize_rendered_whitespace(node, match_opts)
161
- return node
325
+ # Normalize HTML documents to fragments to avoid DTD differences
326
+ # This ensures comparing string with document works correctly
327
+ if node.is_a?(Nokogiri::HTML::Document) ||
328
+ node.is_a?(Nokogiri::HTML4::Document) ||
329
+ node.is_a?(Nokogiri::HTML5::Document)
330
+ # Get root element and create fragment from its outer HTML
331
+ # This avoids DOCTYPE and other document-level nodes
332
+ root = node.at_css("html") || node.root
333
+ if root
334
+ node = Nokogiri::XML.fragment(root.to_html)
162
335
  end
336
+ end
163
337
 
164
- # Normalize whitespace directly without re-parsing
165
- normalize_html_style_script_comments(node)
166
- normalize_rendered_whitespace(node, match_opts)
167
- return node
338
+ # For :rendered preprocessing with Nokogiri nodes
339
+ if preprocessing == :rendered
340
+ # Normalize and return
341
+ frag = node.is_a?(Nokogiri::XML::DocumentFragment) ? node : Nokogiri::XML.fragment(node.to_html)
342
+ normalize_html_style_script_comments(frag)
343
+ normalize_rendered_whitespace(frag, match_opts)
344
+ remove_whitespace_only_text_nodes(frag)
345
+ return frag
168
346
  end
169
347
 
170
- # For other preprocessing, just return the node (including DocumentFragments)
348
+ # Return Nokogiri node (now normalized if it was a document)
171
349
  return node
172
350
  end
173
351
 
174
352
  # Check if string contains XML declaration but is actually HTML
175
- # Nokogiri::HTML4.to_s adds <?xml...?> but the content is still HTML
176
- # Check if this is actually HTML content after the declaration
177
- # Look for <html tag which indicates HTML
178
353
  if node.strip.start_with?("<?xml") && !node.match?(/<html[\s>]/i)
179
354
  # No <html> tag, this is likely pure XML
180
355
  raise Canon::CompareFormatMismatchError.new(:xml, :html)
181
356
  end
182
357
 
183
- # Has <?xml but also <html> tag, so it's HTML with XML declaration
184
- # (common output from Nokogiri::HTML4#to_s)
185
-
186
- # For :rendered preprocessing, handle separately to avoid double-parsing
187
- if preprocessing == :rendered
188
- # Check if this is a full HTML document or a fragment
189
- # Use full document parsing if it has <html> tag
190
- if node.match?(/<html[\s>]/i)
191
- doc = Nokogiri::HTML(node, &:noblanks)
192
- normalize_html_style_script_comments(doc)
193
- normalize_rendered_whitespace(doc, match_opts)
194
- remove_whitespace_only_text_nodes(doc)
195
- return doc
196
- else
197
- # Use fragment for partial HTML
198
- frag = Nokogiri::HTML4.fragment(node)
199
- normalize_html_style_script_comments(frag)
200
- normalize_rendered_whitespace(frag, match_opts)
201
- remove_whitespace_only_text_nodes(frag)
202
- return frag
203
- end
204
- end
358
+ # Strip DOCTYPE declarations from HTML strings
359
+ # This normalizes parsed HTML (which includes DOCTYPE) with raw HTML strings
360
+ node = node.gsub(/<!DOCTYPE[^>]*>/i, "").strip
205
361
 
206
362
  # Apply preprocessing to HTML string before parsing
207
363
  html_string = case preprocessing
@@ -216,15 +372,52 @@ module Canon
216
372
  # Pretty format the HTML
217
373
  Canon.format(node, :html)
218
374
  else
219
- # :none or unrecognized - use as-is
375
+ # :none, :rendered or unrecognized - use as-is
220
376
  node
221
377
  end
222
378
 
223
- # Use Nokogiri for HTML and normalize style/script comments
224
- # Use noblanks to prevent Nokogiri from adding structural whitespace
225
- doc = Nokogiri::HTML(html_string, &:noblanks)
226
- normalize_html_style_script_comments(doc)
227
- doc
379
+ # Parse as Nokogiri fragment for DOM comparison
380
+ # Use XML fragment parser to avoid auto-inserted meta tags
381
+ frag = Nokogiri::XML.fragment(html_string)
382
+
383
+ # Apply :rendered preprocessing if needed
384
+ if preprocessing == :rendered
385
+ normalize_html_style_script_comments(frag)
386
+ normalize_rendered_whitespace(frag, match_opts)
387
+ remove_whitespace_only_text_nodes(frag)
388
+ end
389
+
390
+ frag
391
+ end
392
+
393
+ # Normalize HTML comments within style and script tags for DataModel nodes
394
+ def normalize_html_style_script_comments_datamodel(root)
395
+ # Walk the tree to find style/script elements
396
+ find_and_normalize_style_script(root)
397
+ end
398
+
399
+ def find_and_normalize_style_script(node)
400
+ return unless node.respond_to?(:children)
401
+
402
+ node.children.each do |child|
403
+ next unless child.is_a?(Canon::Xml::Nodes::ElementNode)
404
+
405
+ # If this is a style or script element, normalize its text content
406
+ if %w[style script].include?(child.name.downcase)
407
+ # Get text children and remove HTML comments from them
408
+ child.children.each do |text_child|
409
+ next unless text_child.is_a?(Canon::Xml::Nodes::TextNode)
410
+
411
+ # Remove HTML comments from text content
412
+ normalized = text_child.value.gsub(/<!--.*?-->/m, "").strip
413
+ # Update the text value
414
+ text_child.instance_variable_set(:@value, normalized)
415
+ end
416
+ end
417
+
418
+ # Recursively process children
419
+ find_and_normalize_style_script(child)
420
+ end
228
421
  end
229
422
 
230
423
  # Detect HTML version from content
@@ -244,12 +437,12 @@ module Canon
244
437
  end
245
438
  end
246
439
 
247
- # Detect HTML version from Nokogiri node
440
+ # Detect HTML version from node
248
441
  #
249
- # @param node [Nokogiri::XML::Node] Nokogiri HTML node
442
+ # @param node [Canon::Xml::Node, Nokogiri::XML::Node] HTML node
250
443
  # @return [Symbol] :html5 or :html4
251
444
  def detect_html_version_from_node(node)
252
- # Check node type
445
+ # Check node type for Nokogiri
253
446
  if node.is_a?(Nokogiri::HTML5::Document) ||
254
447
  node.is_a?(Nokogiri::HTML5::DocumentFragment)
255
448
  :html5
@@ -257,20 +450,27 @@ module Canon
257
450
  node.is_a?(Nokogiri::HTML4::DocumentFragment)
258
451
  :html4
259
452
  else
260
- # Default to HTML4 for compatibility
261
- :html4
453
+ # Default to HTML5 for Canon::Xml::Node and unknown types
454
+ :html5
262
455
  end
263
456
  end
264
457
 
265
458
  # Serialize node to string for diff display
266
459
  # This ensures the displayed diff matches what was compared
267
460
  #
268
- # @param node [Nokogiri::HTML::Document] Parsed HTML node
461
+ # @param node [Canon::Xml::Node, Nokogiri::HTML::Document] Parsed node
269
462
  # @return [String] Serialized HTML string
270
463
  def serialize_for_display(node)
271
- # Get string representation with formatting for line-by-line diffs
272
- # Use to_html which preserves line structure for diff display
273
- node.to_html
464
+ # Use XmlComparator's serializer for Canon::Xml::Node
465
+ if node.is_a?(Canon::Xml::Node)
466
+ XmlComparator.send(:serialize_node_to_xml, node)
467
+ elsif node.respond_to?(:to_html)
468
+ node.to_html
469
+ elsif node.respond_to?(:to_xml)
470
+ node.to_xml
471
+ else
472
+ node.to_s
473
+ end
274
474
  end
275
475
 
276
476
  # Normalize HTML comments within style and script tags
@@ -301,14 +501,25 @@ module Canon
301
501
  #
302
502
  # @param doc [Nokogiri::HTML::Document] Document to normalize
303
503
  # @param match_opts [Hash] Match options to respect during normalization
304
- def normalize_rendered_whitespace(doc, match_opts = {})
504
+ # @param compare_profile [HtmlCompareProfile] Optional profile for whitespace rules
505
+ def normalize_rendered_whitespace(doc, match_opts = {},
506
+ compare_profile = nil)
305
507
  # If text_content is :strict, don't normalize ANY text content
306
508
  # This allows users to explicitly request strict text matching
307
509
  return if match_opts[:text_content] == :strict
308
510
 
309
511
  # Elements where whitespace is significant - don't normalize
310
- # This is an HTML rendering rule, not a match option
311
- preserve_whitespace = %w[pre code textarea script style]
512
+ # Use profile if available, otherwise use default list
513
+ preserve_whitespace = if compare_profile.is_a?(HtmlCompareProfile)
514
+ # Profile handles HTML-specific whitespace rules
515
+ %w[pre code textarea script
516
+ style].select do |elem|
517
+ compare_profile.preserve_whitespace?(elem)
518
+ end
519
+ else
520
+ # Fallback to default list
521
+ %w[pre code textarea script style]
522
+ end
312
523
 
313
524
  # Walk all text nodes
314
525
  doc.xpath(".//text()").each do |text_node|
@@ -360,8 +571,18 @@ module Canon
360
571
  # Remove whitespace-only text nodes from the document
361
572
  # These are typically insignificant in HTML rendering (e.g., between
362
573
  # block elements)
574
+ #
575
+ # CRITICAL: Do NOT remove whitespace-only text nodes from whitespace-sensitive
576
+ # elements like <pre>, <code>, <textarea>, <script>, <style>
363
577
  def remove_whitespace_only_text_nodes(doc)
578
+ # Elements where whitespace is significant - don't remove whitespace-only nodes
579
+ preserve_whitespace = %w[pre code textarea script style]
580
+
364
581
  doc.xpath(".//text()").each do |text_node|
582
+ # CRITICAL: Skip if this text node is inside a whitespace-preserving element
583
+ parent = text_node.parent
584
+ next if ancestor_preserves_whitespace?(parent, preserve_whitespace)
585
+
365
586
  # Remove if the text is only whitespace (after normalization)
366
587
  if text_node.content.strip.empty?
367
588
  text_node.remove
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "compare_profile"
4
+
5
+ module Canon
6
+ module Comparison
7
+ # HtmlCompareProfile extends CompareProfile with HTML-specific comparison policies
8
+ #
9
+ # HTML has different semantics than XML:
10
+ # 1. Comments are presentational (default to :ignore unless explicitly :strict)
11
+ # 2. Whitespace preservation required in specific elements
12
+ # 3. Case sensitivity differs between HTML4 and HTML5
13
+ # 4. Self-closing tags handled differently
14
+ #
15
+ # This class provides HTML-specific policy decisions while maintaining
16
+ # the separation of concerns established by CompareProfile.
17
+ class HtmlCompareProfile < CompareProfile
18
+ attr_reader :html_version
19
+
20
+ # @param match_options [ResolvedMatchOptions, Hash] The match options to use
21
+ # @param html_version [Symbol] The HTML version (:html4 or :html5)
22
+ def initialize(match_options, html_version: :html5)
23
+ super(match_options)
24
+ @html_version = html_version
25
+ end
26
+
27
+ # Override for HTML-specific comment handling
28
+ #
29
+ # In HTML, comments are presentational content (not part of the DOM semantics)
30
+ # unless explicitly set to :strict. This differs from XML where comments
31
+ # may carry semantic meaning.
32
+ #
33
+ # HTML default for comments is :ignore, so comments don't affect equivalence
34
+ # unless the user explicitly sets comments: :strict
35
+ #
36
+ # @param dimension [Symbol] The match dimension to check
37
+ # @return [Boolean] true if differences affect equivalence
38
+ def affects_equivalence?(dimension)
39
+ # Comments in HTML: default is :ignore (presentational)
40
+ # Only affect equivalence if explicitly set to :strict
41
+ if dimension == :comments
42
+ # Check if comments key exists in options
43
+ if match_options.is_a?(Hash)
44
+ # If comments key doesn't exist, default to false (HTML default: ignore)
45
+ return false unless match_options.key?(:comments)
46
+
47
+ # If key exists, check if it's :strict
48
+ return match_options[:comments] == :strict
49
+ elsif match_options.respond_to?(:behavior_for)
50
+ behavior = behavior_for(dimension)
51
+ # In HTML, only :strict makes comments affect equivalence
52
+ return behavior == :strict
53
+ end
54
+ # Default: comments don't affect equivalence in HTML
55
+ return false
56
+ end
57
+
58
+ # All other dimensions use base class behavior
59
+ super
60
+ end
61
+
62
+ # Check if whitespace should be preserved for a given element
63
+ #
64
+ # HTML has specific elements where whitespace is significant:
65
+ # <pre>, <code>, <textarea>, <script>, <style>
66
+ #
67
+ # @param element_name [String] The element name to check
68
+ # @return [Boolean] true if whitespace should be preserved
69
+ def preserve_whitespace?(element_name)
70
+ whitespace_sensitive_elements.include?(element_name.to_s.downcase)
71
+ end
72
+
73
+ # Check if element names should be compared case-sensitively
74
+ #
75
+ # HTML4 is case-insensitive, HTML5 is case-sensitive
76
+ #
77
+ # @return [Boolean] true if case-sensitive comparison
78
+ def case_sensitive?
79
+ @html_version == :html5
80
+ end
81
+
82
+ private
83
+
84
+ # Elements where whitespace is semantically significant in HTML
85
+ # @return [Array<String>] List of element names
86
+ def whitespace_sensitive_elements
87
+ %w[pre code textarea script style]
88
+ end
89
+
90
+ # Check if a dimension is explicitly set to :strict
91
+ # @param dimension [Symbol] The match dimension
92
+ # @return [Boolean] true if explicitly :strict
93
+ def explicitly_strict?(dimension)
94
+ behavior_for(dimension) == :strict
95
+ end
96
+
97
+ # Check if an option was explicitly provided in match_options
98
+ # @param dimension [Symbol] The match dimension
99
+ # @return [Boolean] true if option was explicitly set
100
+ def has_explicit_option?(dimension)
101
+ if match_options.is_a?(Hash)
102
+ match_options.key?(dimension)
103
+ elsif match_options.respond_to?(:[])
104
+ # For ResolvedMatchOptions, check if key exists
105
+ begin
106
+ match_options[dimension]
107
+ true
108
+ rescue StandardError
109
+ false
110
+ end
111
+ else
112
+ false
113
+ end
114
+ end
115
+ end
116
+ end
117
+ end