canon 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +69 -92
  3. data/README.adoc +13 -13
  4. data/docs/.lycheeignore +69 -0
  5. data/docs/Gemfile +1 -0
  6. data/docs/_config.yml +90 -1
  7. data/docs/advanced/diff-classification.adoc +82 -2
  8. data/docs/advanced/extending-canon.adoc +193 -0
  9. data/docs/features/match-options/index.adoc +239 -1
  10. data/docs/internals/diffnode-enrichment.adoc +611 -0
  11. data/docs/internals/index.adoc +251 -0
  12. data/docs/lychee.toml +13 -6
  13. data/docs/understanding/architecture.adoc +749 -33
  14. data/docs/understanding/comparison-pipeline.adoc +122 -0
  15. data/lib/canon/cache.rb +129 -0
  16. data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +68 -0
  17. data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +68 -0
  18. data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +171 -0
  19. data/lib/canon/comparison/dimensions/base_dimension.rb +107 -0
  20. data/lib/canon/comparison/dimensions/comments_dimension.rb +121 -0
  21. data/lib/canon/comparison/dimensions/element_position_dimension.rb +90 -0
  22. data/lib/canon/comparison/dimensions/registry.rb +77 -0
  23. data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +119 -0
  24. data/lib/canon/comparison/dimensions/text_content_dimension.rb +96 -0
  25. data/lib/canon/comparison/dimensions.rb +54 -0
  26. data/lib/canon/comparison/format_detector.rb +87 -0
  27. data/lib/canon/comparison/html_comparator.rb +70 -26
  28. data/lib/canon/comparison/html_compare_profile.rb +8 -2
  29. data/lib/canon/comparison/html_parser.rb +80 -0
  30. data/lib/canon/comparison/json_comparator.rb +12 -0
  31. data/lib/canon/comparison/json_parser.rb +19 -0
  32. data/lib/canon/comparison/markup_comparator.rb +293 -0
  33. data/lib/canon/comparison/match_options/base_resolver.rb +150 -0
  34. data/lib/canon/comparison/match_options/json_resolver.rb +82 -0
  35. data/lib/canon/comparison/match_options/xml_resolver.rb +151 -0
  36. data/lib/canon/comparison/match_options/yaml_resolver.rb +87 -0
  37. data/lib/canon/comparison/match_options.rb +68 -463
  38. data/lib/canon/comparison/profile_definition.rb +149 -0
  39. data/lib/canon/comparison/ruby_object_comparator.rb +180 -0
  40. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +7 -10
  41. data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
  42. data/lib/canon/comparison/xml_comparator/attribute_comparator.rb +177 -0
  43. data/lib/canon/comparison/xml_comparator/attribute_filter.rb +136 -0
  44. data/lib/canon/comparison/xml_comparator/child_comparison.rb +197 -0
  45. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +115 -0
  46. data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +186 -0
  47. data/lib/canon/comparison/xml_comparator/node_parser.rb +79 -0
  48. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +102 -0
  49. data/lib/canon/comparison/xml_comparator.rb +97 -684
  50. data/lib/canon/comparison/xml_node_comparison.rb +319 -0
  51. data/lib/canon/comparison/xml_parser.rb +19 -0
  52. data/lib/canon/comparison/yaml_comparator.rb +3 -3
  53. data/lib/canon/comparison.rb +265 -110
  54. data/lib/canon/diff/diff_classifier.rb +101 -2
  55. data/lib/canon/diff/diff_node.rb +32 -2
  56. data/lib/canon/diff/formatting_detector.rb +1 -1
  57. data/lib/canon/diff/node_serializer.rb +191 -0
  58. data/lib/canon/diff/path_builder.rb +143 -0
  59. data/lib/canon/diff_formatter/by_line/base_formatter.rb +251 -0
  60. data/lib/canon/diff_formatter/by_line/html_formatter.rb +6 -248
  61. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +38 -229
  62. data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +30 -0
  63. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +579 -0
  64. data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +121 -0
  65. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +253 -0
  66. data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +61 -0
  67. data/lib/canon/diff_formatter/diff_detail_formatter.rb +31 -1028
  68. data/lib/canon/diff_formatter.rb +1 -1
  69. data/lib/canon/rspec_matchers.rb +38 -9
  70. data/lib/canon/tree_diff/operation_converter.rb +92 -338
  71. data/lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb +71 -0
  72. data/lib/canon/tree_diff/operation_converter_helpers/post_processor.rb +103 -0
  73. data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +168 -0
  74. data/lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb +188 -0
  75. data/lib/canon/version.rb +1 -1
  76. data/lib/canon/xml/data_model.rb +24 -13
  77. metadata +48 -2
@@ -7,11 +7,15 @@ require_relative "comparison/xml_comparator"
7
7
  require_relative "comparison/html_comparator"
8
8
  require_relative "comparison/json_comparator"
9
9
  require_relative "comparison/yaml_comparator"
10
+ require_relative "comparison/profile_definition"
11
+ require_relative "comparison/format_detector"
12
+ require_relative "comparison/html_parser"
10
13
  require_relative "diff/diff_node_mapper"
11
14
  require_relative "diff/diff_line"
12
15
  require_relative "diff/diff_block_builder"
13
16
  require_relative "diff/diff_context_builder"
14
17
  require_relative "diff/diff_report_builder"
18
+ require_relative "cache"
15
19
 
16
20
  module Canon
17
21
  # Comparison module for XML, HTML, JSON, and YAML documents
@@ -36,25 +40,36 @@ module Canon
36
40
  # == Comparison Options
37
41
  #
38
42
  # Common options across all formats:
39
- # - collapse_whitespace: Normalize whitespace in text (default: true)
40
- # - ignore_attr_order: Ignore attribute/key ordering (default: true)
41
- # - ignore_comments: Skip comment nodes (default: true)
42
- # - ignore_text_nodes: Skip all text content (default: false)
43
- # - ignore_children: Skip child nodes (default: false)
43
+ # - profile: Comparison profile (Symbol for preset, Hash for custom)
44
+ # * Presets: :strict, :rendered, :html4, :html5, :spec_friendly, :content_only
45
+ # * Custom: { text_content: :normalize, comments: :ignore, ... }
46
+ # - diff_algorithm: Algorithm to use (:dom or :semantic, default: :dom)
44
47
  # - verbose: Return detailed diff array (default: false)
45
48
  #
46
49
  # == Usage Examples
47
50
  #
48
- # # XML comparison
51
+ # # XML comparison with default profile
49
52
  # Canon::Comparison.equivalent?(xml1, xml2)
50
- # Canon::Comparison.equivalent?(xml1, xml2, verbose: true)
51
53
  #
52
- # # HTML comparison
53
- # Canon::Comparison.equivalent?(html1, html2, ignore_comments: true)
54
+ # # XML comparison with preset profile
55
+ # Canon::Comparison.equivalent?(xml1, xml2, profile: :strict)
56
+ # Canon::Comparison.equivalent?(xml1, xml2, profile: :spec_friendly)
54
57
  #
55
- # # JSON comparison
56
- # Canon::Comparison.equivalent?(json1, json2)
57
- # Canon::Comparison.equivalent?(hash1, hash2) # Pre-parsed objects
58
+ # # HTML comparison with custom inline profile
59
+ # Canon::Comparison.equivalent?(html1, html2,
60
+ # profile: { text_content: :normalize, comments: :ignore })
61
+ #
62
+ # # Define and use a custom profile
63
+ # Canon::Comparison.define_profile(:my_custom) do
64
+ # text_content :normalize
65
+ # comments :ignore
66
+ # preprocessing :rendered
67
+ # end
68
+ # Canon::Comparison.equivalent?(doc1, doc2, profile: :my_custom)
69
+ #
70
+ # # JSON comparison with semantic tree diff
71
+ # Canon::Comparison.equivalent?(json1, json2,
72
+ # diff_algorithm: :semantic, profile: :spec_friendly)
58
73
  #
59
74
  # # With detailed output
60
75
  # diffs = Canon::Comparison.equivalent?(doc1, doc2, verbose: true)
@@ -88,10 +103,11 @@ module Canon
88
103
  UNEQUAL_TEXT_CONTENTS = 9
89
104
  MISSING_HASH_KEY = 10
90
105
  UNEQUAL_HASH_VALUES = 11
91
- UNEQUAL_ARRAY_LENGTHS = 12
92
- UNEQUAL_ARRAY_ELEMENTS = 13
93
- UNEQUAL_TYPES = 14
94
- UNEQUAL_PRIMITIVES = 15
106
+ UNEQUAL_HASH_KEY_ORDER = 12
107
+ UNEQUAL_ARRAY_LENGTHS = 13
108
+ UNEQUAL_ARRAY_ELEMENTS = 14
109
+ UNEQUAL_TYPES = 15
110
+ UNEQUAL_PRIMITIVES = 16
95
111
 
96
112
  class << self
97
113
  # Auto-detect format and compare two objects
@@ -99,8 +115,10 @@ module Canon
99
115
  # @param obj1 [Object] First object to compare
100
116
  # @param obj2 [Object] Second object to compare
101
117
  # @param opts [Hash] Comparison options
118
+ # - :profile - Profile to use (Symbol for preset, Hash for custom)
102
119
  # - :format - Format hint (:xml, :html, :html4, :html5, :json, :yaml, :string)
103
120
  # - :diff_algorithm - Algorithm to use (:dom or :semantic)
121
+ # - :verbose - Return detailed diff array (default: false)
104
122
  # @return [Boolean, Array] true if equivalent, or array of diffs if verbose
105
123
  def equivalent?(obj1, obj2, opts = {})
106
124
  # Check if semantic tree diff is requested
@@ -113,6 +131,56 @@ module Canon
113
131
  dom_diff(obj1, obj2, opts)
114
132
  end
115
133
 
134
+ # Define a custom comparison profile with DSL syntax
135
+ #
136
+ # @param name [Symbol] Profile name
137
+ # @yield [ProfileDefinition] DSL block for defining profile
138
+ # @return [Symbol] Profile name
139
+ # @raise [ProfileError] if profile definition is invalid
140
+ #
141
+ # @example Define a custom profile
142
+ # Canon::Comparison.define_profile(:my_custom) do
143
+ # text_content :normalize
144
+ # comments :ignore
145
+ # preprocessing :rendered
146
+ # end
147
+ def define_profile(name, &block)
148
+ definition = ProfileDefinition.define(name, &block)
149
+
150
+ @custom_profiles ||= {}
151
+ @custom_profiles[name] = definition
152
+
153
+ name
154
+ end
155
+
156
+ # Load a profile (custom or preset)
157
+ #
158
+ # @param name [Symbol] Profile name
159
+ # @return [Hash] Profile settings
160
+ def load_profile(name)
161
+ # Check custom profiles first
162
+ if @custom_profiles&.key?(name)
163
+ return @custom_profiles[name].dup
164
+ end
165
+
166
+ # Fall back to presets - try Xml first (most common)
167
+ begin
168
+ MatchOptions::Xml.get_profile_options(name)
169
+ rescue Error
170
+ # Try other formats
171
+ MatchOptions::Json.get_profile_options(name)
172
+ end
173
+ end
174
+
175
+ # List all available profiles (custom + presets)
176
+ #
177
+ # @return [Array<Symbol>] Available profile names
178
+ def available_profiles
179
+ custom = @custom_profiles&.keys || []
180
+ presets = MatchOptions::Xml::MATCH_PROFILES.keys
181
+ (custom + presets).sort.uniq
182
+ end
183
+
116
184
  private
117
185
 
118
186
  # Perform semantic tree diff comparison
@@ -120,8 +188,8 @@ module Canon
120
188
  require_relative "tree_diff"
121
189
 
122
190
  # Detect format for both objects
123
- format1 = opts[:format] || detect_format(obj1)
124
- format2 = opts[:format] || detect_format(obj2)
191
+ format1 = opts[:format] || FormatDetector.detect(obj1)
192
+ format2 = opts[:format] || FormatDetector.detect(obj2)
125
193
 
126
194
  # Handle string format (plain text comparison) - semantic tree doesn't support it
127
195
  if format1 == :string
@@ -203,39 +271,141 @@ module Canon
203
271
  # @param opts [Hash] User options
204
272
  # @return [Hash] Resolved match options
205
273
  def resolve_match_options(format, opts)
274
+ # Process unified profile parameter first
275
+ processed_opts = process_profile_parameter(opts)
276
+
206
277
  case format
207
278
  when :xml, :html, :html4, :html5
208
279
  MatchOptions::Xml.resolve(
209
280
  format: format,
210
- match_profile: opts[:match_profile],
211
- match: opts[:match],
212
- preprocessing: opts[:preprocessing],
213
- global_profile: opts[:global_profile],
214
- global_options: opts[:global_options],
281
+ match_profile: processed_opts[:match_profile],
282
+ match: processed_opts[:match],
283
+ preprocessing: processed_opts[:preprocessing],
284
+ global_profile: processed_opts[:global_profile],
285
+ global_options: processed_opts[:global_options],
215
286
  )
216
287
  when :json
217
288
  MatchOptions::Json.resolve(
218
289
  format: format,
219
- match_profile: opts[:match_profile],
220
- match: opts[:match],
221
- preprocessing: opts[:preprocessing],
222
- global_profile: opts[:global_profile],
223
- global_options: opts[:global_options],
290
+ match_profile: processed_opts[:match_profile],
291
+ match: processed_opts[:match],
292
+ preprocessing: processed_opts[:preprocessing],
293
+ global_profile: processed_opts[:global_profile],
294
+ global_options: processed_opts[:global_options],
224
295
  )
225
296
  when :yaml
226
297
  MatchOptions::Yaml.resolve(
227
298
  format: format,
228
- match_profile: opts[:match_profile],
229
- match: opts[:match],
230
- preprocessing: opts[:preprocessing],
231
- global_profile: opts[:global_profile],
232
- global_options: opts[:global_options],
299
+ match_profile: processed_opts[:match_profile],
300
+ match: processed_opts[:match],
301
+ preprocessing: processed_opts[:preprocessing],
302
+ global_profile: processed_opts[:global_profile],
303
+ global_options: processed_opts[:global_options],
233
304
  )
234
305
  else
235
- opts[:match] || {}
306
+ processed_opts[:match] || {}
307
+ end
308
+ end
309
+
310
+ # Process unified profile parameter
311
+ #
312
+ # Converts the new :profile parameter into the legacy format expected
313
+ # by MatchOptions resolvers. Handles:
314
+ # - Symbol → preset profile (uses :match_profile)
315
+ # - Hash → custom profile (validates and uses :match)
316
+ #
317
+ # @param opts [Hash] Original user options
318
+ # @return [Hash] Processed options with legacy format
319
+ def process_profile_parameter(opts)
320
+ processed = opts.dup
321
+
322
+ # Handle unified :profile parameter
323
+ if opts.key?(:profile)
324
+ profile = opts[:profile]
325
+
326
+ case profile
327
+ when Symbol
328
+ # Preset profile name
329
+ processed[:match_profile] = profile
330
+ when Hash
331
+ # Inline custom profile - validate and use as :match
332
+ validate_custom_profile!(profile, format_from_opts(opts))
333
+ processed[:match] = profile
334
+ else
335
+ raise Canon::Error,
336
+ "Invalid profile type: #{profile.class}. " \
337
+ "Expected Symbol (preset name) or Hash (custom profile)."
338
+ end
339
+ end
340
+
341
+ processed
342
+ end
343
+
344
+ # Validate custom profile hash
345
+ #
346
+ # Ensures all dimensions and behaviors in a custom profile are valid.
347
+ # Uses ProfileDefinition validation logic.
348
+ #
349
+ # @param profile [Hash] Custom profile hash
350
+ # @param format [Symbol] Format type for validation context
351
+ # @raise [Canon::Error] if profile contains invalid dimensions or behaviors
352
+ def validate_custom_profile!(profile, format)
353
+ profile.each do |dimension, behavior|
354
+ # Skip preprocessing and special options
355
+ next if dimension == :preprocessing
356
+ next if dimension == :semantic_diff
357
+ next if dimension == :similarity_threshold
358
+
359
+ # Validate dimension is known
360
+ valid_dimensions = valid_dimensions_for_format(format)
361
+ unless valid_dimensions.include?(dimension)
362
+ raise Canon::Error,
363
+ "Unknown dimension: #{dimension}. " \
364
+ "Valid dimensions for #{format}: #{valid_dimensions.join(', ')}"
365
+ end
366
+
367
+ # Validate behavior is allowed for this dimension
368
+ valid_behaviors = ProfileDefinition::DIMENSION_BEHAVIORS[dimension]
369
+ if valid_behaviors && !valid_behaviors.include?(behavior)
370
+ raise Canon::Error,
371
+ "Invalid behavior '#{behavior}' for dimension '#{dimension}'. " \
372
+ "Valid behaviors: #{valid_behaviors.join(', ')}"
373
+ end
374
+
375
+ # Validate behavior is in general MATCH_BEHAVIORS
376
+ unless MatchOptions::MATCH_BEHAVIORS.include?(behavior)
377
+ raise Canon::Error,
378
+ "Unknown match behavior: #{behavior}. " \
379
+ "Valid behaviors: #{MatchOptions::MATCH_BEHAVIORS.join(', ')}"
380
+ end
236
381
  end
237
382
  end
238
383
 
384
+ # Get valid dimensions for a format
385
+ #
386
+ # @param format [Symbol] Format type
387
+ # @return [Array<Symbol>] Valid dimensions for the format
388
+ def valid_dimensions_for_format(format)
389
+ case format
390
+ when :xml, :html, :html4, :html5
391
+ MatchOptions::Xml::MATCH_DIMENSIONS
392
+ when :json
393
+ MatchOptions::Json::MATCH_DIMENSIONS
394
+ when :yaml
395
+ MatchOptions::Yaml::MATCH_DIMENSIONS
396
+ else
397
+ []
398
+ end
399
+ end
400
+
401
+ # Helper to extract format from opts for validation
402
+ #
403
+ # @param opts [Hash] User options
404
+ # @return [Symbol] Format type or :xml as default
405
+ def format_from_opts(opts)
406
+ opts[:format] || :xml
407
+ end
408
+
239
409
  # Parse documents using comparator's parse logic (reuses preprocessing)
240
410
  #
241
411
  # @param obj1 [Object] First object
@@ -250,32 +420,66 @@ module Canon
250
420
  when :xml
251
421
  # Delegate to XmlComparator's parse_node - returns Canon::Xml::Node
252
422
  # Adapter now handles Canon::Xml::Node directly
253
- doc1 = XmlComparator.send(:parse_node, obj1, preprocessing)
254
- doc2 = XmlComparator.send(:parse_node, obj2, preprocessing)
423
+ doc1 = parse_with_cache(obj1, format, preprocessing) do |doc|
424
+ XmlComparator.send(:parse_node, doc, preprocessing)
425
+ end
426
+ doc2 = parse_with_cache(obj2, format, preprocessing) do |doc|
427
+ XmlComparator.send(:parse_node, doc, preprocessing)
428
+ end
255
429
  [doc1, doc2]
256
430
  when :html, :html4, :html5
257
431
  # Delegate to HtmlComparator's parse_node_for_semantic for Canon::Xml::Node
258
432
  [
259
- HtmlComparator.send(:parse_node_for_semantic, obj1, preprocessing),
260
- HtmlComparator.send(:parse_node_for_semantic, obj2, preprocessing),
433
+ parse_with_cache(obj1, format, preprocessing) do |doc|
434
+ HtmlComparator.send(:parse_node_for_semantic, doc, preprocessing)
435
+ end,
436
+ parse_with_cache(obj2, format, preprocessing) do |doc|
437
+ HtmlComparator.send(:parse_node_for_semantic, doc, preprocessing)
438
+ end,
261
439
  ]
262
440
  when :json
263
441
  # Delegate to JsonComparator's parse_json
264
442
  [
265
- JsonComparator.send(:parse_json, obj1),
266
- JsonComparator.send(:parse_json, obj2),
443
+ parse_with_cache(obj1, format, :none) do |doc|
444
+ JsonComparator.send(:parse_json, doc)
445
+ end,
446
+ parse_with_cache(obj2, format, :none) do |doc|
447
+ JsonComparator.send(:parse_json, doc)
448
+ end,
267
449
  ]
268
450
  when :yaml
269
451
  # Delegate to YamlComparator's parse_yaml
270
452
  [
271
- YamlComparator.send(:parse_yaml, obj1),
272
- YamlComparator.send(:parse_yaml, obj2),
453
+ parse_with_cache(obj1, format, :none) do |doc|
454
+ YamlComparator.send(:parse_yaml, doc)
455
+ end,
456
+ parse_with_cache(obj2, format, :none) do |doc|
457
+ YamlComparator.send(:parse_yaml, doc)
458
+ end,
273
459
  ]
274
460
  else
275
461
  [obj1, obj2]
276
462
  end
277
463
  end
278
464
 
465
+ # Parse a document with caching
466
+ #
467
+ # @param doc [Object] Document to parse (string or already parsed)
468
+ # @param format [Symbol] Document format
469
+ # @param preprocessing [Symbol] Preprocessing option
470
+ # @yield Block to parse the document if not cached
471
+ # @return [Object] Parsed document
472
+ def parse_with_cache(doc, format, preprocessing)
473
+ # If already a parsed node, return as-is
474
+ return doc unless doc.is_a?(String)
475
+
476
+ # Use cache for string documents
477
+ Cache.fetch(:document_parse,
478
+ Cache.key_for_document(doc, format, preprocessing)) do
479
+ yield doc
480
+ end
481
+ end
482
+
279
483
  # Normalize format for TreeDiff (html4/html5 -> html)
280
484
  #
281
485
  # @param format [Symbol] Original format
@@ -314,14 +518,14 @@ module Canon
314
518
  format1 = format2 = opts[:format]
315
519
  # Parse HTML strings if format is html/html4/html5
316
520
  if %i[html html4 html5].include?(opts[:format])
317
- obj1 = parse_html(obj1, opts[:format]) if obj1.is_a?(String)
318
- obj2 = parse_html(obj2, opts[:format]) if obj2.is_a?(String)
319
- # Normalize html4/html5 to html for comparison
320
- format1 = format2 = :html
521
+ obj1 = HtmlParser.parse(obj1, opts[:format]) if obj1.is_a?(String)
522
+ obj2 = HtmlParser.parse(obj2, opts[:format]) if obj2.is_a?(String)
523
+ # Note: We preserve html4/html5 format instead of normalizing to :html
524
+ # This allows HtmlComparator to use the correct parsing behavior
321
525
  end
322
526
  else
323
- format1 = detect_format(obj1)
324
- format2 = detect_format(obj2)
527
+ format1 = FormatDetector.detect(obj1)
528
+ format2 = FormatDetector.detect(obj2)
325
529
  end
326
530
 
327
531
  # Handle string format (plain text comparison)
@@ -357,7 +561,7 @@ module Canon
357
561
  case comparison_format
358
562
  when :xml
359
563
  XmlComparator.equivalent?(obj1, obj2, opts)
360
- when :html
564
+ when :html, :html4, :html5
361
565
  HtmlComparator.equivalent?(obj1, obj2, opts)
362
566
  when :json
363
567
  JsonComparator.equivalent?(obj1, obj2, opts)
@@ -366,78 +570,29 @@ module Canon
366
570
  end
367
571
  end
368
572
 
369
- # Parse HTML string into Nokogiri document
370
- #
371
- # @param content [String, Object] Content to parse (returns as-is if not a string)
372
- # @param format [Symbol] HTML format (:html, :html4, :html5)
373
- # @return [Nokogiri::HTML::Document, Nokogiri::HTML5::Document, Nokogiri::HTML::DocumentFragment, Object]
374
- def parse_html(content, _format)
375
- return content unless content.is_a?(String)
376
- return content if content.is_a?(Nokogiri::HTML::Document) ||
377
- content.is_a?(Nokogiri::HTML5::Document) ||
378
- content.is_a?(Nokogiri::XML::Document) ||
379
- content.is_a?(Nokogiri::HTML::DocumentFragment) ||
380
- content.is_a?(Nokogiri::HTML5::DocumentFragment) ||
381
- content.is_a?(Nokogiri::XML::DocumentFragment)
382
-
383
- # Let HtmlComparator's parse_node handle parsing with preprocessing
384
- # For now, just return the string and let it be parsed by HtmlComparator
385
- content
386
- rescue StandardError
387
- content
388
- end
389
-
390
- # Detect the format of an object
573
+ # Detect the format of an object (delegates to FormatDetector)
391
574
  #
392
575
  # @param obj [Object] Object to detect format of
393
576
  # @return [Symbol] Format type
394
577
  def detect_format(obj)
395
- case obj
396
- when Moxml::Node, Moxml::Document
397
- :xml
398
- when Nokogiri::HTML::DocumentFragment, Nokogiri::HTML5::DocumentFragment
399
- # HTML DocumentFragments
400
- :html
401
- when Nokogiri::XML::DocumentFragment
402
- # XML DocumentFragments - check if it's actually HTML
403
- obj.document&.html? ? :html : :xml
404
- when Nokogiri::XML::Document, Nokogiri::XML::Node
405
- # Check if it's HTML by looking at the document type
406
- obj.html? ? :html : :xml
407
- when Nokogiri::HTML::Document, Nokogiri::HTML5::Document
408
- :html
409
- when String
410
- detect_string_format(obj)
411
- when Hash, Array
412
- # Raw Ruby objects (from parsed JSON/YAML)
413
- :ruby_object
414
- else
415
- raise Canon::Error, "Unknown format for object: #{obj.class}"
416
- end
578
+ FormatDetector.detect(obj)
417
579
  end
418
580
 
419
- # Detect the format of a string
581
+ # Detect the format of a string (delegates to FormatDetector)
420
582
  #
421
583
  # @param str [String] String to detect format of
422
584
  # @return [Symbol] Format type
423
585
  def detect_string_format(str)
424
- trimmed = str.strip
425
-
426
- # YAML indicators
427
- return :yaml if trimmed.start_with?("---")
428
- return :yaml if trimmed.match?(/^[a-zA-Z_]\w*:\s/)
429
-
430
- # JSON indicators
431
- return :json if trimmed.start_with?("{", "[")
432
-
433
- # HTML indicators
434
- return :html if trimmed.start_with?("<!DOCTYPE html", "<html", "<HTML")
435
-
436
- # XML indicators - must start with < and end with >
437
- return :xml if trimmed.start_with?("<") && trimmed.end_with?(">")
586
+ FormatDetector.detect_string(str)
587
+ end
438
588
 
439
- # Default to plain string for everything else
440
- :string
589
+ # Parse HTML string into Nokogiri document (delegates to HtmlParser)
590
+ #
591
+ # @param content [String, Object] Content to parse
592
+ # @param format [Symbol] HTML format (:html, :html4, :html5)
593
+ # @return [Object] Parsed document
594
+ def parse_html(content, format)
595
+ HtmlParser.parse(content, format)
441
596
  end
442
597
  end
443
598
  end
@@ -2,6 +2,7 @@
2
2
 
3
3
  require_relative "formatting_detector"
4
4
  require_relative "../comparison/compare_profile"
5
+ require_relative "../comparison/whitespace_sensitivity"
5
6
 
6
7
  module Canon
7
8
  module Diff
@@ -28,6 +29,28 @@ module Canon
28
29
  # @param diff_node [DiffNode] The diff node to classify
29
30
  # @return [DiffNode] The same diff node with normative/formatting attributes set
30
31
  def classify(diff_node)
32
+ # SPECIAL CASE: text_content with :normalize behavior
33
+ # When text_content is :normalize and the difference is formatting-only,
34
+ # it should be marked as non-normative (informative)
35
+ # This ensures that verbose and non-verbose modes give consistent results
36
+ #
37
+ # EXCEPTION: If the text node is inside a whitespace-sensitive element
38
+ # (like <pre>, <code>, <textarea> in HTML), don't apply formatting detection
39
+ # because whitespace should be preserved in these elements
40
+ #
41
+ # This check must come FIRST, before normative_dimension? is called,
42
+ # because normative_dimension? returns true for text_content: :normalize
43
+ # (since the dimension affects equivalence), which would prevent formatting
44
+ # detection from being applied.
45
+ if diff_node.dimension == :text_content &&
46
+ profile.send(:behavior_for, :text_content) == :normalize &&
47
+ !inside_whitespace_sensitive_element?(diff_node) &&
48
+ formatting_only_diff?(diff_node)
49
+ diff_node.formatting = true
50
+ diff_node.normative = false
51
+ return diff_node
52
+ end
53
+
31
54
  # FIRST: Determine if this dimension is normative based on CompareProfile
32
55
  # This respects the policy settings (strict/normalize/ignore)
33
56
  is_normative = profile.normative_dimension?(diff_node.dimension)
@@ -45,7 +68,7 @@ module Canon
45
68
  return diff_node
46
69
  end
47
70
 
48
- # Otherwise, use the normative determination from CompareProfile
71
+ # THIRD: Apply the normative determination from CompareProfile
49
72
  diff_node.formatting = false
50
73
  diff_node.normative = is_normative
51
74
 
@@ -65,10 +88,86 @@ module Canon
65
88
  # @param diff_node [DiffNode] The diff node to check
66
89
  # @return [Boolean] true if formatting-only
67
90
  def formatting_only_diff?(diff_node)
91
+ # Only apply formatting detection to actual text content differences
92
+ # If the nodes are not text nodes (e.g., element nodes), don't apply formatting detection
93
+ node1 = diff_node.node1
94
+ node2 = diff_node.node2
95
+
96
+ # Check if both nodes are text nodes
97
+ # If not, this is not a formatting-only difference
98
+ return false unless text_node?(node1) && text_node?(node2)
99
+
68
100
  text1 = extract_text_content(diff_node.node1)
69
101
  text2 = extract_text_content(diff_node.node2)
70
102
 
71
- FormattingDetector.formatting_only?(text1, text2)
103
+ # For text_content dimension, use normalized text comparison
104
+ # This handles cases like "" vs " " (both normalize to "")
105
+ if diff_node.dimension == :text_content
106
+ normalized_equivalent?(text1, text2)
107
+ else
108
+ FormattingDetector.formatting_only?(text1, text2)
109
+ end
110
+ end
111
+
112
+ # Check if two texts are equivalent after normalization
113
+ # This detects formatting-only differences where normalized texts match
114
+ # @param text1 [String, nil] First text
115
+ # @param text2 [String, nil] Second text
116
+ # @return [Boolean] true if normalized texts are equivalent
117
+ def normalized_equivalent?(text1, text2)
118
+ return false if text1.nil? && text2.nil?
119
+ return false if text1.nil? || text2.nil?
120
+
121
+ # Use MatchOptions.normalize_text for consistency
122
+ normalized1 = Canon::Comparison::MatchOptions.normalize_text(text1)
123
+ normalized2 = Canon::Comparison::MatchOptions.normalize_text(text2)
124
+
125
+ # If normalized texts are equivalent but originals are different,
126
+ # it's a formatting-only difference
127
+ normalized1 == normalized2 && text1 != text2
128
+ end
129
+
130
+ # Check if a node is a text node
131
+ # @param node [Object] The node to check
132
+ # @return [Boolean] true if the node is a text node
133
+ def text_node?(node)
134
+ return false if node.nil?
135
+
136
+ # Canon::Xml::Nodes::TextNode
137
+ return true if node.is_a?(Canon::Xml::Nodes::TextNode)
138
+
139
+ # Nokogiri text nodes (node_type returns integer constant like 3)
140
+ return true if node.respond_to?(:node_type) &&
141
+ node.node_type.is_a?(Integer) &&
142
+ node.node_type == Nokogiri::XML::Node::TEXT_NODE
143
+
144
+ # Moxml text nodes (node_type returns symbol)
145
+ return true if node.respond_to?(:node_type) && node.node_type == :text
146
+
147
+ # String
148
+ return true if node.is_a?(String)
149
+
150
+ # Test doubles or objects with text node-like interface
151
+ # Check if it has a value method (contains text content)
152
+ return true if node.respond_to?(:value)
153
+
154
+ false
155
+ end
156
+
157
+ # Check if the text node is inside a whitespace-sensitive element
158
+ # @param diff_node [DiffNode] The diff node to check
159
+ # @return [Boolean] true if inside a whitespace-sensitive element
160
+ def inside_whitespace_sensitive_element?(diff_node)
161
+ # Get the text node (not the parent element)
162
+ node = diff_node.node1 || diff_node.node2
163
+ return false unless node
164
+
165
+ # WhitespaceSensitivity.element_sensitive? expects a text node
166
+ # and checks its parent element
167
+ # We need to pass the full options structure with :match_opts key
168
+ opts = { match_opts: @match_options.options }
169
+
170
+ Canon::Comparison::WhitespaceSensitivity.element_sensitive?(node, opts)
72
171
  end
73
172
 
74
173
  # Extract text content from a node for formatting comparison