canon 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +25 -135
  3. data/README.adoc +13 -13
  4. data/docs/.lycheeignore +69 -0
  5. data/docs/advanced/extending-canon.adoc +193 -0
  6. data/docs/internals/diffnode-enrichment.adoc +611 -0
  7. data/docs/internals/index.adoc +251 -0
  8. data/docs/lychee.toml +13 -6
  9. data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +250 -0
  10. data/docs/understanding/architecture.adoc +749 -33
  11. data/docs/understanding/comparison-pipeline.adoc +122 -0
  12. data/false_positive_analysis.txt +0 -0
  13. data/file1.html +1 -0
  14. data/file2.html +1 -0
  15. data/lib/canon/cache.rb +129 -0
  16. data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +68 -0
  17. data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +68 -0
  18. data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +171 -0
  19. data/lib/canon/comparison/dimensions/base_dimension.rb +107 -0
  20. data/lib/canon/comparison/dimensions/comments_dimension.rb +121 -0
  21. data/lib/canon/comparison/dimensions/element_position_dimension.rb +90 -0
  22. data/lib/canon/comparison/dimensions/registry.rb +77 -0
  23. data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +119 -0
  24. data/lib/canon/comparison/dimensions/text_content_dimension.rb +96 -0
  25. data/lib/canon/comparison/dimensions.rb +54 -0
  26. data/lib/canon/comparison/format_detector.rb +86 -0
  27. data/lib/canon/comparison/html_comparator.rb +51 -18
  28. data/lib/canon/comparison/html_parser.rb +80 -0
  29. data/lib/canon/comparison/json_comparator.rb +12 -0
  30. data/lib/canon/comparison/json_parser.rb +19 -0
  31. data/lib/canon/comparison/markup_comparator.rb +293 -0
  32. data/lib/canon/comparison/match_options/base_resolver.rb +143 -0
  33. data/lib/canon/comparison/match_options/json_resolver.rb +82 -0
  34. data/lib/canon/comparison/match_options/xml_resolver.rb +151 -0
  35. data/lib/canon/comparison/match_options/yaml_resolver.rb +87 -0
  36. data/lib/canon/comparison/match_options.rb +68 -463
  37. data/lib/canon/comparison/profile_definition.rb +149 -0
  38. data/lib/canon/comparison/ruby_object_comparator.rb +180 -0
  39. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +7 -10
  40. data/lib/canon/comparison/xml_comparator/attribute_comparator.rb +177 -0
  41. data/lib/canon/comparison/xml_comparator/attribute_filter.rb +136 -0
  42. data/lib/canon/comparison/xml_comparator/child_comparison.rb +189 -0
  43. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +115 -0
  44. data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +186 -0
  45. data/lib/canon/comparison/xml_comparator/node_parser.rb +74 -0
  46. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +95 -0
  47. data/lib/canon/comparison/xml_comparator.rb +52 -664
  48. data/lib/canon/comparison/xml_node_comparison.rb +297 -0
  49. data/lib/canon/comparison/xml_parser.rb +19 -0
  50. data/lib/canon/comparison/yaml_comparator.rb +3 -3
  51. data/lib/canon/comparison.rb +265 -110
  52. data/lib/canon/diff/diff_node.rb +32 -2
  53. data/lib/canon/diff/node_serializer.rb +191 -0
  54. data/lib/canon/diff/path_builder.rb +143 -0
  55. data/lib/canon/diff_formatter/by_line/base_formatter.rb +251 -0
  56. data/lib/canon/diff_formatter/by_line/html_formatter.rb +6 -248
  57. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +38 -229
  58. data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +30 -0
  59. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +579 -0
  60. data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +121 -0
  61. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +253 -0
  62. data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +61 -0
  63. data/lib/canon/diff_formatter/diff_detail_formatter.rb +31 -1028
  64. data/lib/canon/diff_formatter.rb +1 -1
  65. data/lib/canon/rspec_matchers.rb +1 -1
  66. data/lib/canon/tree_diff/operation_converter.rb +92 -338
  67. data/lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb +71 -0
  68. data/lib/canon/tree_diff/operation_converter_helpers/post_processor.rb +103 -0
  69. data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +168 -0
  70. data/lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb +188 -0
  71. data/lib/canon/version.rb +1 -1
  72. data/old-docs/ADVANCED_TOPICS.adoc +20 -0
  73. data/old-docs/BASIC_USAGE.adoc +16 -0
  74. data/old-docs/CHARACTER_VISUALIZATION.adoc +567 -0
  75. data/old-docs/CLI.adoc +497 -0
  76. data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +19 -0
  77. data/old-docs/DIFF_ARCHITECTURE.adoc +435 -0
  78. data/old-docs/DIFF_FORMATTING.adoc +540 -0
  79. data/old-docs/DIFF_PARAMETERS.adoc +261 -0
  80. data/old-docs/DOM_DIFF.adoc +1017 -0
  81. data/old-docs/ENV_CONFIG.adoc +876 -0
  82. data/old-docs/FORMATS.adoc +867 -0
  83. data/old-docs/INPUT_VALIDATION.adoc +477 -0
  84. data/old-docs/MATCHER_BEHAVIOR.adoc +90 -0
  85. data/old-docs/MATCH_ARCHITECTURE.adoc +463 -0
  86. data/old-docs/MATCH_OPTIONS.adoc +912 -0
  87. data/old-docs/MODES.adoc +432 -0
  88. data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +219 -0
  89. data/old-docs/OPTIONS.adoc +1387 -0
  90. data/old-docs/PREPROCESSING.adoc +491 -0
  91. data/old-docs/README.old.adoc +2831 -0
  92. data/old-docs/RSPEC.adoc +814 -0
  93. data/old-docs/RUBY_API.adoc +485 -0
  94. data/old-docs/SEMANTIC_DIFF_REPORT.adoc +646 -0
  95. data/old-docs/SEMANTIC_TREE_DIFF.adoc +765 -0
  96. data/old-docs/STRING_COMPARE.adoc +345 -0
  97. data/old-docs/TMP.adoc +3384 -0
  98. data/old-docs/TREE_DIFF.adoc +1080 -0
  99. data/old-docs/UNDERSTANDING_CANON.adoc +17 -0
  100. data/old-docs/VERBOSE.adoc +482 -0
  101. data/old-docs/VISUALIZATION_MAP.adoc +625 -0
  102. data/old-docs/WHITESPACE_TREATMENT.adoc +1155 -0
  103. data/scripts/analyze_current_state.rb +85 -0
  104. data/scripts/analyze_false_positives.rb +114 -0
  105. data/scripts/analyze_remaining_failures.rb +105 -0
  106. data/scripts/compare_current_failures.rb +95 -0
  107. data/scripts/compare_dom_tree_diff.rb +158 -0
  108. data/scripts/compare_failures.rb +151 -0
  109. data/scripts/debug_attribute_extraction.rb +66 -0
  110. data/scripts/debug_blocks_839.rb +115 -0
  111. data/scripts/debug_meta_matching.rb +52 -0
  112. data/scripts/debug_p_matching.rb +192 -0
  113. data/scripts/debug_signature_matching.rb +118 -0
  114. data/scripts/debug_sourcecode_124.rb +32 -0
  115. data/scripts/debug_whitespace_sensitive.rb +192 -0
  116. data/scripts/extract_false_positives.rb +138 -0
  117. data/scripts/find_actual_false_positives.rb +125 -0
  118. data/scripts/investigate_all_false_positives.rb +161 -0
  119. data/scripts/investigate_batch1.rb +127 -0
  120. data/scripts/investigate_classification.rb +150 -0
  121. data/scripts/investigate_classification_detailed.rb +190 -0
  122. data/scripts/investigate_common_failures.rb +342 -0
  123. data/scripts/investigate_false_negative.rb +80 -0
  124. data/scripts/investigate_false_positive.rb +83 -0
  125. data/scripts/investigate_false_positives.rb +227 -0
  126. data/scripts/investigate_false_positives_batch.rb +163 -0
  127. data/scripts/investigate_mixed_content.rb +125 -0
  128. data/scripts/investigate_remaining_16.rb +214 -0
  129. data/scripts/run_single_test.rb +29 -0
  130. data/scripts/test_all_false_positives.rb +95 -0
  131. data/scripts/test_attribute_details.rb +61 -0
  132. data/scripts/test_both_algorithms.rb +49 -0
  133. data/scripts/test_both_simple.rb +49 -0
  134. data/scripts/test_enhanced_semantic_output.rb +125 -0
  135. data/scripts/test_readme_examples.rb +131 -0
  136. data/scripts/test_semantic_tree_diff.rb +99 -0
  137. data/scripts/test_semantic_ux_improvements.rb +135 -0
  138. data/scripts/test_single_false_positive.rb +119 -0
  139. data/scripts/test_size_limits.rb +99 -0
  140. data/test_html_1.html +21 -0
  141. data/test_html_2.html +21 -0
  142. data/test_nokogiri.rb +33 -0
  143. data/test_normalize.rb +45 -0
  144. metadata +123 -2
@@ -7,11 +7,15 @@ require_relative "comparison/xml_comparator"
7
7
  require_relative "comparison/html_comparator"
8
8
  require_relative "comparison/json_comparator"
9
9
  require_relative "comparison/yaml_comparator"
10
+ require_relative "comparison/profile_definition"
11
+ require_relative "comparison/format_detector"
12
+ require_relative "comparison/html_parser"
10
13
  require_relative "diff/diff_node_mapper"
11
14
  require_relative "diff/diff_line"
12
15
  require_relative "diff/diff_block_builder"
13
16
  require_relative "diff/diff_context_builder"
14
17
  require_relative "diff/diff_report_builder"
18
+ require_relative "cache"
15
19
 
16
20
  module Canon
17
21
  # Comparison module for XML, HTML, JSON, and YAML documents
@@ -36,25 +40,36 @@ module Canon
36
40
  # == Comparison Options
37
41
  #
38
42
  # Common options across all formats:
39
- # - collapse_whitespace: Normalize whitespace in text (default: true)
40
- # - ignore_attr_order: Ignore attribute/key ordering (default: true)
41
- # - ignore_comments: Skip comment nodes (default: true)
42
- # - ignore_text_nodes: Skip all text content (default: false)
43
- # - ignore_children: Skip child nodes (default: false)
43
+ # - profile: Comparison profile (Symbol for preset, Hash for custom)
44
+ # * Presets: :strict, :rendered, :html4, :html5, :spec_friendly, :content_only
45
+ # * Custom: { text_content: :normalize, comments: :ignore, ... }
46
+ # - diff_algorithm: Algorithm to use (:dom or :semantic, default: :dom)
44
47
  # - verbose: Return detailed diff array (default: false)
45
48
  #
46
49
  # == Usage Examples
47
50
  #
48
- # # XML comparison
51
+ # # XML comparison with default profile
49
52
  # Canon::Comparison.equivalent?(xml1, xml2)
50
- # Canon::Comparison.equivalent?(xml1, xml2, verbose: true)
51
53
  #
52
- # # HTML comparison
53
- # Canon::Comparison.equivalent?(html1, html2, ignore_comments: true)
54
+ # # XML comparison with preset profile
55
+ # Canon::Comparison.equivalent?(xml1, xml2, profile: :strict)
56
+ # Canon::Comparison.equivalent?(xml1, xml2, profile: :spec_friendly)
54
57
  #
55
- # # JSON comparison
56
- # Canon::Comparison.equivalent?(json1, json2)
57
- # Canon::Comparison.equivalent?(hash1, hash2) # Pre-parsed objects
58
+ # # HTML comparison with custom inline profile
59
+ # Canon::Comparison.equivalent?(html1, html2,
60
+ # profile: { text_content: :normalize, comments: :ignore })
61
+ #
62
+ # # Define and use a custom profile
63
+ # Canon::Comparison.define_profile(:my_custom) do
64
+ # text_content :normalize
65
+ # comments :ignore
66
+ # preprocessing :rendered
67
+ # end
68
+ # Canon::Comparison.equivalent?(doc1, doc2, profile: :my_custom)
69
+ #
70
+ # # JSON comparison with semantic tree diff
71
+ # Canon::Comparison.equivalent?(json1, json2,
72
+ # diff_algorithm: :semantic, profile: :spec_friendly)
58
73
  #
59
74
  # # With detailed output
60
75
  # diffs = Canon::Comparison.equivalent?(doc1, doc2, verbose: true)
@@ -88,10 +103,11 @@ module Canon
88
103
  UNEQUAL_TEXT_CONTENTS = 9
89
104
  MISSING_HASH_KEY = 10
90
105
  UNEQUAL_HASH_VALUES = 11
91
- UNEQUAL_ARRAY_LENGTHS = 12
92
- UNEQUAL_ARRAY_ELEMENTS = 13
93
- UNEQUAL_TYPES = 14
94
- UNEQUAL_PRIMITIVES = 15
106
+ UNEQUAL_HASH_KEY_ORDER = 12
107
+ UNEQUAL_ARRAY_LENGTHS = 13
108
+ UNEQUAL_ARRAY_ELEMENTS = 14
109
+ UNEQUAL_TYPES = 15
110
+ UNEQUAL_PRIMITIVES = 16
95
111
 
96
112
  class << self
97
113
  # Auto-detect format and compare two objects
@@ -99,8 +115,10 @@ module Canon
99
115
  # @param obj1 [Object] First object to compare
100
116
  # @param obj2 [Object] Second object to compare
101
117
  # @param opts [Hash] Comparison options
118
+ # - :profile - Profile to use (Symbol for preset, Hash for custom)
102
119
  # - :format - Format hint (:xml, :html, :html4, :html5, :json, :yaml, :string)
103
120
  # - :diff_algorithm - Algorithm to use (:dom or :semantic)
121
+ # - :verbose - Return detailed diff array (default: false)
104
122
  # @return [Boolean, Array] true if equivalent, or array of diffs if verbose
105
123
  def equivalent?(obj1, obj2, opts = {})
106
124
  # Check if semantic tree diff is requested
@@ -113,6 +131,56 @@ module Canon
113
131
  dom_diff(obj1, obj2, opts)
114
132
  end
115
133
 
134
+ # Define a custom comparison profile with DSL syntax
135
+ #
136
+ # @param name [Symbol] Profile name
137
+ # @yield [ProfileDefinition] DSL block for defining profile
138
+ # @return [Symbol] Profile name
139
+ # @raise [ProfileError] if profile definition is invalid
140
+ #
141
+ # @example Define a custom profile
142
+ # Canon::Comparison.define_profile(:my_custom) do
143
+ # text_content :normalize
144
+ # comments :ignore
145
+ # preprocessing :rendered
146
+ # end
147
+ def define_profile(name, &block)
148
+ definition = ProfileDefinition.define(name, &block)
149
+
150
+ @custom_profiles ||= {}
151
+ @custom_profiles[name] = definition
152
+
153
+ name
154
+ end
155
+
156
+ # Load a profile (custom or preset)
157
+ #
158
+ # @param name [Symbol] Profile name
159
+ # @return [Hash] Profile settings
160
+ def load_profile(name)
161
+ # Check custom profiles first
162
+ if @custom_profiles&.key?(name)
163
+ return @custom_profiles[name].dup
164
+ end
165
+
166
+ # Fall back to presets - try Xml first (most common)
167
+ begin
168
+ MatchOptions::Xml.get_profile_options(name)
169
+ rescue Error
170
+ # Try other formats
171
+ MatchOptions::Json.get_profile_options(name)
172
+ end
173
+ end
174
+
175
+ # List all available profiles (custom + presets)
176
+ #
177
+ # @return [Array<Symbol>] Available profile names
178
+ def available_profiles
179
+ custom = @custom_profiles&.keys || []
180
+ presets = MatchOptions::Xml::MATCH_PROFILES.keys
181
+ (custom + presets).sort.uniq
182
+ end
183
+
116
184
  private
117
185
 
118
186
  # Perform semantic tree diff comparison
@@ -120,8 +188,8 @@ module Canon
120
188
  require_relative "tree_diff"
121
189
 
122
190
  # Detect format for both objects
123
- format1 = opts[:format] || detect_format(obj1)
124
- format2 = opts[:format] || detect_format(obj2)
191
+ format1 = opts[:format] || FormatDetector.detect(obj1)
192
+ format2 = opts[:format] || FormatDetector.detect(obj2)
125
193
 
126
194
  # Handle string format (plain text comparison) - semantic tree doesn't support it
127
195
  if format1 == :string
@@ -203,39 +271,141 @@ module Canon
203
271
  # @param opts [Hash] User options
204
272
  # @return [Hash] Resolved match options
205
273
  def resolve_match_options(format, opts)
274
+ # Process unified profile parameter first
275
+ processed_opts = process_profile_parameter(opts)
276
+
206
277
  case format
207
278
  when :xml, :html, :html4, :html5
208
279
  MatchOptions::Xml.resolve(
209
280
  format: format,
210
- match_profile: opts[:match_profile],
211
- match: opts[:match],
212
- preprocessing: opts[:preprocessing],
213
- global_profile: opts[:global_profile],
214
- global_options: opts[:global_options],
281
+ match_profile: processed_opts[:match_profile],
282
+ match: processed_opts[:match],
283
+ preprocessing: processed_opts[:preprocessing],
284
+ global_profile: processed_opts[:global_profile],
285
+ global_options: processed_opts[:global_options],
215
286
  )
216
287
  when :json
217
288
  MatchOptions::Json.resolve(
218
289
  format: format,
219
- match_profile: opts[:match_profile],
220
- match: opts[:match],
221
- preprocessing: opts[:preprocessing],
222
- global_profile: opts[:global_profile],
223
- global_options: opts[:global_options],
290
+ match_profile: processed_opts[:match_profile],
291
+ match: processed_opts[:match],
292
+ preprocessing: processed_opts[:preprocessing],
293
+ global_profile: processed_opts[:global_profile],
294
+ global_options: processed_opts[:global_options],
224
295
  )
225
296
  when :yaml
226
297
  MatchOptions::Yaml.resolve(
227
298
  format: format,
228
- match_profile: opts[:match_profile],
229
- match: opts[:match],
230
- preprocessing: opts[:preprocessing],
231
- global_profile: opts[:global_profile],
232
- global_options: opts[:global_options],
299
+ match_profile: processed_opts[:match_profile],
300
+ match: processed_opts[:match],
301
+ preprocessing: processed_opts[:preprocessing],
302
+ global_profile: processed_opts[:global_profile],
303
+ global_options: processed_opts[:global_options],
233
304
  )
234
305
  else
235
- opts[:match] || {}
306
+ processed_opts[:match] || {}
307
+ end
308
+ end
309
+
310
+ # Process unified profile parameter
311
+ #
312
+ # Converts the new :profile parameter into the legacy format expected
313
+ # by MatchOptions resolvers. Handles:
314
+ # - Symbol → preset profile (uses :match_profile)
315
+ # - Hash → custom profile (validates and uses :match)
316
+ #
317
+ # @param opts [Hash] Original user options
318
+ # @return [Hash] Processed options with legacy format
319
+ def process_profile_parameter(opts)
320
+ processed = opts.dup
321
+
322
+ # Handle unified :profile parameter
323
+ if opts.key?(:profile)
324
+ profile = opts[:profile]
325
+
326
+ case profile
327
+ when Symbol
328
+ # Preset profile name
329
+ processed[:match_profile] = profile
330
+ when Hash
331
+ # Inline custom profile - validate and use as :match
332
+ validate_custom_profile!(profile, format_from_opts(opts))
333
+ processed[:match] = profile
334
+ else
335
+ raise Canon::Error,
336
+ "Invalid profile type: #{profile.class}. " \
337
+ "Expected Symbol (preset name) or Hash (custom profile)."
338
+ end
339
+ end
340
+
341
+ processed
342
+ end
343
+
344
+ # Validate custom profile hash
345
+ #
346
+ # Ensures all dimensions and behaviors in a custom profile are valid.
347
+ # Uses ProfileDefinition validation logic.
348
+ #
349
+ # @param profile [Hash] Custom profile hash
350
+ # @param format [Symbol] Format type for validation context
351
+ # @raise [Canon::Error] if profile contains invalid dimensions or behaviors
352
+ def validate_custom_profile!(profile, format)
353
+ profile.each do |dimension, behavior|
354
+ # Skip preprocessing and special options
355
+ next if dimension == :preprocessing
356
+ next if dimension == :semantic_diff
357
+ next if dimension == :similarity_threshold
358
+
359
+ # Validate dimension is known
360
+ valid_dimensions = valid_dimensions_for_format(format)
361
+ unless valid_dimensions.include?(dimension)
362
+ raise Canon::Error,
363
+ "Unknown dimension: #{dimension}. " \
364
+ "Valid dimensions for #{format}: #{valid_dimensions.join(', ')}"
365
+ end
366
+
367
+ # Validate behavior is allowed for this dimension
368
+ valid_behaviors = ProfileDefinition::DIMENSION_BEHAVIORS[dimension]
369
+ if valid_behaviors && !valid_behaviors.include?(behavior)
370
+ raise Canon::Error,
371
+ "Invalid behavior '#{behavior}' for dimension '#{dimension}'. " \
372
+ "Valid behaviors: #{valid_behaviors.join(', ')}"
373
+ end
374
+
375
+ # Validate behavior is in general MATCH_BEHAVIORS
376
+ unless MatchOptions::MATCH_BEHAVIORS.include?(behavior)
377
+ raise Canon::Error,
378
+ "Unknown match behavior: #{behavior}. " \
379
+ "Valid behaviors: #{MatchOptions::MATCH_BEHAVIORS.join(', ')}"
380
+ end
236
381
  end
237
382
  end
238
383
 
384
+ # Get valid dimensions for a format
385
+ #
386
+ # @param format [Symbol] Format type
387
+ # @return [Array<Symbol>] Valid dimensions for the format
388
+ def valid_dimensions_for_format(format)
389
+ case format
390
+ when :xml, :html, :html4, :html5
391
+ MatchOptions::Xml::MATCH_DIMENSIONS
392
+ when :json
393
+ MatchOptions::Json::MATCH_DIMENSIONS
394
+ when :yaml
395
+ MatchOptions::Yaml::MATCH_DIMENSIONS
396
+ else
397
+ []
398
+ end
399
+ end
400
+
401
+ # Helper to extract format from opts for validation
402
+ #
403
+ # @param opts [Hash] User options
404
+ # @return [Symbol] Format type or :xml as default
405
+ def format_from_opts(opts)
406
+ opts[:format] || :xml
407
+ end
408
+
239
409
  # Parse documents using comparator's parse logic (reuses preprocessing)
240
410
  #
241
411
  # @param obj1 [Object] First object
@@ -250,32 +420,66 @@ module Canon
250
420
  when :xml
251
421
  # Delegate to XmlComparator's parse_node - returns Canon::Xml::Node
252
422
  # Adapter now handles Canon::Xml::Node directly
253
- doc1 = XmlComparator.send(:parse_node, obj1, preprocessing)
254
- doc2 = XmlComparator.send(:parse_node, obj2, preprocessing)
423
+ doc1 = parse_with_cache(obj1, format, preprocessing) do |doc|
424
+ XmlComparator.send(:parse_node, doc, preprocessing)
425
+ end
426
+ doc2 = parse_with_cache(obj2, format, preprocessing) do |doc|
427
+ XmlComparator.send(:parse_node, doc, preprocessing)
428
+ end
255
429
  [doc1, doc2]
256
430
  when :html, :html4, :html5
257
431
  # Delegate to HtmlComparator's parse_node_for_semantic for Canon::Xml::Node
258
432
  [
259
- HtmlComparator.send(:parse_node_for_semantic, obj1, preprocessing),
260
- HtmlComparator.send(:parse_node_for_semantic, obj2, preprocessing),
433
+ parse_with_cache(obj1, format, preprocessing) do |doc|
434
+ HtmlComparator.send(:parse_node_for_semantic, doc, preprocessing)
435
+ end,
436
+ parse_with_cache(obj2, format, preprocessing) do |doc|
437
+ HtmlComparator.send(:parse_node_for_semantic, doc, preprocessing)
438
+ end,
261
439
  ]
262
440
  when :json
263
441
  # Delegate to JsonComparator's parse_json
264
442
  [
265
- JsonComparator.send(:parse_json, obj1),
266
- JsonComparator.send(:parse_json, obj2),
443
+ parse_with_cache(obj1, format, :none) do |doc|
444
+ JsonComparator.send(:parse_json, doc)
445
+ end,
446
+ parse_with_cache(obj2, format, :none) do |doc|
447
+ JsonComparator.send(:parse_json, doc)
448
+ end,
267
449
  ]
268
450
  when :yaml
269
451
  # Delegate to YamlComparator's parse_yaml
270
452
  [
271
- YamlComparator.send(:parse_yaml, obj1),
272
- YamlComparator.send(:parse_yaml, obj2),
453
+ parse_with_cache(obj1, format, :none) do |doc|
454
+ YamlComparator.send(:parse_yaml, doc)
455
+ end,
456
+ parse_with_cache(obj2, format, :none) do |doc|
457
+ YamlComparator.send(:parse_yaml, doc)
458
+ end,
273
459
  ]
274
460
  else
275
461
  [obj1, obj2]
276
462
  end
277
463
  end
278
464
 
465
+ # Parse a document with caching
466
+ #
467
+ # @param doc [Object] Document to parse (string or already parsed)
468
+ # @param format [Symbol] Document format
469
+ # @param preprocessing [Symbol] Preprocessing option
470
+ # @yield Block to parse the document if not cached
471
+ # @return [Object] Parsed document
472
+ def parse_with_cache(doc, format, preprocessing)
473
+ # If already a parsed node, return as-is
474
+ return doc unless doc.is_a?(String)
475
+
476
+ # Use cache for string documents
477
+ Cache.fetch(:document_parse,
478
+ Cache.key_for_document(doc, format, preprocessing)) do
479
+ yield doc
480
+ end
481
+ end
482
+
279
483
  # Normalize format for TreeDiff (html4/html5 -> html)
280
484
  #
281
485
  # @param format [Symbol] Original format
@@ -314,14 +518,14 @@ module Canon
314
518
  format1 = format2 = opts[:format]
315
519
  # Parse HTML strings if format is html/html4/html5
316
520
  if %i[html html4 html5].include?(opts[:format])
317
- obj1 = parse_html(obj1, opts[:format]) if obj1.is_a?(String)
318
- obj2 = parse_html(obj2, opts[:format]) if obj2.is_a?(String)
319
- # Normalize html4/html5 to html for comparison
320
- format1 = format2 = :html
521
+ obj1 = HtmlParser.parse(obj1, opts[:format]) if obj1.is_a?(String)
522
+ obj2 = HtmlParser.parse(obj2, opts[:format]) if obj2.is_a?(String)
523
+ # Note: We preserve html4/html5 format instead of normalizing to :html
524
+ # This allows HtmlComparator to use the correct parsing behavior
321
525
  end
322
526
  else
323
- format1 = detect_format(obj1)
324
- format2 = detect_format(obj2)
527
+ format1 = FormatDetector.detect(obj1)
528
+ format2 = FormatDetector.detect(obj2)
325
529
  end
326
530
 
327
531
  # Handle string format (plain text comparison)
@@ -357,7 +561,7 @@ module Canon
357
561
  case comparison_format
358
562
  when :xml
359
563
  XmlComparator.equivalent?(obj1, obj2, opts)
360
- when :html
564
+ when :html, :html4, :html5
361
565
  HtmlComparator.equivalent?(obj1, obj2, opts)
362
566
  when :json
363
567
  JsonComparator.equivalent?(obj1, obj2, opts)
@@ -366,78 +570,29 @@ module Canon
366
570
  end
367
571
  end
368
572
 
369
- # Parse HTML string into Nokogiri document
370
- #
371
- # @param content [String, Object] Content to parse (returns as-is if not a string)
372
- # @param format [Symbol] HTML format (:html, :html4, :html5)
373
- # @return [Nokogiri::HTML::Document, Nokogiri::HTML5::Document, Nokogiri::HTML::DocumentFragment, Object]
374
- def parse_html(content, _format)
375
- return content unless content.is_a?(String)
376
- return content if content.is_a?(Nokogiri::HTML::Document) ||
377
- content.is_a?(Nokogiri::HTML5::Document) ||
378
- content.is_a?(Nokogiri::XML::Document) ||
379
- content.is_a?(Nokogiri::HTML::DocumentFragment) ||
380
- content.is_a?(Nokogiri::HTML5::DocumentFragment) ||
381
- content.is_a?(Nokogiri::XML::DocumentFragment)
382
-
383
- # Let HtmlComparator's parse_node handle parsing with preprocessing
384
- # For now, just return the string and let it be parsed by HtmlComparator
385
- content
386
- rescue StandardError
387
- content
388
- end
389
-
390
- # Detect the format of an object
573
+ # Detect the format of an object (delegates to FormatDetector)
391
574
  #
392
575
  # @param obj [Object] Object to detect format of
393
576
  # @return [Symbol] Format type
394
577
  def detect_format(obj)
395
- case obj
396
- when Moxml::Node, Moxml::Document
397
- :xml
398
- when Nokogiri::HTML::DocumentFragment, Nokogiri::HTML5::DocumentFragment
399
- # HTML DocumentFragments
400
- :html
401
- when Nokogiri::XML::DocumentFragment
402
- # XML DocumentFragments - check if it's actually HTML
403
- obj.document&.html? ? :html : :xml
404
- when Nokogiri::XML::Document, Nokogiri::XML::Node
405
- # Check if it's HTML by looking at the document type
406
- obj.html? ? :html : :xml
407
- when Nokogiri::HTML::Document, Nokogiri::HTML5::Document
408
- :html
409
- when String
410
- detect_string_format(obj)
411
- when Hash, Array
412
- # Raw Ruby objects (from parsed JSON/YAML)
413
- :ruby_object
414
- else
415
- raise Canon::Error, "Unknown format for object: #{obj.class}"
416
- end
578
+ FormatDetector.detect(obj)
417
579
  end
418
580
 
419
- # Detect the format of a string
581
+ # Detect the format of a string (delegates to FormatDetector)
420
582
  #
421
583
  # @param str [String] String to detect format of
422
584
  # @return [Symbol] Format type
423
585
  def detect_string_format(str)
424
- trimmed = str.strip
425
-
426
- # YAML indicators
427
- return :yaml if trimmed.start_with?("---")
428
- return :yaml if trimmed.match?(/^[a-zA-Z_]\w*:\s/)
429
-
430
- # JSON indicators
431
- return :json if trimmed.start_with?("{", "[")
432
-
433
- # HTML indicators
434
- return :html if trimmed.start_with?("<!DOCTYPE html", "<html", "<HTML")
435
-
436
- # XML indicators - must start with < and end with >
437
- return :xml if trimmed.start_with?("<") && trimmed.end_with?(">")
586
+ FormatDetector.detect_string(str)
587
+ end
438
588
 
439
- # Default to plain string for everything else
440
- :string
589
+ # Parse HTML string into Nokogiri document (delegates to HtmlParser)
590
+ #
591
+ # @param content [String, Object] Content to parse
592
+ # @param format [Symbol] HTML format (:html, :html4, :html5)
593
+ # @return [Object] Parsed document
594
+ def parse_html(content, format)
595
+ HtmlParser.parse(content, format)
441
596
  end
442
597
  end
443
598
  end
@@ -5,9 +5,19 @@ module Canon
5
5
  # Represents a semantic difference between two nodes in a comparison tree
6
6
  # This is created during the Comparison Layer and carries information about
7
7
  # which dimension caused the difference and whether it's normative or informative
8
+ #
9
+ # DiffNode is library-agnostic - it works with data extracted from nodes,
10
+ # not the raw node references themselves. This allows Canon to work with
11
+ # any parsing library (Nokogiri, Moxml, etc.) without being tied to it.
8
12
  class DiffNode
9
13
  attr_reader :node1, :node2
10
- attr_accessor :dimension, :reason, :normative, :formatting
14
+ attr_accessor :dimension, :reason, :normative, :formatting,
15
+ # Enriched metadata for Stage 4 rendering
16
+ :path, # Canonical path with ordinal indices
17
+ :serialized_before, # Serialized content for display (before)
18
+ :serialized_after, # Serialized content for display (after)
19
+ :attributes_before, # Normalized attributes hash (before)
20
+ :attributes_after # Normalized attributes hash (after)
11
21
 
12
22
  # @param node1 [Object] The first node being compared
13
23
  # @param node2 [Object] The second node being compared
@@ -15,13 +25,26 @@ module Canon
15
25
  # (e.g., :text_content, :attribute_whitespace, :structural_whitespace,
16
26
  # :comments, :key_order)
17
27
  # @param reason [String] Human-readable explanation of the difference
18
- def initialize(node1:, node2:, dimension:, reason:)
28
+ # @param path [String, nil] Optional canonical path with ordinal indices
29
+ # @param serialized_before [String, nil] Optional serialized content for display
30
+ # @param serialized_after [String, nil] Optional serialized content for display
31
+ # @param attributes_before [Hash, nil] Optional normalized attributes hash
32
+ # @param attributes_after [Hash, nil] Optional normalized attributes hash
33
+ def initialize(node1:, node2:, dimension:, reason:,
34
+ path: nil, serialized_before: nil, serialized_after: nil,
35
+ attributes_before: nil, attributes_after: nil)
19
36
  @node1 = node1
20
37
  @node2 = node2
21
38
  @dimension = dimension
22
39
  @reason = reason
23
40
  @normative = nil # Will be set by DiffClassifier
24
41
  @formatting = nil # Will be set by DiffClassifier
42
+ # Enriched metadata (optional, populated by PathBuilder and NodeSerializer)
43
+ @path = path
44
+ @serialized_before = serialized_before
45
+ @serialized_after = serialized_after
46
+ @attributes_before = attributes_before
47
+ @attributes_after = attributes_after
25
48
  end
26
49
 
27
50
  # @return [Boolean] true if this diff is normative (affects equivalence)
@@ -54,6 +77,11 @@ module Canon
54
77
  reason: reason,
55
78
  normative: normative,
56
79
  formatting: formatting,
80
+ path: path,
81
+ serialized_before: serialized_before,
82
+ serialized_after: serialized_after,
83
+ attributes_before: attributes_before,
84
+ attributes_after: attributes_after,
57
85
  }
58
86
  end
59
87
 
@@ -65,6 +93,8 @@ module Canon
65
93
  reason == other.reason &&
66
94
  normative == other.normative &&
67
95
  formatting == other.formatting
96
+ # Note: path and serialized content are not part of equality
97
+ # since they're derived from nodes, not independent properties
68
98
  end
69
99
  end
70
100
  end