canon 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +25 -135
- data/README.adoc +13 -13
- data/docs/.lycheeignore +69 -0
- data/docs/advanced/extending-canon.adoc +193 -0
- data/docs/internals/diffnode-enrichment.adoc +611 -0
- data/docs/internals/index.adoc +251 -0
- data/docs/lychee.toml +13 -6
- data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +250 -0
- data/docs/understanding/architecture.adoc +749 -33
- data/docs/understanding/comparison-pipeline.adoc +122 -0
- data/false_positive_analysis.txt +0 -0
- data/file1.html +1 -0
- data/file2.html +1 -0
- data/lib/canon/cache.rb +129 -0
- data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +68 -0
- data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +68 -0
- data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +171 -0
- data/lib/canon/comparison/dimensions/base_dimension.rb +107 -0
- data/lib/canon/comparison/dimensions/comments_dimension.rb +121 -0
- data/lib/canon/comparison/dimensions/element_position_dimension.rb +90 -0
- data/lib/canon/comparison/dimensions/registry.rb +77 -0
- data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +119 -0
- data/lib/canon/comparison/dimensions/text_content_dimension.rb +96 -0
- data/lib/canon/comparison/dimensions.rb +54 -0
- data/lib/canon/comparison/format_detector.rb +86 -0
- data/lib/canon/comparison/html_comparator.rb +51 -18
- data/lib/canon/comparison/html_parser.rb +80 -0
- data/lib/canon/comparison/json_comparator.rb +12 -0
- data/lib/canon/comparison/json_parser.rb +19 -0
- data/lib/canon/comparison/markup_comparator.rb +293 -0
- data/lib/canon/comparison/match_options/base_resolver.rb +143 -0
- data/lib/canon/comparison/match_options/json_resolver.rb +82 -0
- data/lib/canon/comparison/match_options/xml_resolver.rb +151 -0
- data/lib/canon/comparison/match_options/yaml_resolver.rb +87 -0
- data/lib/canon/comparison/match_options.rb +68 -463
- data/lib/canon/comparison/profile_definition.rb +149 -0
- data/lib/canon/comparison/ruby_object_comparator.rb +180 -0
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +7 -10
- data/lib/canon/comparison/xml_comparator/attribute_comparator.rb +177 -0
- data/lib/canon/comparison/xml_comparator/attribute_filter.rb +136 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +189 -0
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +115 -0
- data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +186 -0
- data/lib/canon/comparison/xml_comparator/node_parser.rb +74 -0
- data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +95 -0
- data/lib/canon/comparison/xml_comparator.rb +52 -664
- data/lib/canon/comparison/xml_node_comparison.rb +297 -0
- data/lib/canon/comparison/xml_parser.rb +19 -0
- data/lib/canon/comparison/yaml_comparator.rb +3 -3
- data/lib/canon/comparison.rb +265 -110
- data/lib/canon/diff/diff_node.rb +32 -2
- data/lib/canon/diff/node_serializer.rb +191 -0
- data/lib/canon/diff/path_builder.rb +143 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +251 -0
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +6 -248
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +38 -229
- data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +30 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +579 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +121 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +253 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +61 -0
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +31 -1028
- data/lib/canon/diff_formatter.rb +1 -1
- data/lib/canon/rspec_matchers.rb +1 -1
- data/lib/canon/tree_diff/operation_converter.rb +92 -338
- data/lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb +71 -0
- data/lib/canon/tree_diff/operation_converter_helpers/post_processor.rb +103 -0
- data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +168 -0
- data/lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb +188 -0
- data/lib/canon/version.rb +1 -1
- data/old-docs/ADVANCED_TOPICS.adoc +20 -0
- data/old-docs/BASIC_USAGE.adoc +16 -0
- data/old-docs/CHARACTER_VISUALIZATION.adoc +567 -0
- data/old-docs/CLI.adoc +497 -0
- data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +19 -0
- data/old-docs/DIFF_ARCHITECTURE.adoc +435 -0
- data/old-docs/DIFF_FORMATTING.adoc +540 -0
- data/old-docs/DIFF_PARAMETERS.adoc +261 -0
- data/old-docs/DOM_DIFF.adoc +1017 -0
- data/old-docs/ENV_CONFIG.adoc +876 -0
- data/old-docs/FORMATS.adoc +867 -0
- data/old-docs/INPUT_VALIDATION.adoc +477 -0
- data/old-docs/MATCHER_BEHAVIOR.adoc +90 -0
- data/old-docs/MATCH_ARCHITECTURE.adoc +463 -0
- data/old-docs/MATCH_OPTIONS.adoc +912 -0
- data/old-docs/MODES.adoc +432 -0
- data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +219 -0
- data/old-docs/OPTIONS.adoc +1387 -0
- data/old-docs/PREPROCESSING.adoc +491 -0
- data/old-docs/README.old.adoc +2831 -0
- data/old-docs/RSPEC.adoc +814 -0
- data/old-docs/RUBY_API.adoc +485 -0
- data/old-docs/SEMANTIC_DIFF_REPORT.adoc +646 -0
- data/old-docs/SEMANTIC_TREE_DIFF.adoc +765 -0
- data/old-docs/STRING_COMPARE.adoc +345 -0
- data/old-docs/TMP.adoc +3384 -0
- data/old-docs/TREE_DIFF.adoc +1080 -0
- data/old-docs/UNDERSTANDING_CANON.adoc +17 -0
- data/old-docs/VERBOSE.adoc +482 -0
- data/old-docs/VISUALIZATION_MAP.adoc +625 -0
- data/old-docs/WHITESPACE_TREATMENT.adoc +1155 -0
- data/scripts/analyze_current_state.rb +85 -0
- data/scripts/analyze_false_positives.rb +114 -0
- data/scripts/analyze_remaining_failures.rb +105 -0
- data/scripts/compare_current_failures.rb +95 -0
- data/scripts/compare_dom_tree_diff.rb +158 -0
- data/scripts/compare_failures.rb +151 -0
- data/scripts/debug_attribute_extraction.rb +66 -0
- data/scripts/debug_blocks_839.rb +115 -0
- data/scripts/debug_meta_matching.rb +52 -0
- data/scripts/debug_p_matching.rb +192 -0
- data/scripts/debug_signature_matching.rb +118 -0
- data/scripts/debug_sourcecode_124.rb +32 -0
- data/scripts/debug_whitespace_sensitive.rb +192 -0
- data/scripts/extract_false_positives.rb +138 -0
- data/scripts/find_actual_false_positives.rb +125 -0
- data/scripts/investigate_all_false_positives.rb +161 -0
- data/scripts/investigate_batch1.rb +127 -0
- data/scripts/investigate_classification.rb +150 -0
- data/scripts/investigate_classification_detailed.rb +190 -0
- data/scripts/investigate_common_failures.rb +342 -0
- data/scripts/investigate_false_negative.rb +80 -0
- data/scripts/investigate_false_positive.rb +83 -0
- data/scripts/investigate_false_positives.rb +227 -0
- data/scripts/investigate_false_positives_batch.rb +163 -0
- data/scripts/investigate_mixed_content.rb +125 -0
- data/scripts/investigate_remaining_16.rb +214 -0
- data/scripts/run_single_test.rb +29 -0
- data/scripts/test_all_false_positives.rb +95 -0
- data/scripts/test_attribute_details.rb +61 -0
- data/scripts/test_both_algorithms.rb +49 -0
- data/scripts/test_both_simple.rb +49 -0
- data/scripts/test_enhanced_semantic_output.rb +125 -0
- data/scripts/test_readme_examples.rb +131 -0
- data/scripts/test_semantic_tree_diff.rb +99 -0
- data/scripts/test_semantic_ux_improvements.rb +135 -0
- data/scripts/test_single_false_positive.rb +119 -0
- data/scripts/test_size_limits.rb +99 -0
- data/test_html_1.html +21 -0
- data/test_html_2.html +21 -0
- data/test_nokogiri.rb +33 -0
- data/test_normalize.rb +45 -0
- metadata +123 -2
data/lib/canon/comparison.rb
CHANGED
|
@@ -7,11 +7,15 @@ require_relative "comparison/xml_comparator"
|
|
|
7
7
|
require_relative "comparison/html_comparator"
|
|
8
8
|
require_relative "comparison/json_comparator"
|
|
9
9
|
require_relative "comparison/yaml_comparator"
|
|
10
|
+
require_relative "comparison/profile_definition"
|
|
11
|
+
require_relative "comparison/format_detector"
|
|
12
|
+
require_relative "comparison/html_parser"
|
|
10
13
|
require_relative "diff/diff_node_mapper"
|
|
11
14
|
require_relative "diff/diff_line"
|
|
12
15
|
require_relative "diff/diff_block_builder"
|
|
13
16
|
require_relative "diff/diff_context_builder"
|
|
14
17
|
require_relative "diff/diff_report_builder"
|
|
18
|
+
require_relative "cache"
|
|
15
19
|
|
|
16
20
|
module Canon
|
|
17
21
|
# Comparison module for XML, HTML, JSON, and YAML documents
|
|
@@ -36,25 +40,36 @@ module Canon
|
|
|
36
40
|
# == Comparison Options
|
|
37
41
|
#
|
|
38
42
|
# Common options across all formats:
|
|
39
|
-
# -
|
|
40
|
-
#
|
|
41
|
-
#
|
|
42
|
-
# -
|
|
43
|
-
# - ignore_children: Skip child nodes (default: false)
|
|
43
|
+
# - profile: Comparison profile (Symbol for preset, Hash for custom)
|
|
44
|
+
# * Presets: :strict, :rendered, :html4, :html5, :spec_friendly, :content_only
|
|
45
|
+
# * Custom: { text_content: :normalize, comments: :ignore, ... }
|
|
46
|
+
# - diff_algorithm: Algorithm to use (:dom or :semantic, default: :dom)
|
|
44
47
|
# - verbose: Return detailed diff array (default: false)
|
|
45
48
|
#
|
|
46
49
|
# == Usage Examples
|
|
47
50
|
#
|
|
48
|
-
# # XML comparison
|
|
51
|
+
# # XML comparison with default profile
|
|
49
52
|
# Canon::Comparison.equivalent?(xml1, xml2)
|
|
50
|
-
# Canon::Comparison.equivalent?(xml1, xml2, verbose: true)
|
|
51
53
|
#
|
|
52
|
-
# #
|
|
53
|
-
# Canon::Comparison.equivalent?(
|
|
54
|
+
# # XML comparison with preset profile
|
|
55
|
+
# Canon::Comparison.equivalent?(xml1, xml2, profile: :strict)
|
|
56
|
+
# Canon::Comparison.equivalent?(xml1, xml2, profile: :spec_friendly)
|
|
54
57
|
#
|
|
55
|
-
# #
|
|
56
|
-
# Canon::Comparison.equivalent?(
|
|
57
|
-
#
|
|
58
|
+
# # HTML comparison with custom inline profile
|
|
59
|
+
# Canon::Comparison.equivalent?(html1, html2,
|
|
60
|
+
# profile: { text_content: :normalize, comments: :ignore })
|
|
61
|
+
#
|
|
62
|
+
# # Define and use a custom profile
|
|
63
|
+
# Canon::Comparison.define_profile(:my_custom) do
|
|
64
|
+
# text_content :normalize
|
|
65
|
+
# comments :ignore
|
|
66
|
+
# preprocessing :rendered
|
|
67
|
+
# end
|
|
68
|
+
# Canon::Comparison.equivalent?(doc1, doc2, profile: :my_custom)
|
|
69
|
+
#
|
|
70
|
+
# # JSON comparison with semantic tree diff
|
|
71
|
+
# Canon::Comparison.equivalent?(json1, json2,
|
|
72
|
+
# diff_algorithm: :semantic, profile: :spec_friendly)
|
|
58
73
|
#
|
|
59
74
|
# # With detailed output
|
|
60
75
|
# diffs = Canon::Comparison.equivalent?(doc1, doc2, verbose: true)
|
|
@@ -88,10 +103,11 @@ module Canon
|
|
|
88
103
|
UNEQUAL_TEXT_CONTENTS = 9
|
|
89
104
|
MISSING_HASH_KEY = 10
|
|
90
105
|
UNEQUAL_HASH_VALUES = 11
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
106
|
+
UNEQUAL_HASH_KEY_ORDER = 12
|
|
107
|
+
UNEQUAL_ARRAY_LENGTHS = 13
|
|
108
|
+
UNEQUAL_ARRAY_ELEMENTS = 14
|
|
109
|
+
UNEQUAL_TYPES = 15
|
|
110
|
+
UNEQUAL_PRIMITIVES = 16
|
|
95
111
|
|
|
96
112
|
class << self
|
|
97
113
|
# Auto-detect format and compare two objects
|
|
@@ -99,8 +115,10 @@ module Canon
|
|
|
99
115
|
# @param obj1 [Object] First object to compare
|
|
100
116
|
# @param obj2 [Object] Second object to compare
|
|
101
117
|
# @param opts [Hash] Comparison options
|
|
118
|
+
# - :profile - Profile to use (Symbol for preset, Hash for custom)
|
|
102
119
|
# - :format - Format hint (:xml, :html, :html4, :html5, :json, :yaml, :string)
|
|
103
120
|
# - :diff_algorithm - Algorithm to use (:dom or :semantic)
|
|
121
|
+
# - :verbose - Return detailed diff array (default: false)
|
|
104
122
|
# @return [Boolean, Array] true if equivalent, or array of diffs if verbose
|
|
105
123
|
def equivalent?(obj1, obj2, opts = {})
|
|
106
124
|
# Check if semantic tree diff is requested
|
|
@@ -113,6 +131,56 @@ module Canon
|
|
|
113
131
|
dom_diff(obj1, obj2, opts)
|
|
114
132
|
end
|
|
115
133
|
|
|
134
|
+
# Define a custom comparison profile with DSL syntax
|
|
135
|
+
#
|
|
136
|
+
# @param name [Symbol] Profile name
|
|
137
|
+
# @yield [ProfileDefinition] DSL block for defining profile
|
|
138
|
+
# @return [Symbol] Profile name
|
|
139
|
+
# @raise [ProfileError] if profile definition is invalid
|
|
140
|
+
#
|
|
141
|
+
# @example Define a custom profile
|
|
142
|
+
# Canon::Comparison.define_profile(:my_custom) do
|
|
143
|
+
# text_content :normalize
|
|
144
|
+
# comments :ignore
|
|
145
|
+
# preprocessing :rendered
|
|
146
|
+
# end
|
|
147
|
+
def define_profile(name, &block)
|
|
148
|
+
definition = ProfileDefinition.define(name, &block)
|
|
149
|
+
|
|
150
|
+
@custom_profiles ||= {}
|
|
151
|
+
@custom_profiles[name] = definition
|
|
152
|
+
|
|
153
|
+
name
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Load a profile (custom or preset)
|
|
157
|
+
#
|
|
158
|
+
# @param name [Symbol] Profile name
|
|
159
|
+
# @return [Hash] Profile settings
|
|
160
|
+
def load_profile(name)
|
|
161
|
+
# Check custom profiles first
|
|
162
|
+
if @custom_profiles&.key?(name)
|
|
163
|
+
return @custom_profiles[name].dup
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# Fall back to presets - try Xml first (most common)
|
|
167
|
+
begin
|
|
168
|
+
MatchOptions::Xml.get_profile_options(name)
|
|
169
|
+
rescue Error
|
|
170
|
+
# Try other formats
|
|
171
|
+
MatchOptions::Json.get_profile_options(name)
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# List all available profiles (custom + presets)
|
|
176
|
+
#
|
|
177
|
+
# @return [Array<Symbol>] Available profile names
|
|
178
|
+
def available_profiles
|
|
179
|
+
custom = @custom_profiles&.keys || []
|
|
180
|
+
presets = MatchOptions::Xml::MATCH_PROFILES.keys
|
|
181
|
+
(custom + presets).sort.uniq
|
|
182
|
+
end
|
|
183
|
+
|
|
116
184
|
private
|
|
117
185
|
|
|
118
186
|
# Perform semantic tree diff comparison
|
|
@@ -120,8 +188,8 @@ module Canon
|
|
|
120
188
|
require_relative "tree_diff"
|
|
121
189
|
|
|
122
190
|
# Detect format for both objects
|
|
123
|
-
format1 = opts[:format] ||
|
|
124
|
-
format2 = opts[:format] ||
|
|
191
|
+
format1 = opts[:format] || FormatDetector.detect(obj1)
|
|
192
|
+
format2 = opts[:format] || FormatDetector.detect(obj2)
|
|
125
193
|
|
|
126
194
|
# Handle string format (plain text comparison) - semantic tree doesn't support it
|
|
127
195
|
if format1 == :string
|
|
@@ -203,39 +271,141 @@ module Canon
|
|
|
203
271
|
# @param opts [Hash] User options
|
|
204
272
|
# @return [Hash] Resolved match options
|
|
205
273
|
def resolve_match_options(format, opts)
|
|
274
|
+
# Process unified profile parameter first
|
|
275
|
+
processed_opts = process_profile_parameter(opts)
|
|
276
|
+
|
|
206
277
|
case format
|
|
207
278
|
when :xml, :html, :html4, :html5
|
|
208
279
|
MatchOptions::Xml.resolve(
|
|
209
280
|
format: format,
|
|
210
|
-
match_profile:
|
|
211
|
-
match:
|
|
212
|
-
preprocessing:
|
|
213
|
-
global_profile:
|
|
214
|
-
global_options:
|
|
281
|
+
match_profile: processed_opts[:match_profile],
|
|
282
|
+
match: processed_opts[:match],
|
|
283
|
+
preprocessing: processed_opts[:preprocessing],
|
|
284
|
+
global_profile: processed_opts[:global_profile],
|
|
285
|
+
global_options: processed_opts[:global_options],
|
|
215
286
|
)
|
|
216
287
|
when :json
|
|
217
288
|
MatchOptions::Json.resolve(
|
|
218
289
|
format: format,
|
|
219
|
-
match_profile:
|
|
220
|
-
match:
|
|
221
|
-
preprocessing:
|
|
222
|
-
global_profile:
|
|
223
|
-
global_options:
|
|
290
|
+
match_profile: processed_opts[:match_profile],
|
|
291
|
+
match: processed_opts[:match],
|
|
292
|
+
preprocessing: processed_opts[:preprocessing],
|
|
293
|
+
global_profile: processed_opts[:global_profile],
|
|
294
|
+
global_options: processed_opts[:global_options],
|
|
224
295
|
)
|
|
225
296
|
when :yaml
|
|
226
297
|
MatchOptions::Yaml.resolve(
|
|
227
298
|
format: format,
|
|
228
|
-
match_profile:
|
|
229
|
-
match:
|
|
230
|
-
preprocessing:
|
|
231
|
-
global_profile:
|
|
232
|
-
global_options:
|
|
299
|
+
match_profile: processed_opts[:match_profile],
|
|
300
|
+
match: processed_opts[:match],
|
|
301
|
+
preprocessing: processed_opts[:preprocessing],
|
|
302
|
+
global_profile: processed_opts[:global_profile],
|
|
303
|
+
global_options: processed_opts[:global_options],
|
|
233
304
|
)
|
|
234
305
|
else
|
|
235
|
-
|
|
306
|
+
processed_opts[:match] || {}
|
|
307
|
+
end
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
# Process unified profile parameter
|
|
311
|
+
#
|
|
312
|
+
# Converts the new :profile parameter into the legacy format expected
|
|
313
|
+
# by MatchOptions resolvers. Handles:
|
|
314
|
+
# - Symbol → preset profile (uses :match_profile)
|
|
315
|
+
# - Hash → custom profile (validates and uses :match)
|
|
316
|
+
#
|
|
317
|
+
# @param opts [Hash] Original user options
|
|
318
|
+
# @return [Hash] Processed options with legacy format
|
|
319
|
+
def process_profile_parameter(opts)
|
|
320
|
+
processed = opts.dup
|
|
321
|
+
|
|
322
|
+
# Handle unified :profile parameter
|
|
323
|
+
if opts.key?(:profile)
|
|
324
|
+
profile = opts[:profile]
|
|
325
|
+
|
|
326
|
+
case profile
|
|
327
|
+
when Symbol
|
|
328
|
+
# Preset profile name
|
|
329
|
+
processed[:match_profile] = profile
|
|
330
|
+
when Hash
|
|
331
|
+
# Inline custom profile - validate and use as :match
|
|
332
|
+
validate_custom_profile!(profile, format_from_opts(opts))
|
|
333
|
+
processed[:match] = profile
|
|
334
|
+
else
|
|
335
|
+
raise Canon::Error,
|
|
336
|
+
"Invalid profile type: #{profile.class}. " \
|
|
337
|
+
"Expected Symbol (preset name) or Hash (custom profile)."
|
|
338
|
+
end
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
processed
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
# Validate custom profile hash
|
|
345
|
+
#
|
|
346
|
+
# Ensures all dimensions and behaviors in a custom profile are valid.
|
|
347
|
+
# Uses ProfileDefinition validation logic.
|
|
348
|
+
#
|
|
349
|
+
# @param profile [Hash] Custom profile hash
|
|
350
|
+
# @param format [Symbol] Format type for validation context
|
|
351
|
+
# @raise [Canon::Error] if profile contains invalid dimensions or behaviors
|
|
352
|
+
def validate_custom_profile!(profile, format)
|
|
353
|
+
profile.each do |dimension, behavior|
|
|
354
|
+
# Skip preprocessing and special options
|
|
355
|
+
next if dimension == :preprocessing
|
|
356
|
+
next if dimension == :semantic_diff
|
|
357
|
+
next if dimension == :similarity_threshold
|
|
358
|
+
|
|
359
|
+
# Validate dimension is known
|
|
360
|
+
valid_dimensions = valid_dimensions_for_format(format)
|
|
361
|
+
unless valid_dimensions.include?(dimension)
|
|
362
|
+
raise Canon::Error,
|
|
363
|
+
"Unknown dimension: #{dimension}. " \
|
|
364
|
+
"Valid dimensions for #{format}: #{valid_dimensions.join(', ')}"
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
# Validate behavior is allowed for this dimension
|
|
368
|
+
valid_behaviors = ProfileDefinition::DIMENSION_BEHAVIORS[dimension]
|
|
369
|
+
if valid_behaviors && !valid_behaviors.include?(behavior)
|
|
370
|
+
raise Canon::Error,
|
|
371
|
+
"Invalid behavior '#{behavior}' for dimension '#{dimension}'. " \
|
|
372
|
+
"Valid behaviors: #{valid_behaviors.join(', ')}"
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
# Validate behavior is in general MATCH_BEHAVIORS
|
|
376
|
+
unless MatchOptions::MATCH_BEHAVIORS.include?(behavior)
|
|
377
|
+
raise Canon::Error,
|
|
378
|
+
"Unknown match behavior: #{behavior}. " \
|
|
379
|
+
"Valid behaviors: #{MatchOptions::MATCH_BEHAVIORS.join(', ')}"
|
|
380
|
+
end
|
|
236
381
|
end
|
|
237
382
|
end
|
|
238
383
|
|
|
384
|
+
# Get valid dimensions for a format
|
|
385
|
+
#
|
|
386
|
+
# @param format [Symbol] Format type
|
|
387
|
+
# @return [Array<Symbol>] Valid dimensions for the format
|
|
388
|
+
def valid_dimensions_for_format(format)
|
|
389
|
+
case format
|
|
390
|
+
when :xml, :html, :html4, :html5
|
|
391
|
+
MatchOptions::Xml::MATCH_DIMENSIONS
|
|
392
|
+
when :json
|
|
393
|
+
MatchOptions::Json::MATCH_DIMENSIONS
|
|
394
|
+
when :yaml
|
|
395
|
+
MatchOptions::Yaml::MATCH_DIMENSIONS
|
|
396
|
+
else
|
|
397
|
+
[]
|
|
398
|
+
end
|
|
399
|
+
end
|
|
400
|
+
|
|
401
|
+
# Helper to extract format from opts for validation
|
|
402
|
+
#
|
|
403
|
+
# @param opts [Hash] User options
|
|
404
|
+
# @return [Symbol] Format type or :xml as default
|
|
405
|
+
def format_from_opts(opts)
|
|
406
|
+
opts[:format] || :xml
|
|
407
|
+
end
|
|
408
|
+
|
|
239
409
|
# Parse documents using comparator's parse logic (reuses preprocessing)
|
|
240
410
|
#
|
|
241
411
|
# @param obj1 [Object] First object
|
|
@@ -250,32 +420,66 @@ module Canon
|
|
|
250
420
|
when :xml
|
|
251
421
|
# Delegate to XmlComparator's parse_node - returns Canon::Xml::Node
|
|
252
422
|
# Adapter now handles Canon::Xml::Node directly
|
|
253
|
-
doc1 =
|
|
254
|
-
|
|
423
|
+
doc1 = parse_with_cache(obj1, format, preprocessing) do |doc|
|
|
424
|
+
XmlComparator.send(:parse_node, doc, preprocessing)
|
|
425
|
+
end
|
|
426
|
+
doc2 = parse_with_cache(obj2, format, preprocessing) do |doc|
|
|
427
|
+
XmlComparator.send(:parse_node, doc, preprocessing)
|
|
428
|
+
end
|
|
255
429
|
[doc1, doc2]
|
|
256
430
|
when :html, :html4, :html5
|
|
257
431
|
# Delegate to HtmlComparator's parse_node_for_semantic for Canon::Xml::Node
|
|
258
432
|
[
|
|
259
|
-
|
|
260
|
-
|
|
433
|
+
parse_with_cache(obj1, format, preprocessing) do |doc|
|
|
434
|
+
HtmlComparator.send(:parse_node_for_semantic, doc, preprocessing)
|
|
435
|
+
end,
|
|
436
|
+
parse_with_cache(obj2, format, preprocessing) do |doc|
|
|
437
|
+
HtmlComparator.send(:parse_node_for_semantic, doc, preprocessing)
|
|
438
|
+
end,
|
|
261
439
|
]
|
|
262
440
|
when :json
|
|
263
441
|
# Delegate to JsonComparator's parse_json
|
|
264
442
|
[
|
|
265
|
-
|
|
266
|
-
|
|
443
|
+
parse_with_cache(obj1, format, :none) do |doc|
|
|
444
|
+
JsonComparator.send(:parse_json, doc)
|
|
445
|
+
end,
|
|
446
|
+
parse_with_cache(obj2, format, :none) do |doc|
|
|
447
|
+
JsonComparator.send(:parse_json, doc)
|
|
448
|
+
end,
|
|
267
449
|
]
|
|
268
450
|
when :yaml
|
|
269
451
|
# Delegate to YamlComparator's parse_yaml
|
|
270
452
|
[
|
|
271
|
-
|
|
272
|
-
|
|
453
|
+
parse_with_cache(obj1, format, :none) do |doc|
|
|
454
|
+
YamlComparator.send(:parse_yaml, doc)
|
|
455
|
+
end,
|
|
456
|
+
parse_with_cache(obj2, format, :none) do |doc|
|
|
457
|
+
YamlComparator.send(:parse_yaml, doc)
|
|
458
|
+
end,
|
|
273
459
|
]
|
|
274
460
|
else
|
|
275
461
|
[obj1, obj2]
|
|
276
462
|
end
|
|
277
463
|
end
|
|
278
464
|
|
|
465
|
+
# Parse a document with caching
|
|
466
|
+
#
|
|
467
|
+
# @param doc [Object] Document to parse (string or already parsed)
|
|
468
|
+
# @param format [Symbol] Document format
|
|
469
|
+
# @param preprocessing [Symbol] Preprocessing option
|
|
470
|
+
# @yield Block to parse the document if not cached
|
|
471
|
+
# @return [Object] Parsed document
|
|
472
|
+
def parse_with_cache(doc, format, preprocessing)
|
|
473
|
+
# If already a parsed node, return as-is
|
|
474
|
+
return doc unless doc.is_a?(String)
|
|
475
|
+
|
|
476
|
+
# Use cache for string documents
|
|
477
|
+
Cache.fetch(:document_parse,
|
|
478
|
+
Cache.key_for_document(doc, format, preprocessing)) do
|
|
479
|
+
yield doc
|
|
480
|
+
end
|
|
481
|
+
end
|
|
482
|
+
|
|
279
483
|
# Normalize format for TreeDiff (html4/html5 -> html)
|
|
280
484
|
#
|
|
281
485
|
# @param format [Symbol] Original format
|
|
@@ -314,14 +518,14 @@ module Canon
|
|
|
314
518
|
format1 = format2 = opts[:format]
|
|
315
519
|
# Parse HTML strings if format is html/html4/html5
|
|
316
520
|
if %i[html html4 html5].include?(opts[:format])
|
|
317
|
-
obj1 =
|
|
318
|
-
obj2 =
|
|
319
|
-
#
|
|
320
|
-
|
|
521
|
+
obj1 = HtmlParser.parse(obj1, opts[:format]) if obj1.is_a?(String)
|
|
522
|
+
obj2 = HtmlParser.parse(obj2, opts[:format]) if obj2.is_a?(String)
|
|
523
|
+
# Note: We preserve html4/html5 format instead of normalizing to :html
|
|
524
|
+
# This allows HtmlComparator to use the correct parsing behavior
|
|
321
525
|
end
|
|
322
526
|
else
|
|
323
|
-
format1 =
|
|
324
|
-
format2 =
|
|
527
|
+
format1 = FormatDetector.detect(obj1)
|
|
528
|
+
format2 = FormatDetector.detect(obj2)
|
|
325
529
|
end
|
|
326
530
|
|
|
327
531
|
# Handle string format (plain text comparison)
|
|
@@ -357,7 +561,7 @@ module Canon
|
|
|
357
561
|
case comparison_format
|
|
358
562
|
when :xml
|
|
359
563
|
XmlComparator.equivalent?(obj1, obj2, opts)
|
|
360
|
-
when :html
|
|
564
|
+
when :html, :html4, :html5
|
|
361
565
|
HtmlComparator.equivalent?(obj1, obj2, opts)
|
|
362
566
|
when :json
|
|
363
567
|
JsonComparator.equivalent?(obj1, obj2, opts)
|
|
@@ -366,78 +570,29 @@ module Canon
|
|
|
366
570
|
end
|
|
367
571
|
end
|
|
368
572
|
|
|
369
|
-
#
|
|
370
|
-
#
|
|
371
|
-
# @param content [String, Object] Content to parse (returns as-is if not a string)
|
|
372
|
-
# @param format [Symbol] HTML format (:html, :html4, :html5)
|
|
373
|
-
# @return [Nokogiri::HTML::Document, Nokogiri::HTML5::Document, Nokogiri::HTML::DocumentFragment, Object]
|
|
374
|
-
def parse_html(content, _format)
|
|
375
|
-
return content unless content.is_a?(String)
|
|
376
|
-
return content if content.is_a?(Nokogiri::HTML::Document) ||
|
|
377
|
-
content.is_a?(Nokogiri::HTML5::Document) ||
|
|
378
|
-
content.is_a?(Nokogiri::XML::Document) ||
|
|
379
|
-
content.is_a?(Nokogiri::HTML::DocumentFragment) ||
|
|
380
|
-
content.is_a?(Nokogiri::HTML5::DocumentFragment) ||
|
|
381
|
-
content.is_a?(Nokogiri::XML::DocumentFragment)
|
|
382
|
-
|
|
383
|
-
# Let HtmlComparator's parse_node handle parsing with preprocessing
|
|
384
|
-
# For now, just return the string and let it be parsed by HtmlComparator
|
|
385
|
-
content
|
|
386
|
-
rescue StandardError
|
|
387
|
-
content
|
|
388
|
-
end
|
|
389
|
-
|
|
390
|
-
# Detect the format of an object
|
|
573
|
+
# Detect the format of an object (delegates to FormatDetector)
|
|
391
574
|
#
|
|
392
575
|
# @param obj [Object] Object to detect format of
|
|
393
576
|
# @return [Symbol] Format type
|
|
394
577
|
def detect_format(obj)
|
|
395
|
-
|
|
396
|
-
when Moxml::Node, Moxml::Document
|
|
397
|
-
:xml
|
|
398
|
-
when Nokogiri::HTML::DocumentFragment, Nokogiri::HTML5::DocumentFragment
|
|
399
|
-
# HTML DocumentFragments
|
|
400
|
-
:html
|
|
401
|
-
when Nokogiri::XML::DocumentFragment
|
|
402
|
-
# XML DocumentFragments - check if it's actually HTML
|
|
403
|
-
obj.document&.html? ? :html : :xml
|
|
404
|
-
when Nokogiri::XML::Document, Nokogiri::XML::Node
|
|
405
|
-
# Check if it's HTML by looking at the document type
|
|
406
|
-
obj.html? ? :html : :xml
|
|
407
|
-
when Nokogiri::HTML::Document, Nokogiri::HTML5::Document
|
|
408
|
-
:html
|
|
409
|
-
when String
|
|
410
|
-
detect_string_format(obj)
|
|
411
|
-
when Hash, Array
|
|
412
|
-
# Raw Ruby objects (from parsed JSON/YAML)
|
|
413
|
-
:ruby_object
|
|
414
|
-
else
|
|
415
|
-
raise Canon::Error, "Unknown format for object: #{obj.class}"
|
|
416
|
-
end
|
|
578
|
+
FormatDetector.detect(obj)
|
|
417
579
|
end
|
|
418
580
|
|
|
419
|
-
# Detect the format of a string
|
|
581
|
+
# Detect the format of a string (delegates to FormatDetector)
|
|
420
582
|
#
|
|
421
583
|
# @param str [String] String to detect format of
|
|
422
584
|
# @return [Symbol] Format type
|
|
423
585
|
def detect_string_format(str)
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
# YAML indicators
|
|
427
|
-
return :yaml if trimmed.start_with?("---")
|
|
428
|
-
return :yaml if trimmed.match?(/^[a-zA-Z_]\w*:\s/)
|
|
429
|
-
|
|
430
|
-
# JSON indicators
|
|
431
|
-
return :json if trimmed.start_with?("{", "[")
|
|
432
|
-
|
|
433
|
-
# HTML indicators
|
|
434
|
-
return :html if trimmed.start_with?("<!DOCTYPE html", "<html", "<HTML")
|
|
435
|
-
|
|
436
|
-
# XML indicators - must start with < and end with >
|
|
437
|
-
return :xml if trimmed.start_with?("<") && trimmed.end_with?(">")
|
|
586
|
+
FormatDetector.detect_string(str)
|
|
587
|
+
end
|
|
438
588
|
|
|
439
|
-
|
|
440
|
-
|
|
589
|
+
# Parse HTML string into Nokogiri document (delegates to HtmlParser)
|
|
590
|
+
#
|
|
591
|
+
# @param content [String, Object] Content to parse
|
|
592
|
+
# @param format [Symbol] HTML format (:html, :html4, :html5)
|
|
593
|
+
# @return [Object] Parsed document
|
|
594
|
+
def parse_html(content, format)
|
|
595
|
+
HtmlParser.parse(content, format)
|
|
441
596
|
end
|
|
442
597
|
end
|
|
443
598
|
end
|
data/lib/canon/diff/diff_node.rb
CHANGED
|
@@ -5,9 +5,19 @@ module Canon
|
|
|
5
5
|
# Represents a semantic difference between two nodes in a comparison tree
|
|
6
6
|
# This is created during the Comparison Layer and carries information about
|
|
7
7
|
# which dimension caused the difference and whether it's normative or informative
|
|
8
|
+
#
|
|
9
|
+
# DiffNode is library-agnostic - it works with data extracted from nodes,
|
|
10
|
+
# not the raw node references themselves. This allows Canon to work with
|
|
11
|
+
# any parsing library (Nokogiri, Moxml, etc.) without being tied to it.
|
|
8
12
|
class DiffNode
|
|
9
13
|
attr_reader :node1, :node2
|
|
10
|
-
attr_accessor :dimension, :reason, :normative, :formatting
|
|
14
|
+
attr_accessor :dimension, :reason, :normative, :formatting,
|
|
15
|
+
# Enriched metadata for Stage 4 rendering
|
|
16
|
+
:path, # Canonical path with ordinal indices
|
|
17
|
+
:serialized_before, # Serialized content for display (before)
|
|
18
|
+
:serialized_after, # Serialized content for display (after)
|
|
19
|
+
:attributes_before, # Normalized attributes hash (before)
|
|
20
|
+
:attributes_after # Normalized attributes hash (after)
|
|
11
21
|
|
|
12
22
|
# @param node1 [Object] The first node being compared
|
|
13
23
|
# @param node2 [Object] The second node being compared
|
|
@@ -15,13 +25,26 @@ module Canon
|
|
|
15
25
|
# (e.g., :text_content, :attribute_whitespace, :structural_whitespace,
|
|
16
26
|
# :comments, :key_order)
|
|
17
27
|
# @param reason [String] Human-readable explanation of the difference
|
|
18
|
-
|
|
28
|
+
# @param path [String, nil] Optional canonical path with ordinal indices
|
|
29
|
+
# @param serialized_before [String, nil] Optional serialized content for display
|
|
30
|
+
# @param serialized_after [String, nil] Optional serialized content for display
|
|
31
|
+
# @param attributes_before [Hash, nil] Optional normalized attributes hash
|
|
32
|
+
# @param attributes_after [Hash, nil] Optional normalized attributes hash
|
|
33
|
+
def initialize(node1:, node2:, dimension:, reason:,
|
|
34
|
+
path: nil, serialized_before: nil, serialized_after: nil,
|
|
35
|
+
attributes_before: nil, attributes_after: nil)
|
|
19
36
|
@node1 = node1
|
|
20
37
|
@node2 = node2
|
|
21
38
|
@dimension = dimension
|
|
22
39
|
@reason = reason
|
|
23
40
|
@normative = nil # Will be set by DiffClassifier
|
|
24
41
|
@formatting = nil # Will be set by DiffClassifier
|
|
42
|
+
# Enriched metadata (optional, populated by PathBuilder and NodeSerializer)
|
|
43
|
+
@path = path
|
|
44
|
+
@serialized_before = serialized_before
|
|
45
|
+
@serialized_after = serialized_after
|
|
46
|
+
@attributes_before = attributes_before
|
|
47
|
+
@attributes_after = attributes_after
|
|
25
48
|
end
|
|
26
49
|
|
|
27
50
|
# @return [Boolean] true if this diff is normative (affects equivalence)
|
|
@@ -54,6 +77,11 @@ module Canon
|
|
|
54
77
|
reason: reason,
|
|
55
78
|
normative: normative,
|
|
56
79
|
formatting: formatting,
|
|
80
|
+
path: path,
|
|
81
|
+
serialized_before: serialized_before,
|
|
82
|
+
serialized_after: serialized_after,
|
|
83
|
+
attributes_before: attributes_before,
|
|
84
|
+
attributes_after: attributes_after,
|
|
57
85
|
}
|
|
58
86
|
end
|
|
59
87
|
|
|
@@ -65,6 +93,8 @@ module Canon
|
|
|
65
93
|
reason == other.reason &&
|
|
66
94
|
normative == other.normative &&
|
|
67
95
|
formatting == other.formatting
|
|
96
|
+
# Note: path and serialized content are not part of equality
|
|
97
|
+
# since they're derived from nodes, not independent properties
|
|
68
98
|
end
|
|
69
99
|
end
|
|
70
100
|
end
|