canon 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +69 -92
- data/README.adoc +13 -13
- data/docs/.lycheeignore +69 -0
- data/docs/Gemfile +1 -0
- data/docs/_config.yml +90 -1
- data/docs/advanced/diff-classification.adoc +82 -2
- data/docs/advanced/extending-canon.adoc +193 -0
- data/docs/features/match-options/index.adoc +239 -1
- data/docs/internals/diffnode-enrichment.adoc +611 -0
- data/docs/internals/index.adoc +251 -0
- data/docs/lychee.toml +13 -6
- data/docs/understanding/architecture.adoc +749 -33
- data/docs/understanding/comparison-pipeline.adoc +122 -0
- data/lib/canon/cache.rb +129 -0
- data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +68 -0
- data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +68 -0
- data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +171 -0
- data/lib/canon/comparison/dimensions/base_dimension.rb +107 -0
- data/lib/canon/comparison/dimensions/comments_dimension.rb +121 -0
- data/lib/canon/comparison/dimensions/element_position_dimension.rb +90 -0
- data/lib/canon/comparison/dimensions/registry.rb +77 -0
- data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +119 -0
- data/lib/canon/comparison/dimensions/text_content_dimension.rb +96 -0
- data/lib/canon/comparison/dimensions.rb +54 -0
- data/lib/canon/comparison/format_detector.rb +87 -0
- data/lib/canon/comparison/html_comparator.rb +70 -26
- data/lib/canon/comparison/html_compare_profile.rb +8 -2
- data/lib/canon/comparison/html_parser.rb +80 -0
- data/lib/canon/comparison/json_comparator.rb +12 -0
- data/lib/canon/comparison/json_parser.rb +19 -0
- data/lib/canon/comparison/markup_comparator.rb +293 -0
- data/lib/canon/comparison/match_options/base_resolver.rb +150 -0
- data/lib/canon/comparison/match_options/json_resolver.rb +82 -0
- data/lib/canon/comparison/match_options/xml_resolver.rb +151 -0
- data/lib/canon/comparison/match_options/yaml_resolver.rb +87 -0
- data/lib/canon/comparison/match_options.rb +68 -463
- data/lib/canon/comparison/profile_definition.rb +149 -0
- data/lib/canon/comparison/ruby_object_comparator.rb +180 -0
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +7 -10
- data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
- data/lib/canon/comparison/xml_comparator/attribute_comparator.rb +177 -0
- data/lib/canon/comparison/xml_comparator/attribute_filter.rb +136 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +197 -0
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +115 -0
- data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +186 -0
- data/lib/canon/comparison/xml_comparator/node_parser.rb +79 -0
- data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +102 -0
- data/lib/canon/comparison/xml_comparator.rb +97 -684
- data/lib/canon/comparison/xml_node_comparison.rb +319 -0
- data/lib/canon/comparison/xml_parser.rb +19 -0
- data/lib/canon/comparison/yaml_comparator.rb +3 -3
- data/lib/canon/comparison.rb +265 -110
- data/lib/canon/diff/diff_classifier.rb +101 -2
- data/lib/canon/diff/diff_node.rb +32 -2
- data/lib/canon/diff/formatting_detector.rb +1 -1
- data/lib/canon/diff/node_serializer.rb +191 -0
- data/lib/canon/diff/path_builder.rb +143 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +251 -0
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +6 -248
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +38 -229
- data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +30 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +579 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +121 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +253 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +61 -0
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +31 -1028
- data/lib/canon/diff_formatter.rb +1 -1
- data/lib/canon/rspec_matchers.rb +38 -9
- data/lib/canon/tree_diff/operation_converter.rb +92 -338
- data/lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb +71 -0
- data/lib/canon/tree_diff/operation_converter_helpers/post_processor.rb +103 -0
- data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +168 -0
- data/lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb +188 -0
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +24 -13
- metadata +48 -2
data/lib/canon/comparison.rb
CHANGED
|
@@ -7,11 +7,15 @@ require_relative "comparison/xml_comparator"
|
|
|
7
7
|
require_relative "comparison/html_comparator"
|
|
8
8
|
require_relative "comparison/json_comparator"
|
|
9
9
|
require_relative "comparison/yaml_comparator"
|
|
10
|
+
require_relative "comparison/profile_definition"
|
|
11
|
+
require_relative "comparison/format_detector"
|
|
12
|
+
require_relative "comparison/html_parser"
|
|
10
13
|
require_relative "diff/diff_node_mapper"
|
|
11
14
|
require_relative "diff/diff_line"
|
|
12
15
|
require_relative "diff/diff_block_builder"
|
|
13
16
|
require_relative "diff/diff_context_builder"
|
|
14
17
|
require_relative "diff/diff_report_builder"
|
|
18
|
+
require_relative "cache"
|
|
15
19
|
|
|
16
20
|
module Canon
|
|
17
21
|
# Comparison module for XML, HTML, JSON, and YAML documents
|
|
@@ -36,25 +40,36 @@ module Canon
|
|
|
36
40
|
# == Comparison Options
|
|
37
41
|
#
|
|
38
42
|
# Common options across all formats:
|
|
39
|
-
# -
|
|
40
|
-
#
|
|
41
|
-
#
|
|
42
|
-
# -
|
|
43
|
-
# - ignore_children: Skip child nodes (default: false)
|
|
43
|
+
# - profile: Comparison profile (Symbol for preset, Hash for custom)
|
|
44
|
+
# * Presets: :strict, :rendered, :html4, :html5, :spec_friendly, :content_only
|
|
45
|
+
# * Custom: { text_content: :normalize, comments: :ignore, ... }
|
|
46
|
+
# - diff_algorithm: Algorithm to use (:dom or :semantic, default: :dom)
|
|
44
47
|
# - verbose: Return detailed diff array (default: false)
|
|
45
48
|
#
|
|
46
49
|
# == Usage Examples
|
|
47
50
|
#
|
|
48
|
-
# # XML comparison
|
|
51
|
+
# # XML comparison with default profile
|
|
49
52
|
# Canon::Comparison.equivalent?(xml1, xml2)
|
|
50
|
-
# Canon::Comparison.equivalent?(xml1, xml2, verbose: true)
|
|
51
53
|
#
|
|
52
|
-
# #
|
|
53
|
-
# Canon::Comparison.equivalent?(
|
|
54
|
+
# # XML comparison with preset profile
|
|
55
|
+
# Canon::Comparison.equivalent?(xml1, xml2, profile: :strict)
|
|
56
|
+
# Canon::Comparison.equivalent?(xml1, xml2, profile: :spec_friendly)
|
|
54
57
|
#
|
|
55
|
-
# #
|
|
56
|
-
# Canon::Comparison.equivalent?(
|
|
57
|
-
#
|
|
58
|
+
# # HTML comparison with custom inline profile
|
|
59
|
+
# Canon::Comparison.equivalent?(html1, html2,
|
|
60
|
+
# profile: { text_content: :normalize, comments: :ignore })
|
|
61
|
+
#
|
|
62
|
+
# # Define and use a custom profile
|
|
63
|
+
# Canon::Comparison.define_profile(:my_custom) do
|
|
64
|
+
# text_content :normalize
|
|
65
|
+
# comments :ignore
|
|
66
|
+
# preprocessing :rendered
|
|
67
|
+
# end
|
|
68
|
+
# Canon::Comparison.equivalent?(doc1, doc2, profile: :my_custom)
|
|
69
|
+
#
|
|
70
|
+
# # JSON comparison with semantic tree diff
|
|
71
|
+
# Canon::Comparison.equivalent?(json1, json2,
|
|
72
|
+
# diff_algorithm: :semantic, profile: :spec_friendly)
|
|
58
73
|
#
|
|
59
74
|
# # With detailed output
|
|
60
75
|
# diffs = Canon::Comparison.equivalent?(doc1, doc2, verbose: true)
|
|
@@ -88,10 +103,11 @@ module Canon
|
|
|
88
103
|
UNEQUAL_TEXT_CONTENTS = 9
|
|
89
104
|
MISSING_HASH_KEY = 10
|
|
90
105
|
UNEQUAL_HASH_VALUES = 11
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
106
|
+
UNEQUAL_HASH_KEY_ORDER = 12
|
|
107
|
+
UNEQUAL_ARRAY_LENGTHS = 13
|
|
108
|
+
UNEQUAL_ARRAY_ELEMENTS = 14
|
|
109
|
+
UNEQUAL_TYPES = 15
|
|
110
|
+
UNEQUAL_PRIMITIVES = 16
|
|
95
111
|
|
|
96
112
|
class << self
|
|
97
113
|
# Auto-detect format and compare two objects
|
|
@@ -99,8 +115,10 @@ module Canon
|
|
|
99
115
|
# @param obj1 [Object] First object to compare
|
|
100
116
|
# @param obj2 [Object] Second object to compare
|
|
101
117
|
# @param opts [Hash] Comparison options
|
|
118
|
+
# - :profile - Profile to use (Symbol for preset, Hash for custom)
|
|
102
119
|
# - :format - Format hint (:xml, :html, :html4, :html5, :json, :yaml, :string)
|
|
103
120
|
# - :diff_algorithm - Algorithm to use (:dom or :semantic)
|
|
121
|
+
# - :verbose - Return detailed diff array (default: false)
|
|
104
122
|
# @return [Boolean, Array] true if equivalent, or array of diffs if verbose
|
|
105
123
|
def equivalent?(obj1, obj2, opts = {})
|
|
106
124
|
# Check if semantic tree diff is requested
|
|
@@ -113,6 +131,56 @@ module Canon
|
|
|
113
131
|
dom_diff(obj1, obj2, opts)
|
|
114
132
|
end
|
|
115
133
|
|
|
134
|
+
# Define a custom comparison profile with DSL syntax
|
|
135
|
+
#
|
|
136
|
+
# @param name [Symbol] Profile name
|
|
137
|
+
# @yield [ProfileDefinition] DSL block for defining profile
|
|
138
|
+
# @return [Symbol] Profile name
|
|
139
|
+
# @raise [ProfileError] if profile definition is invalid
|
|
140
|
+
#
|
|
141
|
+
# @example Define a custom profile
|
|
142
|
+
# Canon::Comparison.define_profile(:my_custom) do
|
|
143
|
+
# text_content :normalize
|
|
144
|
+
# comments :ignore
|
|
145
|
+
# preprocessing :rendered
|
|
146
|
+
# end
|
|
147
|
+
def define_profile(name, &block)
|
|
148
|
+
definition = ProfileDefinition.define(name, &block)
|
|
149
|
+
|
|
150
|
+
@custom_profiles ||= {}
|
|
151
|
+
@custom_profiles[name] = definition
|
|
152
|
+
|
|
153
|
+
name
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Load a profile (custom or preset)
|
|
157
|
+
#
|
|
158
|
+
# @param name [Symbol] Profile name
|
|
159
|
+
# @return [Hash] Profile settings
|
|
160
|
+
def load_profile(name)
|
|
161
|
+
# Check custom profiles first
|
|
162
|
+
if @custom_profiles&.key?(name)
|
|
163
|
+
return @custom_profiles[name].dup
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# Fall back to presets - try Xml first (most common)
|
|
167
|
+
begin
|
|
168
|
+
MatchOptions::Xml.get_profile_options(name)
|
|
169
|
+
rescue Error
|
|
170
|
+
# Try other formats
|
|
171
|
+
MatchOptions::Json.get_profile_options(name)
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# List all available profiles (custom + presets)
|
|
176
|
+
#
|
|
177
|
+
# @return [Array<Symbol>] Available profile names
|
|
178
|
+
def available_profiles
|
|
179
|
+
custom = @custom_profiles&.keys || []
|
|
180
|
+
presets = MatchOptions::Xml::MATCH_PROFILES.keys
|
|
181
|
+
(custom + presets).sort.uniq
|
|
182
|
+
end
|
|
183
|
+
|
|
116
184
|
private
|
|
117
185
|
|
|
118
186
|
# Perform semantic tree diff comparison
|
|
@@ -120,8 +188,8 @@ module Canon
|
|
|
120
188
|
require_relative "tree_diff"
|
|
121
189
|
|
|
122
190
|
# Detect format for both objects
|
|
123
|
-
format1 = opts[:format] ||
|
|
124
|
-
format2 = opts[:format] ||
|
|
191
|
+
format1 = opts[:format] || FormatDetector.detect(obj1)
|
|
192
|
+
format2 = opts[:format] || FormatDetector.detect(obj2)
|
|
125
193
|
|
|
126
194
|
# Handle string format (plain text comparison) - semantic tree doesn't support it
|
|
127
195
|
if format1 == :string
|
|
@@ -203,39 +271,141 @@ module Canon
|
|
|
203
271
|
# @param opts [Hash] User options
|
|
204
272
|
# @return [Hash] Resolved match options
|
|
205
273
|
def resolve_match_options(format, opts)
|
|
274
|
+
# Process unified profile parameter first
|
|
275
|
+
processed_opts = process_profile_parameter(opts)
|
|
276
|
+
|
|
206
277
|
case format
|
|
207
278
|
when :xml, :html, :html4, :html5
|
|
208
279
|
MatchOptions::Xml.resolve(
|
|
209
280
|
format: format,
|
|
210
|
-
match_profile:
|
|
211
|
-
match:
|
|
212
|
-
preprocessing:
|
|
213
|
-
global_profile:
|
|
214
|
-
global_options:
|
|
281
|
+
match_profile: processed_opts[:match_profile],
|
|
282
|
+
match: processed_opts[:match],
|
|
283
|
+
preprocessing: processed_opts[:preprocessing],
|
|
284
|
+
global_profile: processed_opts[:global_profile],
|
|
285
|
+
global_options: processed_opts[:global_options],
|
|
215
286
|
)
|
|
216
287
|
when :json
|
|
217
288
|
MatchOptions::Json.resolve(
|
|
218
289
|
format: format,
|
|
219
|
-
match_profile:
|
|
220
|
-
match:
|
|
221
|
-
preprocessing:
|
|
222
|
-
global_profile:
|
|
223
|
-
global_options:
|
|
290
|
+
match_profile: processed_opts[:match_profile],
|
|
291
|
+
match: processed_opts[:match],
|
|
292
|
+
preprocessing: processed_opts[:preprocessing],
|
|
293
|
+
global_profile: processed_opts[:global_profile],
|
|
294
|
+
global_options: processed_opts[:global_options],
|
|
224
295
|
)
|
|
225
296
|
when :yaml
|
|
226
297
|
MatchOptions::Yaml.resolve(
|
|
227
298
|
format: format,
|
|
228
|
-
match_profile:
|
|
229
|
-
match:
|
|
230
|
-
preprocessing:
|
|
231
|
-
global_profile:
|
|
232
|
-
global_options:
|
|
299
|
+
match_profile: processed_opts[:match_profile],
|
|
300
|
+
match: processed_opts[:match],
|
|
301
|
+
preprocessing: processed_opts[:preprocessing],
|
|
302
|
+
global_profile: processed_opts[:global_profile],
|
|
303
|
+
global_options: processed_opts[:global_options],
|
|
233
304
|
)
|
|
234
305
|
else
|
|
235
|
-
|
|
306
|
+
processed_opts[:match] || {}
|
|
307
|
+
end
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
# Process unified profile parameter
|
|
311
|
+
#
|
|
312
|
+
# Converts the new :profile parameter into the legacy format expected
|
|
313
|
+
# by MatchOptions resolvers. Handles:
|
|
314
|
+
# - Symbol → preset profile (uses :match_profile)
|
|
315
|
+
# - Hash → custom profile (validates and uses :match)
|
|
316
|
+
#
|
|
317
|
+
# @param opts [Hash] Original user options
|
|
318
|
+
# @return [Hash] Processed options with legacy format
|
|
319
|
+
def process_profile_parameter(opts)
|
|
320
|
+
processed = opts.dup
|
|
321
|
+
|
|
322
|
+
# Handle unified :profile parameter
|
|
323
|
+
if opts.key?(:profile)
|
|
324
|
+
profile = opts[:profile]
|
|
325
|
+
|
|
326
|
+
case profile
|
|
327
|
+
when Symbol
|
|
328
|
+
# Preset profile name
|
|
329
|
+
processed[:match_profile] = profile
|
|
330
|
+
when Hash
|
|
331
|
+
# Inline custom profile - validate and use as :match
|
|
332
|
+
validate_custom_profile!(profile, format_from_opts(opts))
|
|
333
|
+
processed[:match] = profile
|
|
334
|
+
else
|
|
335
|
+
raise Canon::Error,
|
|
336
|
+
"Invalid profile type: #{profile.class}. " \
|
|
337
|
+
"Expected Symbol (preset name) or Hash (custom profile)."
|
|
338
|
+
end
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
processed
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
# Validate custom profile hash
|
|
345
|
+
#
|
|
346
|
+
# Ensures all dimensions and behaviors in a custom profile are valid.
|
|
347
|
+
# Uses ProfileDefinition validation logic.
|
|
348
|
+
#
|
|
349
|
+
# @param profile [Hash] Custom profile hash
|
|
350
|
+
# @param format [Symbol] Format type for validation context
|
|
351
|
+
# @raise [Canon::Error] if profile contains invalid dimensions or behaviors
|
|
352
|
+
def validate_custom_profile!(profile, format)
|
|
353
|
+
profile.each do |dimension, behavior|
|
|
354
|
+
# Skip preprocessing and special options
|
|
355
|
+
next if dimension == :preprocessing
|
|
356
|
+
next if dimension == :semantic_diff
|
|
357
|
+
next if dimension == :similarity_threshold
|
|
358
|
+
|
|
359
|
+
# Validate dimension is known
|
|
360
|
+
valid_dimensions = valid_dimensions_for_format(format)
|
|
361
|
+
unless valid_dimensions.include?(dimension)
|
|
362
|
+
raise Canon::Error,
|
|
363
|
+
"Unknown dimension: #{dimension}. " \
|
|
364
|
+
"Valid dimensions for #{format}: #{valid_dimensions.join(', ')}"
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
# Validate behavior is allowed for this dimension
|
|
368
|
+
valid_behaviors = ProfileDefinition::DIMENSION_BEHAVIORS[dimension]
|
|
369
|
+
if valid_behaviors && !valid_behaviors.include?(behavior)
|
|
370
|
+
raise Canon::Error,
|
|
371
|
+
"Invalid behavior '#{behavior}' for dimension '#{dimension}'. " \
|
|
372
|
+
"Valid behaviors: #{valid_behaviors.join(', ')}"
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
# Validate behavior is in general MATCH_BEHAVIORS
|
|
376
|
+
unless MatchOptions::MATCH_BEHAVIORS.include?(behavior)
|
|
377
|
+
raise Canon::Error,
|
|
378
|
+
"Unknown match behavior: #{behavior}. " \
|
|
379
|
+
"Valid behaviors: #{MatchOptions::MATCH_BEHAVIORS.join(', ')}"
|
|
380
|
+
end
|
|
236
381
|
end
|
|
237
382
|
end
|
|
238
383
|
|
|
384
|
+
# Get valid dimensions for a format
|
|
385
|
+
#
|
|
386
|
+
# @param format [Symbol] Format type
|
|
387
|
+
# @return [Array<Symbol>] Valid dimensions for the format
|
|
388
|
+
def valid_dimensions_for_format(format)
|
|
389
|
+
case format
|
|
390
|
+
when :xml, :html, :html4, :html5
|
|
391
|
+
MatchOptions::Xml::MATCH_DIMENSIONS
|
|
392
|
+
when :json
|
|
393
|
+
MatchOptions::Json::MATCH_DIMENSIONS
|
|
394
|
+
when :yaml
|
|
395
|
+
MatchOptions::Yaml::MATCH_DIMENSIONS
|
|
396
|
+
else
|
|
397
|
+
[]
|
|
398
|
+
end
|
|
399
|
+
end
|
|
400
|
+
|
|
401
|
+
# Helper to extract format from opts for validation
|
|
402
|
+
#
|
|
403
|
+
# @param opts [Hash] User options
|
|
404
|
+
# @return [Symbol] Format type or :xml as default
|
|
405
|
+
def format_from_opts(opts)
|
|
406
|
+
opts[:format] || :xml
|
|
407
|
+
end
|
|
408
|
+
|
|
239
409
|
# Parse documents using comparator's parse logic (reuses preprocessing)
|
|
240
410
|
#
|
|
241
411
|
# @param obj1 [Object] First object
|
|
@@ -250,32 +420,66 @@ module Canon
|
|
|
250
420
|
when :xml
|
|
251
421
|
# Delegate to XmlComparator's parse_node - returns Canon::Xml::Node
|
|
252
422
|
# Adapter now handles Canon::Xml::Node directly
|
|
253
|
-
doc1 =
|
|
254
|
-
|
|
423
|
+
doc1 = parse_with_cache(obj1, format, preprocessing) do |doc|
|
|
424
|
+
XmlComparator.send(:parse_node, doc, preprocessing)
|
|
425
|
+
end
|
|
426
|
+
doc2 = parse_with_cache(obj2, format, preprocessing) do |doc|
|
|
427
|
+
XmlComparator.send(:parse_node, doc, preprocessing)
|
|
428
|
+
end
|
|
255
429
|
[doc1, doc2]
|
|
256
430
|
when :html, :html4, :html5
|
|
257
431
|
# Delegate to HtmlComparator's parse_node_for_semantic for Canon::Xml::Node
|
|
258
432
|
[
|
|
259
|
-
|
|
260
|
-
|
|
433
|
+
parse_with_cache(obj1, format, preprocessing) do |doc|
|
|
434
|
+
HtmlComparator.send(:parse_node_for_semantic, doc, preprocessing)
|
|
435
|
+
end,
|
|
436
|
+
parse_with_cache(obj2, format, preprocessing) do |doc|
|
|
437
|
+
HtmlComparator.send(:parse_node_for_semantic, doc, preprocessing)
|
|
438
|
+
end,
|
|
261
439
|
]
|
|
262
440
|
when :json
|
|
263
441
|
# Delegate to JsonComparator's parse_json
|
|
264
442
|
[
|
|
265
|
-
|
|
266
|
-
|
|
443
|
+
parse_with_cache(obj1, format, :none) do |doc|
|
|
444
|
+
JsonComparator.send(:parse_json, doc)
|
|
445
|
+
end,
|
|
446
|
+
parse_with_cache(obj2, format, :none) do |doc|
|
|
447
|
+
JsonComparator.send(:parse_json, doc)
|
|
448
|
+
end,
|
|
267
449
|
]
|
|
268
450
|
when :yaml
|
|
269
451
|
# Delegate to YamlComparator's parse_yaml
|
|
270
452
|
[
|
|
271
|
-
|
|
272
|
-
|
|
453
|
+
parse_with_cache(obj1, format, :none) do |doc|
|
|
454
|
+
YamlComparator.send(:parse_yaml, doc)
|
|
455
|
+
end,
|
|
456
|
+
parse_with_cache(obj2, format, :none) do |doc|
|
|
457
|
+
YamlComparator.send(:parse_yaml, doc)
|
|
458
|
+
end,
|
|
273
459
|
]
|
|
274
460
|
else
|
|
275
461
|
[obj1, obj2]
|
|
276
462
|
end
|
|
277
463
|
end
|
|
278
464
|
|
|
465
|
+
# Parse a document with caching
|
|
466
|
+
#
|
|
467
|
+
# @param doc [Object] Document to parse (string or already parsed)
|
|
468
|
+
# @param format [Symbol] Document format
|
|
469
|
+
# @param preprocessing [Symbol] Preprocessing option
|
|
470
|
+
# @yield Block to parse the document if not cached
|
|
471
|
+
# @return [Object] Parsed document
|
|
472
|
+
def parse_with_cache(doc, format, preprocessing)
|
|
473
|
+
# If already a parsed node, return as-is
|
|
474
|
+
return doc unless doc.is_a?(String)
|
|
475
|
+
|
|
476
|
+
# Use cache for string documents
|
|
477
|
+
Cache.fetch(:document_parse,
|
|
478
|
+
Cache.key_for_document(doc, format, preprocessing)) do
|
|
479
|
+
yield doc
|
|
480
|
+
end
|
|
481
|
+
end
|
|
482
|
+
|
|
279
483
|
# Normalize format for TreeDiff (html4/html5 -> html)
|
|
280
484
|
#
|
|
281
485
|
# @param format [Symbol] Original format
|
|
@@ -314,14 +518,14 @@ module Canon
|
|
|
314
518
|
format1 = format2 = opts[:format]
|
|
315
519
|
# Parse HTML strings if format is html/html4/html5
|
|
316
520
|
if %i[html html4 html5].include?(opts[:format])
|
|
317
|
-
obj1 =
|
|
318
|
-
obj2 =
|
|
319
|
-
#
|
|
320
|
-
|
|
521
|
+
obj1 = HtmlParser.parse(obj1, opts[:format]) if obj1.is_a?(String)
|
|
522
|
+
obj2 = HtmlParser.parse(obj2, opts[:format]) if obj2.is_a?(String)
|
|
523
|
+
# Note: We preserve html4/html5 format instead of normalizing to :html
|
|
524
|
+
# This allows HtmlComparator to use the correct parsing behavior
|
|
321
525
|
end
|
|
322
526
|
else
|
|
323
|
-
format1 =
|
|
324
|
-
format2 =
|
|
527
|
+
format1 = FormatDetector.detect(obj1)
|
|
528
|
+
format2 = FormatDetector.detect(obj2)
|
|
325
529
|
end
|
|
326
530
|
|
|
327
531
|
# Handle string format (plain text comparison)
|
|
@@ -357,7 +561,7 @@ module Canon
|
|
|
357
561
|
case comparison_format
|
|
358
562
|
when :xml
|
|
359
563
|
XmlComparator.equivalent?(obj1, obj2, opts)
|
|
360
|
-
when :html
|
|
564
|
+
when :html, :html4, :html5
|
|
361
565
|
HtmlComparator.equivalent?(obj1, obj2, opts)
|
|
362
566
|
when :json
|
|
363
567
|
JsonComparator.equivalent?(obj1, obj2, opts)
|
|
@@ -366,78 +570,29 @@ module Canon
|
|
|
366
570
|
end
|
|
367
571
|
end
|
|
368
572
|
|
|
369
|
-
#
|
|
370
|
-
#
|
|
371
|
-
# @param content [String, Object] Content to parse (returns as-is if not a string)
|
|
372
|
-
# @param format [Symbol] HTML format (:html, :html4, :html5)
|
|
373
|
-
# @return [Nokogiri::HTML::Document, Nokogiri::HTML5::Document, Nokogiri::HTML::DocumentFragment, Object]
|
|
374
|
-
def parse_html(content, _format)
|
|
375
|
-
return content unless content.is_a?(String)
|
|
376
|
-
return content if content.is_a?(Nokogiri::HTML::Document) ||
|
|
377
|
-
content.is_a?(Nokogiri::HTML5::Document) ||
|
|
378
|
-
content.is_a?(Nokogiri::XML::Document) ||
|
|
379
|
-
content.is_a?(Nokogiri::HTML::DocumentFragment) ||
|
|
380
|
-
content.is_a?(Nokogiri::HTML5::DocumentFragment) ||
|
|
381
|
-
content.is_a?(Nokogiri::XML::DocumentFragment)
|
|
382
|
-
|
|
383
|
-
# Let HtmlComparator's parse_node handle parsing with preprocessing
|
|
384
|
-
# For now, just return the string and let it be parsed by HtmlComparator
|
|
385
|
-
content
|
|
386
|
-
rescue StandardError
|
|
387
|
-
content
|
|
388
|
-
end
|
|
389
|
-
|
|
390
|
-
# Detect the format of an object
|
|
573
|
+
# Detect the format of an object (delegates to FormatDetector)
|
|
391
574
|
#
|
|
392
575
|
# @param obj [Object] Object to detect format of
|
|
393
576
|
# @return [Symbol] Format type
|
|
394
577
|
def detect_format(obj)
|
|
395
|
-
|
|
396
|
-
when Moxml::Node, Moxml::Document
|
|
397
|
-
:xml
|
|
398
|
-
when Nokogiri::HTML::DocumentFragment, Nokogiri::HTML5::DocumentFragment
|
|
399
|
-
# HTML DocumentFragments
|
|
400
|
-
:html
|
|
401
|
-
when Nokogiri::XML::DocumentFragment
|
|
402
|
-
# XML DocumentFragments - check if it's actually HTML
|
|
403
|
-
obj.document&.html? ? :html : :xml
|
|
404
|
-
when Nokogiri::XML::Document, Nokogiri::XML::Node
|
|
405
|
-
# Check if it's HTML by looking at the document type
|
|
406
|
-
obj.html? ? :html : :xml
|
|
407
|
-
when Nokogiri::HTML::Document, Nokogiri::HTML5::Document
|
|
408
|
-
:html
|
|
409
|
-
when String
|
|
410
|
-
detect_string_format(obj)
|
|
411
|
-
when Hash, Array
|
|
412
|
-
# Raw Ruby objects (from parsed JSON/YAML)
|
|
413
|
-
:ruby_object
|
|
414
|
-
else
|
|
415
|
-
raise Canon::Error, "Unknown format for object: #{obj.class}"
|
|
416
|
-
end
|
|
578
|
+
FormatDetector.detect(obj)
|
|
417
579
|
end
|
|
418
580
|
|
|
419
|
-
# Detect the format of a string
|
|
581
|
+
# Detect the format of a string (delegates to FormatDetector)
|
|
420
582
|
#
|
|
421
583
|
# @param str [String] String to detect format of
|
|
422
584
|
# @return [Symbol] Format type
|
|
423
585
|
def detect_string_format(str)
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
# YAML indicators
|
|
427
|
-
return :yaml if trimmed.start_with?("---")
|
|
428
|
-
return :yaml if trimmed.match?(/^[a-zA-Z_]\w*:\s/)
|
|
429
|
-
|
|
430
|
-
# JSON indicators
|
|
431
|
-
return :json if trimmed.start_with?("{", "[")
|
|
432
|
-
|
|
433
|
-
# HTML indicators
|
|
434
|
-
return :html if trimmed.start_with?("<!DOCTYPE html", "<html", "<HTML")
|
|
435
|
-
|
|
436
|
-
# XML indicators - must start with < and end with >
|
|
437
|
-
return :xml if trimmed.start_with?("<") && trimmed.end_with?(">")
|
|
586
|
+
FormatDetector.detect_string(str)
|
|
587
|
+
end
|
|
438
588
|
|
|
439
|
-
|
|
440
|
-
|
|
589
|
+
# Parse HTML string into Nokogiri document (delegates to HtmlParser)
|
|
590
|
+
#
|
|
591
|
+
# @param content [String, Object] Content to parse
|
|
592
|
+
# @param format [Symbol] HTML format (:html, :html4, :html5)
|
|
593
|
+
# @return [Object] Parsed document
|
|
594
|
+
def parse_html(content, format)
|
|
595
|
+
HtmlParser.parse(content, format)
|
|
441
596
|
end
|
|
442
597
|
end
|
|
443
598
|
end
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require_relative "formatting_detector"
|
|
4
4
|
require_relative "../comparison/compare_profile"
|
|
5
|
+
require_relative "../comparison/whitespace_sensitivity"
|
|
5
6
|
|
|
6
7
|
module Canon
|
|
7
8
|
module Diff
|
|
@@ -28,6 +29,28 @@ module Canon
|
|
|
28
29
|
# @param diff_node [DiffNode] The diff node to classify
|
|
29
30
|
# @return [DiffNode] The same diff node with normative/formatting attributes set
|
|
30
31
|
def classify(diff_node)
|
|
32
|
+
# SPECIAL CASE: text_content with :normalize behavior
|
|
33
|
+
# When text_content is :normalize and the difference is formatting-only,
|
|
34
|
+
# it should be marked as non-normative (informative)
|
|
35
|
+
# This ensures that verbose and non-verbose modes give consistent results
|
|
36
|
+
#
|
|
37
|
+
# EXCEPTION: If the text node is inside a whitespace-sensitive element
|
|
38
|
+
# (like <pre>, <code>, <textarea> in HTML), don't apply formatting detection
|
|
39
|
+
# because whitespace should be preserved in these elements
|
|
40
|
+
#
|
|
41
|
+
# This check must come FIRST, before normative_dimension? is called,
|
|
42
|
+
# because normative_dimension? returns true for text_content: :normalize
|
|
43
|
+
# (since the dimension affects equivalence), which would prevent formatting
|
|
44
|
+
# detection from being applied.
|
|
45
|
+
if diff_node.dimension == :text_content &&
|
|
46
|
+
profile.send(:behavior_for, :text_content) == :normalize &&
|
|
47
|
+
!inside_whitespace_sensitive_element?(diff_node) &&
|
|
48
|
+
formatting_only_diff?(diff_node)
|
|
49
|
+
diff_node.formatting = true
|
|
50
|
+
diff_node.normative = false
|
|
51
|
+
return diff_node
|
|
52
|
+
end
|
|
53
|
+
|
|
31
54
|
# FIRST: Determine if this dimension is normative based on CompareProfile
|
|
32
55
|
# This respects the policy settings (strict/normalize/ignore)
|
|
33
56
|
is_normative = profile.normative_dimension?(diff_node.dimension)
|
|
@@ -45,7 +68,7 @@ module Canon
|
|
|
45
68
|
return diff_node
|
|
46
69
|
end
|
|
47
70
|
|
|
48
|
-
#
|
|
71
|
+
# THIRD: Apply the normative determination from CompareProfile
|
|
49
72
|
diff_node.formatting = false
|
|
50
73
|
diff_node.normative = is_normative
|
|
51
74
|
|
|
@@ -65,10 +88,86 @@ module Canon
|
|
|
65
88
|
# @param diff_node [DiffNode] The diff node to check
|
|
66
89
|
# @return [Boolean] true if formatting-only
|
|
67
90
|
def formatting_only_diff?(diff_node)
|
|
91
|
+
# Only apply formatting detection to actual text content differences
|
|
92
|
+
# If the nodes are not text nodes (e.g., element nodes), don't apply formatting detection
|
|
93
|
+
node1 = diff_node.node1
|
|
94
|
+
node2 = diff_node.node2
|
|
95
|
+
|
|
96
|
+
# Check if both nodes are text nodes
|
|
97
|
+
# If not, this is not a formatting-only difference
|
|
98
|
+
return false unless text_node?(node1) && text_node?(node2)
|
|
99
|
+
|
|
68
100
|
text1 = extract_text_content(diff_node.node1)
|
|
69
101
|
text2 = extract_text_content(diff_node.node2)
|
|
70
102
|
|
|
71
|
-
|
|
103
|
+
# For text_content dimension, use normalized text comparison
|
|
104
|
+
# This handles cases like "" vs " " (both normalize to "")
|
|
105
|
+
if diff_node.dimension == :text_content
|
|
106
|
+
normalized_equivalent?(text1, text2)
|
|
107
|
+
else
|
|
108
|
+
FormattingDetector.formatting_only?(text1, text2)
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Check if two texts are equivalent after normalization
|
|
113
|
+
# This detects formatting-only differences where normalized texts match
|
|
114
|
+
# @param text1 [String, nil] First text
|
|
115
|
+
# @param text2 [String, nil] Second text
|
|
116
|
+
# @return [Boolean] true if normalized texts are equivalent
|
|
117
|
+
def normalized_equivalent?(text1, text2)
|
|
118
|
+
return false if text1.nil? && text2.nil?
|
|
119
|
+
return false if text1.nil? || text2.nil?
|
|
120
|
+
|
|
121
|
+
# Use MatchOptions.normalize_text for consistency
|
|
122
|
+
normalized1 = Canon::Comparison::MatchOptions.normalize_text(text1)
|
|
123
|
+
normalized2 = Canon::Comparison::MatchOptions.normalize_text(text2)
|
|
124
|
+
|
|
125
|
+
# If normalized texts are equivalent but originals are different,
|
|
126
|
+
# it's a formatting-only difference
|
|
127
|
+
normalized1 == normalized2 && text1 != text2
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Check if a node is a text node
|
|
131
|
+
# @param node [Object] The node to check
|
|
132
|
+
# @return [Boolean] true if the node is a text node
|
|
133
|
+
def text_node?(node)
|
|
134
|
+
return false if node.nil?
|
|
135
|
+
|
|
136
|
+
# Canon::Xml::Nodes::TextNode
|
|
137
|
+
return true if node.is_a?(Canon::Xml::Nodes::TextNode)
|
|
138
|
+
|
|
139
|
+
# Nokogiri text nodes (node_type returns integer constant like 3)
|
|
140
|
+
return true if node.respond_to?(:node_type) &&
|
|
141
|
+
node.node_type.is_a?(Integer) &&
|
|
142
|
+
node.node_type == Nokogiri::XML::Node::TEXT_NODE
|
|
143
|
+
|
|
144
|
+
# Moxml text nodes (node_type returns symbol)
|
|
145
|
+
return true if node.respond_to?(:node_type) && node.node_type == :text
|
|
146
|
+
|
|
147
|
+
# String
|
|
148
|
+
return true if node.is_a?(String)
|
|
149
|
+
|
|
150
|
+
# Test doubles or objects with text node-like interface
|
|
151
|
+
# Check if it has a value method (contains text content)
|
|
152
|
+
return true if node.respond_to?(:value)
|
|
153
|
+
|
|
154
|
+
false
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Check if the text node is inside a whitespace-sensitive element
|
|
158
|
+
# @param diff_node [DiffNode] The diff node to check
|
|
159
|
+
# @return [Boolean] true if inside a whitespace-sensitive element
|
|
160
|
+
def inside_whitespace_sensitive_element?(diff_node)
|
|
161
|
+
# Get the text node (not the parent element)
|
|
162
|
+
node = diff_node.node1 || diff_node.node2
|
|
163
|
+
return false unless node
|
|
164
|
+
|
|
165
|
+
# WhitespaceSensitivity.element_sensitive? expects a text node
|
|
166
|
+
# and checks its parent element
|
|
167
|
+
# We need to pass the full options structure with :match_opts key
|
|
168
|
+
opts = { match_opts: @match_options.options }
|
|
169
|
+
|
|
170
|
+
Canon::Comparison::WhitespaceSensitivity.element_sensitive?(node, opts)
|
|
72
171
|
end
|
|
73
172
|
|
|
74
173
|
# Extract text content from a node for formatting comparison
|