canon 0.2.11 → 0.2.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +12 -22
- data/Rakefile +5 -2
- data/lib/canon/cache.rb +3 -1
- data/lib/canon/cli.rb +0 -3
- data/lib/canon/commands/diff_command.rb +0 -6
- data/lib/canon/commands/format_command.rb +0 -4
- data/lib/canon/commands.rb +9 -0
- data/lib/canon/comparison/child_realignment.rb +0 -2
- data/lib/canon/comparison/compare_profile.rb +30 -36
- data/lib/canon/comparison/comparison_result.rb +0 -2
- data/lib/canon/comparison/diff_node_builder.rb +353 -0
- data/lib/canon/comparison/dimensions/dimension.rb +51 -0
- data/lib/canon/comparison/dimensions/dimension_set.rb +49 -0
- data/lib/canon/comparison/dimensions/registry.rb +101 -60
- data/lib/canon/comparison/dimensions.rb +15 -46
- data/lib/canon/comparison/html_comparator.rb +18 -141
- data/lib/canon/comparison/html_compare_profile.rb +15 -18
- data/lib/canon/comparison/json_comparator.rb +4 -165
- data/lib/canon/comparison/json_parser.rb +0 -2
- data/lib/canon/comparison/markup_comparator.rb +14 -210
- data/lib/canon/comparison/match_options/base_resolver.rb +18 -29
- data/lib/canon/comparison/match_options/json_resolver.rb +4 -28
- data/lib/canon/comparison/match_options/xml_resolver.rb +4 -45
- data/lib/canon/comparison/match_options/yaml_resolver.rb +4 -30
- data/lib/canon/comparison/match_options.rb +13 -88
- data/lib/canon/comparison/pipeline.rb +269 -0
- data/lib/canon/comparison/profile_definition.rb +0 -2
- data/lib/canon/comparison/ruby_object_comparator.rb +1 -1
- data/lib/canon/comparison/strategies/match_strategy_factory.rb +9 -58
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +4 -11
- data/lib/canon/comparison/strategies.rb +16 -0
- data/lib/canon/comparison/xml_comparator/attribute_comparator.rb +0 -3
- data/lib/canon/comparison/xml_comparator/attribute_filter.rb +0 -3
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +0 -6
- data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +1 -6
- data/lib/canon/comparison/xml_comparator/node_parser.rb +0 -4
- data/lib/canon/comparison/xml_comparator.rb +4 -492
- data/lib/canon/comparison/xml_comparator_helpers.rb +21 -0
- data/lib/canon/comparison/xml_node_comparison.rb +4 -119
- data/lib/canon/comparison/yaml_comparator.rb +0 -3
- data/lib/canon/comparison.rb +143 -266
- data/lib/canon/config/config_dsl.rb +159 -0
- data/lib/canon/config/env_provider.rb +0 -3
- data/lib/canon/config/env_schema.rb +48 -58
- data/lib/canon/config/profile_loader.rb +0 -1
- data/lib/canon/config.rb +116 -468
- data/lib/canon/diff/diff_block_builder.rb +0 -2
- data/lib/canon/diff/diff_classifier.rb +0 -5
- data/lib/canon/diff/diff_context.rb +0 -2
- data/lib/canon/diff/diff_context_builder.rb +0 -2
- data/lib/canon/diff/diff_line_builder.rb +0 -3
- data/lib/canon/diff/diff_node_enricher.rb +0 -4
- data/lib/canon/diff/diff_node_mapper.rb +0 -4
- data/lib/canon/diff/diff_report_builder.rb +0 -4
- data/lib/canon/diff/formatting_detector.rb +0 -1
- data/lib/canon/diff/node_serializer.rb +0 -7
- data/lib/canon/diff.rb +39 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +4 -17
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +7 -19
- data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -3
- data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -3
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +7 -26
- data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -3
- data/lib/canon/diff_formatter/by_object/base_formatter.rb +8 -15
- data/lib/canon/diff_formatter/by_object/json_formatter.rb +0 -2
- data/lib/canon/diff_formatter/by_object/xml_formatter.rb +0 -2
- data/lib/canon/diff_formatter/by_object/yaml_formatter.rb +0 -2
- data/lib/canon/diff_formatter/debug_output.rb +0 -2
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +24 -58
- data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +0 -2
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +1 -2
- data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +1 -7
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +0 -7
- data/lib/canon/diff_formatter/diff_detail_formatter_helpers.rb +23 -0
- data/lib/canon/diff_formatter.rb +11 -9
- data/lib/canon/formatters/html4_formatter.rb +0 -2
- data/lib/canon/formatters/html5_formatter.rb +0 -2
- data/lib/canon/formatters/html_formatter.rb +0 -3
- data/lib/canon/formatters/json_formatter.rb +0 -1
- data/lib/canon/formatters/xml_formatter.rb +0 -4
- data/lib/canon/formatters/yaml_formatter.rb +0 -1
- data/lib/canon/formatters.rb +16 -0
- data/lib/canon/html/data_model.rb +0 -10
- data/lib/canon/html.rb +4 -3
- data/lib/canon/options/cli_generator.rb +0 -2
- data/lib/canon/options/registry.rb +0 -2
- data/lib/canon/options.rb +9 -0
- data/lib/canon/pretty_printer/html.rb +0 -1
- data/lib/canon/pretty_printer/xml_normalized.rb +0 -2
- data/lib/canon/pretty_printer.rb +12 -0
- data/lib/canon/tree_diff/adapters/html_adapter.rb +1 -1
- data/lib/canon/tree_diff/adapters.rb +14 -0
- data/lib/canon/tree_diff/core/attribute_comparator.rb +0 -6
- data/lib/canon/tree_diff/core/node_signature.rb +1 -1
- data/lib/canon/tree_diff/core/tree_node.rb +12 -5
- data/lib/canon/tree_diff/core.rb +17 -0
- data/lib/canon/tree_diff/matchers/hash_matcher.rb +0 -7
- data/lib/canon/tree_diff/matchers/similarity_matcher.rb +1 -5
- data/lib/canon/tree_diff/matchers/structural_propagator.rb +1 -5
- data/lib/canon/tree_diff/matchers.rb +15 -0
- data/lib/canon/tree_diff/operation_converter.rb +0 -8
- data/lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb +2 -12
- data/lib/canon/tree_diff/operation_converter_helpers/post_processor.rb +13 -7
- data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +2 -2
- data/lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb +4 -6
- data/lib/canon/tree_diff/operation_converter_helpers.rb +18 -0
- data/lib/canon/tree_diff/operations/operation_detector.rb +2 -5
- data/lib/canon/tree_diff/operations.rb +13 -0
- data/lib/canon/tree_diff.rb +26 -27
- data/lib/canon/validators/base_validator.rb +0 -2
- data/lib/canon/validators/html_validator.rb +0 -1
- data/lib/canon/validators/json_validator.rb +0 -1
- data/lib/canon/validators/xml_validator.rb +0 -1
- data/lib/canon/validators/yaml_validator.rb +0 -1
- data/lib/canon/validators.rb +12 -0
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/c14n.rb +0 -4
- data/lib/canon/xml/data_model.rb +0 -10
- data/lib/canon/xml/line_range_mapper.rb +0 -2
- data/lib/canon/xml/nodes/attribute_node.rb +0 -2
- data/lib/canon/xml/nodes/comment_node.rb +0 -2
- data/lib/canon/xml/nodes/element_node.rb +0 -2
- data/lib/canon/xml/nodes/namespace_node.rb +0 -2
- data/lib/canon/xml/nodes/processing_instruction_node.rb +0 -2
- data/lib/canon/xml/nodes/root_node.rb +0 -2
- data/lib/canon/xml/nodes/text_node.rb +0 -2
- data/lib/canon/xml/nodes.rb +19 -0
- data/lib/canon/xml/processor.rb +0 -5
- data/lib/canon/xml/sax_builder.rb +0 -7
- data/lib/canon/xml.rb +33 -0
- data/lib/canon/xml_backend.rb +50 -14
- data/lib/canon/xml_parsing.rb +4 -2
- data/lib/canon.rb +25 -15
- data/lib/tasks/performance.rake +0 -58
- data/lib/tasks/performance_comparator.rb +132 -65
- data/lib/tasks/performance_helpers.rb +4 -249
- data/lib/tasks/performance_report.rb +309 -0
- metadata +24 -11
- data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +0 -64
- data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +0 -64
- data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +0 -167
- data/lib/canon/comparison/dimensions/base_dimension.rb +0 -107
- data/lib/canon/comparison/dimensions/comments_dimension.rb +0 -117
- data/lib/canon/comparison/dimensions/element_position_dimension.rb +0 -86
- data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +0 -115
- data/lib/canon/comparison/dimensions/text_content_dimension.rb +0 -102
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +0 -300
data/lib/canon/comparison.rb
CHANGED
|
@@ -2,21 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
require "moxml"
|
|
4
4
|
require "nokogiri" if Canon::XmlBackend.nokogiri?
|
|
5
|
-
require_relative "xml/whitespace_normalizer"
|
|
6
|
-
require_relative "comparison/xml_comparator"
|
|
7
|
-
require_relative "comparison/html_comparator"
|
|
8
|
-
require_relative "comparison/json_comparator"
|
|
9
|
-
require_relative "comparison/yaml_comparator"
|
|
10
|
-
require_relative "errors"
|
|
11
|
-
require_relative "comparison/profile_definition"
|
|
12
|
-
require_relative "comparison/format_detector"
|
|
13
|
-
require_relative "comparison/html_parser"
|
|
14
|
-
require_relative "diff/diff_node_mapper"
|
|
15
|
-
require_relative "diff/diff_line"
|
|
16
|
-
require_relative "diff/diff_block_builder"
|
|
17
|
-
require_relative "diff/diff_context_builder"
|
|
18
|
-
require_relative "diff/diff_report_builder"
|
|
19
|
-
require_relative "cache"
|
|
20
5
|
|
|
21
6
|
module Canon
|
|
22
7
|
# Comparison module for XML, HTML, JSON, and YAML documents
|
|
@@ -104,7 +89,31 @@ module Canon
|
|
|
104
89
|
# - diff_code: Type of difference
|
|
105
90
|
#
|
|
106
91
|
module Comparison
|
|
92
|
+
autoload :BaseComparator, "canon/comparison/base_comparator"
|
|
107
93
|
autoload :ChildRealignment, "canon/comparison/child_realignment"
|
|
94
|
+
autoload :CompareProfile, "canon/comparison/compare_profile"
|
|
95
|
+
autoload :ComparisonResult, "canon/comparison/comparison_result"
|
|
96
|
+
autoload :DiffNodeBuilder, "canon/comparison/diff_node_builder"
|
|
97
|
+
autoload :Dimensions, "canon/comparison/dimensions"
|
|
98
|
+
autoload :FormatDetector, "canon/comparison/format_detector"
|
|
99
|
+
autoload :HtmlComparator, "canon/comparison/html_comparator"
|
|
100
|
+
autoload :HtmlCompareProfile, "canon/comparison/html_compare_profile"
|
|
101
|
+
autoload :HtmlParser, "canon/comparison/html_parser"
|
|
102
|
+
autoload :JsonComparator, "canon/comparison/json_comparator"
|
|
103
|
+
autoload :JsonParser, "canon/comparison/json_parser"
|
|
104
|
+
autoload :MarkupComparator, "canon/comparison/markup_comparator"
|
|
105
|
+
autoload :MatchOptions, "canon/comparison/match_options"
|
|
106
|
+
autoload :NodeInspector, "canon/comparison/node_inspector"
|
|
107
|
+
autoload :Pipeline, "canon/comparison/pipeline"
|
|
108
|
+
autoload :ProfileDefinition, "canon/comparison/profile_definition"
|
|
109
|
+
autoload :RubyObjectComparator, "canon/comparison/ruby_object_comparator"
|
|
110
|
+
autoload :Strategies, "canon/comparison/strategies"
|
|
111
|
+
autoload :WhitespaceSensitivity, "canon/comparison/whitespace_sensitivity"
|
|
112
|
+
autoload :XmlComparator, "canon/comparison/xml_comparator"
|
|
113
|
+
autoload :XmlComparatorHelpers, "canon/comparison/xml_comparator_helpers"
|
|
114
|
+
autoload :XmlNodeComparison, "canon/comparison/xml_node_comparison"
|
|
115
|
+
autoload :XmlParser, "canon/comparison/xml_parser"
|
|
116
|
+
autoload :YamlComparator, "canon/comparison/yaml_comparator"
|
|
108
117
|
|
|
109
118
|
# Comparison result constants
|
|
110
119
|
EQUIVALENT = 1
|
|
@@ -124,6 +133,32 @@ module Canon
|
|
|
124
133
|
UNEQUAL_TYPES = 15
|
|
125
134
|
UNEQUAL_PRIMITIVES = 16
|
|
126
135
|
|
|
136
|
+
# Keys that OperationConverter and SemanticTreeMatchStrategy accept.
|
|
137
|
+
# Used to strip diff-only keys (e.g. +max_node_count+) from the
|
|
138
|
+
# fully-resolved match options hash before passing it to components
|
|
139
|
+
# that expect match options only.
|
|
140
|
+
MATCH_OPTION_KEYS = %i[
|
|
141
|
+
match_profile
|
|
142
|
+
match
|
|
143
|
+
preprocessing
|
|
144
|
+
text_content
|
|
145
|
+
structural_whitespace
|
|
146
|
+
attribute_presence
|
|
147
|
+
attribute_order
|
|
148
|
+
attribute_values
|
|
149
|
+
element_position
|
|
150
|
+
comments
|
|
151
|
+
format
|
|
152
|
+
similarity_threshold
|
|
153
|
+
hash_matching
|
|
154
|
+
similarity_matching
|
|
155
|
+
propagation
|
|
156
|
+
preserve_whitespace_elements
|
|
157
|
+
collapse_whitespace_elements
|
|
158
|
+
strip_whitespace_elements
|
|
159
|
+
respect_xml_space
|
|
160
|
+
].freeze
|
|
161
|
+
|
|
127
162
|
# Human-readable labels for the integer comparison-result constants
|
|
128
163
|
# above. Used by the diff reason builders so user-facing reason text
|
|
129
164
|
# never leaks raw numeric codes (e.g. "7 vs 7" — see lutaml/canon#127).
|
|
@@ -195,13 +230,17 @@ module Canon
|
|
|
195
230
|
# - :verbose - Return detailed diff array (default: false)
|
|
196
231
|
# @return [Boolean, Array] true if equivalent, or array of diffs if verbose
|
|
197
232
|
def equivalent?(obj1, obj2, opts = {})
|
|
198
|
-
#
|
|
199
|
-
|
|
233
|
+
# Normalize: match: { semantic_diff: true } → diff_algorithm: :semantic
|
|
234
|
+
if opts.dig(:match, :semantic_diff) || opts.dig(:match, :semantic_tree)
|
|
235
|
+
opts = opts.merge(diff_algorithm: :semantic)
|
|
236
|
+
opts = opts.merge(match: opts[:match].except(:semantic_diff,
|
|
237
|
+
:semantic_tree))
|
|
238
|
+
end
|
|
239
|
+
|
|
200
240
|
if %i[semantic semantic_tree].include?(opts[:diff_algorithm])
|
|
201
241
|
return semantic_diff(obj1, obj2, opts)
|
|
202
242
|
end
|
|
203
243
|
|
|
204
|
-
# Otherwise use DOM-based comparison (default)
|
|
205
244
|
dom_diff(obj1, obj2, opts)
|
|
206
245
|
end
|
|
207
246
|
|
|
@@ -288,113 +327,90 @@ module Canon
|
|
|
288
327
|
|
|
289
328
|
# Perform semantic tree diff comparison
|
|
290
329
|
def semantic_diff(obj1, obj2, opts = {})
|
|
291
|
-
|
|
330
|
+
resolved = opts.dup
|
|
331
|
+
format_hint = resolved[:format]
|
|
292
332
|
|
|
293
|
-
# Capture original strings BEFORE any parsing/transformation
|
|
294
|
-
# These are used for display to preserve original formatting
|
|
295
|
-
|
|
296
|
-
original_str1 = extract_original_string(obj1, format_hint)
|
|
297
|
-
original_str2 = extract_original_string(obj2, format_hint)
|
|
333
|
+
# Capture original strings BEFORE any parsing/transformation.
|
|
334
|
+
# These are used for display to preserve original formatting.
|
|
335
|
+
original_str1, original_str2 = Pipeline.capture_originals(obj1, obj2)
|
|
298
336
|
|
|
299
|
-
# Detect format for both objects
|
|
300
|
-
format1 =
|
|
301
|
-
format2 = opts[:format] || FormatDetector.detect(obj2)
|
|
337
|
+
# Detect format for both objects.
|
|
338
|
+
format1, format2 = Pipeline.detect_formats(obj1, obj2, format_hint)
|
|
302
339
|
|
|
303
|
-
#
|
|
340
|
+
# Semantic tree doesn't support plain-string comparison.
|
|
304
341
|
if format1 == :string
|
|
305
|
-
if
|
|
342
|
+
if resolved[:verbose]
|
|
306
343
|
return obj1.to_s == obj2.to_s ? [] : [:different]
|
|
307
344
|
else
|
|
308
345
|
return obj1.to_s == obj2.to_s
|
|
309
346
|
end
|
|
310
347
|
end
|
|
311
348
|
|
|
312
|
-
#
|
|
313
|
-
|
|
314
|
-
raise Canon::CompareFormatMismatchError.new(format1, format2)
|
|
315
|
-
end
|
|
349
|
+
# Semantic requires exact format match (no ruby_object cross-compat).
|
|
350
|
+
Pipeline.validate_compatible!(format1, format2, strict: true)
|
|
316
351
|
|
|
317
|
-
#
|
|
318
|
-
|
|
319
|
-
if !(opts[:match_profile] || opts[:global_options]) && %i[xml html json
|
|
320
|
-
yaml string].include?(format1)
|
|
321
|
-
format_config = Canon::Config.instance.public_send(format1)
|
|
322
|
-
if format_config.match.profile
|
|
323
|
-
opts[:match_profile] =
|
|
324
|
-
format_config.match.profile
|
|
325
|
-
end
|
|
326
|
-
if format_config.match.options && !format_config.match.options.empty?
|
|
327
|
-
opts[:global_options] =
|
|
328
|
-
format_config.match.options
|
|
329
|
-
end
|
|
330
|
-
end
|
|
352
|
+
# Merge global config-sourced profile and options into opts.
|
|
353
|
+
resolved = Pipeline.resolve_config(format1, resolved)
|
|
331
354
|
|
|
332
|
-
# Resolve match options for the format
|
|
333
|
-
match_opts_hash = resolve_match_options(format1,
|
|
355
|
+
# Resolve match options for the format.
|
|
356
|
+
match_opts_hash = resolve_match_options(format1, resolved)
|
|
334
357
|
|
|
335
|
-
# Also read diff options from config (e.g., max_node_count for
|
|
336
|
-
#
|
|
337
|
-
|
|
338
|
-
|
|
358
|
+
# Also read diff options from config (e.g., max_node_count for
|
|
359
|
+
# large documents). Independent of match options; passed to
|
|
360
|
+
# TreeDiffIntegrator.
|
|
361
|
+
if !match_opts_hash[:max_node_count] &&
|
|
362
|
+
Pipeline::CONFIG_BACKED_FORMATS.include?(format1)
|
|
339
363
|
diff_max_node = Canon::Config.instance.public_send(format1).diff.max_node_count
|
|
340
364
|
if diff_max_node > 10_000
|
|
341
|
-
match_opts_hash[:max_node_count] =
|
|
342
|
-
diff_max_node
|
|
365
|
+
match_opts_hash[:max_node_count] = diff_max_node
|
|
343
366
|
end
|
|
344
367
|
end
|
|
345
368
|
|
|
346
|
-
# Delegate parsing to comparators (reuses existing preprocessing
|
|
347
|
-
doc1, doc2 =
|
|
369
|
+
# Delegate parsing to comparators (reuses existing preprocessing).
|
|
370
|
+
doc1, doc2 = Pipeline.parse_pair(obj1, obj2, format1, match_opts_hash)
|
|
348
371
|
|
|
349
|
-
# Normalize format for TreeDiff (html4/html5 -> html)
|
|
372
|
+
# Normalize format for TreeDiff (html4/html5 -> html).
|
|
350
373
|
tree_diff_format = normalize_format_for_tree_diff(format1)
|
|
351
374
|
|
|
352
|
-
# Create TreeDiff integrator for the format
|
|
353
|
-
# CRITICAL: Use match_opts_hash (resolved options with profile)
|
|
375
|
+
# Create TreeDiff integrator for the format.
|
|
376
|
+
# CRITICAL: Use match_opts_hash (resolved options with profile)
|
|
377
|
+
# not opts[:match].
|
|
354
378
|
integrator = Canon::TreeDiff::TreeDiffIntegrator.new(
|
|
355
379
|
format: tree_diff_format,
|
|
356
380
|
options: match_opts_hash,
|
|
357
381
|
)
|
|
358
382
|
|
|
359
|
-
# Perform diff
|
|
383
|
+
# Perform diff.
|
|
360
384
|
tree_diff_result = integrator.diff(doc1, doc2)
|
|
361
385
|
|
|
362
|
-
# Extract only match-related keys for OperationConverter and
|
|
363
|
-
# These components expect match
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
similarity_matching propagation
|
|
369
|
-
preserve_whitespace_elements
|
|
370
|
-
collapse_whitespace_elements
|
|
371
|
-
strip_whitespace_elements respect_xml_space]
|
|
372
|
-
match_options_only = match_opts_hash.slice(*match_only_keys)
|
|
373
|
-
|
|
374
|
-
# Convert operations to DiffNodes for unified pipeline
|
|
375
|
-
# CRITICAL: Use match_opts_hash (resolved options with profile) not opts[:match]
|
|
386
|
+
# Extract only match-related keys for OperationConverter and
|
|
387
|
+
# SemanticTreeMatchStrategy. These components expect match
|
|
388
|
+
# options, not diff options like max_node_count.
|
|
389
|
+
match_options_only = match_opts_hash.slice(*MATCH_OPTION_KEYS)
|
|
390
|
+
|
|
391
|
+
# Convert operations to DiffNodes for unified pipeline.
|
|
376
392
|
converter = Canon::TreeDiff::OperationConverter.new(
|
|
377
393
|
format: format1,
|
|
378
394
|
match_options: match_options_only,
|
|
379
395
|
)
|
|
380
396
|
diff_nodes = converter.convert(tree_diff_result[:operations])
|
|
381
397
|
|
|
382
|
-
# CRITICAL: Use strategy's preprocess_for_display to ensure proper
|
|
383
|
-
# This matches DOM diff preprocessing pattern
|
|
384
|
-
|
|
398
|
+
# CRITICAL: Use strategy's preprocess_for_display to ensure proper
|
|
399
|
+
# line-breaking. This matches DOM diff preprocessing pattern
|
|
400
|
+
# (xml_comparator.rb:106-109).
|
|
385
401
|
strategy = Comparison::Strategies::SemanticTreeMatchStrategy.new(
|
|
386
402
|
format: format1, match_options: match_options_only,
|
|
387
403
|
)
|
|
388
404
|
str1, str2 = strategy.preprocess_for_display(doc1, doc2)
|
|
389
405
|
|
|
390
|
-
# Store tree diff data in match_options for access via result
|
|
406
|
+
# Store tree diff data in match_options for access via result.
|
|
391
407
|
enhanced_match_options = match_opts_hash.merge(
|
|
392
408
|
tree_diff_operations: tree_diff_result[:operations],
|
|
393
409
|
tree_diff_statistics: tree_diff_result[:statistics],
|
|
394
410
|
tree_diff_matching: tree_diff_result[:matching],
|
|
395
411
|
)
|
|
396
412
|
|
|
397
|
-
# Create ComparisonResult for unified handling
|
|
413
|
+
# Create ComparisonResult for unified handling.
|
|
398
414
|
result = Canon::Comparison::ComparisonResult.new(
|
|
399
415
|
differences: diff_nodes,
|
|
400
416
|
preprocessed_strings: [str1, str2],
|
|
@@ -405,8 +421,8 @@ module Canon
|
|
|
405
421
|
algorithm: :semantic,
|
|
406
422
|
)
|
|
407
423
|
|
|
408
|
-
# Return boolean or ComparisonResult based on verbose flag
|
|
409
|
-
if
|
|
424
|
+
# Return boolean or ComparisonResult based on verbose flag.
|
|
425
|
+
if resolved[:verbose]
|
|
410
426
|
result
|
|
411
427
|
else
|
|
412
428
|
result.equivalent?
|
|
@@ -534,16 +550,7 @@ module Canon
|
|
|
534
550
|
# @param format [Symbol] Format type
|
|
535
551
|
# @return [Array<Symbol>] Valid dimensions for the format
|
|
536
552
|
def valid_dimensions_for_format(format)
|
|
537
|
-
|
|
538
|
-
when :xml, :html, :html4, :html5
|
|
539
|
-
MatchOptions::Xml::MATCH_DIMENSIONS
|
|
540
|
-
when :json
|
|
541
|
-
MatchOptions::Json::MATCH_DIMENSIONS
|
|
542
|
-
when :yaml
|
|
543
|
-
MatchOptions::Yaml::MATCH_DIMENSIONS
|
|
544
|
-
else
|
|
545
|
-
[]
|
|
546
|
-
end
|
|
553
|
+
Dimensions::Registry.for(format).names
|
|
547
554
|
end
|
|
548
555
|
|
|
549
556
|
# Helper to extract format from opts for validation
|
|
@@ -554,76 +561,6 @@ module Canon
|
|
|
554
561
|
opts[:format] || :xml
|
|
555
562
|
end
|
|
556
563
|
|
|
557
|
-
# Parse documents using comparator's parse logic (reuses preprocessing)
|
|
558
|
-
#
|
|
559
|
-
# @param obj1 [Object] First object
|
|
560
|
-
# @param obj2 [Object] Second object
|
|
561
|
-
# @param format [Symbol] Format type
|
|
562
|
-
# @param match_opts_hash [Hash] Resolved match options
|
|
563
|
-
# @return [Array<Object, Object>] Parsed documents
|
|
564
|
-
def parse_with_comparator(obj1, obj2, format, match_opts_hash)
|
|
565
|
-
preprocessing = match_opts_hash[:preprocessing] || :none
|
|
566
|
-
|
|
567
|
-
case format
|
|
568
|
-
when :xml
|
|
569
|
-
# Delegate to XmlComparator's parse - returns Canon::Xml::Node
|
|
570
|
-
doc1 = parse_with_cache(obj1, format, preprocessing) do |doc|
|
|
571
|
-
XmlComparator.parse(doc, preprocessing)
|
|
572
|
-
end
|
|
573
|
-
doc2 = parse_with_cache(obj2, format, preprocessing) do |doc|
|
|
574
|
-
XmlComparator.parse(doc, preprocessing)
|
|
575
|
-
end
|
|
576
|
-
[doc1, doc2]
|
|
577
|
-
when :html, :html4, :html5
|
|
578
|
-
[
|
|
579
|
-
parse_with_cache(obj1, format, preprocessing) do |doc|
|
|
580
|
-
HtmlComparator.parse(doc, preprocessing)
|
|
581
|
-
end,
|
|
582
|
-
parse_with_cache(obj2, format, preprocessing) do |doc|
|
|
583
|
-
HtmlComparator.parse(doc, preprocessing)
|
|
584
|
-
end,
|
|
585
|
-
]
|
|
586
|
-
when :json
|
|
587
|
-
[
|
|
588
|
-
parse_with_cache(obj1, format, :none) do |doc|
|
|
589
|
-
JsonComparator.parse(doc)
|
|
590
|
-
end,
|
|
591
|
-
parse_with_cache(obj2, format, :none) do |doc|
|
|
592
|
-
JsonComparator.parse(doc)
|
|
593
|
-
end,
|
|
594
|
-
]
|
|
595
|
-
when :yaml
|
|
596
|
-
[
|
|
597
|
-
parse_with_cache(obj1, format, :none) do |doc|
|
|
598
|
-
YamlComparator.parse(doc)
|
|
599
|
-
end,
|
|
600
|
-
parse_with_cache(obj2, format, :none) do |doc|
|
|
601
|
-
YamlComparator.parse(doc)
|
|
602
|
-
end,
|
|
603
|
-
]
|
|
604
|
-
else
|
|
605
|
-
[obj1, obj2]
|
|
606
|
-
end
|
|
607
|
-
end
|
|
608
|
-
|
|
609
|
-
# Parse a document with caching
|
|
610
|
-
#
|
|
611
|
-
# @param doc [Object] Document to parse (string or already parsed)
|
|
612
|
-
# @param format [Symbol] Document format
|
|
613
|
-
# @param preprocessing [Symbol] Preprocessing option
|
|
614
|
-
# @yield Block to parse the document if not cached
|
|
615
|
-
# @return [Object] Parsed document
|
|
616
|
-
def parse_with_cache(doc, format, preprocessing)
|
|
617
|
-
# If already a parsed node, return as-is
|
|
618
|
-
return doc unless doc.is_a?(String)
|
|
619
|
-
|
|
620
|
-
# Use cache for string documents
|
|
621
|
-
Cache.fetch(:document_parse,
|
|
622
|
-
Cache.key_for_document(doc, format, preprocessing)) do # rubocop:disable Lint/UselessDefaultValueArgument
|
|
623
|
-
yield doc
|
|
624
|
-
end
|
|
625
|
-
end
|
|
626
|
-
|
|
627
564
|
# Normalize format for TreeDiff (html4/html5 -> html)
|
|
628
565
|
#
|
|
629
566
|
# @param format [Symbol] Original format
|
|
@@ -637,28 +574,6 @@ module Canon
|
|
|
637
574
|
end
|
|
638
575
|
end
|
|
639
576
|
|
|
640
|
-
# Extract original string from various input types
|
|
641
|
-
# This preserves the original formatting without minification
|
|
642
|
-
#
|
|
643
|
-
# @param obj [String, Nokogiri::Node, Canon::Xml::Node, Object] Input object
|
|
644
|
-
# @param format [Symbol] Format type for context
|
|
645
|
-
# @return [String] Original string representation
|
|
646
|
-
def extract_original_string(obj, _format = nil)
|
|
647
|
-
case obj
|
|
648
|
-
when String
|
|
649
|
-
obj
|
|
650
|
-
when Nokogiri::XML::Document, Nokogiri::HTML::Document,
|
|
651
|
-
Nokogiri::XML::DocumentFragment, Nokogiri::HTML::DocumentFragment
|
|
652
|
-
obj.to_html
|
|
653
|
-
else
|
|
654
|
-
if Canon::XmlParsing.xml_node?(obj) || obj.is_a?(Canon::Xml::Node)
|
|
655
|
-
Canon::XmlParsing.serialize(obj)
|
|
656
|
-
else
|
|
657
|
-
obj.to_s
|
|
658
|
-
end
|
|
659
|
-
end
|
|
660
|
-
end
|
|
661
|
-
|
|
662
577
|
# Serialize document back to string
|
|
663
578
|
def serialize_document(doc, format)
|
|
664
579
|
case format
|
|
@@ -683,108 +598,70 @@ module Canon
|
|
|
683
598
|
|
|
684
599
|
# Perform DOM-based comparison (original behavior)
|
|
685
600
|
def dom_diff(obj1, obj2, opts = {})
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
# to dodge Nokogiri::HTML4.fragment's destructive DOM
|
|
701
|
-
# mutations. That avoided one problem but introduced a
|
|
702
|
-
# bigger one: XML whitespace rules were being applied to
|
|
703
|
-
# HTML content. HTML's content model — identical between
|
|
704
|
-
# HTML4 and HTML5 — treats whitespace-only text between
|
|
705
|
-
# block-level children as insignificant; XML treats every
|
|
706
|
-
# whitespace text node as significant. Routing html4 input
|
|
707
|
-
# through an XML parser therefore made
|
|
708
|
-
# be_html4_equivalent_to reject inputs that
|
|
709
|
-
# be_html5_equivalent_to (correctly) accepts.
|
|
710
|
-
# Nokogiri::HTML5.fragment is non-destructive (the original
|
|
711
|
-
# HTML4.fragment concern does not apply to it) and applies
|
|
712
|
-
# HTML's content model uniformly.
|
|
713
|
-
obj1 = HtmlParser.parse(obj1, :html5) if obj1.is_a?(String)
|
|
714
|
-
obj2 = HtmlParser.parse(obj2, :html5) if obj2.is_a?(String)
|
|
715
|
-
end
|
|
716
|
-
else
|
|
717
|
-
format1 = FormatDetector.detect(obj1)
|
|
718
|
-
format2 = FormatDetector.detect(obj2)
|
|
601
|
+
resolved = opts.dup
|
|
602
|
+
format_hint = resolved[:format]
|
|
603
|
+
|
|
604
|
+
# Detect formats (with explicit hint) and pre-parse HTML strings
|
|
605
|
+
# through Nokogiri::HTML5 so html4 and html5 share HTML's
|
|
606
|
+
# whitespace-sensitivity semantics (issue #118). Pre-parsing
|
|
607
|
+
# also lets us snapshot the original strings before the HTML
|
|
608
|
+
# fragment parser mutates the DOM.
|
|
609
|
+
format1, format2 = Pipeline.detect_formats(obj1, obj2, format_hint)
|
|
610
|
+
if %i[html html4 html5].include?(format_hint) && obj1.is_a?(String) &&
|
|
611
|
+
obj2.is_a?(String)
|
|
612
|
+
resolved[:_original_str1] = obj1
|
|
613
|
+
resolved[:_original_str2] = obj2
|
|
614
|
+
obj1, obj2 = Pipeline.preparse_html_pair(obj1, obj2)
|
|
719
615
|
end
|
|
720
616
|
|
|
721
|
-
# Handle string format (plain text comparison)
|
|
617
|
+
# Handle string format (plain text comparison).
|
|
722
618
|
if format1 == :string
|
|
723
|
-
if
|
|
619
|
+
if resolved[:verbose]
|
|
724
620
|
return obj1.to_s == obj2.to_s ? [] : [:different]
|
|
725
621
|
else
|
|
726
622
|
return obj1.to_s == obj2.to_s
|
|
727
623
|
end
|
|
728
624
|
end
|
|
729
625
|
|
|
730
|
-
#
|
|
731
|
-
|
|
732
|
-
formats_compatible = format1 == format2 ||
|
|
733
|
-
(%i[json ruby_object].include?(format1) &&
|
|
734
|
-
%i[json ruby_object].include?(format2)) ||
|
|
735
|
-
(%i[yaml ruby_object].include?(format1) &&
|
|
736
|
-
%i[yaml ruby_object].include?(format2))
|
|
626
|
+
# DOM allows ruby_object <-> json/yaml cross-compatibility.
|
|
627
|
+
Pipeline.validate_compatible!(format1, format2, strict: false)
|
|
737
628
|
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
end
|
|
629
|
+
# Normalize comparison format (ruby_object -> json by default).
|
|
630
|
+
comparison_format = normalize_comparison_format(format1, format2)
|
|
741
631
|
|
|
742
|
-
#
|
|
743
|
-
|
|
744
|
-
when :ruby_object
|
|
745
|
-
# If comparing ruby_object with json/yaml, use that format
|
|
746
|
-
%i[json yaml].include?(format2) ? format2 : :json
|
|
747
|
-
else
|
|
748
|
-
format1
|
|
749
|
-
end
|
|
750
|
-
|
|
751
|
-
# get match_profile if it is not defined in options
|
|
752
|
-
# but defined in config
|
|
753
|
-
if %i[xml html json yaml string].include?(comparison_format)
|
|
754
|
-
format_config = Canon::Config.instance.public_send(comparison_format)
|
|
755
|
-
if opts[:global_profile].nil? && format_config.match.profile
|
|
756
|
-
# Config-sourced profile has *global* priority (applied before
|
|
757
|
-
# global_options), so that YAML profile_options like
|
|
758
|
-
# whitespace_type: :normalize can override the built-in profile
|
|
759
|
-
# (e.g. :spec_friendly)'s whitespace_type: :strict. Writing to
|
|
760
|
-
# :match_profile here gave the config profile per-call priority,
|
|
761
|
-
# which incorrectly overrode the YAML's own overrides.
|
|
762
|
-
opts[:global_profile] = format_config.match.profile
|
|
763
|
-
end
|
|
764
|
-
# Pass YAML profile's extra match options (e.g., preserve_whitespace_elements)
|
|
765
|
-
# that are stored in MatchConfig's resolver but not exposed via the
|
|
766
|
-
# built-in MATCH_PROFILES system. These supplement the built-in profile.
|
|
767
|
-
profile_opts = format_config.match.profile_options
|
|
768
|
-
if profile_opts.any? && opts[:global_options].nil?
|
|
769
|
-
opts[:global_options] = profile_opts
|
|
770
|
-
elsif profile_opts.any?
|
|
771
|
-
# Merge: global_options already set (e.g., per-call) takes precedence
|
|
772
|
-
opts[:global_options] = opts[:global_options].merge(profile_opts)
|
|
773
|
-
end
|
|
774
|
-
end
|
|
632
|
+
# Merge global config-sourced profile and options into opts.
|
|
633
|
+
resolved = Pipeline.resolve_config(comparison_format, resolved)
|
|
775
634
|
|
|
776
635
|
case comparison_format
|
|
777
636
|
when :xml
|
|
778
|
-
XmlComparator.equivalent?(obj1, obj2,
|
|
637
|
+
XmlComparator.equivalent?(obj1, obj2, resolved)
|
|
779
638
|
when :html, :html4, :html5
|
|
780
|
-
HtmlComparator.equivalent?(obj1, obj2,
|
|
639
|
+
HtmlComparator.equivalent?(obj1, obj2, resolved)
|
|
781
640
|
when :json
|
|
782
|
-
JsonComparator.equivalent?(obj1, obj2,
|
|
641
|
+
JsonComparator.equivalent?(obj1, obj2, resolved)
|
|
783
642
|
when :yaml
|
|
784
|
-
YamlComparator.equivalent?(obj1, obj2,
|
|
643
|
+
YamlComparator.equivalent?(obj1, obj2, resolved)
|
|
785
644
|
end
|
|
786
645
|
end
|
|
787
646
|
|
|
647
|
+
# Pick the format used for actual comparison.
|
|
648
|
+
#
|
|
649
|
+
# When comparing ruby_object with json/yaml, use the json/yaml side
|
|
650
|
+
# so both inputs parse to the same Ruby structure. When both sides
|
|
651
|
+
# are ruby_object (or the other side is not json/yaml), default to
|
|
652
|
+
# JSON since ruby_object has no comparator of its own.
|
|
653
|
+
#
|
|
654
|
+
# @param format1 [Symbol]
|
|
655
|
+
# @param format2 [Symbol]
|
|
656
|
+
# @return [Symbol]
|
|
657
|
+
def normalize_comparison_format(format1, format2)
|
|
658
|
+
return format2 if format1 == :ruby_object &&
|
|
659
|
+
%i[json yaml].include?(format2)
|
|
660
|
+
return :json if format1 == :ruby_object
|
|
661
|
+
|
|
662
|
+
format1
|
|
663
|
+
end
|
|
664
|
+
|
|
788
665
|
# Strip XML declarations and DOCTYPE preambles from an HTML string
|
|
789
666
|
# so it can be safely parsed with Nokogiri::XML.fragment without
|
|
790
667
|
# generating processing-instruction nodes.
|