canon 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +163 -67
- data/README.adoc +400 -7
- data/docs/Gemfile +9 -0
- data/docs/INDEX.adoc +99 -182
- data/docs/_config.yml +100 -0
- data/docs/advanced/diff-classification.adoc +547 -0
- data/docs/advanced/diff-pipeline.adoc +358 -0
- data/docs/advanced/index.adoc +214 -0
- data/docs/advanced/semantic-diff-report.adoc +390 -0
- data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
- data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
- data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
- data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
- data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
- data/docs/features/diff-formatting/display-filtering.adoc +472 -0
- data/docs/features/diff-formatting/index.adoc +140 -0
- data/docs/features/environment-configuration/index.adoc +327 -0
- data/docs/features/environment-configuration/override-system.adoc +436 -0
- data/docs/features/environment-configuration/size-limits.adoc +273 -0
- data/docs/features/index.adoc +173 -0
- data/docs/features/input-validation/index.adoc +521 -0
- data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
- data/docs/features/match-options/html-policies.adoc +312 -0
- data/docs/features/match-options/index.adoc +621 -0
- data/docs/getting-started/index.adoc +83 -0
- data/docs/getting-started/quick-start.adoc +76 -0
- data/docs/guides/choosing-configuration.adoc +689 -0
- data/docs/guides/index.adoc +181 -0
- data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
- data/docs/interfaces/index.adoc +101 -0
- data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
- data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
- data/docs/lychee.toml +65 -0
- data/docs/reference/cli-options.adoc +418 -0
- data/docs/reference/environment-variables.adoc +375 -0
- data/docs/reference/index.adoc +204 -0
- data/docs/reference/options-across-interfaces.adoc +417 -0
- data/docs/understanding/algorithms/dom-diff.adoc +389 -0
- data/docs/understanding/algorithms/index.adoc +314 -0
- data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
- data/docs/understanding/architecture.adoc +447 -0
- data/docs/understanding/comparison-pipeline.adoc +317 -0
- data/docs/understanding/formats/html.adoc +380 -0
- data/docs/understanding/formats/index.adoc +261 -0
- data/docs/understanding/formats/json.adoc +390 -0
- data/docs/understanding/formats/xml.adoc +366 -0
- data/docs/understanding/formats/yaml.adoc +504 -0
- data/docs/understanding/index.adoc +130 -0
- data/lib/canon/cli.rb +42 -1
- data/lib/canon/commands/diff_command.rb +108 -23
- data/lib/canon/comparison/compare_profile.rb +101 -0
- data/lib/canon/comparison/comparison_result.rb +41 -2
- data/lib/canon/comparison/html_comparator.rb +292 -71
- data/lib/canon/comparison/html_compare_profile.rb +117 -0
- data/lib/canon/comparison/match_options.rb +42 -4
- data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
- data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
- data/lib/canon/comparison/xml_comparator.rb +695 -91
- data/lib/canon/comparison.rb +207 -2
- data/lib/canon/config/env_provider.rb +71 -0
- data/lib/canon/config/env_schema.rb +58 -0
- data/lib/canon/config/override_resolver.rb +55 -0
- data/lib/canon/config/type_converter.rb +59 -0
- data/lib/canon/config.rb +158 -29
- data/lib/canon/data_model.rb +29 -0
- data/lib/canon/diff/diff_classifier.rb +74 -14
- data/lib/canon/diff/diff_context_builder.rb +41 -0
- data/lib/canon/diff/diff_line.rb +18 -2
- data/lib/canon/diff/diff_node.rb +18 -3
- data/lib/canon/diff/diff_node_mapper.rb +71 -12
- data/lib/canon/diff/formatting_detector.rb +53 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
- data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
- data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
- data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
- data/lib/canon/diff_formatter/debug_output.rb +7 -1
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
- data/lib/canon/diff_formatter/legend.rb +42 -0
- data/lib/canon/diff_formatter.rb +78 -9
- data/lib/canon/errors.rb +56 -0
- data/lib/canon/formatters/html_formatter_base.rb +35 -1
- data/lib/canon/formatters/json_formatter.rb +3 -0
- data/lib/canon/formatters/yaml_formatter.rb +3 -0
- data/lib/canon/html/data_model.rb +229 -0
- data/lib/canon/html.rb +9 -0
- data/lib/canon/options/cli_generator.rb +70 -0
- data/lib/canon/options/registry.rb +234 -0
- data/lib/canon/rspec_matchers.rb +34 -13
- data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
- data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
- data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
- data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
- data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
- data/lib/canon/tree_diff/core/matching.rb +241 -0
- data/lib/canon/tree_diff/core/node_signature.rb +164 -0
- data/lib/canon/tree_diff/core/node_weight.rb +135 -0
- data/lib/canon/tree_diff/core/tree_node.rb +450 -0
- data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
- data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
- data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
- data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
- data/lib/canon/tree_diff/operation_converter.rb +631 -0
- data/lib/canon/tree_diff/operations/operation.rb +92 -0
- data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
- data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
- data/lib/canon/tree_diff.rb +33 -0
- data/lib/canon/validators/json_validator.rb +3 -1
- data/lib/canon/validators/yaml_validator.rb +3 -1
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +22 -23
- data/lib/canon/xml/element_matcher.rb +128 -20
- data/lib/canon/xml/namespace_helper.rb +110 -0
- data/lib/canon.rb +3 -0
- metadata +81 -23
- data/_config.yml +0 -116
- data/docs/ADVANCED_TOPICS.adoc +0 -20
- data/docs/BASIC_USAGE.adoc +0 -16
- data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
- data/docs/DIFF_ARCHITECTURE.adoc +0 -435
- data/docs/DIFF_FORMATTING.adoc +0 -540
- data/docs/FORMATS.adoc +0 -447
- data/docs/INPUT_VALIDATION.adoc +0 -477
- data/docs/MATCH_ARCHITECTURE.adoc +0 -463
- data/docs/MATCH_OPTIONS.adoc +0 -719
- data/docs/MODES.adoc +0 -432
- data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
- data/docs/OPTIONS.adoc +0 -1387
- data/docs/PREPROCESSING.adoc +0 -491
- data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
- data/docs/UNDERSTANDING_CANON.adoc +0 -17
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../comparison/match_options"
|
|
4
|
+
|
|
5
|
+
module Canon
|
|
6
|
+
module Options
|
|
7
|
+
# Centralized registry for all Canon options
|
|
8
|
+
# This is the SINGLE SOURCE OF TRUTH for option definitions
|
|
9
|
+
# All interfaces (CLI, Ruby API, RSpec) auto-generate from this registry
|
|
10
|
+
class Registry
|
|
11
|
+
class << self
|
|
12
|
+
# Get all option definitions
|
|
13
|
+
def all_options
|
|
14
|
+
@all_options ||= [
|
|
15
|
+
preprocessing_option,
|
|
16
|
+
diff_algorithm_option,
|
|
17
|
+
diff_mode_option,
|
|
18
|
+
*match_dimension_options,
|
|
19
|
+
match_profile_option,
|
|
20
|
+
*diff_formatting_options,
|
|
21
|
+
].freeze
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Get options applicable to a specific format
|
|
25
|
+
def options_for_format(format)
|
|
26
|
+
all_options.select do |opt|
|
|
27
|
+
opt[:applies_to].nil? || opt[:applies_to].include?(format)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Validate options hash against registry
|
|
32
|
+
def validate_options!(opts, format)
|
|
33
|
+
valid_option_names = options_for_format(format).map { |o| o[:name] }
|
|
34
|
+
invalid = opts.keys - valid_option_names
|
|
35
|
+
return if invalid.empty?
|
|
36
|
+
|
|
37
|
+
raise Canon::Error,
|
|
38
|
+
"Invalid options for #{format}: #{invalid.join(', ')}"
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Get CLI flag name for an option
|
|
42
|
+
def cli_flag_for(option_name)
|
|
43
|
+
opt = all_options.find { |o| o[:name] == option_name }
|
|
44
|
+
opt&.dig(:cli_flag)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Get default value for an option
|
|
48
|
+
def default_for(option_name, format = nil)
|
|
49
|
+
opt = all_options.find { |o| o[:name] == option_name }
|
|
50
|
+
return nil unless opt
|
|
51
|
+
|
|
52
|
+
# Check for format-specific default
|
|
53
|
+
if format && opt[:format_defaults]&.key?(format)
|
|
54
|
+
opt[:format_defaults][format]
|
|
55
|
+
else
|
|
56
|
+
opt[:default]
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
private
|
|
61
|
+
|
|
62
|
+
# Preprocessing option
|
|
63
|
+
def preprocessing_option
|
|
64
|
+
{
|
|
65
|
+
name: :preprocessing,
|
|
66
|
+
type: :enum,
|
|
67
|
+
values: %w[none c14n normalize format],
|
|
68
|
+
default: :none,
|
|
69
|
+
cli_flag: "--preprocessing",
|
|
70
|
+
description: "Preprocessing: none, c14n, normalize, or format",
|
|
71
|
+
applies_to: %i[xml html json yaml],
|
|
72
|
+
}
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Diff algorithm option (NEW)
|
|
76
|
+
def diff_algorithm_option
|
|
77
|
+
{
|
|
78
|
+
name: :diff_algorithm,
|
|
79
|
+
type: :enum,
|
|
80
|
+
values: %w[dom semantic],
|
|
81
|
+
default: :dom,
|
|
82
|
+
cli_flag: "--diff-algorithm",
|
|
83
|
+
aliases: ["-a"],
|
|
84
|
+
description: "Diff algorithm: dom (positional) or semantic (tree-based)",
|
|
85
|
+
applies_to: %i[xml html json yaml],
|
|
86
|
+
}
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Diff mode option (replaces --by-line flag)
|
|
90
|
+
def diff_mode_option
|
|
91
|
+
{
|
|
92
|
+
name: :diff_mode,
|
|
93
|
+
type: :enum,
|
|
94
|
+
values: %w[by_line by_object],
|
|
95
|
+
default: :by_object,
|
|
96
|
+
format_defaults: {
|
|
97
|
+
html: :by_line,
|
|
98
|
+
},
|
|
99
|
+
cli_flag: "--diff-mode",
|
|
100
|
+
description: "Diff output mode: by_line or by_object",
|
|
101
|
+
applies_to: %i[xml html json yaml],
|
|
102
|
+
}
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Match profile option
|
|
106
|
+
def match_profile_option
|
|
107
|
+
{
|
|
108
|
+
name: :match_profile,
|
|
109
|
+
type: :enum,
|
|
110
|
+
values: Canon::Comparison::MatchOptions::MATCH_PROFILES.keys.map(&:to_s),
|
|
111
|
+
default: nil,
|
|
112
|
+
cli_flag: "--match-profile",
|
|
113
|
+
aliases: ["-p"],
|
|
114
|
+
description: "Match profile: strict, rendered, spec_friendly, or content_only",
|
|
115
|
+
applies_to: %i[xml html json yaml],
|
|
116
|
+
}
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Match dimension options (generated from MatchOptions)
|
|
120
|
+
def match_dimension_options
|
|
121
|
+
Canon::Comparison::MatchOptions::MATCH_DIMENSIONS.map do |dim|
|
|
122
|
+
{
|
|
123
|
+
name: dim,
|
|
124
|
+
type: :enum,
|
|
125
|
+
values: behaviors_for_dimension(dim),
|
|
126
|
+
default: nil,
|
|
127
|
+
format_defaults: format_defaults_for_dimension(dim),
|
|
128
|
+
cli_flag: "--#{dim.to_s.tr('_', '-')}",
|
|
129
|
+
description: "#{dimension_description(dim)}: #{behaviors_for_dimension(dim).join(', ')}",
|
|
130
|
+
applies_to: applicable_formats_for_dimension(dim),
|
|
131
|
+
}
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Diff formatting options
|
|
136
|
+
def diff_formatting_options
|
|
137
|
+
[
|
|
138
|
+
{
|
|
139
|
+
name: :color,
|
|
140
|
+
type: :boolean,
|
|
141
|
+
default: true,
|
|
142
|
+
cli_flag: "--color",
|
|
143
|
+
description: "Colorize diff output",
|
|
144
|
+
applies_to: %i[xml html json yaml],
|
|
145
|
+
},
|
|
146
|
+
{
|
|
147
|
+
name: :verbose,
|
|
148
|
+
type: :boolean,
|
|
149
|
+
default: false,
|
|
150
|
+
cli_flag: "--verbose",
|
|
151
|
+
aliases: ["-v"],
|
|
152
|
+
description: "Show detailed differences",
|
|
153
|
+
applies_to: %i[xml html json yaml],
|
|
154
|
+
},
|
|
155
|
+
{
|
|
156
|
+
name: :context_lines,
|
|
157
|
+
type: :numeric,
|
|
158
|
+
default: 3,
|
|
159
|
+
cli_flag: "--context-lines",
|
|
160
|
+
description: "Number of context lines around changes",
|
|
161
|
+
applies_to: %i[xml html json yaml],
|
|
162
|
+
},
|
|
163
|
+
{
|
|
164
|
+
name: :diff_grouping_lines,
|
|
165
|
+
type: :numeric,
|
|
166
|
+
default: nil,
|
|
167
|
+
cli_flag: "--diff-grouping-lines",
|
|
168
|
+
description: "Group diffs within N lines into context blocks",
|
|
169
|
+
applies_to: %i[xml html json yaml],
|
|
170
|
+
},
|
|
171
|
+
]
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Get valid behaviors for a dimension
|
|
175
|
+
def behaviors_for_dimension(dimension)
|
|
176
|
+
case dimension
|
|
177
|
+
when :key_order, :attribute_order,
|
|
178
|
+
:element_structure, :element_position, :element_hierarchy
|
|
179
|
+
%w[strict ignore]
|
|
180
|
+
else
|
|
181
|
+
%w[strict normalize ignore]
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Get format defaults for a dimension from MatchOptions
|
|
186
|
+
def format_defaults_for_dimension(dimension)
|
|
187
|
+
Canon::Comparison::MatchOptions::FORMAT_DEFAULTS
|
|
188
|
+
.transform_values { |v| v[dimension] }
|
|
189
|
+
.compact
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Get applicable formats for a dimension
|
|
193
|
+
def applicable_formats_for_dimension(dimension)
|
|
194
|
+
case dimension
|
|
195
|
+
when :attribute_whitespace, :attribute_order, :attribute_values
|
|
196
|
+
%i[xml html]
|
|
197
|
+
when :key_order
|
|
198
|
+
%i[json yaml]
|
|
199
|
+
else
|
|
200
|
+
%i[xml html json yaml]
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Get human-readable description for a dimension
|
|
205
|
+
def dimension_description(dimension)
|
|
206
|
+
case dimension
|
|
207
|
+
when :text_content
|
|
208
|
+
"Text content matching"
|
|
209
|
+
when :structural_whitespace
|
|
210
|
+
"Structural whitespace matching"
|
|
211
|
+
when :attribute_whitespace
|
|
212
|
+
"Attribute whitespace matching (XML/HTML only)"
|
|
213
|
+
when :attribute_order
|
|
214
|
+
"Attribute ordering (XML/HTML only)"
|
|
215
|
+
when :attribute_values
|
|
216
|
+
"Attribute value matching (XML/HTML only)"
|
|
217
|
+
when :key_order
|
|
218
|
+
"Key ordering (JSON/YAML only)"
|
|
219
|
+
when :comments
|
|
220
|
+
"Comment matching"
|
|
221
|
+
when :element_structure
|
|
222
|
+
"Element type/structure matching (semantic diff)"
|
|
223
|
+
when :element_position
|
|
224
|
+
"Element position/order matching (semantic diff)"
|
|
225
|
+
when :element_hierarchy
|
|
226
|
+
"Element hierarchy/parent-child matching (semantic diff)"
|
|
227
|
+
else
|
|
228
|
+
dimension.to_s.tr("_", " ").capitalize
|
|
229
|
+
end
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
end
|
data/lib/canon/rspec_matchers.rb
CHANGED
|
@@ -44,12 +44,23 @@ module Canon
|
|
|
44
44
|
# This is a THIN WRAPPER around Canon::Comparison API
|
|
45
45
|
class SerializationMatcher
|
|
46
46
|
def initialize(expected, format = nil, match_profile: nil,
|
|
47
|
-
match: nil, preprocessing: nil
|
|
47
|
+
match: nil, preprocessing: nil, diff_algorithm: nil,
|
|
48
|
+
show_diffs: nil)
|
|
48
49
|
@expected = expected
|
|
49
50
|
@format = format&.to_sym
|
|
50
51
|
@match_profile = match_profile
|
|
51
52
|
@match = match
|
|
52
53
|
@preprocessing = preprocessing
|
|
54
|
+
@diff_algorithm = diff_algorithm
|
|
55
|
+
@show_diffs = show_diffs
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Chain method for controlling diff display
|
|
59
|
+
# @param value [Symbol, String] :all, :normative, or :informative
|
|
60
|
+
# @return [SerializationMatcher] self for chaining
|
|
61
|
+
def show_diffs(value)
|
|
62
|
+
@show_diffs = value.to_sym
|
|
63
|
+
self
|
|
53
64
|
end
|
|
54
65
|
|
|
55
66
|
def matches?(target)
|
|
@@ -134,6 +145,8 @@ module Canon
|
|
|
134
145
|
opts[:match_profile] = @match_profile if @match_profile
|
|
135
146
|
opts[:match] = @match if @match
|
|
136
147
|
opts[:preprocessing] = @preprocessing if @preprocessing
|
|
148
|
+
opts[:diff_algorithm] = @diff_algorithm if @diff_algorithm
|
|
149
|
+
opts[:show_diffs] = @show_diffs if @show_diffs
|
|
137
150
|
|
|
138
151
|
# Add global configuration from Canon::Config (lower priority)
|
|
139
152
|
if @format
|
|
@@ -151,6 +164,8 @@ module Canon
|
|
|
151
164
|
format_config.match.options
|
|
152
165
|
end
|
|
153
166
|
opts[:preprocessing] ||= format_config.preprocessing
|
|
167
|
+
# Add diff algorithm from config if not explicitly set
|
|
168
|
+
opts[:diff_algorithm] ||= format_config.diff.algorithm if format_config.diff.algorithm
|
|
154
169
|
elsif !%i[xml html html4 html5 json yaml
|
|
155
170
|
string].include?(@format)
|
|
156
171
|
# Unsupported format - raise error early
|
|
@@ -211,27 +226,30 @@ module Canon
|
|
|
211
226
|
# Matcher methods
|
|
212
227
|
def be_serialization_equivalent_to(expected, format: :xml,
|
|
213
228
|
match_profile: nil, match: nil,
|
|
214
|
-
preprocessing: nil)
|
|
229
|
+
preprocessing: nil, diff_algorithm: nil)
|
|
215
230
|
SerializationMatcher.new(expected, format,
|
|
216
231
|
match_profile: match_profile,
|
|
217
232
|
match: match,
|
|
218
|
-
preprocessing: preprocessing
|
|
233
|
+
preprocessing: preprocessing,
|
|
234
|
+
diff_algorithm: diff_algorithm)
|
|
219
235
|
end
|
|
220
236
|
|
|
221
237
|
def be_analogous_with(expected, match_profile: nil, match: nil,
|
|
222
|
-
preprocessing: nil)
|
|
238
|
+
preprocessing: nil, diff_algorithm: nil)
|
|
223
239
|
SerializationMatcher.new(expected, :xml,
|
|
224
240
|
match_profile: match_profile,
|
|
225
241
|
match: match,
|
|
226
|
-
preprocessing: preprocessing
|
|
242
|
+
preprocessing: preprocessing,
|
|
243
|
+
diff_algorithm: diff_algorithm)
|
|
227
244
|
end
|
|
228
245
|
|
|
229
246
|
def be_xml_equivalent_to(expected, match_profile: nil, match: nil,
|
|
230
|
-
preprocessing: nil)
|
|
247
|
+
preprocessing: nil, diff_algorithm: nil)
|
|
231
248
|
SerializationMatcher.new(expected, :xml,
|
|
232
249
|
match_profile: match_profile,
|
|
233
250
|
match: match,
|
|
234
|
-
preprocessing: preprocessing
|
|
251
|
+
preprocessing: preprocessing,
|
|
252
|
+
diff_algorithm: diff_algorithm)
|
|
235
253
|
end
|
|
236
254
|
|
|
237
255
|
def be_yaml_equivalent_to(expected)
|
|
@@ -243,27 +261,30 @@ module Canon
|
|
|
243
261
|
end
|
|
244
262
|
|
|
245
263
|
def be_html_equivalent_to(expected, match_profile: nil, match: nil,
|
|
246
|
-
preprocessing: nil)
|
|
264
|
+
preprocessing: nil, diff_algorithm: nil)
|
|
247
265
|
SerializationMatcher.new(expected, :html,
|
|
248
266
|
match_profile: match_profile,
|
|
249
267
|
match: match,
|
|
250
|
-
preprocessing: preprocessing
|
|
268
|
+
preprocessing: preprocessing,
|
|
269
|
+
diff_algorithm: diff_algorithm)
|
|
251
270
|
end
|
|
252
271
|
|
|
253
272
|
def be_html4_equivalent_to(expected, match_profile: nil, match: nil,
|
|
254
|
-
preprocessing: nil)
|
|
273
|
+
preprocessing: nil, diff_algorithm: nil)
|
|
255
274
|
SerializationMatcher.new(expected, :html4,
|
|
256
275
|
match_profile: match_profile,
|
|
257
276
|
match: match,
|
|
258
|
-
preprocessing: preprocessing
|
|
277
|
+
preprocessing: preprocessing,
|
|
278
|
+
diff_algorithm: diff_algorithm)
|
|
259
279
|
end
|
|
260
280
|
|
|
261
281
|
def be_html5_equivalent_to(expected, match_profile: nil, match: nil,
|
|
262
|
-
preprocessing: nil)
|
|
282
|
+
preprocessing: nil, diff_algorithm: nil)
|
|
263
283
|
SerializationMatcher.new(expected, :html5,
|
|
264
284
|
match_profile: match_profile,
|
|
265
285
|
match: match,
|
|
266
|
-
preprocessing: preprocessing
|
|
286
|
+
preprocessing: preprocessing,
|
|
287
|
+
diff_algorithm: diff_algorithm)
|
|
267
288
|
end
|
|
268
289
|
|
|
269
290
|
def be_equivalent_to(expected)
|
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
|
|
5
|
+
module Canon
|
|
6
|
+
module TreeDiff
|
|
7
|
+
module Adapters
|
|
8
|
+
# HTMLAdapter converts Nokogiri HTML documents to TreeNode structures
|
|
9
|
+
# and back, enabling semantic tree diffing on HTML documents.
|
|
10
|
+
#
|
|
11
|
+
# This adapter:
|
|
12
|
+
# - Converts Nokogiri::HTML::Document to TreeNode tree
|
|
13
|
+
# - Preserves element names, text content, and attributes
|
|
14
|
+
# - Handles HTML-specific elements (script, style, etc.)
|
|
15
|
+
# - Maintains document structure for round-trip conversion
|
|
16
|
+
#
|
|
17
|
+
# @example Convert HTML to TreeNode
|
|
18
|
+
# html = Nokogiri::HTML("<html><body><p>text</p></body></html>")
|
|
19
|
+
# adapter = HTMLAdapter.new
|
|
20
|
+
# tree = adapter.to_tree(html)
|
|
21
|
+
#
|
|
22
|
+
class HTMLAdapter
|
|
23
|
+
attr_reader :match_options
|
|
24
|
+
|
|
25
|
+
# Initialize adapter with match options
|
|
26
|
+
#
|
|
27
|
+
# @param match_options [Hash] Match options for text/attribute normalization
|
|
28
|
+
def initialize(match_options: {})
|
|
29
|
+
@match_options = match_options
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Convert Nokogiri HTML document/element or Canon::Xml::Node to TreeNode
|
|
33
|
+
#
|
|
34
|
+
# @param node [Nokogiri::HTML::Document, Nokogiri::XML::Element, Nokogiri::HTML::DocumentFragment, Canon::Xml::Node] HTML node
|
|
35
|
+
# @return [Core::TreeNode] Root tree node
|
|
36
|
+
def to_tree(node)
|
|
37
|
+
# Handle Canon::Xml::Node types first (same as XML adapter)
|
|
38
|
+
case node
|
|
39
|
+
when Canon::Xml::Nodes::RootNode
|
|
40
|
+
return to_tree_from_canon_root(node)
|
|
41
|
+
when Canon::Xml::Nodes::ElementNode
|
|
42
|
+
return to_tree_from_canon_element(node)
|
|
43
|
+
when Canon::Xml::Nodes::TextNode
|
|
44
|
+
return to_tree_from_canon_text(node)
|
|
45
|
+
when Canon::Xml::Nodes::CommentNode
|
|
46
|
+
return to_tree_from_canon_comment(node)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Fallback to Nokogiri (legacy support)
|
|
50
|
+
case node
|
|
51
|
+
when Nokogiri::HTML::Document, Nokogiri::HTML4::Document, Nokogiri::HTML5::Document
|
|
52
|
+
# Start from html element or root element
|
|
53
|
+
root = node.at_css("html") || node.root
|
|
54
|
+
root ? to_tree(root) : nil
|
|
55
|
+
when Nokogiri::HTML4::DocumentFragment, Nokogiri::HTML5::DocumentFragment, Nokogiri::XML::DocumentFragment
|
|
56
|
+
# For DocumentFragment, create a wrapper root node and add all fragment children
|
|
57
|
+
convert_fragment(node)
|
|
58
|
+
when Nokogiri::XML::Element
|
|
59
|
+
convert_element(node)
|
|
60
|
+
else
|
|
61
|
+
raise ArgumentError, "Unsupported node type: #{node.class}"
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Convert TreeNode back to Nokogiri HTML
|
|
66
|
+
#
|
|
67
|
+
# @param tree_node [Core::TreeNode] Root tree node
|
|
68
|
+
# @param doc [Nokogiri::HTML::Document] Optional document to use
|
|
69
|
+
# @return [Nokogiri::HTML::Document, Nokogiri::XML::Element]
|
|
70
|
+
def from_tree(tree_node, doc = nil)
|
|
71
|
+
doc ||= Nokogiri::HTML::Document.new
|
|
72
|
+
|
|
73
|
+
element = build_element(tree_node, doc)
|
|
74
|
+
|
|
75
|
+
if doc.root.nil?
|
|
76
|
+
doc.root = element
|
|
77
|
+
doc
|
|
78
|
+
else
|
|
79
|
+
element
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
private
|
|
84
|
+
|
|
85
|
+
# Convert a DocumentFragment to TreeNode
|
|
86
|
+
# Creates a synthetic root node containing the fragment's children
|
|
87
|
+
#
|
|
88
|
+
# @param fragment [Nokogiri::HTML::DocumentFragment] HTML fragment
|
|
89
|
+
# @return [Core::TreeNode] Root tree node
|
|
90
|
+
def convert_fragment(fragment)
|
|
91
|
+
# Create a synthetic root node for the fragment
|
|
92
|
+
root = Core::TreeNode.new(
|
|
93
|
+
label: "fragment",
|
|
94
|
+
value: nil,
|
|
95
|
+
attributes: {},
|
|
96
|
+
source_node: fragment,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Add all fragment children as children of the root
|
|
100
|
+
fragment.element_children.each do |child|
|
|
101
|
+
child_node = convert_element(child)
|
|
102
|
+
root.add_child(child_node)
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
root
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Convert a Nokogiri element to TreeNode
|
|
109
|
+
#
|
|
110
|
+
# @param element [Nokogiri::XML::Element] HTML element
|
|
111
|
+
# @return [Core::TreeNode] Tree node
|
|
112
|
+
def convert_element(element)
|
|
113
|
+
# Get element name (lowercase for HTML)
|
|
114
|
+
label = element.name.downcase
|
|
115
|
+
|
|
116
|
+
# Collect attributes (preserve original order for tree diff)
|
|
117
|
+
# The tree diff will detect attribute order differences
|
|
118
|
+
# and classify them as informative when attribute_order: ignore
|
|
119
|
+
#
|
|
120
|
+
# CRITICAL FIX: Filter out xmlns attributes for HTML documents
|
|
121
|
+
# These are typically added by parsers (e.g., MS Word) and aren't
|
|
122
|
+
# semantically significant for HTML comparison. Keeping them causes
|
|
123
|
+
# false mismatches that prevent the entire subtree from matching due
|
|
124
|
+
# to prefix closure constraints.
|
|
125
|
+
attributes = {}
|
|
126
|
+
element.attributes.each do |name, attr|
|
|
127
|
+
# Skip xmlns namespace declarations for HTML (but keep regular attributes)
|
|
128
|
+
# This prevents false mismatches caused by parser-added namespace declarations
|
|
129
|
+
next if name.start_with?("xmlns")
|
|
130
|
+
|
|
131
|
+
attributes[name] = attr.value
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Get text content (only direct text, not from children)
|
|
135
|
+
text_value = extract_text_value(element)
|
|
136
|
+
|
|
137
|
+
# Create tree node with source_node reference
|
|
138
|
+
tree_node = Core::TreeNode.new(
|
|
139
|
+
label: label,
|
|
140
|
+
value: text_value,
|
|
141
|
+
attributes: attributes,
|
|
142
|
+
source_node: element,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Process child elements
|
|
146
|
+
element.element_children.each do |child|
|
|
147
|
+
child_node = convert_element(child)
|
|
148
|
+
tree_node.add_child(child_node)
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
tree_node
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Extract direct text content from element
|
|
155
|
+
#
|
|
156
|
+
# Preserves original text for proper normalization during comparison.
|
|
157
|
+
# Normalization happens in OperationDetector based on match_options,
|
|
158
|
+
# NOT during tree conversion.
|
|
159
|
+
#
|
|
160
|
+
# For mixed content (text nodes + child elements), joins text nodes
|
|
161
|
+
# with a space to prevent text from running together when elements
|
|
162
|
+
# like <br/> separate the text.
|
|
163
|
+
#
|
|
164
|
+
# @param element [Nokogiri::XML::Element] HTML element
|
|
165
|
+
# @return [String, nil] Text content or nil
|
|
166
|
+
def extract_text_value(element)
|
|
167
|
+
# Get only direct text nodes, not from nested elements
|
|
168
|
+
text_nodes = element.children.select(&:text?)
|
|
169
|
+
|
|
170
|
+
# For mixed content (has both text nodes and element children),
|
|
171
|
+
# join text nodes with space to handle implicit whitespace around
|
|
172
|
+
# block-level elements like <br/>
|
|
173
|
+
# Example: "Text<br/>More" should become "Text More" not "TextMore"
|
|
174
|
+
# EXCEPT for whitespace-sensitive elements (<pre>, <code>, etc.)
|
|
175
|
+
# where we must preserve exact whitespace
|
|
176
|
+
separator = if element.element_children.any? && !whitespace_sensitive?(element)
|
|
177
|
+
" "
|
|
178
|
+
else
|
|
179
|
+
""
|
|
180
|
+
end
|
|
181
|
+
text = text_nodes.map(&:text).join(separator)
|
|
182
|
+
|
|
183
|
+
# CRITICAL FIX: Return original text without stripping
|
|
184
|
+
# Normalization will be applied during comparison based on match_options
|
|
185
|
+
# Only return nil for truly empty text
|
|
186
|
+
text.empty? ? nil : text
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Check if an element is whitespace-sensitive
|
|
190
|
+
#
|
|
191
|
+
# HTML elements where whitespace is significant: <pre>, <code>, <textarea>, <script>, <style>
|
|
192
|
+
#
|
|
193
|
+
# @param element [Nokogiri::XML::Element] Element to check
|
|
194
|
+
# @return [Boolean] True if element is whitespace-sensitive
|
|
195
|
+
def whitespace_sensitive?(element)
|
|
196
|
+
return false unless element.respond_to?(:name)
|
|
197
|
+
|
|
198
|
+
# List of HTML elements where whitespace is semantically significant
|
|
199
|
+
whitespace_sensitive_tags = %w[pre code textarea script style]
|
|
200
|
+
whitespace_sensitive_tags.include?(element.name.downcase)
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
# Build Nokogiri element from TreeNode
|
|
204
|
+
#
|
|
205
|
+
# @param tree_node [Core::TreeNode] Tree node
|
|
206
|
+
# @param doc [Nokogiri::HTML::Document] Document
|
|
207
|
+
# @return [Nokogiri::XML::Element] HTML element
|
|
208
|
+
def build_element(tree_node, doc)
|
|
209
|
+
element = Nokogiri::XML::Element.new(tree_node.label, doc)
|
|
210
|
+
|
|
211
|
+
# Add attributes
|
|
212
|
+
tree_node.attributes.each do |name, value|
|
|
213
|
+
element[name] = value
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# Add text content if present
|
|
217
|
+
if tree_node.value && !tree_node.value.empty?
|
|
218
|
+
element.content = tree_node.value
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
# Add child elements
|
|
222
|
+
tree_node.children.each do |child|
|
|
223
|
+
child_element = build_element(child, doc)
|
|
224
|
+
element.add_child(child_element)
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
element
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
# Convert Canon::Xml::Nodes::RootNode to TreeNode
|
|
231
|
+
#
|
|
232
|
+
# @param root_node [Canon::Xml::Nodes::RootNode] Root node
|
|
233
|
+
# @return [Core::TreeNode, nil] Tree node for first child (document element)
|
|
234
|
+
def to_tree_from_canon_root(root_node)
|
|
235
|
+
# Root node: process first child (document element)
|
|
236
|
+
return nil if root_node.children.empty?
|
|
237
|
+
|
|
238
|
+
to_tree(root_node.children.first)
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
# Convert Canon::Xml::Nodes::ElementNode to TreeNode
|
|
242
|
+
#
|
|
243
|
+
# @param element_node [Canon::Xml::Nodes::ElementNode] Element node
|
|
244
|
+
# @return [Core::TreeNode] Tree node
|
|
245
|
+
def to_tree_from_canon_element(element_node)
|
|
246
|
+
# Create TreeNode from Canon::Xml::Nodes::ElementNode
|
|
247
|
+
tree_node = Core::TreeNode.new(
|
|
248
|
+
label: element_node.name.downcase, # Lowercase for HTML
|
|
249
|
+
value: nil, # Elements don't have values
|
|
250
|
+
attributes: extract_canon_attributes(element_node),
|
|
251
|
+
children: [],
|
|
252
|
+
source_node: element_node, # Preserve reference to Canon node
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# Process children recursively
|
|
256
|
+
element_node.children.each do |child|
|
|
257
|
+
child_tree = to_tree(child)
|
|
258
|
+
tree_node.add_child(child_tree) if child_tree
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
tree_node
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# Convert Canon::Xml::Nodes::TextNode to TreeNode
|
|
265
|
+
#
|
|
266
|
+
# @param text_node [Canon::Xml::Nodes::TextNode] Text node
|
|
267
|
+
# @return [Core::TreeNode, nil] Tree node or nil for empty text
|
|
268
|
+
def to_tree_from_canon_text(text_node)
|
|
269
|
+
# Extract text value
|
|
270
|
+
text_value = text_node.value.to_s
|
|
271
|
+
|
|
272
|
+
# Return nil for empty text (don't strip for HTML)
|
|
273
|
+
return nil if text_value.empty?
|
|
274
|
+
|
|
275
|
+
Core::TreeNode.new(
|
|
276
|
+
label: "text",
|
|
277
|
+
value: text_value,
|
|
278
|
+
attributes: {},
|
|
279
|
+
children: [],
|
|
280
|
+
source_node: text_node,
|
|
281
|
+
)
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
# Convert Canon::Xml::Nodes::CommentNode to TreeNode
|
|
285
|
+
#
|
|
286
|
+
# @param comment_node [Canon::Xml::Nodes::CommentNode] Comment node
|
|
287
|
+
# @return [Core::TreeNode] Tree node
|
|
288
|
+
def to_tree_from_canon_comment(comment_node)
|
|
289
|
+
Core::TreeNode.new(
|
|
290
|
+
label: "comment",
|
|
291
|
+
value: comment_node.value,
|
|
292
|
+
attributes: {},
|
|
293
|
+
children: [],
|
|
294
|
+
source_node: comment_node,
|
|
295
|
+
)
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
# Extract attributes from Canon::Xml::Nodes::ElementNode
|
|
299
|
+
#
|
|
300
|
+
# @param element_node [Canon::Xml::Nodes::ElementNode] Element node
|
|
301
|
+
# @return [Hash] Attributes hash (preserves order, filters xmlns)
|
|
302
|
+
def extract_canon_attributes(element_node)
|
|
303
|
+
# Canon::Xml::Nodes::ElementNode has attribute_nodes array
|
|
304
|
+
attrs = {}
|
|
305
|
+
element_node.attribute_nodes.each do |attr|
|
|
306
|
+
# Skip xmlns attributes for HTML (like Nokogiri path)
|
|
307
|
+
next if attr.name.start_with?("xmlns")
|
|
308
|
+
|
|
309
|
+
attrs[attr.name] = attr.value
|
|
310
|
+
end
|
|
311
|
+
attrs
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
end
|
|
315
|
+
end
|
|
316
|
+
end
|