canon 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +163 -67
- data/README.adoc +400 -7
- data/docs/Gemfile +9 -0
- data/docs/INDEX.adoc +99 -182
- data/docs/_config.yml +100 -0
- data/docs/advanced/diff-classification.adoc +547 -0
- data/docs/advanced/diff-pipeline.adoc +358 -0
- data/docs/advanced/index.adoc +214 -0
- data/docs/advanced/semantic-diff-report.adoc +390 -0
- data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
- data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
- data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
- data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
- data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
- data/docs/features/diff-formatting/display-filtering.adoc +472 -0
- data/docs/features/diff-formatting/index.adoc +140 -0
- data/docs/features/environment-configuration/index.adoc +327 -0
- data/docs/features/environment-configuration/override-system.adoc +436 -0
- data/docs/features/environment-configuration/size-limits.adoc +273 -0
- data/docs/features/index.adoc +173 -0
- data/docs/features/input-validation/index.adoc +521 -0
- data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
- data/docs/features/match-options/html-policies.adoc +312 -0
- data/docs/features/match-options/index.adoc +621 -0
- data/docs/getting-started/index.adoc +83 -0
- data/docs/getting-started/quick-start.adoc +76 -0
- data/docs/guides/choosing-configuration.adoc +689 -0
- data/docs/guides/index.adoc +181 -0
- data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
- data/docs/interfaces/index.adoc +101 -0
- data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
- data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
- data/docs/lychee.toml +65 -0
- data/docs/reference/cli-options.adoc +418 -0
- data/docs/reference/environment-variables.adoc +375 -0
- data/docs/reference/index.adoc +204 -0
- data/docs/reference/options-across-interfaces.adoc +417 -0
- data/docs/understanding/algorithms/dom-diff.adoc +389 -0
- data/docs/understanding/algorithms/index.adoc +314 -0
- data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
- data/docs/understanding/architecture.adoc +447 -0
- data/docs/understanding/comparison-pipeline.adoc +317 -0
- data/docs/understanding/formats/html.adoc +380 -0
- data/docs/understanding/formats/index.adoc +261 -0
- data/docs/understanding/formats/json.adoc +390 -0
- data/docs/understanding/formats/xml.adoc +366 -0
- data/docs/understanding/formats/yaml.adoc +504 -0
- data/docs/understanding/index.adoc +130 -0
- data/lib/canon/cli.rb +42 -1
- data/lib/canon/commands/diff_command.rb +108 -23
- data/lib/canon/comparison/compare_profile.rb +101 -0
- data/lib/canon/comparison/comparison_result.rb +41 -2
- data/lib/canon/comparison/html_comparator.rb +292 -71
- data/lib/canon/comparison/html_compare_profile.rb +117 -0
- data/lib/canon/comparison/match_options.rb +42 -4
- data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
- data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
- data/lib/canon/comparison/xml_comparator.rb +695 -91
- data/lib/canon/comparison.rb +207 -2
- data/lib/canon/config/env_provider.rb +71 -0
- data/lib/canon/config/env_schema.rb +58 -0
- data/lib/canon/config/override_resolver.rb +55 -0
- data/lib/canon/config/type_converter.rb +59 -0
- data/lib/canon/config.rb +158 -29
- data/lib/canon/data_model.rb +29 -0
- data/lib/canon/diff/diff_classifier.rb +74 -14
- data/lib/canon/diff/diff_context_builder.rb +41 -0
- data/lib/canon/diff/diff_line.rb +18 -2
- data/lib/canon/diff/diff_node.rb +18 -3
- data/lib/canon/diff/diff_node_mapper.rb +71 -12
- data/lib/canon/diff/formatting_detector.rb +53 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
- data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
- data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
- data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
- data/lib/canon/diff_formatter/debug_output.rb +7 -1
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
- data/lib/canon/diff_formatter/legend.rb +42 -0
- data/lib/canon/diff_formatter.rb +78 -9
- data/lib/canon/errors.rb +56 -0
- data/lib/canon/formatters/html_formatter_base.rb +35 -1
- data/lib/canon/formatters/json_formatter.rb +3 -0
- data/lib/canon/formatters/yaml_formatter.rb +3 -0
- data/lib/canon/html/data_model.rb +229 -0
- data/lib/canon/html.rb +9 -0
- data/lib/canon/options/cli_generator.rb +70 -0
- data/lib/canon/options/registry.rb +234 -0
- data/lib/canon/rspec_matchers.rb +34 -13
- data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
- data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
- data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
- data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
- data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
- data/lib/canon/tree_diff/core/matching.rb +241 -0
- data/lib/canon/tree_diff/core/node_signature.rb +164 -0
- data/lib/canon/tree_diff/core/node_weight.rb +135 -0
- data/lib/canon/tree_diff/core/tree_node.rb +450 -0
- data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
- data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
- data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
- data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
- data/lib/canon/tree_diff/operation_converter.rb +631 -0
- data/lib/canon/tree_diff/operations/operation.rb +92 -0
- data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
- data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
- data/lib/canon/tree_diff.rb +33 -0
- data/lib/canon/validators/json_validator.rb +3 -1
- data/lib/canon/validators/yaml_validator.rb +3 -1
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +22 -23
- data/lib/canon/xml/element_matcher.rb +128 -20
- data/lib/canon/xml/namespace_helper.rb +110 -0
- data/lib/canon.rb +3 -0
- metadata +81 -23
- data/_config.yml +0 -116
- data/docs/ADVANCED_TOPICS.adoc +0 -20
- data/docs/BASIC_USAGE.adoc +0 -16
- data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
- data/docs/DIFF_ARCHITECTURE.adoc +0 -435
- data/docs/DIFF_FORMATTING.adoc +0 -540
- data/docs/FORMATS.adoc +0 -447
- data/docs/INPUT_VALIDATION.adoc +0 -477
- data/docs/MATCH_ARCHITECTURE.adoc +0 -463
- data/docs/MATCH_OPTIONS.adoc +0 -719
- data/docs/MODES.adoc +0 -432
- data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
- data/docs/OPTIONS.adoc +0 -1387
- data/docs/PREPROCESSING.adoc +0 -491
- data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
- data/docs/UNDERSTANDING_CANON.adoc +0 -17
|
@@ -1,11 +1,16 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "nokogiri"
|
|
4
|
+
require_relative "../comparison" # Load base module with constants first
|
|
4
5
|
require_relative "xml_comparator"
|
|
5
6
|
require_relative "match_options"
|
|
6
7
|
require_relative "comparison_result"
|
|
8
|
+
require_relative "compare_profile"
|
|
9
|
+
require_relative "html_compare_profile"
|
|
7
10
|
require_relative "../diff/diff_node"
|
|
8
11
|
require_relative "../diff/diff_classifier"
|
|
12
|
+
require_relative "strategies/match_strategy_factory"
|
|
13
|
+
require_relative "../html/data_model"
|
|
9
14
|
|
|
10
15
|
module Canon
|
|
11
16
|
module Comparison
|
|
@@ -59,24 +64,41 @@ module Canon
|
|
|
59
64
|
global_options: opts[:global_options],
|
|
60
65
|
)
|
|
61
66
|
|
|
67
|
+
# Parse nodes to detect HTML version before creating profile
|
|
68
|
+
# We need to parse early to know if we're dealing with HTML4 or HTML5
|
|
69
|
+
node1 = parse_node(html1, match_opts_hash[:preprocessing],
|
|
70
|
+
match_opts_hash)
|
|
71
|
+
node2 = parse_node(html2, match_opts_hash[:preprocessing],
|
|
72
|
+
match_opts_hash)
|
|
73
|
+
|
|
74
|
+
# Detect HTML version from parsed nodes
|
|
75
|
+
html_version = detect_html_version_from_node(node1)
|
|
76
|
+
|
|
77
|
+
# Create HTML-specific compare profile
|
|
78
|
+
compare_profile = HtmlCompareProfile.new(
|
|
79
|
+
match_opts_hash,
|
|
80
|
+
html_version: html_version,
|
|
81
|
+
)
|
|
82
|
+
|
|
62
83
|
# Wrap in ResolvedMatchOptions for DiffClassifier
|
|
63
84
|
match_opts = Canon::Comparison::ResolvedMatchOptions.new(
|
|
64
85
|
match_opts_hash,
|
|
65
86
|
format: :html,
|
|
87
|
+
compare_profile: compare_profile,
|
|
66
88
|
)
|
|
67
89
|
|
|
68
90
|
# Store resolved match options hash for use in comparison logic
|
|
69
91
|
opts[:match_opts] = match_opts_hash
|
|
70
92
|
|
|
93
|
+
# Use tree diff if semantic_diff option is enabled
|
|
94
|
+
if match_opts.semantic_diff?
|
|
95
|
+
return perform_semantic_tree_diff(html1, html2, opts,
|
|
96
|
+
match_opts_hash)
|
|
97
|
+
end
|
|
98
|
+
|
|
71
99
|
# Create child_opts with resolved options
|
|
72
100
|
child_opts = opts.merge(child_opts)
|
|
73
101
|
|
|
74
|
-
# Parse nodes if they are strings, applying preprocessing if needed
|
|
75
|
-
node1 = parse_node(html1, match_opts_hash[:preprocessing],
|
|
76
|
-
match_opts_hash)
|
|
77
|
-
node2 = parse_node(html2, match_opts_hash[:preprocessing],
|
|
78
|
-
match_opts_hash)
|
|
79
|
-
|
|
80
102
|
# Serialize preprocessed nodes for diff display (avoid re-preprocessing)
|
|
81
103
|
preprocessed_str1 = serialize_for_display(node1)
|
|
82
104
|
preprocessed_str2 = serialize_for_display(node2)
|
|
@@ -86,11 +108,19 @@ module Canon
|
|
|
86
108
|
|
|
87
109
|
# DocumentFragment nodes need special handling - compare their children
|
|
88
110
|
# instead of the fragment nodes themselves
|
|
89
|
-
if node1.is_a?(Nokogiri::HTML4::DocumentFragment)
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
111
|
+
if (node1.is_a?(Nokogiri::HTML4::DocumentFragment) ||
|
|
112
|
+
node1.is_a?(Nokogiri::XML::DocumentFragment)) &&
|
|
113
|
+
(node2.is_a?(Nokogiri::HTML4::DocumentFragment) ||
|
|
114
|
+
node2.is_a?(Nokogiri::XML::DocumentFragment))
|
|
115
|
+
# Compare children of fragments - filter them first
|
|
116
|
+
all_children1 = node1.children.to_a
|
|
117
|
+
all_children2 = node2.children.to_a
|
|
118
|
+
|
|
119
|
+
# Filter children based on match options (e.g., ignore comments)
|
|
120
|
+
children1 = XmlComparator.send(:filter_children, all_children1,
|
|
121
|
+
opts)
|
|
122
|
+
children2 = XmlComparator.send(:filter_children, all_children2,
|
|
123
|
+
opts)
|
|
94
124
|
|
|
95
125
|
if children1.length != children2.length
|
|
96
126
|
result = Comparison::UNEQUAL_ELEMENTS
|
|
@@ -129,79 +159,205 @@ module Canon
|
|
|
129
159
|
format: :html,
|
|
130
160
|
html_version: detect_html_version_from_node(node1),
|
|
131
161
|
match_options: match_opts_hash,
|
|
162
|
+
algorithm: :dom,
|
|
132
163
|
)
|
|
164
|
+
elsif result != Comparison::EQUIVALENT && !differences.empty?
|
|
165
|
+
# Non-verbose mode: check equivalence
|
|
166
|
+
# If comparison found differences, classify them to determine if normative
|
|
167
|
+
classifier = Canon::Diff::DiffClassifier.new(match_opts)
|
|
168
|
+
classifier.classify_all(differences.select do |d|
|
|
169
|
+
d.is_a?(Canon::Diff::DiffNode)
|
|
170
|
+
end)
|
|
171
|
+
# Equivalent if no normative differences (matches semantic algorithm)
|
|
172
|
+
differences.none?(&:normative?)
|
|
133
173
|
else
|
|
174
|
+
# Either equivalent or no differences tracked
|
|
134
175
|
result == Comparison::EQUIVALENT
|
|
135
176
|
end
|
|
136
177
|
end
|
|
137
178
|
|
|
138
179
|
private
|
|
139
180
|
|
|
181
|
+
# Perform semantic tree diff using SemanticTreeMatchStrategy
|
|
182
|
+
#
|
|
183
|
+
# @param html1 [String, Nokogiri::HTML::Document] First HTML
|
|
184
|
+
# @param html2 [String, Nokogiri::HTML::Document] Second HTML
|
|
185
|
+
# @param opts [Hash] Comparison options
|
|
186
|
+
# @param match_opts_hash [Hash] Resolved match options
|
|
187
|
+
# @return [Boolean, ComparisonResult] Result of tree diff comparison
|
|
188
|
+
def perform_semantic_tree_diff(html1, html2, opts, match_opts_hash)
|
|
189
|
+
# Parse to Canon::Xml::Node (preserves preprocessing)
|
|
190
|
+
# For HTML, we parse as XML to get Canon::Xml::Node structure
|
|
191
|
+
node1 = parse_node_for_semantic(html1,
|
|
192
|
+
match_opts_hash[:preprocessing])
|
|
193
|
+
node2 = parse_node_for_semantic(html2,
|
|
194
|
+
match_opts_hash[:preprocessing])
|
|
195
|
+
|
|
196
|
+
# Create strategy using factory
|
|
197
|
+
strategy = Strategies::MatchStrategyFactory.create(
|
|
198
|
+
format: :html,
|
|
199
|
+
match_options: match_opts_hash,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Pass Canon::Xml::Node directly - adapter now handles it
|
|
203
|
+
differences = strategy.match(node1, node2)
|
|
204
|
+
|
|
205
|
+
# Return based on verbose mode
|
|
206
|
+
if opts[:verbose]
|
|
207
|
+
# Get preprocessed strings for display
|
|
208
|
+
preprocessed = strategy.preprocess_for_display(node1, node2)
|
|
209
|
+
|
|
210
|
+
# Detect HTML version (default to HTML5 for Canon nodes)
|
|
211
|
+
html_version = :html5
|
|
212
|
+
|
|
213
|
+
# Return ComparisonResult with strategy metadata
|
|
214
|
+
ComparisonResult.new(
|
|
215
|
+
differences: differences,
|
|
216
|
+
preprocessed_strings: preprocessed,
|
|
217
|
+
format: :html,
|
|
218
|
+
html_version: html_version,
|
|
219
|
+
match_options: match_opts_hash.merge(strategy.metadata),
|
|
220
|
+
algorithm: :semantic,
|
|
221
|
+
)
|
|
222
|
+
else
|
|
223
|
+
# Simple boolean result - equivalent if no normative differences
|
|
224
|
+
differences.none?(&:normative?)
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# Parse node as fragment to preserve actual content
|
|
229
|
+
# Uses HTML4.fragment or HTML5.fragment based on content detection
|
|
230
|
+
#
|
|
231
|
+
# @param node [String, Nokogiri node] Node to parse
|
|
232
|
+
# @param preprocessing [Symbol] Preprocessing mode
|
|
233
|
+
# @param match_opts [Hash] Match options
|
|
234
|
+
# @return [Nokogiri::HTML::DocumentFragment] Parsed fragment
|
|
235
|
+
def parse_node_as_fragment(node, preprocessing = :none, match_opts = {})
|
|
236
|
+
# If already an XML fragment (no meta tags), return it
|
|
237
|
+
if node.is_a?(Nokogiri::XML::DocumentFragment)
|
|
238
|
+
return node
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
# Convert HTML fragments to string and re-parse as XML to remove phantom tags
|
|
242
|
+
# This handles cases where pre-parsed HTML4/HTML5 fragments have auto-inserted meta
|
|
243
|
+
html_string = if node.is_a?(Nokogiri::HTML4::DocumentFragment) ||
|
|
244
|
+
node.is_a?(Nokogiri::HTML5::DocumentFragment)
|
|
245
|
+
node.to_s # Use to_s to avoid re-inserting meta tags
|
|
246
|
+
elsif node.is_a?(String)
|
|
247
|
+
node
|
|
248
|
+
else
|
|
249
|
+
node.to_html
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
# Use XML fragment parser to preserve structure without auto-generated elements
|
|
253
|
+
# This avoids both HTML4's meta tag insertion and HTML5's tag stripping
|
|
254
|
+
# See: https://stackoverflow.com/questions/25998824/stop-nokogiri-from-adding-doctype-and-meta-tags
|
|
255
|
+
frag = Nokogiri::XML.fragment(html_string)
|
|
256
|
+
|
|
257
|
+
# Apply preprocessing if needed
|
|
258
|
+
if preprocessing == :rendered
|
|
259
|
+
normalize_html_style_script_comments(frag)
|
|
260
|
+
normalize_rendered_whitespace(frag, match_opts)
|
|
261
|
+
remove_whitespace_only_text_nodes(frag)
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
frag
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
# Parse HTML for semantic tree diff using Canon::Html::DataModel
|
|
268
|
+
# Returns Canon::Xml::Node for preprocessing preservation
|
|
269
|
+
#
|
|
270
|
+
# @param html [String, Object] HTML to parse
|
|
271
|
+
# @param preprocessing [Symbol] Preprocessing mode
|
|
272
|
+
# @return [Canon::Xml::Node] Parsed Canon node
|
|
273
|
+
def parse_node_for_semantic(html, preprocessing = :none)
|
|
274
|
+
# If already a Canon::Xml::Node, return as-is
|
|
275
|
+
return html if html.is_a?(Canon::Xml::Node)
|
|
276
|
+
|
|
277
|
+
# Convert to string if needed
|
|
278
|
+
html_string = if html.is_a?(String)
|
|
279
|
+
html
|
|
280
|
+
elsif html.respond_to?(:to_html)
|
|
281
|
+
html.to_html
|
|
282
|
+
elsif html.respond_to?(:to_s)
|
|
283
|
+
html.to_s
|
|
284
|
+
else
|
|
285
|
+
raise Canon::Error,
|
|
286
|
+
"Unable to convert HTML to string: #{html.class}"
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
# Strip DOCTYPE for consistent parsing
|
|
290
|
+
html_string = html_string.gsub(/<!DOCTYPE[^>]*>/i, "").strip
|
|
291
|
+
|
|
292
|
+
# Apply preprocessing to HTML string before parsing
|
|
293
|
+
processed_html = case preprocessing
|
|
294
|
+
when :normalize
|
|
295
|
+
# Normalize whitespace
|
|
296
|
+
html_string.lines.map(&:strip).reject(&:empty?).join("\n")
|
|
297
|
+
when :c14n
|
|
298
|
+
# Canonicalize
|
|
299
|
+
Canon::Xml::C14n.canonicalize(html_string,
|
|
300
|
+
with_comments: false)
|
|
301
|
+
when :format
|
|
302
|
+
# Pretty format
|
|
303
|
+
Canon.format(html_string, :html)
|
|
304
|
+
else
|
|
305
|
+
# :none or unrecognized
|
|
306
|
+
html_string
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
# Parse using Canon::Html::DataModel to get Canon::Xml::Node
|
|
310
|
+
# HTML parsing with proper HTML-specific handling
|
|
311
|
+
Canon::Html::DataModel.from_html(processed_html)
|
|
312
|
+
end
|
|
313
|
+
|
|
140
314
|
# Parse a node from string or return as-is
|
|
141
315
|
# Applies preprocessing transformation before parsing if specified
|
|
316
|
+
# For DOM comparison, returns Nokogiri nodes (not Canon::Xml::Node)
|
|
142
317
|
def parse_node(node, preprocessing = :none, match_opts = {})
|
|
143
318
|
# If already a Nokogiri node, check for incompatible XML documents
|
|
144
|
-
# Only raise error for non-string incompatible formats
|
|
145
319
|
unless node.is_a?(String)
|
|
146
320
|
# Detect if this is an XML document (not HTML)
|
|
147
|
-
# Strings are allowed since they can be wrapped/parsed as needed
|
|
148
321
|
if is_xml_document?(node)
|
|
149
322
|
raise Canon::CompareFormatMismatchError.new(:xml, :html)
|
|
150
323
|
end
|
|
151
324
|
|
|
152
|
-
#
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
325
|
+
# Normalize HTML documents to fragments to avoid DTD differences
|
|
326
|
+
# This ensures comparing string with document works correctly
|
|
327
|
+
if node.is_a?(Nokogiri::HTML::Document) ||
|
|
328
|
+
node.is_a?(Nokogiri::HTML4::Document) ||
|
|
329
|
+
node.is_a?(Nokogiri::HTML5::Document)
|
|
330
|
+
# Get root element and create fragment from its outer HTML
|
|
331
|
+
# This avoids DOCTYPE and other document-level nodes
|
|
332
|
+
root = node.at_css("html") || node.root
|
|
333
|
+
if root
|
|
334
|
+
node = Nokogiri::XML.fragment(root.to_html)
|
|
162
335
|
end
|
|
336
|
+
end
|
|
163
337
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
338
|
+
# For :rendered preprocessing with Nokogiri nodes
|
|
339
|
+
if preprocessing == :rendered
|
|
340
|
+
# Normalize and return
|
|
341
|
+
frag = node.is_a?(Nokogiri::XML::DocumentFragment) ? node : Nokogiri::XML.fragment(node.to_html)
|
|
342
|
+
normalize_html_style_script_comments(frag)
|
|
343
|
+
normalize_rendered_whitespace(frag, match_opts)
|
|
344
|
+
remove_whitespace_only_text_nodes(frag)
|
|
345
|
+
return frag
|
|
168
346
|
end
|
|
169
347
|
|
|
170
|
-
#
|
|
348
|
+
# Return Nokogiri node (now normalized if it was a document)
|
|
171
349
|
return node
|
|
172
350
|
end
|
|
173
351
|
|
|
174
352
|
# Check if string contains XML declaration but is actually HTML
|
|
175
|
-
# Nokogiri::HTML4.to_s adds <?xml...?> but the content is still HTML
|
|
176
|
-
# Check if this is actually HTML content after the declaration
|
|
177
|
-
# Look for <html tag which indicates HTML
|
|
178
353
|
if node.strip.start_with?("<?xml") && !node.match?(/<html[\s>]/i)
|
|
179
354
|
# No <html> tag, this is likely pure XML
|
|
180
355
|
raise Canon::CompareFormatMismatchError.new(:xml, :html)
|
|
181
356
|
end
|
|
182
357
|
|
|
183
|
-
#
|
|
184
|
-
# (
|
|
185
|
-
|
|
186
|
-
# For :rendered preprocessing, handle separately to avoid double-parsing
|
|
187
|
-
if preprocessing == :rendered
|
|
188
|
-
# Check if this is a full HTML document or a fragment
|
|
189
|
-
# Use full document parsing if it has <html> tag
|
|
190
|
-
if node.match?(/<html[\s>]/i)
|
|
191
|
-
doc = Nokogiri::HTML(node, &:noblanks)
|
|
192
|
-
normalize_html_style_script_comments(doc)
|
|
193
|
-
normalize_rendered_whitespace(doc, match_opts)
|
|
194
|
-
remove_whitespace_only_text_nodes(doc)
|
|
195
|
-
return doc
|
|
196
|
-
else
|
|
197
|
-
# Use fragment for partial HTML
|
|
198
|
-
frag = Nokogiri::HTML4.fragment(node)
|
|
199
|
-
normalize_html_style_script_comments(frag)
|
|
200
|
-
normalize_rendered_whitespace(frag, match_opts)
|
|
201
|
-
remove_whitespace_only_text_nodes(frag)
|
|
202
|
-
return frag
|
|
203
|
-
end
|
|
204
|
-
end
|
|
358
|
+
# Strip DOCTYPE declarations from HTML strings
|
|
359
|
+
# This normalizes parsed HTML (which includes DOCTYPE) with raw HTML strings
|
|
360
|
+
node = node.gsub(/<!DOCTYPE[^>]*>/i, "").strip
|
|
205
361
|
|
|
206
362
|
# Apply preprocessing to HTML string before parsing
|
|
207
363
|
html_string = case preprocessing
|
|
@@ -216,15 +372,52 @@ module Canon
|
|
|
216
372
|
# Pretty format the HTML
|
|
217
373
|
Canon.format(node, :html)
|
|
218
374
|
else
|
|
219
|
-
# :none or unrecognized - use as-is
|
|
375
|
+
# :none, :rendered or unrecognized - use as-is
|
|
220
376
|
node
|
|
221
377
|
end
|
|
222
378
|
|
|
223
|
-
#
|
|
224
|
-
# Use
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
379
|
+
# Parse as Nokogiri fragment for DOM comparison
|
|
380
|
+
# Use XML fragment parser to avoid auto-inserted meta tags
|
|
381
|
+
frag = Nokogiri::XML.fragment(html_string)
|
|
382
|
+
|
|
383
|
+
# Apply :rendered preprocessing if needed
|
|
384
|
+
if preprocessing == :rendered
|
|
385
|
+
normalize_html_style_script_comments(frag)
|
|
386
|
+
normalize_rendered_whitespace(frag, match_opts)
|
|
387
|
+
remove_whitespace_only_text_nodes(frag)
|
|
388
|
+
end
|
|
389
|
+
|
|
390
|
+
frag
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
# Normalize HTML comments within style and script tags for DataModel nodes
|
|
394
|
+
def normalize_html_style_script_comments_datamodel(root)
|
|
395
|
+
# Walk the tree to find style/script elements
|
|
396
|
+
find_and_normalize_style_script(root)
|
|
397
|
+
end
|
|
398
|
+
|
|
399
|
+
def find_and_normalize_style_script(node)
|
|
400
|
+
return unless node.respond_to?(:children)
|
|
401
|
+
|
|
402
|
+
node.children.each do |child|
|
|
403
|
+
next unless child.is_a?(Canon::Xml::Nodes::ElementNode)
|
|
404
|
+
|
|
405
|
+
# If this is a style or script element, normalize its text content
|
|
406
|
+
if %w[style script].include?(child.name.downcase)
|
|
407
|
+
# Get text children and remove HTML comments from them
|
|
408
|
+
child.children.each do |text_child|
|
|
409
|
+
next unless text_child.is_a?(Canon::Xml::Nodes::TextNode)
|
|
410
|
+
|
|
411
|
+
# Remove HTML comments from text content
|
|
412
|
+
normalized = text_child.value.gsub(/<!--.*?-->/m, "").strip
|
|
413
|
+
# Update the text value
|
|
414
|
+
text_child.instance_variable_set(:@value, normalized)
|
|
415
|
+
end
|
|
416
|
+
end
|
|
417
|
+
|
|
418
|
+
# Recursively process children
|
|
419
|
+
find_and_normalize_style_script(child)
|
|
420
|
+
end
|
|
228
421
|
end
|
|
229
422
|
|
|
230
423
|
# Detect HTML version from content
|
|
@@ -244,12 +437,12 @@ module Canon
|
|
|
244
437
|
end
|
|
245
438
|
end
|
|
246
439
|
|
|
247
|
-
# Detect HTML version from
|
|
440
|
+
# Detect HTML version from node
|
|
248
441
|
#
|
|
249
|
-
# @param node [Nokogiri::XML::Node]
|
|
442
|
+
# @param node [Canon::Xml::Node, Nokogiri::XML::Node] HTML node
|
|
250
443
|
# @return [Symbol] :html5 or :html4
|
|
251
444
|
def detect_html_version_from_node(node)
|
|
252
|
-
# Check node type
|
|
445
|
+
# Check node type for Nokogiri
|
|
253
446
|
if node.is_a?(Nokogiri::HTML5::Document) ||
|
|
254
447
|
node.is_a?(Nokogiri::HTML5::DocumentFragment)
|
|
255
448
|
:html5
|
|
@@ -257,20 +450,27 @@ module Canon
|
|
|
257
450
|
node.is_a?(Nokogiri::HTML4::DocumentFragment)
|
|
258
451
|
:html4
|
|
259
452
|
else
|
|
260
|
-
# Default to
|
|
261
|
-
:
|
|
453
|
+
# Default to HTML5 for Canon::Xml::Node and unknown types
|
|
454
|
+
:html5
|
|
262
455
|
end
|
|
263
456
|
end
|
|
264
457
|
|
|
265
458
|
# Serialize node to string for diff display
|
|
266
459
|
# This ensures the displayed diff matches what was compared
|
|
267
460
|
#
|
|
268
|
-
# @param node [Nokogiri::HTML::Document] Parsed
|
|
461
|
+
# @param node [Canon::Xml::Node, Nokogiri::HTML::Document] Parsed node
|
|
269
462
|
# @return [String] Serialized HTML string
|
|
270
463
|
def serialize_for_display(node)
|
|
271
|
-
#
|
|
272
|
-
|
|
273
|
-
|
|
464
|
+
# Use XmlComparator's serializer for Canon::Xml::Node
|
|
465
|
+
if node.is_a?(Canon::Xml::Node)
|
|
466
|
+
XmlComparator.send(:serialize_node_to_xml, node)
|
|
467
|
+
elsif node.respond_to?(:to_html)
|
|
468
|
+
node.to_html
|
|
469
|
+
elsif node.respond_to?(:to_xml)
|
|
470
|
+
node.to_xml
|
|
471
|
+
else
|
|
472
|
+
node.to_s
|
|
473
|
+
end
|
|
274
474
|
end
|
|
275
475
|
|
|
276
476
|
# Normalize HTML comments within style and script tags
|
|
@@ -301,14 +501,25 @@ module Canon
|
|
|
301
501
|
#
|
|
302
502
|
# @param doc [Nokogiri::HTML::Document] Document to normalize
|
|
303
503
|
# @param match_opts [Hash] Match options to respect during normalization
|
|
304
|
-
|
|
504
|
+
# @param compare_profile [HtmlCompareProfile] Optional profile for whitespace rules
|
|
505
|
+
def normalize_rendered_whitespace(doc, match_opts = {},
|
|
506
|
+
compare_profile = nil)
|
|
305
507
|
# If text_content is :strict, don't normalize ANY text content
|
|
306
508
|
# This allows users to explicitly request strict text matching
|
|
307
509
|
return if match_opts[:text_content] == :strict
|
|
308
510
|
|
|
309
511
|
# Elements where whitespace is significant - don't normalize
|
|
310
|
-
#
|
|
311
|
-
preserve_whitespace =
|
|
512
|
+
# Use profile if available, otherwise use default list
|
|
513
|
+
preserve_whitespace = if compare_profile.is_a?(HtmlCompareProfile)
|
|
514
|
+
# Profile handles HTML-specific whitespace rules
|
|
515
|
+
%w[pre code textarea script
|
|
516
|
+
style].select do |elem|
|
|
517
|
+
compare_profile.preserve_whitespace?(elem)
|
|
518
|
+
end
|
|
519
|
+
else
|
|
520
|
+
# Fallback to default list
|
|
521
|
+
%w[pre code textarea script style]
|
|
522
|
+
end
|
|
312
523
|
|
|
313
524
|
# Walk all text nodes
|
|
314
525
|
doc.xpath(".//text()").each do |text_node|
|
|
@@ -360,8 +571,18 @@ module Canon
|
|
|
360
571
|
# Remove whitespace-only text nodes from the document
|
|
361
572
|
# These are typically insignificant in HTML rendering (e.g., between
|
|
362
573
|
# block elements)
|
|
574
|
+
#
|
|
575
|
+
# CRITICAL: Do NOT remove whitespace-only text nodes from whitespace-sensitive
|
|
576
|
+
# elements like <pre>, <code>, <textarea>, <script>, <style>
|
|
363
577
|
def remove_whitespace_only_text_nodes(doc)
|
|
578
|
+
# Elements where whitespace is significant - don't remove whitespace-only nodes
|
|
579
|
+
preserve_whitespace = %w[pre code textarea script style]
|
|
580
|
+
|
|
364
581
|
doc.xpath(".//text()").each do |text_node|
|
|
582
|
+
# CRITICAL: Skip if this text node is inside a whitespace-preserving element
|
|
583
|
+
parent = text_node.parent
|
|
584
|
+
next if ancestor_preserves_whitespace?(parent, preserve_whitespace)
|
|
585
|
+
|
|
365
586
|
# Remove if the text is only whitespace (after normalization)
|
|
366
587
|
if text_node.content.strip.empty?
|
|
367
588
|
text_node.remove
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "compare_profile"
|
|
4
|
+
|
|
5
|
+
module Canon
|
|
6
|
+
module Comparison
|
|
7
|
+
# HtmlCompareProfile extends CompareProfile with HTML-specific comparison policies
|
|
8
|
+
#
|
|
9
|
+
# HTML has different semantics than XML:
|
|
10
|
+
# 1. Comments are presentational (default to :ignore unless explicitly :strict)
|
|
11
|
+
# 2. Whitespace preservation required in specific elements
|
|
12
|
+
# 3. Case sensitivity differs between HTML4 and HTML5
|
|
13
|
+
# 4. Self-closing tags handled differently
|
|
14
|
+
#
|
|
15
|
+
# This class provides HTML-specific policy decisions while maintaining
|
|
16
|
+
# the separation of concerns established by CompareProfile.
|
|
17
|
+
class HtmlCompareProfile < CompareProfile
|
|
18
|
+
attr_reader :html_version
|
|
19
|
+
|
|
20
|
+
# @param match_options [ResolvedMatchOptions, Hash] The match options to use
|
|
21
|
+
# @param html_version [Symbol] The HTML version (:html4 or :html5)
|
|
22
|
+
def initialize(match_options, html_version: :html5)
|
|
23
|
+
super(match_options)
|
|
24
|
+
@html_version = html_version
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Override for HTML-specific comment handling
|
|
28
|
+
#
|
|
29
|
+
# In HTML, comments are presentational content (not part of the DOM semantics)
|
|
30
|
+
# unless explicitly set to :strict. This differs from XML where comments
|
|
31
|
+
# may carry semantic meaning.
|
|
32
|
+
#
|
|
33
|
+
# HTML default for comments is :ignore, so comments don't affect equivalence
|
|
34
|
+
# unless the user explicitly sets comments: :strict
|
|
35
|
+
#
|
|
36
|
+
# @param dimension [Symbol] The match dimension to check
|
|
37
|
+
# @return [Boolean] true if differences affect equivalence
|
|
38
|
+
def affects_equivalence?(dimension)
|
|
39
|
+
# Comments in HTML: default is :ignore (presentational)
|
|
40
|
+
# Only affect equivalence if explicitly set to :strict
|
|
41
|
+
if dimension == :comments
|
|
42
|
+
# Check if comments key exists in options
|
|
43
|
+
if match_options.is_a?(Hash)
|
|
44
|
+
# If comments key doesn't exist, default to false (HTML default: ignore)
|
|
45
|
+
return false unless match_options.key?(:comments)
|
|
46
|
+
|
|
47
|
+
# If key exists, check if it's :strict
|
|
48
|
+
return match_options[:comments] == :strict
|
|
49
|
+
elsif match_options.respond_to?(:behavior_for)
|
|
50
|
+
behavior = behavior_for(dimension)
|
|
51
|
+
# In HTML, only :strict makes comments affect equivalence
|
|
52
|
+
return behavior == :strict
|
|
53
|
+
end
|
|
54
|
+
# Default: comments don't affect equivalence in HTML
|
|
55
|
+
return false
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# All other dimensions use base class behavior
|
|
59
|
+
super
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Check if whitespace should be preserved for a given element
|
|
63
|
+
#
|
|
64
|
+
# HTML has specific elements where whitespace is significant:
|
|
65
|
+
# <pre>, <code>, <textarea>, <script>, <style>
|
|
66
|
+
#
|
|
67
|
+
# @param element_name [String] The element name to check
|
|
68
|
+
# @return [Boolean] true if whitespace should be preserved
|
|
69
|
+
def preserve_whitespace?(element_name)
|
|
70
|
+
whitespace_sensitive_elements.include?(element_name.to_s.downcase)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Check if element names should be compared case-sensitively
|
|
74
|
+
#
|
|
75
|
+
# HTML4 is case-insensitive, HTML5 is case-sensitive
|
|
76
|
+
#
|
|
77
|
+
# @return [Boolean] true if case-sensitive comparison
|
|
78
|
+
def case_sensitive?
|
|
79
|
+
@html_version == :html5
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
private
|
|
83
|
+
|
|
84
|
+
# Elements where whitespace is semantically significant in HTML
|
|
85
|
+
# @return [Array<String>] List of element names
|
|
86
|
+
def whitespace_sensitive_elements
|
|
87
|
+
%w[pre code textarea script style]
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Check if a dimension is explicitly set to :strict
|
|
91
|
+
# @param dimension [Symbol] The match dimension
|
|
92
|
+
# @return [Boolean] true if explicitly :strict
|
|
93
|
+
def explicitly_strict?(dimension)
|
|
94
|
+
behavior_for(dimension) == :strict
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Check if an option was explicitly provided in match_options
|
|
98
|
+
# @param dimension [Symbol] The match dimension
|
|
99
|
+
# @return [Boolean] true if option was explicitly set
|
|
100
|
+
def has_explicit_option?(dimension)
|
|
101
|
+
if match_options.is_a?(Hash)
|
|
102
|
+
match_options.key?(dimension)
|
|
103
|
+
elsif match_options.respond_to?(:[])
|
|
104
|
+
# For ResolvedMatchOptions, check if key exists
|
|
105
|
+
begin
|
|
106
|
+
match_options[dimension]
|
|
107
|
+
true
|
|
108
|
+
rescue StandardError
|
|
109
|
+
false
|
|
110
|
+
end
|
|
111
|
+
else
|
|
112
|
+
false
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|