canon 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +163 -67
- data/README.adoc +400 -7
- data/docs/Gemfile +9 -0
- data/docs/INDEX.adoc +99 -182
- data/docs/_config.yml +100 -0
- data/docs/advanced/diff-classification.adoc +547 -0
- data/docs/advanced/diff-pipeline.adoc +358 -0
- data/docs/advanced/index.adoc +214 -0
- data/docs/advanced/semantic-diff-report.adoc +390 -0
- data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
- data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
- data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
- data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
- data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
- data/docs/features/diff-formatting/display-filtering.adoc +472 -0
- data/docs/features/diff-formatting/index.adoc +140 -0
- data/docs/features/environment-configuration/index.adoc +327 -0
- data/docs/features/environment-configuration/override-system.adoc +436 -0
- data/docs/features/environment-configuration/size-limits.adoc +273 -0
- data/docs/features/index.adoc +173 -0
- data/docs/features/input-validation/index.adoc +521 -0
- data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
- data/docs/features/match-options/html-policies.adoc +312 -0
- data/docs/features/match-options/index.adoc +621 -0
- data/docs/getting-started/index.adoc +83 -0
- data/docs/getting-started/quick-start.adoc +76 -0
- data/docs/guides/choosing-configuration.adoc +689 -0
- data/docs/guides/index.adoc +181 -0
- data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
- data/docs/interfaces/index.adoc +101 -0
- data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
- data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
- data/docs/lychee.toml +65 -0
- data/docs/reference/cli-options.adoc +418 -0
- data/docs/reference/environment-variables.adoc +375 -0
- data/docs/reference/index.adoc +204 -0
- data/docs/reference/options-across-interfaces.adoc +417 -0
- data/docs/understanding/algorithms/dom-diff.adoc +389 -0
- data/docs/understanding/algorithms/index.adoc +314 -0
- data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
- data/docs/understanding/architecture.adoc +447 -0
- data/docs/understanding/comparison-pipeline.adoc +317 -0
- data/docs/understanding/formats/html.adoc +380 -0
- data/docs/understanding/formats/index.adoc +261 -0
- data/docs/understanding/formats/json.adoc +390 -0
- data/docs/understanding/formats/xml.adoc +366 -0
- data/docs/understanding/formats/yaml.adoc +504 -0
- data/docs/understanding/index.adoc +130 -0
- data/lib/canon/cli.rb +42 -1
- data/lib/canon/commands/diff_command.rb +108 -23
- data/lib/canon/comparison/compare_profile.rb +101 -0
- data/lib/canon/comparison/comparison_result.rb +41 -2
- data/lib/canon/comparison/html_comparator.rb +292 -71
- data/lib/canon/comparison/html_compare_profile.rb +117 -0
- data/lib/canon/comparison/match_options.rb +42 -4
- data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
- data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
- data/lib/canon/comparison/xml_comparator.rb +695 -91
- data/lib/canon/comparison.rb +207 -2
- data/lib/canon/config/env_provider.rb +71 -0
- data/lib/canon/config/env_schema.rb +58 -0
- data/lib/canon/config/override_resolver.rb +55 -0
- data/lib/canon/config/type_converter.rb +59 -0
- data/lib/canon/config.rb +158 -29
- data/lib/canon/data_model.rb +29 -0
- data/lib/canon/diff/diff_classifier.rb +74 -14
- data/lib/canon/diff/diff_context_builder.rb +41 -0
- data/lib/canon/diff/diff_line.rb +18 -2
- data/lib/canon/diff/diff_node.rb +18 -3
- data/lib/canon/diff/diff_node_mapper.rb +71 -12
- data/lib/canon/diff/formatting_detector.rb +53 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
- data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
- data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
- data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
- data/lib/canon/diff_formatter/debug_output.rb +7 -1
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
- data/lib/canon/diff_formatter/legend.rb +42 -0
- data/lib/canon/diff_formatter.rb +78 -9
- data/lib/canon/errors.rb +56 -0
- data/lib/canon/formatters/html_formatter_base.rb +35 -1
- data/lib/canon/formatters/json_formatter.rb +3 -0
- data/lib/canon/formatters/yaml_formatter.rb +3 -0
- data/lib/canon/html/data_model.rb +229 -0
- data/lib/canon/html.rb +9 -0
- data/lib/canon/options/cli_generator.rb +70 -0
- data/lib/canon/options/registry.rb +234 -0
- data/lib/canon/rspec_matchers.rb +34 -13
- data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
- data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
- data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
- data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
- data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
- data/lib/canon/tree_diff/core/matching.rb +241 -0
- data/lib/canon/tree_diff/core/node_signature.rb +164 -0
- data/lib/canon/tree_diff/core/node_weight.rb +135 -0
- data/lib/canon/tree_diff/core/tree_node.rb +450 -0
- data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
- data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
- data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
- data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
- data/lib/canon/tree_diff/operation_converter.rb +631 -0
- data/lib/canon/tree_diff/operations/operation.rb +92 -0
- data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
- data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
- data/lib/canon/tree_diff.rb +33 -0
- data/lib/canon/validators/json_validator.rb +3 -1
- data/lib/canon/validators/yaml_validator.rb +3 -1
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +22 -23
- data/lib/canon/xml/element_matcher.rb +128 -20
- data/lib/canon/xml/namespace_helper.rb +110 -0
- data/lib/canon.rb +3 -0
- metadata +81 -23
- data/_config.yml +0 -116
- data/docs/ADVANCED_TOPICS.adoc +0 -20
- data/docs/BASIC_USAGE.adoc +0 -16
- data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
- data/docs/DIFF_ARCHITECTURE.adoc +0 -435
- data/docs/DIFF_FORMATTING.adoc +0 -540
- data/docs/FORMATS.adoc +0 -447
- data/docs/INPUT_VALIDATION.adoc +0 -477
- data/docs/MATCH_ARCHITECTURE.adoc +0 -463
- data/docs/MATCH_OPTIONS.adoc +0 -719
- data/docs/MODES.adoc +0 -432
- data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
- data/docs/OPTIONS.adoc +0 -1387
- data/docs/PREPROCESSING.adoc +0 -491
- data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
- data/docs/UNDERSTANDING_CANON.adoc +0 -17
|
@@ -88,6 +88,48 @@ module Canon
|
|
|
88
88
|
output.join("\n")
|
|
89
89
|
end
|
|
90
90
|
|
|
91
|
+
# Build diff symbol legend
|
|
92
|
+
#
|
|
93
|
+
# @param use_color [Boolean] Whether to use colors
|
|
94
|
+
# @return [String] Formatted diff symbol legend
|
|
95
|
+
def self.build_diff_symbol_legend(use_color: true)
|
|
96
|
+
output = []
|
|
97
|
+
separator = "━" * 60
|
|
98
|
+
|
|
99
|
+
output << colorize("Diff Symbol Legend:", :cyan, :bold, use_color)
|
|
100
|
+
output << colorize(separator, :cyan, :bold, use_color)
|
|
101
|
+
|
|
102
|
+
# Formatting-only changes
|
|
103
|
+
output << colorize("Formatting Changes (cosmetic only):", :yellow,
|
|
104
|
+
:bold, use_color)
|
|
105
|
+
output << " #{colorize('[', :black, :bold,
|
|
106
|
+
use_color)}: Line removed (formatting only - dark gray)"
|
|
107
|
+
output << " #{colorize(']', :white, :bold,
|
|
108
|
+
use_color)}: Line added (formatting only - light gray)"
|
|
109
|
+
output << ""
|
|
110
|
+
|
|
111
|
+
# Informative changes
|
|
112
|
+
output << colorize("Informative Changes (do not affect equivalence):",
|
|
113
|
+
:yellow, :bold, use_color)
|
|
114
|
+
output << " #{colorize('<', :blue, :bold,
|
|
115
|
+
use_color)}: Line removed (informative - blue)"
|
|
116
|
+
output << " #{colorize('>', :cyan, :bold,
|
|
117
|
+
use_color)}: Line added (informative - cyan)"
|
|
118
|
+
output << ""
|
|
119
|
+
|
|
120
|
+
# Normative changes
|
|
121
|
+
output << colorize("Normative Changes (affect equivalence):", :yellow,
|
|
122
|
+
:bold, use_color)
|
|
123
|
+
output << " #{colorize('-', :red, :bold,
|
|
124
|
+
use_color)}: Line removed (normative difference - red)"
|
|
125
|
+
output << " #{colorize('+', :green, :bold,
|
|
126
|
+
use_color)}: Line added (normative difference - green)"
|
|
127
|
+
output << ""
|
|
128
|
+
|
|
129
|
+
output << colorize(separator, :cyan, :bold, use_color)
|
|
130
|
+
output.join("\n")
|
|
131
|
+
end
|
|
132
|
+
|
|
91
133
|
# Format character name for display
|
|
92
134
|
#
|
|
93
135
|
# @param name [String] Unicode character name
|
data/lib/canon/diff_formatter.rb
CHANGED
|
@@ -300,6 +300,18 @@ module Canon
|
|
|
300
300
|
|
|
301
301
|
output = []
|
|
302
302
|
|
|
303
|
+
# Display the algorithm being used
|
|
304
|
+
if comparison_result.is_a?(Canon::Comparison::ComparisonResult)
|
|
305
|
+
algorithm_name = case comparison_result.algorithm
|
|
306
|
+
when :semantic
|
|
307
|
+
"SEMANTIC TREE DIFF"
|
|
308
|
+
else
|
|
309
|
+
"DOM DIFF"
|
|
310
|
+
end
|
|
311
|
+
output << colorize("Algorithm: #{algorithm_name}", :cyan, :bold)
|
|
312
|
+
output << "" # Blank line for spacing
|
|
313
|
+
end
|
|
314
|
+
|
|
303
315
|
# 1. CANON VERBOSE tables (ONLY if CANON_VERBOSE=1)
|
|
304
316
|
verbose_tables = DebugOutput.verbose_tables_only(
|
|
305
317
|
comparison_result,
|
|
@@ -317,12 +329,21 @@ module Canon
|
|
|
317
329
|
)
|
|
318
330
|
end
|
|
319
331
|
|
|
332
|
+
# 2.5. Original Input Strings (ONLY if verbose_diff is enabled)
|
|
333
|
+
if @verbose_diff && comparison_result.is_a?(Canon::Comparison::ComparisonResult)
|
|
334
|
+
original1, original2 = comparison_result.original_strings
|
|
335
|
+
if original1 && original2
|
|
336
|
+
output << format_original_strings(original1, original2)
|
|
337
|
+
end
|
|
338
|
+
end
|
|
339
|
+
|
|
320
340
|
# 3. Main diff output (by-line or by-object) - ALWAYS
|
|
321
341
|
|
|
322
342
|
# Check if comparison result is a ComparisonResult object
|
|
323
343
|
if comparison_result.is_a?(Canon::Comparison::ComparisonResult)
|
|
324
|
-
# Use
|
|
325
|
-
|
|
344
|
+
# Use original strings for line diff to show actual formatting/namespace differences
|
|
345
|
+
# Use preprocessed strings for semantic comparison only
|
|
346
|
+
doc1, doc2 = comparison_result.original_strings
|
|
326
347
|
differences = comparison_result.differences
|
|
327
348
|
html_version = comparison_result.html_version
|
|
328
349
|
elsif comparison_result.is_a?(Hash) && comparison_result[:preprocessed]
|
|
@@ -361,7 +382,7 @@ module Canon
|
|
|
361
382
|
/></, ">\n<"
|
|
362
383
|
),
|
|
363
384
|
Canon::Xml::C14n.canonicalize(actual, with_comments: false).gsub(
|
|
364
|
-
|
|
385
|
+
/>\s+$/, ""
|
|
365
386
|
),
|
|
366
387
|
]
|
|
367
388
|
when :html
|
|
@@ -407,6 +428,43 @@ module Canon
|
|
|
407
428
|
html.to_s
|
|
408
429
|
end
|
|
409
430
|
|
|
431
|
+
# Format original input strings for display (RSpec-style)
|
|
432
|
+
# Shows the actual strings that were passed in before any preprocessing
|
|
433
|
+
#
|
|
434
|
+
# @param original1 [String] First original input string
|
|
435
|
+
# @param original2 [String] Second original input string
|
|
436
|
+
# @return [String] Formatted display of original strings
|
|
437
|
+
def format_original_strings(original1, original2)
|
|
438
|
+
return "" if original1.nil? || original2.nil?
|
|
439
|
+
|
|
440
|
+
output = []
|
|
441
|
+
output << ""
|
|
442
|
+
output << colorize("=" * 70, :cyan, :bold)
|
|
443
|
+
output << colorize(" ORIGINAL INPUT STRINGS", :cyan, :bold)
|
|
444
|
+
output << colorize("=" * 70, :cyan, :bold)
|
|
445
|
+
output << ""
|
|
446
|
+
|
|
447
|
+
# Format expected
|
|
448
|
+
output << colorize("Expected (as string):", :yellow, :bold)
|
|
449
|
+
original1.each_line.with_index do |line, idx|
|
|
450
|
+
output << " #{colorize(sprintf('%4d', idx + 1),
|
|
451
|
+
:blue)} | #{line.chomp}"
|
|
452
|
+
end
|
|
453
|
+
output << ""
|
|
454
|
+
|
|
455
|
+
# Format actual
|
|
456
|
+
output << colorize("Actual (as string):", :yellow, :bold)
|
|
457
|
+
original2.each_line.with_index do |line, idx|
|
|
458
|
+
output << " #{colorize(sprintf('%4d', idx + 1),
|
|
459
|
+
:blue)} | #{line.chomp}"
|
|
460
|
+
end
|
|
461
|
+
output << ""
|
|
462
|
+
output << colorize("=" * 70, :cyan, :bold)
|
|
463
|
+
output << ""
|
|
464
|
+
|
|
465
|
+
output.join("\n")
|
|
466
|
+
end
|
|
467
|
+
|
|
410
468
|
# Build the final visualization map from various customization options
|
|
411
469
|
#
|
|
412
470
|
# @param visualization_map [Hash, nil] Complete custom visualization map
|
|
@@ -455,19 +513,25 @@ module Canon
|
|
|
455
513
|
# Generate by-object diff with tree visualization
|
|
456
514
|
# Delegates to format-specific by-object formatters
|
|
457
515
|
def by_object_diff(differences, format)
|
|
458
|
-
require_relative "diff_formatter/by_object/base_formatter"
|
|
459
|
-
|
|
460
516
|
output = []
|
|
461
517
|
output << colorize("Visual Diff:", :cyan, :bold)
|
|
462
518
|
|
|
519
|
+
# Extract differences array from ComparisonResult if needed
|
|
520
|
+
diffs_array = if differences.is_a?(Canon::Comparison::ComparisonResult)
|
|
521
|
+
differences.differences
|
|
522
|
+
else
|
|
523
|
+
differences
|
|
524
|
+
end
|
|
525
|
+
|
|
463
526
|
# Delegate to format-specific formatter
|
|
464
527
|
formatter = ByObject::BaseFormatter.for_format(
|
|
465
528
|
format,
|
|
466
529
|
use_color: @use_color,
|
|
467
530
|
visualization_map: @visualization_map,
|
|
531
|
+
show_diffs: @show_diffs,
|
|
468
532
|
)
|
|
469
533
|
|
|
470
|
-
output << formatter.format(
|
|
534
|
+
output << formatter.format(diffs_array, format)
|
|
471
535
|
|
|
472
536
|
output.join("\n")
|
|
473
537
|
end
|
|
@@ -476,8 +540,6 @@ module Canon
|
|
|
476
540
|
# Delegates to format-specific by-line formatters
|
|
477
541
|
def by_line_diff(doc1, doc2, format: :xml, html_version: nil,
|
|
478
542
|
differences: [])
|
|
479
|
-
require_relative "diff_formatter/by_line/base_formatter"
|
|
480
|
-
|
|
481
543
|
# For HTML format, use html_version if provided, otherwise default to :html4
|
|
482
544
|
if format == :html && html_version
|
|
483
545
|
format = html_version # Use :html4 or :html5
|
|
@@ -492,6 +554,13 @@ differences: [])
|
|
|
492
554
|
|
|
493
555
|
return output.join("\n") if doc1.nil? || doc2.nil?
|
|
494
556
|
|
|
557
|
+
# Extract differences array from ComparisonResult if needed
|
|
558
|
+
diffs_array = if differences.is_a?(Canon::Comparison::ComparisonResult)
|
|
559
|
+
differences.differences
|
|
560
|
+
else
|
|
561
|
+
differences
|
|
562
|
+
end
|
|
563
|
+
|
|
495
564
|
# Delegate to format-specific formatter
|
|
496
565
|
formatter = ByLine::BaseFormatter.for_format(
|
|
497
566
|
format,
|
|
@@ -500,7 +569,7 @@ differences: [])
|
|
|
500
569
|
diff_grouping_lines: @diff_grouping_lines,
|
|
501
570
|
visualization_map: @visualization_map,
|
|
502
571
|
show_diffs: @show_diffs,
|
|
503
|
-
differences:
|
|
572
|
+
differences: diffs_array,
|
|
504
573
|
)
|
|
505
574
|
|
|
506
575
|
output << formatter.format(doc1, doc2)
|
data/lib/canon/errors.rb
CHANGED
|
@@ -53,4 +53,60 @@ module Canon
|
|
|
53
53
|
parts.join("\n")
|
|
54
54
|
end
|
|
55
55
|
end
|
|
56
|
+
|
|
57
|
+
# Error raised when input exceeds size limits
|
|
58
|
+
#
|
|
59
|
+
# This error is raised when input files or trees exceed configured size
|
|
60
|
+
# limits to prevent performance issues or hangs.
|
|
61
|
+
class SizeLimitExceededError < Error
|
|
62
|
+
attr_reader :limit_type, :actual, :limit
|
|
63
|
+
|
|
64
|
+
# Initialize a new SizeLimitExceededError
|
|
65
|
+
#
|
|
66
|
+
# @param limit_type [Symbol] The type of limit exceeded (:file_size,
|
|
67
|
+
# :node_count, :diff_lines)
|
|
68
|
+
# @param actual [Integer] The actual size that exceeded the limit
|
|
69
|
+
# @param limit [Integer] The configured limit
|
|
70
|
+
def initialize(limit_type, actual, limit)
|
|
71
|
+
@limit_type = limit_type
|
|
72
|
+
@actual = actual
|
|
73
|
+
@limit = limit
|
|
74
|
+
super(build_message)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
private
|
|
78
|
+
|
|
79
|
+
# Build a descriptive error message
|
|
80
|
+
#
|
|
81
|
+
# @return [String] The formatted error message
|
|
82
|
+
def build_message
|
|
83
|
+
case limit_type
|
|
84
|
+
when :file_size
|
|
85
|
+
"File size (#{format_bytes(actual)}) exceeds limit (#{format_bytes(limit)}). " \
|
|
86
|
+
"Increase limit via CANON_MAX_FILE_SIZE or config.diff.max_file_size"
|
|
87
|
+
when :node_count
|
|
88
|
+
"Tree node count (#{actual}) exceeds limit (#{limit}). " \
|
|
89
|
+
"Increase limit via CANON_MAX_NODE_COUNT or config.diff.max_node_count"
|
|
90
|
+
when :diff_lines
|
|
91
|
+
"Diff output (#{actual} lines) exceeds limit (#{limit} lines). " \
|
|
92
|
+
"Output truncated. Increase limit via CANON_MAX_DIFF_LINES or config.diff.max_diff_lines"
|
|
93
|
+
else
|
|
94
|
+
"Size limit exceeded: #{limit_type} (#{actual} > #{limit})"
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Format bytes into human-readable size
|
|
99
|
+
#
|
|
100
|
+
# @param bytes [Integer] Size in bytes
|
|
101
|
+
# @return [String] Formatted size string
|
|
102
|
+
def format_bytes(bytes)
|
|
103
|
+
if bytes < 1024
|
|
104
|
+
"#{bytes} bytes"
|
|
105
|
+
elsif bytes < 1_048_576
|
|
106
|
+
"#{(bytes / 1024.0).round(2)} KB"
|
|
107
|
+
else
|
|
108
|
+
"#{(bytes / 1_048_576.0).round(2)} MB"
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
56
112
|
end
|
|
@@ -49,6 +49,12 @@ module Canon
|
|
|
49
49
|
header hgroup hr li main nav ol p pre section table tbody
|
|
50
50
|
td tfoot th thead tr ul
|
|
51
51
|
].freeze
|
|
52
|
+
|
|
53
|
+
# HTML elements where whitespace is semantically significant
|
|
54
|
+
# and should NOT be normalized
|
|
55
|
+
WHITESPACE_SENSITIVE_ELEMENTS = %w[
|
|
56
|
+
pre code textarea script style
|
|
57
|
+
].freeze
|
|
52
58
|
# Format HTML using canonical form
|
|
53
59
|
# @param html [String] HTML document to canonicalize
|
|
54
60
|
# @return [String] Canonical form of HTML
|
|
@@ -108,6 +114,13 @@ module Canon
|
|
|
108
114
|
doc.traverse do |node|
|
|
109
115
|
next unless node.text?
|
|
110
116
|
|
|
117
|
+
# CRITICAL: Skip normalization for whitespace-sensitive elements
|
|
118
|
+
# In elements like <pre>, <code>, etc., whitespace is semantically
|
|
119
|
+
# significant and must be preserved exactly as-is
|
|
120
|
+
if whitespace_sensitive_element?(node.parent)
|
|
121
|
+
next
|
|
122
|
+
end
|
|
123
|
+
|
|
111
124
|
# Handle whitespace-only text nodes
|
|
112
125
|
if node.text.strip.empty? && node.parent&.element?
|
|
113
126
|
# Check if this text node is between block-level elements
|
|
@@ -156,8 +169,29 @@ module Canon
|
|
|
156
169
|
node&.element? && BLOCK_ELEMENTS.include?(node.name.downcase)
|
|
157
170
|
end
|
|
158
171
|
|
|
172
|
+
# Check if a node is a whitespace-sensitive element
|
|
173
|
+
# @param node [Nokogiri::XML::Node, nil] Node to check
|
|
174
|
+
# @return [Boolean] true if node is whitespace-sensitive
|
|
175
|
+
def self.whitespace_sensitive_element?(node)
|
|
176
|
+
return false unless node&.element?
|
|
177
|
+
|
|
178
|
+
# Check if this element or any ancestor is whitespace-sensitive
|
|
179
|
+
current = node
|
|
180
|
+
while current
|
|
181
|
+
if current.element? && WHITESPACE_SENSITIVE_ELEMENTS.include?(current.name.downcase)
|
|
182
|
+
return true
|
|
183
|
+
end
|
|
184
|
+
# Stop at document root - documents don't have parents
|
|
185
|
+
break if current.is_a?(Nokogiri::XML::Document) || current.is_a?(Nokogiri::HTML5::Document)
|
|
186
|
+
|
|
187
|
+
current = current.parent
|
|
188
|
+
end
|
|
189
|
+
false
|
|
190
|
+
end
|
|
191
|
+
|
|
159
192
|
private_class_method :sort_attributes, :normalize_whitespace,
|
|
160
|
-
:ensure_block_element_spacing, :block_element
|
|
193
|
+
:ensure_block_element_spacing, :block_element?,
|
|
194
|
+
:whitespace_sensitive_element?
|
|
161
195
|
end
|
|
162
196
|
end
|
|
163
197
|
end
|
|
@@ -15,6 +15,9 @@ module Canon
|
|
|
15
15
|
def self.parse(yaml)
|
|
16
16
|
# Validate before parsing
|
|
17
17
|
Canon::Validators::YamlValidator.validate!(yaml)
|
|
18
|
+
# Return as-is if already parsed
|
|
19
|
+
return yaml if yaml.is_a?(Hash) || yaml.is_a?(Array)
|
|
20
|
+
|
|
18
21
|
YAML.safe_load(yaml, permitted_classes: [Symbol, Date, Time])
|
|
19
22
|
end
|
|
20
23
|
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
require_relative "../data_model"
|
|
5
|
+
require_relative "../xml/nodes/root_node"
|
|
6
|
+
require_relative "../xml/nodes/element_node"
|
|
7
|
+
require_relative "../xml/nodes/namespace_node"
|
|
8
|
+
require_relative "../xml/nodes/attribute_node"
|
|
9
|
+
require_relative "../xml/nodes/text_node"
|
|
10
|
+
require_relative "../xml/nodes/comment_node"
|
|
11
|
+
require_relative "../xml/nodes/processing_instruction_node"
|
|
12
|
+
|
|
13
|
+
module Canon
|
|
14
|
+
module Html
|
|
15
|
+
# Builds XPath data model from HTML
|
|
16
|
+
# HTML-specific parsing with lowercase element/attribute names,
|
|
17
|
+
# whitespace-sensitive element handling, and fragment parsing
|
|
18
|
+
class DataModel < Canon::DataModel
|
|
19
|
+
# Build XPath data model from HTML string
|
|
20
|
+
#
|
|
21
|
+
# @param html_string [String] HTML content to parse
|
|
22
|
+
# @param version [Symbol] HTML version (:html4 or :html5)
|
|
23
|
+
# @return [Canon::Xml::Nodes::RootNode] Root of the data model tree
|
|
24
|
+
def self.from_html(html_string, version: :html4)
|
|
25
|
+
# Detect if this is a full document (has <html> tag) or fragment
|
|
26
|
+
# Full documents should use document parser to preserve structure
|
|
27
|
+
# Fragments should use fragment parser to avoid adding implicit wrappers
|
|
28
|
+
is_full_document = html_string.match?(/<html[\s>]/i)
|
|
29
|
+
|
|
30
|
+
# Parse with Nokogiri using appropriate parser
|
|
31
|
+
doc = if is_full_document
|
|
32
|
+
# Full document - use fragment parser to avoid Nokogiri's phantom tag insertion
|
|
33
|
+
# The fragment parser avoids auto-inserted meta tags in HTML4
|
|
34
|
+
if version == :html5
|
|
35
|
+
Nokogiri::HTML5.fragment(html_string)
|
|
36
|
+
else
|
|
37
|
+
Nokogiri::HTML4.fragment(html_string)
|
|
38
|
+
end
|
|
39
|
+
elsif version == :html5
|
|
40
|
+
# Fragment - use fragment parser to avoid implicit wrappers
|
|
41
|
+
Nokogiri::HTML5.fragment(html_string)
|
|
42
|
+
else
|
|
43
|
+
Nokogiri::HTML4.fragment(html_string)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# HTML doesn't have strict namespace requirements like XML,
|
|
47
|
+
# so skip the relative namespace URI check
|
|
48
|
+
|
|
49
|
+
# Convert to XPath data model (reuse XML infrastructure)
|
|
50
|
+
build_from_nokogiri(doc)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Alias for compatibility
|
|
54
|
+
def self.parse(html_string, version: :html4)
|
|
55
|
+
from_html(html_string, version: version)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Serialize HTML node to string
|
|
59
|
+
def self.serialize(node)
|
|
60
|
+
# HTML nodes use the same serialization as XML
|
|
61
|
+
# Delegate to XML serialization implementation
|
|
62
|
+
require_relative "../xml/data_model"
|
|
63
|
+
Canon::Xml::DataModel.serialize(node)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Build XPath data model from Nokogiri document or fragment
|
|
67
|
+
def self.build_from_nokogiri(nokogiri_doc)
|
|
68
|
+
root = Canon::Xml::Nodes::RootNode.new
|
|
69
|
+
|
|
70
|
+
if nokogiri_doc.respond_to?(:root) && nokogiri_doc.root
|
|
71
|
+
# For Documents (HTML4, HTML5): process the root element
|
|
72
|
+
root.add_child(build_element_node(nokogiri_doc.root))
|
|
73
|
+
|
|
74
|
+
# Process PIs and comments outside doc element
|
|
75
|
+
nokogiri_doc.children.each do |child|
|
|
76
|
+
next if child == nokogiri_doc.root
|
|
77
|
+
next if child.is_a?(Nokogiri::XML::DTD)
|
|
78
|
+
|
|
79
|
+
node = build_node_from_nokogiri(child)
|
|
80
|
+
root.add_child(node) if node
|
|
81
|
+
end
|
|
82
|
+
else
|
|
83
|
+
# For DocumentFragments: process all children directly
|
|
84
|
+
# Fragments don't have a single .root, they contain multiple top-level nodes
|
|
85
|
+
nokogiri_doc.children.each do |child|
|
|
86
|
+
next if child.is_a?(Nokogiri::XML::DTD)
|
|
87
|
+
|
|
88
|
+
node = build_node_from_nokogiri(child)
|
|
89
|
+
root.add_child(node) if node
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
root
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Build node from Nokogiri node
|
|
97
|
+
def self.build_node_from_nokogiri(nokogiri_node)
|
|
98
|
+
case nokogiri_node
|
|
99
|
+
when Nokogiri::XML::Element
|
|
100
|
+
build_element_node(nokogiri_node)
|
|
101
|
+
when Nokogiri::XML::Text
|
|
102
|
+
build_text_node(nokogiri_node)
|
|
103
|
+
when Nokogiri::XML::Comment
|
|
104
|
+
build_comment_node(nokogiri_node)
|
|
105
|
+
when Nokogiri::XML::ProcessingInstruction
|
|
106
|
+
build_pi_node(nokogiri_node)
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Build element node from Nokogiri element
|
|
111
|
+
def self.build_element_node(nokogiri_element)
|
|
112
|
+
element = Canon::Xml::Nodes::ElementNode.new(
|
|
113
|
+
name: nokogiri_element.name,
|
|
114
|
+
namespace_uri: nokogiri_element.namespace&.href,
|
|
115
|
+
prefix: nokogiri_element.namespace&.prefix,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Build namespace nodes (includes inherited namespaces)
|
|
119
|
+
build_namespace_nodes(nokogiri_element, element)
|
|
120
|
+
|
|
121
|
+
# Build attribute nodes
|
|
122
|
+
build_attribute_nodes(nokogiri_element, element)
|
|
123
|
+
|
|
124
|
+
# Build child nodes
|
|
125
|
+
nokogiri_element.children.each do |child|
|
|
126
|
+
node = build_node_from_nokogiri(child)
|
|
127
|
+
element.add_child(node) if node
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
element
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Build namespace nodes for an element
|
|
134
|
+
def self.build_namespace_nodes(nokogiri_element, element)
|
|
135
|
+
# Collect all in-scope namespaces
|
|
136
|
+
namespaces = collect_in_scope_namespaces(nokogiri_element)
|
|
137
|
+
|
|
138
|
+
namespaces.each do |prefix, uri|
|
|
139
|
+
ns_node = Canon::Xml::Nodes::NamespaceNode.new(
|
|
140
|
+
prefix: prefix,
|
|
141
|
+
uri: uri,
|
|
142
|
+
)
|
|
143
|
+
element.add_namespace(ns_node)
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Collect all in-scope namespaces for an element
|
|
148
|
+
def self.collect_in_scope_namespaces(nokogiri_element)
|
|
149
|
+
namespaces = {}
|
|
150
|
+
|
|
151
|
+
# Walk up the tree to collect all namespace declarations
|
|
152
|
+
current = nokogiri_element
|
|
153
|
+
while current && !current.is_a?(Nokogiri::XML::Document)
|
|
154
|
+
if current.is_a?(Nokogiri::XML::Element)
|
|
155
|
+
current.namespace_definitions.each do |ns|
|
|
156
|
+
prefix = ns.prefix || ""
|
|
157
|
+
# Only add if not already defined (child overrides parent)
|
|
158
|
+
unless namespaces.key?(prefix)
|
|
159
|
+
namespaces[prefix] = ns.href
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
current = current.parent
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# Always include xml namespace
|
|
167
|
+
namespaces["xml"] ||= "http://www.w3.org/XML/1998/namespace"
|
|
168
|
+
|
|
169
|
+
namespaces
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Build attribute nodes for an element
|
|
173
|
+
def self.build_attribute_nodes(nokogiri_element, element)
|
|
174
|
+
nokogiri_element.attributes.each do |name, attr|
|
|
175
|
+
next if name.start_with?("xmlns")
|
|
176
|
+
|
|
177
|
+
attr_node = Canon::Xml::Nodes::AttributeNode.new(
|
|
178
|
+
name: attr.name,
|
|
179
|
+
value: attr.value,
|
|
180
|
+
namespace_uri: attr.namespace&.href,
|
|
181
|
+
prefix: attr.namespace&.prefix,
|
|
182
|
+
)
|
|
183
|
+
element.add_attribute(attr_node)
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Build text node from Nokogiri text node
|
|
188
|
+
# HTML-specific: handles whitespace-sensitive elements (pre, code, textarea, script, style)
|
|
189
|
+
def self.build_text_node(nokogiri_text)
|
|
190
|
+
# Skip text nodes that are only whitespace between elements
|
|
191
|
+
# EXCEPT in whitespace-sensitive elements (pre, code, textarea, script, style)
|
|
192
|
+
# where whitespace is semantically significant
|
|
193
|
+
content = nokogiri_text.content
|
|
194
|
+
|
|
195
|
+
if content.strip.empty? && nokogiri_text.parent.is_a?(Nokogiri::XML::Element)
|
|
196
|
+
# Check if parent is whitespace-sensitive
|
|
197
|
+
parent_name = nokogiri_text.parent.name.downcase
|
|
198
|
+
whitespace_sensitive_tags = %w[pre code textarea script style]
|
|
199
|
+
|
|
200
|
+
# Skip whitespace-only text UNLESS in whitespace-sensitive element
|
|
201
|
+
return nil unless whitespace_sensitive_tags.include?(parent_name)
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Nokogiri already handles CDATA conversion and entity resolution
|
|
205
|
+
Canon::Xml::Nodes::TextNode.new(value: content)
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
# Build comment node from Nokogiri comment
|
|
209
|
+
def self.build_comment_node(nokogiri_comment)
|
|
210
|
+
Canon::Xml::Nodes::CommentNode.new(value: nokogiri_comment.content)
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# Build PI node from Nokogiri PI
|
|
214
|
+
def self.build_pi_node(nokogiri_pi)
|
|
215
|
+
Canon::Xml::Nodes::ProcessingInstructionNode.new(
|
|
216
|
+
target: nokogiri_pi.name,
|
|
217
|
+
data: nokogiri_pi.content,
|
|
218
|
+
)
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
class << self
|
|
222
|
+
private :build_from_nokogiri, :build_node_from_nokogiri,
|
|
223
|
+
:build_element_node, :build_namespace_nodes,
|
|
224
|
+
:collect_in_scope_namespaces, :build_attribute_nodes,
|
|
225
|
+
:build_text_node, :build_comment_node, :build_pi_node
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
end
|
data/lib/canon/html.rb
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "registry"
|
|
4
|
+
|
|
5
|
+
module Canon
|
|
6
|
+
module Options
|
|
7
|
+
# Generates Thor CLI options from the Options Registry
|
|
8
|
+
# This ensures CLI options stay in sync with the centralized registry
|
|
9
|
+
module CliGenerator
|
|
10
|
+
class << self
|
|
11
|
+
# Generate Thor method_option calls for diff command
|
|
12
|
+
def generate_diff_options
|
|
13
|
+
lambda do |thor_class|
|
|
14
|
+
Canon::Options::Registry.all_options.each do |opt|
|
|
15
|
+
add_thor_option(thor_class, opt)
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
private
|
|
21
|
+
|
|
22
|
+
# Add a single Thor option
|
|
23
|
+
def add_thor_option(thor_class, opt)
|
|
24
|
+
thor_opts = build_thor_opts(opt)
|
|
25
|
+
|
|
26
|
+
thor_class.method_option(
|
|
27
|
+
opt[:name],
|
|
28
|
+
**thor_opts,
|
|
29
|
+
)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Build Thor option hash from registry option
|
|
33
|
+
def build_thor_opts(opt)
|
|
34
|
+
result = {}
|
|
35
|
+
|
|
36
|
+
# Add aliases if present
|
|
37
|
+
result[:aliases] = opt[:aliases] if opt[:aliases]
|
|
38
|
+
|
|
39
|
+
# Map type
|
|
40
|
+
result[:type] = map_type(opt[:type])
|
|
41
|
+
|
|
42
|
+
# Add enum values for enum types
|
|
43
|
+
result[:enum] = opt[:values] if opt[:type] == :enum
|
|
44
|
+
|
|
45
|
+
# Add default if present
|
|
46
|
+
result[:default] = opt[:default] if opt[:default]
|
|
47
|
+
|
|
48
|
+
# Add description
|
|
49
|
+
result[:desc] = opt[:description]
|
|
50
|
+
|
|
51
|
+
result
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Map registry type to Thor type
|
|
55
|
+
def map_type(registry_type)
|
|
56
|
+
case registry_type
|
|
57
|
+
when :enum
|
|
58
|
+
:string
|
|
59
|
+
when :numeric
|
|
60
|
+
:numeric
|
|
61
|
+
when :boolean
|
|
62
|
+
:boolean
|
|
63
|
+
else
|
|
64
|
+
:string
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|