canon 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +163 -67
  3. data/README.adoc +400 -7
  4. data/docs/Gemfile +9 -0
  5. data/docs/INDEX.adoc +99 -182
  6. data/docs/_config.yml +100 -0
  7. data/docs/advanced/diff-classification.adoc +547 -0
  8. data/docs/advanced/diff-pipeline.adoc +358 -0
  9. data/docs/advanced/index.adoc +214 -0
  10. data/docs/advanced/semantic-diff-report.adoc +390 -0
  11. data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
  12. data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
  13. data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
  14. data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
  15. data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
  16. data/docs/features/diff-formatting/display-filtering.adoc +472 -0
  17. data/docs/features/diff-formatting/index.adoc +140 -0
  18. data/docs/features/environment-configuration/index.adoc +327 -0
  19. data/docs/features/environment-configuration/override-system.adoc +436 -0
  20. data/docs/features/environment-configuration/size-limits.adoc +273 -0
  21. data/docs/features/index.adoc +173 -0
  22. data/docs/features/input-validation/index.adoc +521 -0
  23. data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
  24. data/docs/features/match-options/html-policies.adoc +312 -0
  25. data/docs/features/match-options/index.adoc +621 -0
  26. data/docs/getting-started/index.adoc +83 -0
  27. data/docs/getting-started/quick-start.adoc +76 -0
  28. data/docs/guides/choosing-configuration.adoc +689 -0
  29. data/docs/guides/index.adoc +181 -0
  30. data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
  31. data/docs/interfaces/index.adoc +101 -0
  32. data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
  33. data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
  34. data/docs/lychee.toml +65 -0
  35. data/docs/reference/cli-options.adoc +418 -0
  36. data/docs/reference/environment-variables.adoc +375 -0
  37. data/docs/reference/index.adoc +204 -0
  38. data/docs/reference/options-across-interfaces.adoc +417 -0
  39. data/docs/understanding/algorithms/dom-diff.adoc +389 -0
  40. data/docs/understanding/algorithms/index.adoc +314 -0
  41. data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
  42. data/docs/understanding/architecture.adoc +447 -0
  43. data/docs/understanding/comparison-pipeline.adoc +317 -0
  44. data/docs/understanding/formats/html.adoc +380 -0
  45. data/docs/understanding/formats/index.adoc +261 -0
  46. data/docs/understanding/formats/json.adoc +390 -0
  47. data/docs/understanding/formats/xml.adoc +366 -0
  48. data/docs/understanding/formats/yaml.adoc +504 -0
  49. data/docs/understanding/index.adoc +130 -0
  50. data/lib/canon/cli.rb +42 -1
  51. data/lib/canon/commands/diff_command.rb +108 -23
  52. data/lib/canon/comparison/compare_profile.rb +101 -0
  53. data/lib/canon/comparison/comparison_result.rb +41 -2
  54. data/lib/canon/comparison/html_comparator.rb +292 -71
  55. data/lib/canon/comparison/html_compare_profile.rb +117 -0
  56. data/lib/canon/comparison/match_options.rb +42 -4
  57. data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
  58. data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
  59. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
  60. data/lib/canon/comparison/xml_comparator.rb +695 -91
  61. data/lib/canon/comparison.rb +207 -2
  62. data/lib/canon/config/env_provider.rb +71 -0
  63. data/lib/canon/config/env_schema.rb +58 -0
  64. data/lib/canon/config/override_resolver.rb +55 -0
  65. data/lib/canon/config/type_converter.rb +59 -0
  66. data/lib/canon/config.rb +158 -29
  67. data/lib/canon/data_model.rb +29 -0
  68. data/lib/canon/diff/diff_classifier.rb +74 -14
  69. data/lib/canon/diff/diff_context_builder.rb +41 -0
  70. data/lib/canon/diff/diff_line.rb +18 -2
  71. data/lib/canon/diff/diff_node.rb +18 -3
  72. data/lib/canon/diff/diff_node_mapper.rb +71 -12
  73. data/lib/canon/diff/formatting_detector.rb +53 -0
  74. data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
  75. data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
  76. data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
  77. data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
  78. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
  79. data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
  80. data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
  81. data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
  82. data/lib/canon/diff_formatter/debug_output.rb +7 -1
  83. data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
  84. data/lib/canon/diff_formatter/legend.rb +42 -0
  85. data/lib/canon/diff_formatter.rb +78 -9
  86. data/lib/canon/errors.rb +56 -0
  87. data/lib/canon/formatters/html_formatter_base.rb +35 -1
  88. data/lib/canon/formatters/json_formatter.rb +3 -0
  89. data/lib/canon/formatters/yaml_formatter.rb +3 -0
  90. data/lib/canon/html/data_model.rb +229 -0
  91. data/lib/canon/html.rb +9 -0
  92. data/lib/canon/options/cli_generator.rb +70 -0
  93. data/lib/canon/options/registry.rb +234 -0
  94. data/lib/canon/rspec_matchers.rb +34 -13
  95. data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
  96. data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
  97. data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
  98. data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
  99. data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
  100. data/lib/canon/tree_diff/core/matching.rb +241 -0
  101. data/lib/canon/tree_diff/core/node_signature.rb +164 -0
  102. data/lib/canon/tree_diff/core/node_weight.rb +135 -0
  103. data/lib/canon/tree_diff/core/tree_node.rb +450 -0
  104. data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
  105. data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
  106. data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
  107. data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
  108. data/lib/canon/tree_diff/operation_converter.rb +631 -0
  109. data/lib/canon/tree_diff/operations/operation.rb +92 -0
  110. data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
  111. data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
  112. data/lib/canon/tree_diff.rb +33 -0
  113. data/lib/canon/validators/json_validator.rb +3 -1
  114. data/lib/canon/validators/yaml_validator.rb +3 -1
  115. data/lib/canon/version.rb +1 -1
  116. data/lib/canon/xml/data_model.rb +22 -23
  117. data/lib/canon/xml/element_matcher.rb +128 -20
  118. data/lib/canon/xml/namespace_helper.rb +110 -0
  119. data/lib/canon.rb +3 -0
  120. metadata +81 -23
  121. data/_config.yml +0 -116
  122. data/docs/ADVANCED_TOPICS.adoc +0 -20
  123. data/docs/BASIC_USAGE.adoc +0 -16
  124. data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  125. data/docs/DIFF_ARCHITECTURE.adoc +0 -435
  126. data/docs/DIFF_FORMATTING.adoc +0 -540
  127. data/docs/FORMATS.adoc +0 -447
  128. data/docs/INPUT_VALIDATION.adoc +0 -477
  129. data/docs/MATCH_ARCHITECTURE.adoc +0 -463
  130. data/docs/MATCH_OPTIONS.adoc +0 -719
  131. data/docs/MODES.adoc +0 -432
  132. data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  133. data/docs/OPTIONS.adoc +0 -1387
  134. data/docs/PREPROCESSING.adoc +0 -491
  135. data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
  136. data/docs/UNDERSTANDING_CANON.adoc +0 -17
@@ -88,6 +88,48 @@ module Canon
88
88
  output.join("\n")
89
89
  end
90
90
 
91
+ # Build diff symbol legend
92
+ #
93
+ # @param use_color [Boolean] Whether to use colors
94
+ # @return [String] Formatted diff symbol legend
95
+ def self.build_diff_symbol_legend(use_color: true)
96
+ output = []
97
+ separator = "━" * 60
98
+
99
+ output << colorize("Diff Symbol Legend:", :cyan, :bold, use_color)
100
+ output << colorize(separator, :cyan, :bold, use_color)
101
+
102
+ # Formatting-only changes
103
+ output << colorize("Formatting Changes (cosmetic only):", :yellow,
104
+ :bold, use_color)
105
+ output << " #{colorize('[', :black, :bold,
106
+ use_color)}: Line removed (formatting only - dark gray)"
107
+ output << " #{colorize(']', :white, :bold,
108
+ use_color)}: Line added (formatting only - light gray)"
109
+ output << ""
110
+
111
+ # Informative changes
112
+ output << colorize("Informative Changes (do not affect equivalence):",
113
+ :yellow, :bold, use_color)
114
+ output << " #{colorize('<', :blue, :bold,
115
+ use_color)}: Line removed (informative - blue)"
116
+ output << " #{colorize('>', :cyan, :bold,
117
+ use_color)}: Line added (informative - cyan)"
118
+ output << ""
119
+
120
+ # Normative changes
121
+ output << colorize("Normative Changes (affect equivalence):", :yellow,
122
+ :bold, use_color)
123
+ output << " #{colorize('-', :red, :bold,
124
+ use_color)}: Line removed (normative difference - red)"
125
+ output << " #{colorize('+', :green, :bold,
126
+ use_color)}: Line added (normative difference - green)"
127
+ output << ""
128
+
129
+ output << colorize(separator, :cyan, :bold, use_color)
130
+ output.join("\n")
131
+ end
132
+
91
133
  # Format character name for display
92
134
  #
93
135
  # @param name [String] Unicode character name
@@ -300,6 +300,18 @@ module Canon
300
300
 
301
301
  output = []
302
302
 
303
+ # Display the algorithm being used
304
+ if comparison_result.is_a?(Canon::Comparison::ComparisonResult)
305
+ algorithm_name = case comparison_result.algorithm
306
+ when :semantic
307
+ "SEMANTIC TREE DIFF"
308
+ else
309
+ "DOM DIFF"
310
+ end
311
+ output << colorize("Algorithm: #{algorithm_name}", :cyan, :bold)
312
+ output << "" # Blank line for spacing
313
+ end
314
+
303
315
  # 1. CANON VERBOSE tables (ONLY if CANON_VERBOSE=1)
304
316
  verbose_tables = DebugOutput.verbose_tables_only(
305
317
  comparison_result,
@@ -317,12 +329,21 @@ module Canon
317
329
  )
318
330
  end
319
331
 
332
+ # 2.5. Original Input Strings (ONLY if verbose_diff is enabled)
333
+ if @verbose_diff && comparison_result.is_a?(Canon::Comparison::ComparisonResult)
334
+ original1, original2 = comparison_result.original_strings
335
+ if original1 && original2
336
+ output << format_original_strings(original1, original2)
337
+ end
338
+ end
339
+
320
340
  # 3. Main diff output (by-line or by-object) - ALWAYS
321
341
 
322
342
  # Check if comparison result is a ComparisonResult object
323
343
  if comparison_result.is_a?(Canon::Comparison::ComparisonResult)
324
- # Use preprocessed strings from comparison - avoids re-preprocessing
325
- doc1, doc2 = comparison_result.preprocessed_strings
344
+ # Use original strings for line diff to show actual formatting/namespace differences
345
+ # Use preprocessed strings for semantic comparison only
346
+ doc1, doc2 = comparison_result.original_strings
326
347
  differences = comparison_result.differences
327
348
  html_version = comparison_result.html_version
328
349
  elsif comparison_result.is_a?(Hash) && comparison_result[:preprocessed]
@@ -361,7 +382,7 @@ module Canon
361
382
  /></, ">\n<"
362
383
  ),
363
384
  Canon::Xml::C14n.canonicalize(actual, with_comments: false).gsub(
364
- /></, ">\n<"
385
+ />\s+$/, ""
365
386
  ),
366
387
  ]
367
388
  when :html
@@ -407,6 +428,43 @@ module Canon
407
428
  html.to_s
408
429
  end
409
430
 
431
+ # Format original input strings for display (RSpec-style)
432
+ # Shows the actual strings that were passed in before any preprocessing
433
+ #
434
+ # @param original1 [String] First original input string
435
+ # @param original2 [String] Second original input string
436
+ # @return [String] Formatted display of original strings
437
+ def format_original_strings(original1, original2)
438
+ return "" if original1.nil? || original2.nil?
439
+
440
+ output = []
441
+ output << ""
442
+ output << colorize("=" * 70, :cyan, :bold)
443
+ output << colorize(" ORIGINAL INPUT STRINGS", :cyan, :bold)
444
+ output << colorize("=" * 70, :cyan, :bold)
445
+ output << ""
446
+
447
+ # Format expected
448
+ output << colorize("Expected (as string):", :yellow, :bold)
449
+ original1.each_line.with_index do |line, idx|
450
+ output << " #{colorize(sprintf('%4d', idx + 1),
451
+ :blue)} | #{line.chomp}"
452
+ end
453
+ output << ""
454
+
455
+ # Format actual
456
+ output << colorize("Actual (as string):", :yellow, :bold)
457
+ original2.each_line.with_index do |line, idx|
458
+ output << " #{colorize(sprintf('%4d', idx + 1),
459
+ :blue)} | #{line.chomp}"
460
+ end
461
+ output << ""
462
+ output << colorize("=" * 70, :cyan, :bold)
463
+ output << ""
464
+
465
+ output.join("\n")
466
+ end
467
+
410
468
  # Build the final visualization map from various customization options
411
469
  #
412
470
  # @param visualization_map [Hash, nil] Complete custom visualization map
@@ -455,19 +513,25 @@ module Canon
455
513
  # Generate by-object diff with tree visualization
456
514
  # Delegates to format-specific by-object formatters
457
515
  def by_object_diff(differences, format)
458
- require_relative "diff_formatter/by_object/base_formatter"
459
-
460
516
  output = []
461
517
  output << colorize("Visual Diff:", :cyan, :bold)
462
518
 
519
+ # Extract differences array from ComparisonResult if needed
520
+ diffs_array = if differences.is_a?(Canon::Comparison::ComparisonResult)
521
+ differences.differences
522
+ else
523
+ differences
524
+ end
525
+
463
526
  # Delegate to format-specific formatter
464
527
  formatter = ByObject::BaseFormatter.for_format(
465
528
  format,
466
529
  use_color: @use_color,
467
530
  visualization_map: @visualization_map,
531
+ show_diffs: @show_diffs,
468
532
  )
469
533
 
470
- output << formatter.format(differences, format)
534
+ output << formatter.format(diffs_array, format)
471
535
 
472
536
  output.join("\n")
473
537
  end
@@ -476,8 +540,6 @@ module Canon
476
540
  # Delegates to format-specific by-line formatters
477
541
  def by_line_diff(doc1, doc2, format: :xml, html_version: nil,
478
542
  differences: [])
479
- require_relative "diff_formatter/by_line/base_formatter"
480
-
481
543
  # For HTML format, use html_version if provided, otherwise default to :html4
482
544
  if format == :html && html_version
483
545
  format = html_version # Use :html4 or :html5
@@ -492,6 +554,13 @@ differences: [])
492
554
 
493
555
  return output.join("\n") if doc1.nil? || doc2.nil?
494
556
 
557
+ # Extract differences array from ComparisonResult if needed
558
+ diffs_array = if differences.is_a?(Canon::Comparison::ComparisonResult)
559
+ differences.differences
560
+ else
561
+ differences
562
+ end
563
+
495
564
  # Delegate to format-specific formatter
496
565
  formatter = ByLine::BaseFormatter.for_format(
497
566
  format,
@@ -500,7 +569,7 @@ differences: [])
500
569
  diff_grouping_lines: @diff_grouping_lines,
501
570
  visualization_map: @visualization_map,
502
571
  show_diffs: @show_diffs,
503
- differences: differences,
572
+ differences: diffs_array,
504
573
  )
505
574
 
506
575
  output << formatter.format(doc1, doc2)
data/lib/canon/errors.rb CHANGED
@@ -53,4 +53,60 @@ module Canon
53
53
  parts.join("\n")
54
54
  end
55
55
  end
56
+
57
+ # Error raised when input exceeds size limits
58
+ #
59
+ # This error is raised when input files or trees exceed configured size
60
+ # limits to prevent performance issues or hangs.
61
+ class SizeLimitExceededError < Error
62
+ attr_reader :limit_type, :actual, :limit
63
+
64
+ # Initialize a new SizeLimitExceededError
65
+ #
66
+ # @param limit_type [Symbol] The type of limit exceeded (:file_size,
67
+ # :node_count, :diff_lines)
68
+ # @param actual [Integer] The actual size that exceeded the limit
69
+ # @param limit [Integer] The configured limit
70
+ def initialize(limit_type, actual, limit)
71
+ @limit_type = limit_type
72
+ @actual = actual
73
+ @limit = limit
74
+ super(build_message)
75
+ end
76
+
77
+ private
78
+
79
+ # Build a descriptive error message
80
+ #
81
+ # @return [String] The formatted error message
82
+ def build_message
83
+ case limit_type
84
+ when :file_size
85
+ "File size (#{format_bytes(actual)}) exceeds limit (#{format_bytes(limit)}). " \
86
+ "Increase limit via CANON_MAX_FILE_SIZE or config.diff.max_file_size"
87
+ when :node_count
88
+ "Tree node count (#{actual}) exceeds limit (#{limit}). " \
89
+ "Increase limit via CANON_MAX_NODE_COUNT or config.diff.max_node_count"
90
+ when :diff_lines
91
+ "Diff output (#{actual} lines) exceeds limit (#{limit} lines). " \
92
+ "Output truncated. Increase limit via CANON_MAX_DIFF_LINES or config.diff.max_diff_lines"
93
+ else
94
+ "Size limit exceeded: #{limit_type} (#{actual} > #{limit})"
95
+ end
96
+ end
97
+
98
+ # Format bytes into human-readable size
99
+ #
100
+ # @param bytes [Integer] Size in bytes
101
+ # @return [String] Formatted size string
102
+ def format_bytes(bytes)
103
+ if bytes < 1024
104
+ "#{bytes} bytes"
105
+ elsif bytes < 1_048_576
106
+ "#{(bytes / 1024.0).round(2)} KB"
107
+ else
108
+ "#{(bytes / 1_048_576.0).round(2)} MB"
109
+ end
110
+ end
111
+ end
56
112
  end
@@ -49,6 +49,12 @@ module Canon
49
49
  header hgroup hr li main nav ol p pre section table tbody
50
50
  td tfoot th thead tr ul
51
51
  ].freeze
52
+
53
+ # HTML elements where whitespace is semantically significant
54
+ # and should NOT be normalized
55
+ WHITESPACE_SENSITIVE_ELEMENTS = %w[
56
+ pre code textarea script style
57
+ ].freeze
52
58
  # Format HTML using canonical form
53
59
  # @param html [String] HTML document to canonicalize
54
60
  # @return [String] Canonical form of HTML
@@ -108,6 +114,13 @@ module Canon
108
114
  doc.traverse do |node|
109
115
  next unless node.text?
110
116
 
117
+ # CRITICAL: Skip normalization for whitespace-sensitive elements
118
+ # In elements like <pre>, <code>, etc., whitespace is semantically
119
+ # significant and must be preserved exactly as-is
120
+ if whitespace_sensitive_element?(node.parent)
121
+ next
122
+ end
123
+
111
124
  # Handle whitespace-only text nodes
112
125
  if node.text.strip.empty? && node.parent&.element?
113
126
  # Check if this text node is between block-level elements
@@ -156,8 +169,29 @@ module Canon
156
169
  node&.element? && BLOCK_ELEMENTS.include?(node.name.downcase)
157
170
  end
158
171
 
172
+ # Check if a node is a whitespace-sensitive element
173
+ # @param node [Nokogiri::XML::Node, nil] Node to check
174
+ # @return [Boolean] true if node is whitespace-sensitive
175
+ def self.whitespace_sensitive_element?(node)
176
+ return false unless node&.element?
177
+
178
+ # Check if this element or any ancestor is whitespace-sensitive
179
+ current = node
180
+ while current
181
+ if current.element? && WHITESPACE_SENSITIVE_ELEMENTS.include?(current.name.downcase)
182
+ return true
183
+ end
184
+ # Stop at document root - documents don't have parents
185
+ break if current.is_a?(Nokogiri::XML::Document) || current.is_a?(Nokogiri::HTML5::Document)
186
+
187
+ current = current.parent
188
+ end
189
+ false
190
+ end
191
+
159
192
  private_class_method :sort_attributes, :normalize_whitespace,
160
- :ensure_block_element_spacing, :block_element?
193
+ :ensure_block_element_spacing, :block_element?,
194
+ :whitespace_sensitive_element?
161
195
  end
162
196
  end
163
197
  end
@@ -15,6 +15,9 @@ module Canon
15
15
  def self.parse(json)
16
16
  # Validate before parsing
17
17
  Canon::Validators::JsonValidator.validate!(json)
18
+ # Return as-is if already parsed
19
+ return json if json.is_a?(Hash) || json.is_a?(Array)
20
+
18
21
  JSON.parse(json)
19
22
  end
20
23
 
@@ -15,6 +15,9 @@ module Canon
15
15
  def self.parse(yaml)
16
16
  # Validate before parsing
17
17
  Canon::Validators::YamlValidator.validate!(yaml)
18
+ # Return as-is if already parsed
19
+ return yaml if yaml.is_a?(Hash) || yaml.is_a?(Array)
20
+
18
21
  YAML.safe_load(yaml, permitted_classes: [Symbol, Date, Time])
19
22
  end
20
23
 
@@ -0,0 +1,229 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+ require_relative "../data_model"
5
+ require_relative "../xml/nodes/root_node"
6
+ require_relative "../xml/nodes/element_node"
7
+ require_relative "../xml/nodes/namespace_node"
8
+ require_relative "../xml/nodes/attribute_node"
9
+ require_relative "../xml/nodes/text_node"
10
+ require_relative "../xml/nodes/comment_node"
11
+ require_relative "../xml/nodes/processing_instruction_node"
12
+
13
+ module Canon
14
+ module Html
15
+ # Builds XPath data model from HTML
16
+ # HTML-specific parsing with lowercase element/attribute names,
17
+ # whitespace-sensitive element handling, and fragment parsing
18
+ class DataModel < Canon::DataModel
19
+ # Build XPath data model from HTML string
20
+ #
21
+ # @param html_string [String] HTML content to parse
22
+ # @param version [Symbol] HTML version (:html4 or :html5)
23
+ # @return [Canon::Xml::Nodes::RootNode] Root of the data model tree
24
+ def self.from_html(html_string, version: :html4)
25
+ # Detect if this is a full document (has <html> tag) or fragment
26
+ # Full documents should use document parser to preserve structure
27
+ # Fragments should use fragment parser to avoid adding implicit wrappers
28
+ is_full_document = html_string.match?(/<html[\s>]/i)
29
+
30
+ # Parse with Nokogiri using appropriate parser
31
+ doc = if is_full_document
32
+ # Full document - use fragment parser to avoid Nokogiri's phantom tag insertion
33
+ # The fragment parser avoids auto-inserted meta tags in HTML4
34
+ if version == :html5
35
+ Nokogiri::HTML5.fragment(html_string)
36
+ else
37
+ Nokogiri::HTML4.fragment(html_string)
38
+ end
39
+ elsif version == :html5
40
+ # Fragment - use fragment parser to avoid implicit wrappers
41
+ Nokogiri::HTML5.fragment(html_string)
42
+ else
43
+ Nokogiri::HTML4.fragment(html_string)
44
+ end
45
+
46
+ # HTML doesn't have strict namespace requirements like XML,
47
+ # so skip the relative namespace URI check
48
+
49
+ # Convert to XPath data model (reuse XML infrastructure)
50
+ build_from_nokogiri(doc)
51
+ end
52
+
53
+ # Alias for compatibility
54
+ def self.parse(html_string, version: :html4)
55
+ from_html(html_string, version: version)
56
+ end
57
+
58
+ # Serialize HTML node to string
59
+ def self.serialize(node)
60
+ # HTML nodes use the same serialization as XML
61
+ # Delegate to XML serialization implementation
62
+ require_relative "../xml/data_model"
63
+ Canon::Xml::DataModel.serialize(node)
64
+ end
65
+
66
+ # Build XPath data model from Nokogiri document or fragment
67
+ def self.build_from_nokogiri(nokogiri_doc)
68
+ root = Canon::Xml::Nodes::RootNode.new
69
+
70
+ if nokogiri_doc.respond_to?(:root) && nokogiri_doc.root
71
+ # For Documents (HTML4, HTML5): process the root element
72
+ root.add_child(build_element_node(nokogiri_doc.root))
73
+
74
+ # Process PIs and comments outside doc element
75
+ nokogiri_doc.children.each do |child|
76
+ next if child == nokogiri_doc.root
77
+ next if child.is_a?(Nokogiri::XML::DTD)
78
+
79
+ node = build_node_from_nokogiri(child)
80
+ root.add_child(node) if node
81
+ end
82
+ else
83
+ # For DocumentFragments: process all children directly
84
+ # Fragments don't have a single .root, they contain multiple top-level nodes
85
+ nokogiri_doc.children.each do |child|
86
+ next if child.is_a?(Nokogiri::XML::DTD)
87
+
88
+ node = build_node_from_nokogiri(child)
89
+ root.add_child(node) if node
90
+ end
91
+ end
92
+
93
+ root
94
+ end
95
+
96
+ # Build node from Nokogiri node
97
+ def self.build_node_from_nokogiri(nokogiri_node)
98
+ case nokogiri_node
99
+ when Nokogiri::XML::Element
100
+ build_element_node(nokogiri_node)
101
+ when Nokogiri::XML::Text
102
+ build_text_node(nokogiri_node)
103
+ when Nokogiri::XML::Comment
104
+ build_comment_node(nokogiri_node)
105
+ when Nokogiri::XML::ProcessingInstruction
106
+ build_pi_node(nokogiri_node)
107
+ end
108
+ end
109
+
110
+ # Build element node from Nokogiri element
111
+ def self.build_element_node(nokogiri_element)
112
+ element = Canon::Xml::Nodes::ElementNode.new(
113
+ name: nokogiri_element.name,
114
+ namespace_uri: nokogiri_element.namespace&.href,
115
+ prefix: nokogiri_element.namespace&.prefix,
116
+ )
117
+
118
+ # Build namespace nodes (includes inherited namespaces)
119
+ build_namespace_nodes(nokogiri_element, element)
120
+
121
+ # Build attribute nodes
122
+ build_attribute_nodes(nokogiri_element, element)
123
+
124
+ # Build child nodes
125
+ nokogiri_element.children.each do |child|
126
+ node = build_node_from_nokogiri(child)
127
+ element.add_child(node) if node
128
+ end
129
+
130
+ element
131
+ end
132
+
133
+ # Build namespace nodes for an element
134
+ def self.build_namespace_nodes(nokogiri_element, element)
135
+ # Collect all in-scope namespaces
136
+ namespaces = collect_in_scope_namespaces(nokogiri_element)
137
+
138
+ namespaces.each do |prefix, uri|
139
+ ns_node = Canon::Xml::Nodes::NamespaceNode.new(
140
+ prefix: prefix,
141
+ uri: uri,
142
+ )
143
+ element.add_namespace(ns_node)
144
+ end
145
+ end
146
+
147
+ # Collect all in-scope namespaces for an element
148
+ def self.collect_in_scope_namespaces(nokogiri_element)
149
+ namespaces = {}
150
+
151
+ # Walk up the tree to collect all namespace declarations
152
+ current = nokogiri_element
153
+ while current && !current.is_a?(Nokogiri::XML::Document)
154
+ if current.is_a?(Nokogiri::XML::Element)
155
+ current.namespace_definitions.each do |ns|
156
+ prefix = ns.prefix || ""
157
+ # Only add if not already defined (child overrides parent)
158
+ unless namespaces.key?(prefix)
159
+ namespaces[prefix] = ns.href
160
+ end
161
+ end
162
+ end
163
+ current = current.parent
164
+ end
165
+
166
+ # Always include xml namespace
167
+ namespaces["xml"] ||= "http://www.w3.org/XML/1998/namespace"
168
+
169
+ namespaces
170
+ end
171
+
172
+ # Build attribute nodes for an element
173
+ def self.build_attribute_nodes(nokogiri_element, element)
174
+ nokogiri_element.attributes.each do |name, attr|
175
+ next if name.start_with?("xmlns")
176
+
177
+ attr_node = Canon::Xml::Nodes::AttributeNode.new(
178
+ name: attr.name,
179
+ value: attr.value,
180
+ namespace_uri: attr.namespace&.href,
181
+ prefix: attr.namespace&.prefix,
182
+ )
183
+ element.add_attribute(attr_node)
184
+ end
185
+ end
186
+
187
+ # Build text node from Nokogiri text node
188
+ # HTML-specific: handles whitespace-sensitive elements (pre, code, textarea, script, style)
189
+ def self.build_text_node(nokogiri_text)
190
+ # Skip text nodes that are only whitespace between elements
191
+ # EXCEPT in whitespace-sensitive elements (pre, code, textarea, script, style)
192
+ # where whitespace is semantically significant
193
+ content = nokogiri_text.content
194
+
195
+ if content.strip.empty? && nokogiri_text.parent.is_a?(Nokogiri::XML::Element)
196
+ # Check if parent is whitespace-sensitive
197
+ parent_name = nokogiri_text.parent.name.downcase
198
+ whitespace_sensitive_tags = %w[pre code textarea script style]
199
+
200
+ # Skip whitespace-only text UNLESS in whitespace-sensitive element
201
+ return nil unless whitespace_sensitive_tags.include?(parent_name)
202
+ end
203
+
204
+ # Nokogiri already handles CDATA conversion and entity resolution
205
+ Canon::Xml::Nodes::TextNode.new(value: content)
206
+ end
207
+
208
+ # Build comment node from Nokogiri comment
209
+ def self.build_comment_node(nokogiri_comment)
210
+ Canon::Xml::Nodes::CommentNode.new(value: nokogiri_comment.content)
211
+ end
212
+
213
+ # Build PI node from Nokogiri PI
214
+ def self.build_pi_node(nokogiri_pi)
215
+ Canon::Xml::Nodes::ProcessingInstructionNode.new(
216
+ target: nokogiri_pi.name,
217
+ data: nokogiri_pi.content,
218
+ )
219
+ end
220
+
221
+ class << self
222
+ private :build_from_nokogiri, :build_node_from_nokogiri,
223
+ :build_element_node, :build_namespace_nodes,
224
+ :collect_in_scope_namespaces, :build_attribute_nodes,
225
+ :build_text_node, :build_comment_node, :build_pi_node
226
+ end
227
+ end
228
+ end
229
+ end
data/lib/canon/html.rb ADDED
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "html/data_model"
4
+
5
+ module Canon
6
+ # HTML-specific functionality for Canon
7
+ module Html
8
+ end
9
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "registry"
4
+
5
+ module Canon
6
+ module Options
7
+ # Generates Thor CLI options from the Options Registry
8
+ # This ensures CLI options stay in sync with the centralized registry
9
+ module CliGenerator
10
+ class << self
11
+ # Generate Thor method_option calls for diff command
12
+ def generate_diff_options
13
+ lambda do |thor_class|
14
+ Canon::Options::Registry.all_options.each do |opt|
15
+ add_thor_option(thor_class, opt)
16
+ end
17
+ end
18
+ end
19
+
20
+ private
21
+
22
+ # Add a single Thor option
23
+ def add_thor_option(thor_class, opt)
24
+ thor_opts = build_thor_opts(opt)
25
+
26
+ thor_class.method_option(
27
+ opt[:name],
28
+ **thor_opts,
29
+ )
30
+ end
31
+
32
+ # Build Thor option hash from registry option
33
+ def build_thor_opts(opt)
34
+ result = {}
35
+
36
+ # Add aliases if present
37
+ result[:aliases] = opt[:aliases] if opt[:aliases]
38
+
39
+ # Map type
40
+ result[:type] = map_type(opt[:type])
41
+
42
+ # Add enum values for enum types
43
+ result[:enum] = opt[:values] if opt[:type] == :enum
44
+
45
+ # Add default if present
46
+ result[:default] = opt[:default] if opt[:default]
47
+
48
+ # Add description
49
+ result[:desc] = opt[:description]
50
+
51
+ result
52
+ end
53
+
54
+ # Map registry type to Thor type
55
+ def map_type(registry_type)
56
+ case registry_type
57
+ when :enum
58
+ :string
59
+ when :numeric
60
+ :numeric
61
+ when :boolean
62
+ :boolean
63
+ else
64
+ :string
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end