canon 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +112 -25
  3. data/docs/Gemfile +1 -0
  4. data/docs/_config.yml +90 -1
  5. data/docs/advanced/diff-classification.adoc +82 -2
  6. data/docs/features/match-options/index.adoc +239 -1
  7. data/lib/canon/comparison/format_detector.rb +2 -1
  8. data/lib/canon/comparison/html_comparator.rb +19 -8
  9. data/lib/canon/comparison/html_compare_profile.rb +8 -2
  10. data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
  11. data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
  12. data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
  13. data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
  14. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
  15. data/lib/canon/comparison/xml_comparator.rb +48 -23
  16. data/lib/canon/comparison/xml_node_comparison.rb +25 -3
  17. data/lib/canon/diff/diff_classifier.rb +101 -2
  18. data/lib/canon/diff/formatting_detector.rb +1 -1
  19. data/lib/canon/rspec_matchers.rb +37 -8
  20. data/lib/canon/version.rb +1 -1
  21. data/lib/canon/xml/data_model.rb +24 -13
  22. metadata +3 -78
  23. data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
  24. data/false_positive_analysis.txt +0 -0
  25. data/file1.html +0 -1
  26. data/file2.html +0 -1
  27. data/old-docs/ADVANCED_TOPICS.adoc +0 -20
  28. data/old-docs/BASIC_USAGE.adoc +0 -16
  29. data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
  30. data/old-docs/CLI.adoc +0 -497
  31. data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  32. data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
  33. data/old-docs/DIFF_FORMATTING.adoc +0 -540
  34. data/old-docs/DIFF_PARAMETERS.adoc +0 -261
  35. data/old-docs/DOM_DIFF.adoc +0 -1017
  36. data/old-docs/ENV_CONFIG.adoc +0 -876
  37. data/old-docs/FORMATS.adoc +0 -867
  38. data/old-docs/INPUT_VALIDATION.adoc +0 -477
  39. data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
  40. data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
  41. data/old-docs/MATCH_OPTIONS.adoc +0 -912
  42. data/old-docs/MODES.adoc +0 -432
  43. data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  44. data/old-docs/OPTIONS.adoc +0 -1387
  45. data/old-docs/PREPROCESSING.adoc +0 -491
  46. data/old-docs/README.old.adoc +0 -2831
  47. data/old-docs/RSPEC.adoc +0 -814
  48. data/old-docs/RUBY_API.adoc +0 -485
  49. data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
  50. data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
  51. data/old-docs/STRING_COMPARE.adoc +0 -345
  52. data/old-docs/TMP.adoc +0 -3384
  53. data/old-docs/TREE_DIFF.adoc +0 -1080
  54. data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
  55. data/old-docs/VERBOSE.adoc +0 -482
  56. data/old-docs/VISUALIZATION_MAP.adoc +0 -625
  57. data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
  58. data/scripts/analyze_current_state.rb +0 -85
  59. data/scripts/analyze_false_positives.rb +0 -114
  60. data/scripts/analyze_remaining_failures.rb +0 -105
  61. data/scripts/compare_current_failures.rb +0 -95
  62. data/scripts/compare_dom_tree_diff.rb +0 -158
  63. data/scripts/compare_failures.rb +0 -151
  64. data/scripts/debug_attribute_extraction.rb +0 -66
  65. data/scripts/debug_blocks_839.rb +0 -115
  66. data/scripts/debug_meta_matching.rb +0 -52
  67. data/scripts/debug_p_matching.rb +0 -192
  68. data/scripts/debug_signature_matching.rb +0 -118
  69. data/scripts/debug_sourcecode_124.rb +0 -32
  70. data/scripts/debug_whitespace_sensitive.rb +0 -192
  71. data/scripts/extract_false_positives.rb +0 -138
  72. data/scripts/find_actual_false_positives.rb +0 -125
  73. data/scripts/investigate_all_false_positives.rb +0 -161
  74. data/scripts/investigate_batch1.rb +0 -127
  75. data/scripts/investigate_classification.rb +0 -150
  76. data/scripts/investigate_classification_detailed.rb +0 -190
  77. data/scripts/investigate_common_failures.rb +0 -342
  78. data/scripts/investigate_false_negative.rb +0 -80
  79. data/scripts/investigate_false_positive.rb +0 -83
  80. data/scripts/investigate_false_positives.rb +0 -227
  81. data/scripts/investigate_false_positives_batch.rb +0 -163
  82. data/scripts/investigate_mixed_content.rb +0 -125
  83. data/scripts/investigate_remaining_16.rb +0 -214
  84. data/scripts/run_single_test.rb +0 -29
  85. data/scripts/test_all_false_positives.rb +0 -95
  86. data/scripts/test_attribute_details.rb +0 -61
  87. data/scripts/test_both_algorithms.rb +0 -49
  88. data/scripts/test_both_simple.rb +0 -49
  89. data/scripts/test_enhanced_semantic_output.rb +0 -125
  90. data/scripts/test_readme_examples.rb +0 -131
  91. data/scripts/test_semantic_tree_diff.rb +0 -99
  92. data/scripts/test_semantic_ux_improvements.rb +0 -135
  93. data/scripts/test_single_false_positive.rb +0 -119
  94. data/scripts/test_size_limits.rb +0 -99
  95. data/test_html_1.html +0 -21
  96. data/test_html_2.html +0 -21
  97. data/test_nokogiri.rb +0 -33
  98. data/test_normalize.rb +0 -45
@@ -45,10 +45,37 @@ Match dimensions are orthogonal aspects that can be configured independently.
45
45
 
46
46
  `:strict`:: Text must match exactly, character-for-character including all whitespace
47
47
 
48
- `:normalize`:: Whitespace is normalized (collapsed/trimmed) before comparison
48
+ `:normalize`:: Whitespace is normalized (collapsed/trimmed) before comparison.
49
+ Formatting-only differences (e.g., extra spaces around text) are classified as
50
+ *informative* rather than normative. This means documents with only whitespace
51
+ differences in text content are considered equivalent.
49
52
 
50
53
  `:ignore`:: Text content is completely ignored in comparison
51
54
 
55
+ .Using text_content: :normalize
56
+ [example]
57
+ ====
58
+ [source,ruby]
59
+ ----
60
+ # These are equivalent with :normalize
61
+ # Whitespace differences are formatting-only (informative)
62
+ Canon.equivalent?(
63
+ '<p> text </p>',
64
+ '<p>text</p>',
65
+ match: { text_content: :normalize }
66
+ )
67
+ # => true
68
+
69
+ # These differ in :strict mode
70
+ Canon.equivalent?(
71
+ '<p> text </p>',
72
+ '<p>text</p>',
73
+ match: { text_content: :strict }
74
+ )
75
+ # => false
76
+ ----
77
+ ====
78
+
52
79
  === structural_whitespace
53
80
 
54
81
  **Applies to**: All formats
@@ -63,6 +90,200 @@ Match dimensions are orthogonal aspects that can be configured independently.
63
90
 
64
91
  `:ignore`:: Structural whitespace is completely ignored
65
92
 
93
+
94
+ === Whitespace sensitivity at element level
95
+
96
+ ==== General
97
+
98
+ In XML, whitespace sensitivity can vary by schema and element:
99
+
100
+ * Elements that apply `xml:space="preserve"` are whitespace-sensitive.
101
+
102
+ * Other elements may be defined as sensitive by schema (e.g.
103
+ `xs:space="preserve"` in XML Schema) or unannounced conventions, such as
104
+ for mixed content.
105
+
106
+ In HTML, elements like `<pre>` and `<code>` preserve whitespace, while others
107
+ like `<div>` and `<p>` do not.
108
+
109
+ In the unannounced cases, the developer must indicate which elements are
110
+ whitespace-sensitive.
111
+
112
+ In Canon, you can control whitespace sensitivity at the element level using
113
+ `structural_whitespace: :strict` or `text_content: :normalize`.
114
+
115
+ Element-level sensitivity controls both:
116
+
117
+ * `structural_whitespace`: Whether whitespace between elements in the element is
118
+ preserved
119
+
120
+ * `text_content`: Whether whitespace within text nodes of the element is
121
+ normalized
122
+
123
+ Options for controlling element-level sensitivity include:
124
+
125
+ * **xml:space attribute** - XML standard for declaring whitespace sensitivity in documents
126
+ * **whitelist/blacklist options** - User-specified element lists
127
+ * **Format defaults** - HTML has built-in sensitive elements
128
+ * **respect_xml_space option** - Control whether xml:space is honored
129
+
130
+ For elements marked as sensitive, whitespace differences are always normative.
131
+
132
+ For non-sensitive elements using `text_content: :normalize`, whitespace
133
+ differences are classified as formatting-only (informative).
134
+
135
+
136
+ ==== xml:space attribute support
137
+
138
+ The `xml:space` attribute is the XML standard way to declare whitespace
139
+ sensitivity in XML instance documents:
140
+
141
+ [source,xml]
142
+ ----
143
+ <!-- Preserve whitespace in this element -->
144
+ <code xml:space="preserve">
145
+ Indentation and newlines matter here
146
+ </code>
147
+
148
+ <!-- Use default behavior -->
149
+ <text xml:space="default">
150
+ Whitespace handling follows configured behavior
151
+ </text>
152
+ ----
153
+
154
+ ==== Whitelist and blacklist options
155
+
156
+ You can explicitly specify which elements are whitespace-sensitive:
157
+
158
+ [source,ruby]
159
+ ----
160
+ # Specify elements that preserve whitespace
161
+ Canon::Comparison.equivalent?(xml1, xml2,
162
+ match: {
163
+ structural_whitespace: :strict,
164
+ whitespace_sensitive_elements: [:pre, :code, :sample],
165
+ whitespace_insensitive_elements: [:p, :div] # Override defaults/whitelist
166
+ }
167
+ )
168
+ ----
169
+
170
+ ==== respect_xml_space option
171
+
172
+ Control whether xml:space attributes in the document are honored:
173
+
174
+ [source,ruby]
175
+ ----
176
+ # Honor xml:space (default)
177
+ Canon::Comparison.equivalent?(xml1, xml2,
178
+ match: {
179
+ structural_whitespace: :strict,
180
+ respect_xml_space: true # Use xml:space attributes in document
181
+ }
182
+ )
183
+
184
+ # Ignore xml:space, use only user configuration
185
+ Canon::Comparison.equivalent?(xml1, xml2,
186
+ match: {
187
+ structural_whitespace: :strict,
188
+ respect_xml_space: false # Override document declarations
189
+ }
190
+ )
191
+ ----
192
+
193
+ ==== Priority order
194
+
195
+ When determining if an element is whitespace-sensitive, Canon uses this priority:
196
+
197
+ [source]
198
+ ----
199
+ 1. respect_xml_space: false → User config only (ignore xml:space)
200
+
201
+ 2. User whitelist → Use whitelist (user explicitly declared)
202
+
203
+ 3. Format defaults → HTML: [:pre, :textarea, :script, :style], XML: []
204
+
205
+ 4. User blacklist → Remove from defaults/whitelist
206
+
207
+ 5. xml:space="preserve" → Element is sensitive
208
+
209
+ 6. xml:space="default" → Use steps 1-4
210
+ ----
211
+
212
+ ==== Format-specific defaults
213
+
214
+ **HTML**:: `[:pre, :textarea, :script, :style]` - These elements preserve whitespace by HTML specification
215
+ **XML**:: `[]` - No default whitespace-sensitive elements, purely user-controlled
216
+
217
+ ==== Examples
218
+
219
+ .Using xml:space attribute
220
+ [source,ruby]
221
+ ----
222
+ xml1 = '<root><code xml:space="preserve"> indented </code></root>'
223
+ xml2 = '<root><code xml:space="preserve">indented</code></root>'
224
+
225
+ # These are NOT equivalent (whitespace matters in xml:space="preserve")
226
+ Canon::Comparison.equivalent?(xml1, xml2,
227
+ match: { structural_whitespace: :strict }
228
+ )
229
+ # => false
230
+ ----
231
+
232
+ .Using whitelist
233
+ [source,ruby]
234
+ ----
235
+ # Make <p> elements whitespace-sensitive
236
+ Canon::Comparison.equivalent?(xml1, xml2,
237
+ match: {
238
+ structural_whitespace: :strict,
239
+ whitespace_sensitive_elements: [:p, :pre]
240
+ }
241
+ )
242
+ ----
243
+
244
+ .Overriding HTML defaults
245
+ [source,ruby]
246
+ ----
247
+ # Make <script> NOT whitespace-sensitive (override HTML default)
248
+ Canon::Comparison.equivalent?(html1, html2,
249
+ format: :html,
250
+ match: {
251
+ structural_whitespace: :strict,
252
+ whitespace_insensitive_elements: [:script]
253
+ }
254
+ )
255
+ ----
256
+
257
+ .Using text_content: :normalize with whitespace_insensitive_elements
258
+ [source,ruby]
259
+ ----
260
+ # HTML defaults: [:pre, :code, :textarea, :script, :style]
261
+ # Excluding :code means it's no longer whitespace-sensitive
262
+ html1 = '<root><pre> indented </pre><code> code </code></root>'
263
+ html2 = '<root><pre> indented </pre><code>code</code></root>'
264
+
265
+ # With :code blacklisted, whitespace in <code> is normalized (formatting-only)
266
+ # HTML uses text_content: :normalize by default
267
+ Canon::Comparison.equivalent?(html1, html2,
268
+ format: :html,
269
+ match: {
270
+ whitespace_insensitive_elements: [:code],
271
+ }
272
+ )
273
+ # => true (whitespace differences in <code> are formatting-only)
274
+
275
+ # Without blacklisting, <code> is sensitive (whitespace matters)
276
+ Canon::Comparison.equivalent?(html1, html2,
277
+ format: :html,
278
+ match: {
279
+ structural_whitespace: :strict,
280
+ }
281
+ )
282
+ # => false (whitespace in <code> is normative)
283
+ ----
284
+
285
+
286
+
66
287
  === attribute_whitespace
67
288
 
68
289
  **Applies to**: XML, HTML only
@@ -414,6 +635,23 @@ expect(actual).to be_xml_equivalent_to(expected,
414
635
  element_position: :ignore,
415
636
  element_hierarchy: :ignore
416
637
  )
638
+
639
+ # Element-level whitespace sensitivity
640
+ expect(actual).to be_xml_equivalent_to(expected,
641
+ match: { structural_whitespace: :strict }
642
+ )
643
+ .with_options(
644
+ whitespace_sensitive_elements: [:pre, :code, :sample],
645
+ respect_xml_space: true
646
+ )
647
+
648
+ # Override HTML default whitespace-sensitive elements
649
+ expect(html).to be_html_equivalent_to(expected,
650
+ match: { structural_whitespace: :strict }
651
+ )
652
+ .with_options(
653
+ whitespace_insensitive_elements: [:script, :style]
654
+ )
417
655
  ====
418
656
 
419
657
  == Comments dimension
@@ -72,7 +72,8 @@ module Canon
72
72
  return :json if trimmed.start_with?("{", "[")
73
73
 
74
74
  # HTML indicators
75
- return :html if trimmed.start_with?("<!DOCTYPE html", "<html", "<HTML")
75
+ return :html if trimmed.start_with?("<!DOCTYPE html", "<html",
76
+ "<HTML")
76
77
 
77
78
  # XML indicators - must start with < and end with >
78
79
  return :xml if trimmed.start_with?("<") && trimmed.end_with?(">")
@@ -13,6 +13,8 @@ require_relative "../diff/diff_classifier"
13
13
  require_relative "strategies/match_strategy_factory"
14
14
  require_relative "../html/data_model"
15
15
  require_relative "xml_node_comparison"
16
+ # Whitespace sensitivity module (single source of truth for sensitive elements)
17
+ require_relative "whitespace_sensitivity"
16
18
 
17
19
  module Canon
18
20
  module Comparison
@@ -542,16 +544,22 @@ compare_profile = nil)
542
544
  return if match_opts[:text_content] == :strict
543
545
 
544
546
  # Elements where whitespace is significant - don't normalize
545
- # Use profile if available, otherwise use default list
547
+ # SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_sensitive_elements
548
+ # This ensures consistency between preprocessing and comparison logic
549
+ # SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_sensitive_elements
550
+ # This ensures consistency between preprocessing and comparison logic
546
551
  preserve_whitespace = if compare_profile.is_a?(HtmlCompareProfile)
547
552
  # Profile handles HTML-specific whitespace rules
548
- %w[pre code textarea script
549
- style].select do |elem|
550
- compare_profile.preserve_whitespace?(elem)
551
- end
553
+ # Get default list and filter by profile
554
+ WhitespaceSensitivity
555
+ .format_default_sensitive_elements(match_opts)
556
+ .select do |elem|
557
+ compare_profile.preserve_whitespace?(elem.to_s)
558
+ end
559
+ .map(&:to_s)
552
560
  else
553
- # Fallback to default list
554
- %w[pre code textarea script style]
561
+ # Use default list from WhitespaceSensitivity (single source of truth)
562
+ WhitespaceSensitivity.format_default_sensitive_elements(match_opts).map(&:to_s)
555
563
  end
556
564
 
557
565
  # Walk all text nodes
@@ -607,9 +615,12 @@ compare_profile = nil)
607
615
  #
608
616
  # CRITICAL: Do NOT remove whitespace-only text nodes from whitespace-sensitive
609
617
  # elements like <pre>, <code>, <textarea>, <script>, <style>
618
+ #
619
+ # SINGLE SOURCE OF TRUTH: Uses WhitespaceSensitivity.format_default_sensitive_elements
610
620
  def remove_whitespace_only_text_nodes(doc)
611
621
  # Elements where whitespace is significant - don't remove whitespace-only nodes
612
- preserve_whitespace = %w[pre code textarea script style]
622
+ # SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_sensitive_elements
623
+ preserve_whitespace = WhitespaceSensitivity.format_default_sensitive_elements(format: :html).map(&:to_s)
613
624
 
614
625
  doc.xpath(".//text()").each do |text_node|
615
626
  # CRITICAL: Skip if this text node is inside a whitespace-preserving element
@@ -1,6 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "compare_profile"
4
+ # Whitespace sensitivity module (single source of truth for sensitive elements)
5
+ require_relative "whitespace_sensitivity"
4
6
 
5
7
  module Canon
6
8
  module Comparison
@@ -82,9 +84,13 @@ module Canon
82
84
  private
83
85
 
84
86
  # Elements where whitespace is semantically significant in HTML
85
- # @return [Array<String>] List of element names
87
+ #
88
+ # SINGLE SOURCE OF TRUTH: Delegates to WhitespaceSensitivity.format_default_sensitive_elements
89
+ # This ensures consistency across the codebase.
90
+ #
91
+ # @return [Array<String>] List of element names (as strings)
86
92
  def whitespace_sensitive_elements
87
- %w[pre code textarea script style]
93
+ WhitespaceSensitivity.format_default_sensitive_elements(format: @html_version).map(&:to_s)
88
94
  end
89
95
 
90
96
  # Check if a dimension is explicitly set to :strict
@@ -27,6 +27,9 @@ module Canon
27
27
  # Start with format-specific defaults
28
28
  options = format_defaults(format).dup
29
29
 
30
+ # Store format for later use (e.g., WhitespaceSensitivity needs it)
31
+ options[:format] = format
32
+
30
33
  # Apply global profile if specified
31
34
  if global_profile
32
35
  profile_opts = get_profile_options(global_profile)
@@ -111,12 +114,16 @@ module Canon
111
114
  def validate_match_options!(match_options)
112
115
  # Special options that don't need validation as dimensions
113
116
  special_options = %i[
117
+ format
114
118
  preprocessing
115
119
  semantic_diff
116
120
  similarity_threshold
117
121
  hash_matching
118
122
  similarity_matching
119
123
  propagation
124
+ whitespace_sensitive_elements
125
+ whitespace_insensitive_elements
126
+ respect_xml_space
120
127
  ]
121
128
 
122
129
  match_options.each do |dimension, behavior|
@@ -0,0 +1,208 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Canon
4
+ module Comparison
5
+ # Whitespace sensitivity utilities for element-level control
6
+ #
7
+ # This module provides logic to determine whether whitespace should be
8
+ # preserved during comparison based on:
9
+ # - Format-specific defaults (HTML has built-in sensitive elements)
10
+ # - User-configured whitelist (elements that care about whitespace)
11
+ # - User-configured blacklist (elements that don't care about whitespace)
12
+ # - xml:space attribute in the document itself
13
+ # - respect_xml_space flag (whether to honor or override xml:space)
14
+ #
15
+ # == Priority Order
16
+ #
17
+ # 1. respect_xml_space: false → User config only (ignore xml:space)
18
+ # 2. User whitelist → Use whitelist (user explicitly declared)
19
+ # 3. Format defaults → HTML: [:pre, :textarea, :script, :style], XML: []
20
+ # 4. User blacklist → Remove from defaults/whitelist
21
+ # 5. xml:space="preserve" → Element is sensitive
22
+ # 6. xml:space="default" → Use steps 1-4
23
+ #
24
+ # == Usage
25
+ #
26
+ # WhitespaceSensitivity.element_sensitive?(node, opts)
27
+ # => true if whitespace should be preserved for this element
28
+ module WhitespaceSensitivity
29
+ class << self
30
+ # Check if an element is whitespace-sensitive based on configuration
31
+ #
32
+ # @param node [Object] The element node to check
33
+ # @param opts [Hash] Comparison options containing match_opts
34
+ # @return [Boolean] true if whitespace should be preserved for this element
35
+ def element_sensitive?(node, opts)
36
+ match_opts = opts[:match_opts]
37
+ return false unless match_opts
38
+ return false unless text_node_parent?(node)
39
+
40
+ parent = node.parent
41
+
42
+ # 1. Check if we should ignore xml:space (user override)
43
+ if !respect_xml_space?(match_opts)
44
+ return user_config_sensitive?(parent, match_opts)
45
+ end
46
+
47
+ # 2. Check xml:space="preserve" (document declaration)
48
+ return true if xml_space_preserve?(parent)
49
+
50
+ # 3. Check xml:space="default" (use configured behavior)
51
+ return false if xml_space_default?(parent)
52
+
53
+ # 4. Use user configuration + format defaults
54
+ configured_sensitive?(parent, match_opts)
55
+ end
56
+
57
+ # Check if whitespace-only text node should be filtered
58
+ #
59
+ # @param node [Object] The text node to check
60
+ # @param opts [Hash] Comparison options
61
+ # @return [Boolean] true if node should be preserved (not filtered)
62
+ def preserve_whitespace_node?(node, opts)
63
+ return false unless node.respond_to?(:parent)
64
+ return false unless node.parent
65
+
66
+ element_sensitive?(node, opts)
67
+ end
68
+
69
+ # Get format-specific default sensitive elements
70
+ #
71
+ # This is the SINGLE SOURCE OF TRUTH for default whitespace-sensitive
72
+ # elements. All other code should use this method to get the list.
73
+ #
74
+ # @param match_opts [Hash] Resolved match options
75
+ # @return [Array<Symbol>] Default sensitive element names
76
+ def format_default_sensitive_elements(match_opts)
77
+ format = match_opts[:format] || :xml
78
+
79
+ case format
80
+ when :html, :html4, :html5
81
+ # HTML specification: these elements preserve whitespace
82
+ %i[pre code textarea script style].freeze
83
+ when :xml
84
+ # XML has no default sensitive elements - purely user-controlled
85
+ [].freeze
86
+ else
87
+ [].freeze
88
+ end
89
+ end
90
+
91
+ # Check if an element is in the default sensitive list for its format
92
+ #
93
+ # Convenience method for checking element sensitivity without building
94
+ # the full list first.
95
+ #
96
+ # @param element_name [String, Symbol] The element name to check
97
+ # @param match_opts [Hash] Resolved match options
98
+ # @return [Boolean] true if element is in default sensitive list
99
+ def default_sensitive_element?(element_name, match_opts)
100
+ format_default_sensitive_elements(match_opts)
101
+ .include?(element_name.to_sym)
102
+ end
103
+
104
+ private
105
+
106
+ # Check if we should respect xml:space attribute
107
+ #
108
+ # @param match_opts [Hash] Resolved match options
109
+ # @return [Boolean] true if xml:space should be respected
110
+ def respect_xml_space?(match_opts)
111
+ if match_opts.key?(:respect_xml_space)
112
+ match_opts[:respect_xml_space]
113
+ else
114
+ true
115
+ end
116
+ end
117
+
118
+ # Check if xml:space="preserve" is set
119
+ #
120
+ # @param element [Object] The element to check
121
+ # @return [Boolean] true if xml:space="preserve"
122
+ def xml_space_preserve?(element)
123
+ if element.is_a?(Canon::Xml::Nodes::ElementNode)
124
+ # Check attribute_nodes for xml:space attribute
125
+ # xml:space is stored with name="space" and namespace_uri="http://www.w3.org/XML/1998/namespace"
126
+ element.attribute_nodes.any? do |attr|
127
+ attr.name == "space" &&
128
+ attr.namespace_uri == "http://www.w3.org/XML/1998/namespace" &&
129
+ attr.value == "preserve"
130
+ end
131
+ elsif element.respond_to?(:[])
132
+ element["xml:space"] == "preserve"
133
+ else
134
+ false
135
+ end
136
+ end
137
+
138
+ # Check if xml:space="default" is set
139
+ #
140
+ # @param element [Object] The element to check
141
+ # @return [Boolean] true if xml:space="default"
142
+ def xml_space_default?(element)
143
+ if element.is_a?(Canon::Xml::Nodes::ElementNode)
144
+ # Check attribute_nodes for xml:space attribute
145
+ # xml:space is stored with name="space" and namespace_uri="http://www.w3.org/XML/1998/namespace"
146
+ element.attribute_nodes.any? do |attr|
147
+ attr.name == "space" &&
148
+ attr.namespace_uri == "http://www.w3.org/XML/1998/namespace" &&
149
+ attr.value == "default"
150
+ end
151
+ elsif element.respond_to?(:[])
152
+ element["xml:space"] == "default"
153
+ else
154
+ false
155
+ end
156
+ end
157
+
158
+ # Check sensitivity based on user configuration
159
+ #
160
+ # @param element [Object] The element to check
161
+ # @param match_opts [Hash] Resolved match options
162
+ # @return [Boolean] true if element is in whitelist
163
+ def user_config_sensitive?(element, match_opts)
164
+ return false unless match_opts[:whitespace_sensitive_elements]
165
+
166
+ match_opts[:whitespace_sensitive_elements].include?(element.name.to_sym)
167
+ end
168
+
169
+ # Check sensitivity based on user config + format defaults
170
+ #
171
+ # @param element [Object] The element to check
172
+ # @param match_opts [Hash] Resolved match options
173
+ # @return [Boolean] true if element should be sensitive
174
+ def configured_sensitive?(element, match_opts)
175
+ # Start with format defaults
176
+ sensitive = format_default_sensitive_elements(match_opts).to_set
177
+
178
+ # Apply whitelist (adds to defaults)
179
+ if match_opts[:whitespace_sensitive_elements]
180
+ sensitive |= match_opts[:whitespace_sensitive_elements]
181
+ end
182
+
183
+ # Apply blacklist (removes from everything)
184
+ if match_opts[:whitespace_insensitive_elements]
185
+ sensitive -= match_opts[:whitespace_insensitive_elements]
186
+ end
187
+
188
+ sensitive.include?(element.name.to_sym)
189
+ end
190
+
191
+ # Check if node has a parent that's an element (not document root)
192
+ #
193
+ # @param node [Object] The node to check
194
+ # @return [Boolean] true if node has an element parent
195
+ def text_node_parent?(node)
196
+ return false unless node.respond_to?(:parent)
197
+ return false unless node.parent
198
+
199
+ parent = node.parent
200
+ return true if parent.respond_to?(:element?) && parent.element?
201
+
202
+ # Nokogiri compatibility
203
+ parent.respond_to?(:node_type) && parent.node_type == :element
204
+ end
205
+ end
206
+ end
207
+ end
208
+ end
@@ -26,7 +26,8 @@ module Canon
26
26
  # @param diff_children [Boolean] Whether to diff children
27
27
  # @param differences [Array] Array to collect differences
28
28
  # @return [Integer] Comparison result code
29
- def compare(node1, node2, comparator, opts, child_opts, diff_children, differences)
29
+ def compare(node1, node2, comparator, opts, child_opts,
30
+ diff_children, differences)
30
31
  children1 = comparator.send(:filter_children, node1.children, opts)
31
32
  children2 = comparator.send(:filter_children, node2.children, opts)
32
33
 
@@ -51,7 +52,9 @@ module Canon
51
52
  # method that returns symbols, and only works with element nodes.
52
53
  def can_use_element_matcher?(children1, children2)
53
54
  !children1.empty? && !children2.empty? &&
54
- children1.all? { |c| c.is_a?(Canon::Xml::Node) && c.node_type == :element } &&
55
+ children1.all? do |c|
56
+ c.is_a?(Canon::Xml::Node) && c.node_type == :element
57
+ end &&
55
58
  children2.all? { |c| c.is_a?(Canon::Xml::Node) && c.node_type == :element }
56
59
  end
57
60
 
@@ -140,7 +143,8 @@ module Canon
140
143
  opts, child_opts, diff_children, differences)
141
144
  # Length check
142
145
  unless children1.length == children2.length
143
- dimension = determine_dimension_for_mismatch(children1, children2, comparator)
146
+ dimension = determine_dimension_for_mismatch(children1,
147
+ children2, comparator)
144
148
  comparator.send(:add_difference, parent_node, parent_node,
145
149
  Comparison::MISSING_NODE, Comparison::MISSING_NODE,
146
150
  dimension, opts, differences)
@@ -167,15 +171,19 @@ module Canon
167
171
  (0...max_len).each do |i|
168
172
  if i >= children1.length
169
173
  # Extra child in children2
170
- dimension = comparator.send(:determine_node_dimension, children2[i])
174
+ dimension = comparator.send(:determine_node_dimension,
175
+ children2[i])
171
176
  break
172
177
  elsif i >= children2.length
173
178
  # Extra child in children1
174
- dimension = comparator.send(:determine_node_dimension, children1[i])
179
+ dimension = comparator.send(:determine_node_dimension,
180
+ children1[i])
175
181
  break
176
- elsif !comparator.send(:same_node_type?, children1[i], children2[i])
182
+ elsif !comparator.send(:same_node_type?, children1[i],
183
+ children2[i])
177
184
  # Different node types at same position
178
- dimension = comparator.send(:determine_node_dimension, children1[i])
185
+ dimension = comparator.send(:determine_node_dimension,
186
+ children1[i])
179
187
  break
180
188
  end
181
189
  end
@@ -13,21 +13,24 @@ module Canon
13
13
  #
14
14
  # @param node [String, Object] Node to parse
15
15
  # @param preprocessing [Symbol] Preprocessing mode (:none, :normalize, :c14n, :format)
16
+ # @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
16
17
  # @return [Canon::Xml::Node] Parsed node
17
- def self.parse(node, preprocessing = :none)
18
+ def self.parse(node, preprocessing = :none, preserve_whitespace: false)
18
19
  # If already a Canon::Xml::Node, return as-is
19
20
  return node if node.is_a?(Canon::Xml::Node)
20
21
 
21
22
  # If it's a Nokogiri or Moxml node, convert to DataModel
22
23
  unless node.is_a?(String)
23
- return convert_from_node(node)
24
+ return convert_from_node(node,
25
+ preserve_whitespace: preserve_whitespace)
24
26
  end
25
27
 
26
28
  # Apply preprocessing to XML string before parsing
27
29
  xml_string = apply_preprocessing(node, preprocessing)
28
30
 
29
31
  # Use Canon::Xml::DataModel for parsing to get Canon::Xml::Node instances
30
- Canon::Xml::DataModel.from_xml(xml_string)
32
+ Canon::Xml::DataModel.from_xml(xml_string,
33
+ preserve_whitespace: preserve_whitespace)
31
34
  end
32
35
 
33
36
  # Apply preprocessing transformation to XML string
@@ -55,8 +58,9 @@ module Canon
55
58
  # Convert from Nokogiri/Moxml node to Canon::Xml::Node
56
59
  #
57
60
  # @param node [Object] Nokogiri or Moxml node
61
+ # @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
58
62
  # @return [Canon::Xml::Node] Converted node
59
- def self.convert_from_node(node)
63
+ def self.convert_from_node(node, preserve_whitespace: false)
60
64
  # Convert to XML string then parse through DataModel
61
65
  xml_str = if node.respond_to?(:to_xml)
62
66
  node.to_xml
@@ -66,7 +70,8 @@ module Canon
66
70
  raise Canon::Error,
67
71
  "Unable to convert node to string: #{node.class}"
68
72
  end
69
- Canon::Xml::DataModel.from_xml(xml_str)
73
+ Canon::Xml::DataModel.from_xml(xml_str,
74
+ preserve_whitespace: preserve_whitespace)
70
75
  end
71
76
  end
72
77
  end