canon 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +112 -25
  3. data/docs/Gemfile +1 -0
  4. data/docs/_config.yml +90 -1
  5. data/docs/advanced/diff-classification.adoc +82 -2
  6. data/docs/features/match-options/index.adoc +239 -1
  7. data/lib/canon/comparison/format_detector.rb +2 -1
  8. data/lib/canon/comparison/html_comparator.rb +19 -8
  9. data/lib/canon/comparison/html_compare_profile.rb +8 -2
  10. data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
  11. data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
  12. data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
  13. data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
  14. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
  15. data/lib/canon/comparison/xml_comparator.rb +48 -23
  16. data/lib/canon/comparison/xml_node_comparison.rb +25 -3
  17. data/lib/canon/diff/diff_classifier.rb +101 -2
  18. data/lib/canon/diff/formatting_detector.rb +1 -1
  19. data/lib/canon/rspec_matchers.rb +37 -8
  20. data/lib/canon/version.rb +1 -1
  21. data/lib/canon/xml/data_model.rb +24 -13
  22. metadata +3 -78
  23. data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
  24. data/false_positive_analysis.txt +0 -0
  25. data/file1.html +0 -1
  26. data/file2.html +0 -1
  27. data/old-docs/ADVANCED_TOPICS.adoc +0 -20
  28. data/old-docs/BASIC_USAGE.adoc +0 -16
  29. data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
  30. data/old-docs/CLI.adoc +0 -497
  31. data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  32. data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
  33. data/old-docs/DIFF_FORMATTING.adoc +0 -540
  34. data/old-docs/DIFF_PARAMETERS.adoc +0 -261
  35. data/old-docs/DOM_DIFF.adoc +0 -1017
  36. data/old-docs/ENV_CONFIG.adoc +0 -876
  37. data/old-docs/FORMATS.adoc +0 -867
  38. data/old-docs/INPUT_VALIDATION.adoc +0 -477
  39. data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
  40. data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
  41. data/old-docs/MATCH_OPTIONS.adoc +0 -912
  42. data/old-docs/MODES.adoc +0 -432
  43. data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  44. data/old-docs/OPTIONS.adoc +0 -1387
  45. data/old-docs/PREPROCESSING.adoc +0 -491
  46. data/old-docs/README.old.adoc +0 -2831
  47. data/old-docs/RSPEC.adoc +0 -814
  48. data/old-docs/RUBY_API.adoc +0 -485
  49. data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
  50. data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
  51. data/old-docs/STRING_COMPARE.adoc +0 -345
  52. data/old-docs/TMP.adoc +0 -3384
  53. data/old-docs/TREE_DIFF.adoc +0 -1080
  54. data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
  55. data/old-docs/VERBOSE.adoc +0 -482
  56. data/old-docs/VISUALIZATION_MAP.adoc +0 -625
  57. data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
  58. data/scripts/analyze_current_state.rb +0 -85
  59. data/scripts/analyze_false_positives.rb +0 -114
  60. data/scripts/analyze_remaining_failures.rb +0 -105
  61. data/scripts/compare_current_failures.rb +0 -95
  62. data/scripts/compare_dom_tree_diff.rb +0 -158
  63. data/scripts/compare_failures.rb +0 -151
  64. data/scripts/debug_attribute_extraction.rb +0 -66
  65. data/scripts/debug_blocks_839.rb +0 -115
  66. data/scripts/debug_meta_matching.rb +0 -52
  67. data/scripts/debug_p_matching.rb +0 -192
  68. data/scripts/debug_signature_matching.rb +0 -118
  69. data/scripts/debug_sourcecode_124.rb +0 -32
  70. data/scripts/debug_whitespace_sensitive.rb +0 -192
  71. data/scripts/extract_false_positives.rb +0 -138
  72. data/scripts/find_actual_false_positives.rb +0 -125
  73. data/scripts/investigate_all_false_positives.rb +0 -161
  74. data/scripts/investigate_batch1.rb +0 -127
  75. data/scripts/investigate_classification.rb +0 -150
  76. data/scripts/investigate_classification_detailed.rb +0 -190
  77. data/scripts/investigate_common_failures.rb +0 -342
  78. data/scripts/investigate_false_negative.rb +0 -80
  79. data/scripts/investigate_false_positive.rb +0 -83
  80. data/scripts/investigate_false_positives.rb +0 -227
  81. data/scripts/investigate_false_positives_batch.rb +0 -163
  82. data/scripts/investigate_mixed_content.rb +0 -125
  83. data/scripts/investigate_remaining_16.rb +0 -214
  84. data/scripts/run_single_test.rb +0 -29
  85. data/scripts/test_all_false_positives.rb +0 -95
  86. data/scripts/test_attribute_details.rb +0 -61
  87. data/scripts/test_both_algorithms.rb +0 -49
  88. data/scripts/test_both_simple.rb +0 -49
  89. data/scripts/test_enhanced_semantic_output.rb +0 -125
  90. data/scripts/test_readme_examples.rb +0 -131
  91. data/scripts/test_semantic_tree_diff.rb +0 -99
  92. data/scripts/test_semantic_ux_improvements.rb +0 -135
  93. data/scripts/test_single_false_positive.rb +0 -119
  94. data/scripts/test_size_limits.rb +0 -99
  95. data/test_html_1.html +0 -21
  96. data/test_html_2.html +0 -21
  97. data/test_nokogiri.rb +0 -33
  98. data/test_normalize.rb +0 -45
@@ -23,7 +23,8 @@ module Canon
23
23
  # @param diff_children [Boolean] Whether to diff children
24
24
  # @param differences [Array] Array to collect differences
25
25
  # @return [Integer] Comparison result code
26
- def compare(node1, node2, comparator, opts, child_opts, diff_children, differences)
26
+ def compare(node1, node2, comparator, opts, child_opts,
27
+ diff_children, differences)
27
28
  # Dispatch based on node type
28
29
  # Canon::Xml::Node types use .node_type method that returns symbols
29
30
  # Nokogiri also has .node_type but returns integers, so check for Symbol
@@ -51,11 +52,14 @@ module Canon
51
52
  comparator.send(:compare_element_nodes, node1, node2, opts, child_opts,
52
53
  diff_children, differences)
53
54
  when :text
54
- comparator.send(:compare_text_nodes, node1, node2, opts, differences)
55
+ comparator.send(:compare_text_nodes, node1, node2, opts,
56
+ differences)
55
57
  when :comment
56
- comparator.send(:compare_comment_nodes, node1, node2, opts, differences)
58
+ comparator.send(:compare_comment_nodes, node1, node2, opts,
59
+ differences)
57
60
  when :cdata
58
- comparator.send(:compare_text_nodes, node1, node2, opts, differences)
61
+ comparator.send(:compare_text_nodes, node1, node2, opts,
62
+ differences)
59
63
  when :processing_instruction
60
64
  comparator.send(:compare_processing_instruction_nodes, node1, node2, opts,
61
65
  differences)
@@ -71,11 +75,14 @@ module Canon
71
75
  comparator.send(:compare_element_nodes, node1, node2, opts, child_opts,
72
76
  diff_children, differences)
73
77
  elsif node1.respond_to?(:text?) && node1.text?
74
- comparator.send(:compare_text_nodes, node1, node2, opts, differences)
78
+ comparator.send(:compare_text_nodes, node1, node2, opts,
79
+ differences)
75
80
  elsif node1.respond_to?(:comment?) && node1.comment?
76
- comparator.send(:compare_comment_nodes, node1, node2, opts, differences)
81
+ comparator.send(:compare_comment_nodes, node1, node2, opts,
82
+ differences)
77
83
  elsif node1.respond_to?(:cdata?) && node1.cdata?
78
- comparator.send(:compare_text_nodes, node1, node2, opts, differences)
84
+ comparator.send(:compare_text_nodes, node1, node2, opts,
85
+ differences)
79
86
  elsif node1.respond_to?(:processing_instruction?) &&
80
87
  node1.processing_instruction?
81
88
  comparator.send(:compare_processing_instruction_nodes, node1, node2, opts,
@@ -18,6 +18,8 @@ require_relative "xml_comparator/namespace_comparator"
18
18
  require_relative "xml_comparator/node_type_comparator"
19
19
  require_relative "xml_comparator/child_comparison"
20
20
  require_relative "xml_comparator/diff_node_builder"
21
+ # Whitespace sensitivity module
22
+ require_relative "whitespace_sensitivity"
21
23
 
22
24
  module Canon
23
25
  module Comparison
@@ -90,9 +92,15 @@ module Canon
90
92
  # Create child_opts with resolved options
91
93
  child_opts = opts.merge(child_opts)
92
94
 
95
+ # Determine if we should preserve whitespace during parsing
96
+ # When structural_whitespace is :strict, preserve all whitespace-only text nodes
97
+ preserve_whitespace = match_opts_hash[:structural_whitespace] == :strict
98
+
93
99
  # Parse nodes if they are strings, applying preprocessing if needed
94
- node1 = parse_node(n1, match_opts_hash[:preprocessing])
95
- node2 = parse_node(n2, match_opts_hash[:preprocessing])
100
+ node1 = parse_node(n1, match_opts_hash[:preprocessing],
101
+ preserve_whitespace: preserve_whitespace)
102
+ node2 = parse_node(n2, match_opts_hash[:preprocessing],
103
+ preserve_whitespace: preserve_whitespace)
96
104
 
97
105
  # Store original strings for line diff display (before preprocessing)
98
106
  original1 = if n1.is_a?(String)
@@ -209,8 +217,9 @@ module Canon
209
217
  # Parse a node from string or return as-is
210
218
  # Applies preprocessing transformation before parsing if specified
211
219
  # Delegates to NodeParser module
212
- def parse_node(node, preprocessing = :none)
213
- XmlComparatorHelpers::NodeParser.parse(node, preprocessing)
220
+ def parse_node(node, preprocessing = :none, preserve_whitespace: false)
221
+ XmlComparatorHelpers::NodeParser.parse(node, preprocessing,
222
+ preserve_whitespace: preserve_whitespace)
214
223
  end
215
224
 
216
225
  # Main comparison dispatcher
@@ -331,7 +340,8 @@ module Canon
331
340
 
332
341
  # For HTML, check if text node is inside whitespace-preserving element
333
342
  # If so, always use strict comparison regardless of text_content setting
334
- if should_preserve_whitespace_strictly?(n1, n2)
343
+ sensitive_element = should_preserve_whitespace_strictly?(n1, n2, opts)
344
+ if sensitive_element
335
345
  behavior = :strict
336
346
  end
337
347
 
@@ -344,15 +354,23 @@ module Canon
344
354
 
345
355
  # Determine the correct dimension for this difference
346
356
  # - If text_content is :strict, ALL differences use :text_content dimension
347
- # - If text_content is :normalize, whitespace-only diffs use :structural_whitespace
357
+ # - If text_content is :normalize, whitespace-only diffs could use :structural_whitespace
358
+ # but we keep :text_content to ensure correct classification behavior
348
359
  # - Otherwise use :text_content
349
- dimension = if behavior == :normalize && whitespace_only_difference?(
350
- text1, text2
351
- )
352
- :structural_whitespace
353
- else
354
- :text_content
355
- end
360
+ # However, if element is whitespace-sensitive (like <pre> in HTML),
361
+ # always use :text_content dimension regardless of behavior
362
+ #
363
+ # NOTE: We keep the dimension as :text_content even for whitespace-only diffs
364
+ # when text_content: :normalize. This ensures that the classification uses
365
+ # the text_content behavior (:normalize) instead of structural_whitespace
366
+ # behavior (:strict for XML), which would incorrectly mark the diff as normative.
367
+ if sensitive_element
368
+ # Whitespace-sensitive element: always use :text_content dimension
369
+ else
370
+ # Always use :text_content for text differences
371
+ # This ensures correct classification based on text_content behavior
372
+ end
373
+ dimension = :text_content
356
374
 
357
375
  # Create DiffNode in verbose mode when raw content differs
358
376
  # This ensures informative diffs are created even for :ignore/:normalize
@@ -368,17 +386,23 @@ module Canon
368
386
 
369
387
  # Check if whitespace should be preserved strictly for these text nodes
370
388
  # This applies to HTML elements like pre, code, textarea, script, style
371
- def should_preserve_whitespace_strictly?(n1, n2)
372
- # Only applies to Nokogiri nodes (HTML)
373
- return false unless n1.respond_to?(:parent) && n2.respond_to?(:parent)
374
- return false unless n1.parent.respond_to?(:name) && n2.parent.respond_to?(:name)
389
+ # and elements with xml:space="preserve" or in user-configured whitelist
390
+ def should_preserve_whitespace_strictly?(n1, n2, opts)
391
+ # Use WhitespaceSensitivity module to check if element is sensitive
392
+ # Check both n1 and n2 - if either is in a sensitive element, preserve strictly
393
+ if n1.respond_to?(:parent)
394
+ sensitivity_opts = { match_opts: opts[:match_opts] }
395
+ return true if WhitespaceSensitivity.element_sensitive?(n1,
396
+ sensitivity_opts)
397
+ end
375
398
 
376
- # Elements where whitespace must be preserved in HTML
377
- preserve_elements = %w[pre code textarea script style]
399
+ if n2.respond_to?(:parent)
400
+ sensitivity_opts = { match_opts: opts[:match_opts] }
401
+ return true if WhitespaceSensitivity.element_sensitive?(n2,
402
+ sensitivity_opts)
403
+ end
378
404
 
379
- # Check if either node is inside a whitespace-preserving element
380
- in_preserve_element?(n1, preserve_elements) ||
381
- in_preserve_element?(n2, preserve_elements)
405
+ false
382
406
  end
383
407
 
384
408
  # Check if a node is inside a whitespace-preserving element
@@ -469,7 +493,8 @@ module Canon
469
493
  #
470
494
  # Delegates to ChildComparison module which handles both ElementMatcher
471
495
  # (semantic matching) and simple positional comparison.
472
- def compare_children(n1, n2, opts, child_opts, diff_children, differences)
496
+ def compare_children(n1, n2, opts, child_opts, diff_children,
497
+ differences)
473
498
  XmlComparatorHelpers::ChildComparison.compare(
474
499
  n1, n2, self, opts, child_opts, diff_children, differences
475
500
  )
@@ -139,9 +139,13 @@ diff_children, differences)
139
139
 
140
140
  # Check structural_whitespace match option
141
141
  match_opts = opts[:match_opts]
142
- # Filter out whitespace-only text nodes
143
- if match_opts && %i[ignore
144
- normalize].include?(match_opts[:structural_whitespace]) && text_node?(node)
142
+ return false unless match_opts
143
+
144
+ # Filter out whitespace-only text nodes based on structural_whitespace setting
145
+ # - :ignore or :normalize: Filter all whitespace-only text nodes
146
+ # - :strict: Preserve all whitespace-only text nodes (don't filter any)
147
+ if text_node?(node) && %i[ignore
148
+ normalize].include?(match_opts[:structural_whitespace])
145
149
  text = node_text(node)
146
150
  return true if MatchOptions.normalize_text(text).empty?
147
151
  end
@@ -184,6 +188,24 @@ diff_children, differences)
184
188
  node.respond_to?(:node_type) && node.node_type == :text
185
189
  end
186
190
 
191
+ # Extract text content from a node
192
+ #
193
+ # @param node [Object] Node to extract text from
194
+ # @return [String] Text content
195
+ def self.node_text(node)
196
+ return "" unless node
197
+
198
+ if node.respond_to?(:content)
199
+ node.content.to_s
200
+ elsif node.respond_to?(:text)
201
+ node.text.to_s
202
+ elsif node.respond_to?(:value)
203
+ node.value.to_s
204
+ else
205
+ ""
206
+ end
207
+ end
208
+
187
209
  # Dispatch by Canon::Xml::Node type
188
210
  def self.dispatch_canon_node_type(node1, node2, opts, child_opts,
189
211
  diff_children, differences)
@@ -2,6 +2,7 @@
2
2
 
3
3
  require_relative "formatting_detector"
4
4
  require_relative "../comparison/compare_profile"
5
+ require_relative "../comparison/whitespace_sensitivity"
5
6
 
6
7
  module Canon
7
8
  module Diff
@@ -28,6 +29,28 @@ module Canon
28
29
  # @param diff_node [DiffNode] The diff node to classify
29
30
  # @return [DiffNode] The same diff node with normative/formatting attributes set
30
31
  def classify(diff_node)
32
+ # SPECIAL CASE: text_content with :normalize behavior
33
+ # When text_content is :normalize and the difference is formatting-only,
34
+ # it should be marked as non-normative (informative)
35
+ # This ensures that verbose and non-verbose modes give consistent results
36
+ #
37
+ # EXCEPTION: If the text node is inside a whitespace-sensitive element
38
+ # (like <pre>, <code>, <textarea> in HTML), don't apply formatting detection
39
+ # because whitespace should be preserved in these elements
40
+ #
41
+ # This check must come FIRST, before normative_dimension? is called,
42
+ # because normative_dimension? returns true for text_content: :normalize
43
+ # (since the dimension affects equivalence), which would prevent formatting
44
+ # detection from being applied.
45
+ if diff_node.dimension == :text_content &&
46
+ profile.send(:behavior_for, :text_content) == :normalize &&
47
+ !inside_whitespace_sensitive_element?(diff_node) &&
48
+ formatting_only_diff?(diff_node)
49
+ diff_node.formatting = true
50
+ diff_node.normative = false
51
+ return diff_node
52
+ end
53
+
31
54
  # FIRST: Determine if this dimension is normative based on CompareProfile
32
55
  # This respects the policy settings (strict/normalize/ignore)
33
56
  is_normative = profile.normative_dimension?(diff_node.dimension)
@@ -45,7 +68,7 @@ module Canon
45
68
  return diff_node
46
69
  end
47
70
 
48
- # Otherwise, use the normative determination from CompareProfile
71
+ # THIRD: Apply the normative determination from CompareProfile
49
72
  diff_node.formatting = false
50
73
  diff_node.normative = is_normative
51
74
 
@@ -65,10 +88,86 @@ module Canon
65
88
  # @param diff_node [DiffNode] The diff node to check
66
89
  # @return [Boolean] true if formatting-only
67
90
  def formatting_only_diff?(diff_node)
91
+ # Only apply formatting detection to actual text content differences
92
+ # If the nodes are not text nodes (e.g., element nodes), don't apply formatting detection
93
+ node1 = diff_node.node1
94
+ node2 = diff_node.node2
95
+
96
+ # Check if both nodes are text nodes
97
+ # If not, this is not a formatting-only difference
98
+ return false unless text_node?(node1) && text_node?(node2)
99
+
68
100
  text1 = extract_text_content(diff_node.node1)
69
101
  text2 = extract_text_content(diff_node.node2)
70
102
 
71
- FormattingDetector.formatting_only?(text1, text2)
103
+ # For text_content dimension, use normalized text comparison
104
+ # This handles cases like "" vs " " (both normalize to "")
105
+ if diff_node.dimension == :text_content
106
+ normalized_equivalent?(text1, text2)
107
+ else
108
+ FormattingDetector.formatting_only?(text1, text2)
109
+ end
110
+ end
111
+
112
+ # Check if two texts are equivalent after normalization
113
+ # This detects formatting-only differences where normalized texts match
114
+ # @param text1 [String, nil] First text
115
+ # @param text2 [String, nil] Second text
116
+ # @return [Boolean] true if normalized texts are equivalent
117
+ def normalized_equivalent?(text1, text2)
118
+ return false if text1.nil? && text2.nil?
119
+ return false if text1.nil? || text2.nil?
120
+
121
+ # Use MatchOptions.normalize_text for consistency
122
+ normalized1 = Canon::Comparison::MatchOptions.normalize_text(text1)
123
+ normalized2 = Canon::Comparison::MatchOptions.normalize_text(text2)
124
+
125
+ # If normalized texts are equivalent but originals are different,
126
+ # it's a formatting-only difference
127
+ normalized1 == normalized2 && text1 != text2
128
+ end
129
+
130
+ # Check if a node is a text node
131
+ # @param node [Object] The node to check
132
+ # @return [Boolean] true if the node is a text node
133
+ def text_node?(node)
134
+ return false if node.nil?
135
+
136
+ # Canon::Xml::Nodes::TextNode
137
+ return true if node.is_a?(Canon::Xml::Nodes::TextNode)
138
+
139
+ # Nokogiri text nodes (node_type returns integer constant like 3)
140
+ return true if node.respond_to?(:node_type) &&
141
+ node.node_type.is_a?(Integer) &&
142
+ node.node_type == Nokogiri::XML::Node::TEXT_NODE
143
+
144
+ # Moxml text nodes (node_type returns symbol)
145
+ return true if node.respond_to?(:node_type) && node.node_type == :text
146
+
147
+ # String
148
+ return true if node.is_a?(String)
149
+
150
+ # Test doubles or objects with text node-like interface
151
+ # Check if it has a value method (contains text content)
152
+ return true if node.respond_to?(:value)
153
+
154
+ false
155
+ end
156
+
157
+ # Check if the text node is inside a whitespace-sensitive element
158
+ # @param diff_node [DiffNode] The diff node to check
159
+ # @return [Boolean] true if inside a whitespace-sensitive element
160
+ def inside_whitespace_sensitive_element?(diff_node)
161
+ # Get the text node (not the parent element)
162
+ node = diff_node.node1 || diff_node.node2
163
+ return false unless node
164
+
165
+ # WhitespaceSensitivity.element_sensitive? expects a text node
166
+ # and checks its parent element
167
+ # We need to pass the full options structure with :match_opts key
168
+ opts = { match_opts: @match_options.options }
169
+
170
+ Canon::Comparison::WhitespaceSensitivity.element_sensitive?(node, opts)
72
171
  end
73
172
 
74
173
  # Extract text content from a node for formatting comparison
@@ -11,7 +11,7 @@ module Canon
11
11
  # @param line2 [String, nil] Second line to compare
12
12
  # @return [Boolean] true if lines differ only in formatting
13
13
  def self.formatting_only?(line1, line2)
14
- # If both are nil or empty, not a formatting diff
14
+ # If both are nil or empty, not a formatting diff (no difference)
15
15
  return false if blank?(line1) && blank?(line2)
16
16
 
17
17
  # If only one is blank, it's not just formatting
@@ -63,6 +63,15 @@ module Canon
63
63
  self
64
64
  end
65
65
 
66
+ # Chain method for setting match options
67
+ # @param match_opts [Hash] match options
68
+ # @return [SerializationMatcher] self for chaining
69
+ def with_match(**match_opts)
70
+ @match ||= {}
71
+ @match = @match.merge(match_opts)
72
+ self
73
+ end
74
+
66
75
  def matches?(target)
67
76
  @target = target
68
77
 
@@ -252,12 +261,22 @@ module Canon
252
261
  diff_algorithm: diff_algorithm)
253
262
  end
254
263
 
255
- def be_yaml_equivalent_to(expected)
256
- SerializationMatcher.new(expected, :yaml)
264
+ def be_yaml_equivalent_to(expected, match_profile: nil, match: nil,
265
+ preprocessing: nil, diff_algorithm: nil)
266
+ SerializationMatcher.new(expected, :yaml,
267
+ match_profile: match_profile,
268
+ match: match,
269
+ preprocessing: preprocessing,
270
+ diff_algorithm: diff_algorithm)
257
271
  end
258
272
 
259
- def be_json_equivalent_to(expected)
260
- SerializationMatcher.new(expected, :json)
273
+ def be_json_equivalent_to(expected, match_profile: nil, match: nil,
274
+ preprocessing: nil, diff_algorithm: nil)
275
+ SerializationMatcher.new(expected, :json,
276
+ match_profile: match_profile,
277
+ match: match,
278
+ preprocessing: preprocessing,
279
+ diff_algorithm: diff_algorithm)
261
280
  end
262
281
 
263
282
  def be_html_equivalent_to(expected, match_profile: nil, match: nil,
@@ -287,12 +306,22 @@ module Canon
287
306
  diff_algorithm: diff_algorithm)
288
307
  end
289
308
 
290
- def be_equivalent_to(expected)
291
- SerializationMatcher.new(expected, nil)
309
+ def be_equivalent_to(expected, match_profile: nil, match: nil,
310
+ preprocessing: nil, diff_algorithm: nil)
311
+ SerializationMatcher.new(expected, nil,
312
+ match_profile: match_profile,
313
+ match: match,
314
+ preprocessing: preprocessing,
315
+ diff_algorithm: diff_algorithm)
292
316
  end
293
317
 
294
- def be_string_equivalent_to(expected)
295
- SerializationMatcher.new(expected, :string)
318
+ def be_string_equivalent_to(expected, match_profile: nil, match: nil,
319
+ preprocessing: nil, diff_algorithm: nil)
320
+ SerializationMatcher.new(expected, :string,
321
+ match_profile: match_profile,
322
+ match: match,
323
+ preprocessing: preprocessing,
324
+ diff_algorithm: diff_algorithm)
296
325
  end
297
326
 
298
327
  if defined?(::RSpec) && ::RSpec.respond_to?(:configure)
data/lib/canon/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Canon
4
- VERSION = "0.1.8"
4
+ VERSION = "0.1.9"
5
5
  end
@@ -18,8 +18,9 @@ module Canon
18
18
  # Build XPath data model from XML string
19
19
  #
20
20
  # @param xml_string [String] XML content to parse
21
+ # @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
21
22
  # @return [Nodes::RootNode] Root of the data model tree
22
- def self.from_xml(xml_string)
23
+ def self.from_xml(xml_string, preserve_whitespace: false)
23
24
  # Parse with Nokogiri
24
25
  doc = Nokogiri::XML(xml_string) do |config|
25
26
  config.nonet # Disable network access
@@ -30,7 +31,7 @@ module Canon
30
31
  check_for_relative_namespace_uris(doc)
31
32
 
32
33
  # Convert to XPath data model
33
- build_from_nokogiri(doc)
34
+ build_from_nokogiri(doc, preserve_whitespace: preserve_whitespace)
34
35
  end
35
36
 
36
37
  # Alias for compatibility with base class interface
@@ -74,19 +75,21 @@ module Canon
74
75
 
75
76
  # Build XPath data model from Nokogiri document or fragment
76
77
  # rubocop:disable Metrics/MethodLength
77
- def self.build_from_nokogiri(nokogiri_doc)
78
+ def self.build_from_nokogiri(nokogiri_doc, preserve_whitespace: false)
78
79
  root = Nodes::RootNode.new
79
80
 
80
81
  if nokogiri_doc.respond_to?(:root) && nokogiri_doc.root
81
82
  # For Documents (XML, HTML4, HTML5, Moxml): process the root element
82
- root.add_child(build_element_node(nokogiri_doc.root))
83
+ root.add_child(build_element_node(nokogiri_doc.root,
84
+ preserve_whitespace: preserve_whitespace))
83
85
 
84
86
  # Process PIs and comments outside doc element
85
87
  nokogiri_doc.children.each do |child|
86
88
  next if child == nokogiri_doc.root
87
89
  next if child.is_a?(Nokogiri::XML::DTD)
88
90
 
89
- node = build_node_from_nokogiri(child)
91
+ node = build_node_from_nokogiri(child,
92
+ preserve_whitespace: preserve_whitespace)
90
93
  root.add_child(node) if node
91
94
  end
92
95
  else
@@ -95,7 +98,8 @@ module Canon
95
98
  nokogiri_doc.children.each do |child|
96
99
  next if child.is_a?(Nokogiri::XML::DTD)
97
100
 
98
- node = build_node_from_nokogiri(child)
101
+ node = build_node_from_nokogiri(child,
102
+ preserve_whitespace: preserve_whitespace)
99
103
  root.add_child(node) if node
100
104
  end
101
105
  end
@@ -104,12 +108,15 @@ module Canon
104
108
  end
105
109
 
106
110
  # Build node from Nokogiri node
107
- def self.build_node_from_nokogiri(nokogiri_node)
111
+ def self.build_node_from_nokogiri(nokogiri_node,
112
+ preserve_whitespace: false)
108
113
  case nokogiri_node
109
114
  when Nokogiri::XML::Element
110
- build_element_node(nokogiri_node)
115
+ build_element_node(nokogiri_node,
116
+ preserve_whitespace: preserve_whitespace)
111
117
  when Nokogiri::XML::Text
112
- build_text_node(nokogiri_node)
118
+ build_text_node(nokogiri_node,
119
+ preserve_whitespace: preserve_whitespace)
113
120
  when Nokogiri::XML::Comment
114
121
  build_comment_node(nokogiri_node)
115
122
  when Nokogiri::XML::ProcessingInstruction
@@ -119,7 +126,7 @@ module Canon
119
126
 
120
127
  # Build element node from Nokogiri element
121
128
  # rubocop:disable Metrics/MethodLength
122
- def self.build_element_node(nokogiri_element)
129
+ def self.build_element_node(nokogiri_element, preserve_whitespace: false)
123
130
  element = Nodes::ElementNode.new(
124
131
  name: nokogiri_element.name,
125
132
  namespace_uri: nokogiri_element.namespace&.href,
@@ -134,7 +141,8 @@ module Canon
134
141
 
135
142
  # Build child nodes
136
143
  nokogiri_element.children.each do |child|
137
- node = build_node_from_nokogiri(child)
144
+ node = build_node_from_nokogiri(child,
145
+ preserve_whitespace: preserve_whitespace)
138
146
  element.add_child(node) if node
139
147
  end
140
148
 
@@ -195,13 +203,16 @@ module Canon
195
203
  end
196
204
 
197
205
  # Build text node from Nokogiri text node
198
- def self.build_text_node(nokogiri_text)
206
+ def self.build_text_node(nokogiri_text, preserve_whitespace: false)
199
207
  # XML text nodes: preserve all content including whitespace
200
208
  # Unlike HTML, XML treats all whitespace as significant
201
209
  content = nokogiri_text.content
202
210
 
203
211
  # Skip empty text nodes between elements (common formatting whitespace)
204
- return nil if content.strip.empty? && nokogiri_text.parent.is_a?(Nokogiri::XML::Element)
212
+ # UNLESS preserve_whitespace is true (for structural_whitespace: :strict)
213
+ if !preserve_whitespace && content.strip.empty? && nokogiri_text.parent.is_a?(Nokogiri::XML::Element)
214
+ return nil
215
+ end
205
216
 
206
217
  # Nokogiri already handles CDATA conversion and entity resolution
207
218
  Nodes::TextNode.new(value: content)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: canon
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.8
4
+ version: 0.1.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-01-18 00:00:00.000000000 Z
11
+ date: 2026-01-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: diff-lcs
@@ -174,7 +174,6 @@ files:
174
174
  - docs/internals/diffnode-enrichment.adoc
175
175
  - docs/internals/index.adoc
176
176
  - docs/lychee.toml
177
- - docs/plans/2025-01-17-html-parser-selection-fix.adoc
178
177
  - docs/reference/cli-options.adoc
179
178
  - docs/reference/environment-variables.adoc
180
179
  - docs/reference/index.adoc
@@ -191,9 +190,6 @@ files:
191
190
  - docs/understanding/formats/yaml.adoc
192
191
  - docs/understanding/index.adoc
193
192
  - exe/canon
194
- - false_positive_analysis.txt
195
- - file1.html
196
- - file2.html
197
193
  - lib/canon.rb
198
194
  - lib/canon/cache.rb
199
195
  - lib/canon/cli.rb
@@ -230,6 +226,7 @@ files:
230
226
  - lib/canon/comparison/strategies/base_match_strategy.rb
231
227
  - lib/canon/comparison/strategies/match_strategy_factory.rb
232
228
  - lib/canon/comparison/strategies/semantic_tree_match_strategy.rb
229
+ - lib/canon/comparison/whitespace_sensitivity.rb
233
230
  - lib/canon/comparison/xml_comparator.rb
234
231
  - lib/canon/comparison/xml_comparator/attribute_comparator.rb
235
232
  - lib/canon/comparison/xml_comparator/attribute_filter.rb
@@ -344,79 +341,7 @@ files:
344
341
  - lib/canon/xml/whitespace_normalizer.rb
345
342
  - lib/canon/xml/xml_base_handler.rb
346
343
  - lib/xml-c14n.rb
347
- - old-docs/ADVANCED_TOPICS.adoc
348
- - old-docs/BASIC_USAGE.adoc
349
- - old-docs/CHARACTER_VISUALIZATION.adoc
350
- - old-docs/CLI.adoc
351
- - old-docs/CUSTOMIZING_BEHAVIOR.adoc
352
- - old-docs/DIFF_ARCHITECTURE.adoc
353
- - old-docs/DIFF_FORMATTING.adoc
354
- - old-docs/DIFF_PARAMETERS.adoc
355
- - old-docs/DOM_DIFF.adoc
356
- - old-docs/ENV_CONFIG.adoc
357
- - old-docs/FORMATS.adoc
358
- - old-docs/INPUT_VALIDATION.adoc
359
- - old-docs/MATCHER_BEHAVIOR.adoc
360
- - old-docs/MATCH_ARCHITECTURE.adoc
361
- - old-docs/MATCH_OPTIONS.adoc
362
- - old-docs/MODES.adoc
363
- - old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc
364
- - old-docs/OPTIONS.adoc
365
- - old-docs/PREPROCESSING.adoc
366
- - old-docs/README.old.adoc
367
- - old-docs/RSPEC.adoc
368
- - old-docs/RUBY_API.adoc
369
- - old-docs/SEMANTIC_DIFF_REPORT.adoc
370
- - old-docs/SEMANTIC_TREE_DIFF.adoc
371
- - old-docs/STRING_COMPARE.adoc
372
- - old-docs/TMP.adoc
373
- - old-docs/TREE_DIFF.adoc
374
- - old-docs/UNDERSTANDING_CANON.adoc
375
- - old-docs/VERBOSE.adoc
376
- - old-docs/VISUALIZATION_MAP.adoc
377
- - old-docs/WHITESPACE_TREATMENT.adoc
378
- - scripts/analyze_current_state.rb
379
- - scripts/analyze_false_positives.rb
380
- - scripts/analyze_remaining_failures.rb
381
- - scripts/compare_current_failures.rb
382
- - scripts/compare_dom_tree_diff.rb
383
- - scripts/compare_failures.rb
384
- - scripts/debug_attribute_extraction.rb
385
- - scripts/debug_blocks_839.rb
386
- - scripts/debug_meta_matching.rb
387
- - scripts/debug_p_matching.rb
388
- - scripts/debug_signature_matching.rb
389
- - scripts/debug_sourcecode_124.rb
390
- - scripts/debug_whitespace_sensitive.rb
391
- - scripts/extract_false_positives.rb
392
- - scripts/find_actual_false_positives.rb
393
- - scripts/investigate_all_false_positives.rb
394
- - scripts/investigate_batch1.rb
395
- - scripts/investigate_classification.rb
396
- - scripts/investigate_classification_detailed.rb
397
- - scripts/investigate_common_failures.rb
398
- - scripts/investigate_false_negative.rb
399
- - scripts/investigate_false_positive.rb
400
- - scripts/investigate_false_positives.rb
401
- - scripts/investigate_false_positives_batch.rb
402
- - scripts/investigate_mixed_content.rb
403
- - scripts/investigate_remaining_16.rb
404
- - scripts/run_single_test.rb
405
- - scripts/test_all_false_positives.rb
406
- - scripts/test_attribute_details.rb
407
- - scripts/test_both_algorithms.rb
408
- - scripts/test_both_simple.rb
409
- - scripts/test_enhanced_semantic_output.rb
410
- - scripts/test_readme_examples.rb
411
- - scripts/test_semantic_tree_diff.rb
412
- - scripts/test_semantic_ux_improvements.rb
413
- - scripts/test_single_false_positive.rb
414
- - scripts/test_size_limits.rb
415
344
  - sig/xml/c14n.rbs
416
- - test_html_1.html
417
- - test_html_2.html
418
- - test_nokogiri.rb
419
- - test_normalize.rb
420
345
  homepage: https://github.com/lutaml/canon
421
346
  licenses:
422
347
  - BSD-2-Clause