canon 0.1.17 → 0.1.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4bf32847db2d5c418daebb9ad5221646edecaf4b67b4f25c4e2a9e8a68167a8e
4
- data.tar.gz: 6e595f08701e61f73ad62dc5aec3ec3b95da8f41fc75d579e70721f2d9af42e5
3
+ metadata.gz: fae901023e2945c8ee14c48a6de4ce793d2735d9f2b098ba9f727b9c0f10e8ad
4
+ data.tar.gz: 84ed342a12b39a77394275e6159eb16cc60f331c80ed161a6cd4fccc957dc06d
5
5
  SHA512:
6
- metadata.gz: 42a21e5e1badd2c1b96b1b86dce89551ee5b0794150fd2844b345fcabeb3d9bb484ca3beb423209e0bd455887d3597aa7d5973aaa0985ee77c450f20ff755866
7
- data.tar.gz: 8799d74f6a3738317387336308a3f95ffabaa5779d96dbde0ee9bccc424d360131230752031b0a0ee907af5907134b1ca8dec75e8cd0024fb600e090d3b681b7
6
+ metadata.gz: d88d544b3b961dfa5c0f9fb806f51a473e29b0a018a22dbc9ea2aebaaf459a3aa6317d1cb22c2d5dd32d69eb6162389be274a06886566ab9b63f83e613c4b276
7
+ data.tar.gz: cc409487c2c38791ec915584a8ebca85672ac7add362c78b5ace99bfd1a1657c6907ac387f321dbc12eda2613ffe998ae7fecdfce6895ff5e285f1f6022d250f
data/.rubocop_todo.yml CHANGED
@@ -1,6 +1,6 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2026-03-24 03:04:40 UTC using RuboCop version 1.85.1.
3
+ # on 2026-03-24 10:43:04 UTC using RuboCop version 1.85.1.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
@@ -11,13 +11,76 @@ Gemspec/RequiredRubyVersion:
11
11
  Exclude:
12
12
  - 'canon.gemspec'
13
13
 
14
- # Offense count: 802
14
+ # Offense count: 2
15
+ # This cop supports safe autocorrection (--autocorrect).
16
+ # Configuration parameters: EnforcedStyle, IndentationWidth.
17
+ # SupportedStyles: with_first_argument, with_fixed_indentation
18
+ Layout/ArgumentAlignment:
19
+ Exclude:
20
+ - 'lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb'
21
+
22
+ # Offense count: 1
23
+ # This cop supports safe autocorrection (--autocorrect).
24
+ # Configuration parameters: EnforcedStyle, IndentationWidth.
25
+ # SupportedStyles: with_first_element, with_fixed_indentation
26
+ Layout/ArrayAlignment:
27
+ Exclude:
28
+ - 'lib/canon/diff/path_builder.rb'
29
+
30
+ # Offense count: 6
31
+ # This cop supports safe autocorrection (--autocorrect).
32
+ Layout/ElseAlignment:
33
+ Exclude:
34
+ - 'lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb'
35
+
36
+ # Offense count: 2
37
+ # This cop supports safe autocorrection (--autocorrect).
38
+ # Configuration parameters: EnforcedStyleAlignWith.
39
+ # SupportedStylesAlignWith: keyword, variable, start_of_line
40
+ Layout/EndAlignment:
41
+ Exclude:
42
+ - 'lib/canon/diff/path_builder.rb'
43
+ - 'lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb'
44
+
45
+ # Offense count: 1
46
+ # This cop supports safe autocorrection (--autocorrect).
47
+ # Configuration parameters: AllowForAlignment, AllowBeforeTrailingComments, ForceEqualSignAlignment.
48
+ Layout/ExtraSpacing:
49
+ Exclude:
50
+ - 'lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb'
51
+
52
+ # Offense count: 1
53
+ # This cop supports safe autocorrection (--autocorrect).
54
+ # Configuration parameters: EnforcedStyle.
55
+ # SupportedStyles: normal, indented_internal_methods
56
+ Layout/IndentationConsistency:
57
+ Exclude:
58
+ - 'lib/canon/diff/path_builder.rb'
59
+
60
+ # Offense count: 8
61
+ # This cop supports safe autocorrection (--autocorrect).
62
+ # Configuration parameters: Width, EnforcedStyleAlignWith, AllowedPatterns.
63
+ # SupportedStylesAlignWith: start_of_line, relative_to_receiver
64
+ Layout/IndentationWidth:
65
+ Exclude:
66
+ - 'lib/canon/diff/path_builder.rb'
67
+ - 'lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb'
68
+
69
+ # Offense count: 841
15
70
  # This cop supports safe autocorrection (--autocorrect).
16
71
  # Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
17
72
  # URISchemes: http, https
18
73
  Layout/LineLength:
19
74
  Enabled: false
20
75
 
76
+ # Offense count: 3
77
+ # This cop supports safe autocorrection (--autocorrect).
78
+ # Configuration parameters: AllowInHeredoc.
79
+ Layout/TrailingWhitespace:
80
+ Exclude:
81
+ - 'lib/canon/diff/path_builder.rb'
82
+ - 'lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb'
83
+
21
84
  # Offense count: 49
22
85
  # Configuration parameters: IgnoreLiteralBranches, IgnoreConstantBranches, IgnoreDuplicateElseBranch.
23
86
  Lint/DuplicateBranch:
@@ -58,7 +121,7 @@ Lint/UnusedMethodArgument:
58
121
  - 'lib/canon/diff_formatter/by_line/xml_formatter.rb'
59
122
  - 'lib/canon/diff_formatter/by_object/base_formatter.rb'
60
123
 
61
- # Offense count: 235
124
+ # Offense count: 238
62
125
  # Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
63
126
  Metrics/AbcSize:
64
127
  Enabled: false
@@ -69,12 +132,12 @@ Metrics/AbcSize:
69
132
  Metrics/BlockLength:
70
133
  Max: 84
71
134
 
72
- # Offense count: 192
135
+ # Offense count: 196
73
136
  # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
74
137
  Metrics/CyclomaticComplexity:
75
138
  Enabled: false
76
139
 
77
- # Offense count: 401
140
+ # Offense count: 405
78
141
  # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
79
142
  Metrics/MethodLength:
80
143
  Max: 95
@@ -84,7 +147,7 @@ Metrics/MethodLength:
84
147
  Metrics/ParameterLists:
85
148
  Max: 9
86
149
 
87
- # Offense count: 158
150
+ # Offense count: 162
88
151
  # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
89
152
  Metrics/PerceivedComplexity:
90
153
  Enabled: false
@@ -121,7 +184,7 @@ Performance/CollectionLiteralInLoop:
121
184
  RSpec/ContextWording:
122
185
  Enabled: false
123
186
 
124
- # Offense count: 27
187
+ # Offense count: 30
125
188
  # Configuration parameters: IgnoredMetadata.
126
189
  RSpec/DescribeClass:
127
190
  Enabled: false
@@ -132,7 +195,7 @@ RSpec/DescribeMethod:
132
195
  - 'spec/canon/comparison/multiple_differences_spec.rb'
133
196
  - 'spec/canon/diff_formatter/character_map_customization_spec.rb'
134
197
 
135
- # Offense count: 695
198
+ # Offense count: 699
136
199
  # Configuration parameters: CountAsOne.
137
200
  RSpec/ExampleLength:
138
201
  Max: 43
@@ -184,7 +247,7 @@ RSpec/MultipleDescribes:
184
247
  Exclude:
185
248
  - 'spec/canon/comparison/match_options_spec.rb'
186
249
 
187
- # Offense count: 536
250
+ # Offense count: 537
188
251
  RSpec/MultipleExpectations:
189
252
  Max: 15
190
253
 
@@ -217,6 +280,11 @@ RSpec/NoExpectationExample:
217
280
  - 'spec/canon/isodoc_blockquotes_spec.rb'
218
281
  - 'spec/canon/match_scenarios_spec.rb'
219
282
 
283
+ # Offense count: 2
284
+ RSpec/RepeatedExample:
285
+ Exclude:
286
+ - 'spec/canon/comparison/encoding_normalization_spec.rb'
287
+
220
288
  # Offense count: 7
221
289
  # Configuration parameters: CustomTransform, IgnoreMethods, IgnoreMetadata, InflectorPath, EnforcedInflector.
222
290
  # SupportedInflectors: default, active_support
@@ -230,7 +298,7 @@ RSpec/SpecFilePathFormat:
230
298
  - 'spec/canon/yaml/formatter_spec.rb'
231
299
  - 'spec/xml_c14n_spec.rb'
232
300
 
233
- # Offense count: 126
301
+ # Offense count: 128
234
302
  # Configuration parameters: IgnoreNameless, IgnoreSymbolicNames.
235
303
  RSpec/VerifiedDoubles:
236
304
  Exclude:
@@ -239,6 +307,7 @@ RSpec/VerifiedDoubles:
239
307
  - 'spec/canon/diff/diff_classifier_spec.rb'
240
308
  - 'spec/canon/diff/path_builder_spec.rb'
241
309
  - 'spec/canon/diff/xml_serialization_formatter_spec.rb'
310
+ - 'spec/canon/diff_formatter/diff_detail_formatter_spec.rb'
242
311
  - 'spec/canon/tree_diff/operation_converter_spec.rb'
243
312
 
244
313
  # Offense count: 1
@@ -263,9 +332,37 @@ Style/IdenticalConditionalBranches:
263
332
  - 'lib/canon/diff_formatter/by_object/base_formatter.rb'
264
333
  - 'lib/canon/diff_formatter/legend.rb'
265
334
 
335
+ # Offense count: 2
336
+ # This cop supports safe autocorrection (--autocorrect).
337
+ # Configuration parameters: AllowMethodComparison, ComparisonsThreshold.
338
+ Style/MultipleComparison:
339
+ Exclude:
340
+ - 'lib/canon/diff/path_builder.rb'
341
+
266
342
  # Offense count: 1
267
343
  # Configuration parameters: AllowedMethods.
268
344
  # AllowedMethods: respond_to_missing?
269
345
  Style/OptionalBooleanParameter:
270
346
  Exclude:
271
347
  - 'lib/canon/diff_formatter/debug_output.rb'
348
+
349
+ # Offense count: 1
350
+ # This cop supports safe autocorrection (--autocorrect).
351
+ Style/RedundantParentheses:
352
+ Exclude:
353
+ - 'lib/canon/diff/path_builder.rb'
354
+
355
+ # Offense count: 1
356
+ # This cop supports safe autocorrection (--autocorrect).
357
+ # Configuration parameters: AllowModifier.
358
+ Style/SoleNestedConditional:
359
+ Exclude:
360
+ - 'lib/canon/diff/path_builder.rb'
361
+
362
+ # Offense count: 3
363
+ # This cop supports safe autocorrection (--autocorrect).
364
+ # Configuration parameters: EnforcedStyleForMultiline.
365
+ # SupportedStylesForMultiline: comma, consistent_comma, diff_comma, no_comma
366
+ Style/TrailingCommaInArguments:
367
+ Exclude:
368
+ - 'spec/canon/diff_formatter/diff_detail_formatter_spec.rb'
data/README.adoc CHANGED
@@ -214,6 +214,7 @@ Compare documents based on meaning, not formatting:
214
214
  * Comment handling with display control
215
215
  * Multiple match dimensions with behaviors
216
216
  * Predefined match profiles (strict, rendered, spec_friendly, content_only)
217
+ * **Cross-encoding comparison**: Compare XML documents with different character encodings (UTF-8, Shift_JIS, ISO-8859-1, UTF-16) — Canon automatically normalizes to UTF-8 before comparison
217
218
 
218
219
  See link:docs/MATCH_OPTIONS[Match options] for details.
219
220
 
@@ -151,6 +151,43 @@ sensitivity in XML instance documents:
151
151
  </text>
152
152
  ----
153
153
 
154
+ The `xml:space` attribute affects both structural whitespace and text content:
155
+
156
+ * **Structural whitespace** (whitespace-only text nodes between child elements)
157
+ * **Text content whitespace** (whitespace within text nodes)
158
+
159
+ .xml:space with structural_whitespace
160
+ [example]
161
+ ====
162
+ [source,ruby]
163
+ ----
164
+ # With xml:space="preserve", structural whitespace is preserved
165
+ xml1 = "<root xml:space='preserve'>\n <text>Hello</text>\n</root>"
166
+ xml2 = "<root xml:space='preserve'><text>Hello</text></root>"
167
+
168
+ # These are NOT equivalent (structural whitespace differs)
169
+ Canon::Comparison.equivalent?(xml1, xml2)
170
+ # => false
171
+ ----
172
+ ====
173
+
174
+ .xml:space with text_content
175
+ [example]
176
+ ====
177
+ [source,ruby]
178
+ ----
179
+ # With xml:space="preserve", text content whitespace is preserved
180
+ xml1 = '<root xml:space="preserve"><code> indented </code></root>'
181
+ xml2 = '<root xml:space="preserve"><code>indented</code></root>'
182
+
183
+ # These are NOT equivalent (text whitespace differs)
184
+ Canon::Comparison.equivalent?(xml1, xml2,
185
+ match: { text_content: :strict }
186
+ )
187
+ # => false
188
+ ----
189
+ ====
190
+
154
191
  ==== Whitelist and blacklist options
155
192
 
156
193
  You can explicitly specify which elements are whitespace-sensitive using either short or long option names:
@@ -260,29 +297,44 @@ Canon::Comparison.equivalent?(xml1, xml2,
260
297
 
261
298
  ==== Examples
262
299
 
263
- .Using xml:space attribute
300
+ .Using xml:space="preserve" for structural whitespace
301
+ [source,ruby]
302
+ ----
303
+ xml1 = "<root xml:space='preserve'>\n <text>Hello</text>\n</root>"
304
+ xml2 = "<root xml:space='preserve'><text>Hello</text></root>"
305
+
306
+ # Structural whitespace differs - NOT equivalent
307
+ Canon::Comparison.equivalent?(xml1, xml2)
308
+ # => false
309
+ ----
310
+
311
+ .Using xml:space="preserve" for text content
264
312
  [source,ruby]
265
313
  ----
266
- xml1 = '<root><code xml:space="preserve"> indented </code></root>'
267
- xml2 = '<root><code xml:space="preserve">indented</code></root>'
314
+ xml1 = '<root><code xml:space="preserve"> multiple spaces </code></root>'
315
+ xml2 = '<root><code xml:space="preserve">multiple spaces</code></root>'
268
316
 
269
- # These are NOT equivalent (whitespace matters in xml:space="preserve")
317
+ # Text content whitespace differs - NOT equivalent with text_content: :strict
270
318
  Canon::Comparison.equivalent?(xml1, xml2,
271
- match: { structural_whitespace: :strict }
319
+ match: { text_content: :strict }
272
320
  )
273
321
  # => false
274
322
  ----
275
323
 
276
- .Using whitelist
324
+ .Using sensitive_elements whitelist
277
325
  [source,ruby]
278
326
  ----
279
- # Make <p> elements whitespace-sensitive (strings, not symbols)
327
+ # Make <sample> elements whitespace-sensitive (strings, not symbols)
328
+ xml1 = "<sample>\n content\n</sample>"
329
+ xml2 = "<sample>content</sample>"
330
+
280
331
  Canon::Comparison.equivalent?(xml1, xml2,
281
332
  match: {
282
333
  structural_whitespace: :strict,
283
- sensitive_elements: ["p", "pre"]
334
+ sensitive_elements: ["sample"]
284
335
  }
285
336
  )
337
+ # => false (structural whitespace differs in <sample>)
286
338
  ----
287
339
 
288
340
  .Overriding HTML defaults
@@ -340,6 +340,44 @@ Special attributes like `xml:lang`, `xml:space`, `xml:id`, and `xml:base` are pr
340
340
  When `xml:space="preserve"` is set, whitespace is preserved in descendants.
341
341
  ----
342
342
 
343
+ === Cross-encoding comparison
344
+
345
+ Canon automatically normalizes XML character encodings before comparison, enabling
346
+ cross-encoding comparisons to work correctly.
347
+
348
+ **Supported encodings**: UTF-8, UTF-16 (all variants), Shift_JIS, EUC-JP, ISO-8859-1, and more.
349
+
350
+ **How it works**:
351
+
352
+ 1. Extract the declared encoding from the XML declaration (e.g., `encoding="Shift_JIS"`)
353
+ 2. If declared encoding differs from UTF-8, transcode to UTF-8
354
+ 3. Handle cases where the declared encoding doesn't match actual bytes
355
+ 4. Use safe transcoding with replacement characters for invalid sequences
356
+
357
+ .Cross-encoding comparison example
358
+ [example]
359
+ ====
360
+ [source,ruby]
361
+ ----
362
+ # UTF-8 vs Shift_JIS - automatically normalized
363
+ xml1 = "<root>日本語</root>" # UTF-8
364
+ xml2 = "<root>日本語</root>".encode("Shift_JIS") # Shift_JIS
365
+
366
+ Canon::Comparison.equivalent?(xml1, xml2)
367
+ # => true (automatically transcoded to UTF-8 before comparison)
368
+
369
+ # ASCII content works across all encodings
370
+ xml3 = "<root>hello</root>"
371
+ xml4 = "<root>hello</root>".encode("ISO-8859-1")
372
+
373
+ Canon::Comparison.equivalent?(xml3, xml4)
374
+ # => true
375
+ ----
376
+ ====
377
+
378
+ This means you can compare XML files from different sources or systems without
379
+ worrying about their native encoding.
380
+
343
381
  == Usage examples
344
382
 
345
383
  === Basic XML comparison
data/lib/canon/cache.rb CHANGED
@@ -89,7 +89,8 @@ module Canon
89
89
  # @return [String] Cache key
90
90
  def key_for_format_detection(content)
91
91
  # Use first 100 chars for quick key, plus length
92
- preview = content[0..100]
92
+ # Force to binary to avoid encoding compatibility issues
93
+ preview = content[0..100].b
93
94
  digest = Digest::SHA256.hexdigest(preview + content.length.to_s)
94
95
  "fmt:#{digest[0..16]}"
95
96
  end
@@ -62,7 +62,21 @@ module Canon
62
62
  # @param str [String] String to detect format of
63
63
  # @return [Symbol] Format type
64
64
  def detect_string_uncached(str)
65
- trimmed = str.strip
65
+ # Convert to UTF-8 for consistent handling if possible
66
+ # This handles cases like UTF-16 encoded XML that would otherwise fail string operations
67
+ str_utf8 = if ["UTF-16", "UTF-16BE",
68
+ "UTF-16LE"].include?(str.encoding.name)
69
+ begin
70
+ str.encode("UTF-8", str.encoding, invalid: :replace,
71
+ undef: :replace, replace: "?")
72
+ rescue EncodingError
73
+ str.dup.force_encoding("BINARY").encode("UTF-8")
74
+ end
75
+ else
76
+ str
77
+ end
78
+
79
+ trimmed = str_utf8.strip
66
80
 
67
81
  # YAML indicators
68
82
  return :yaml if trimmed.start_with?("---")
@@ -89,6 +89,15 @@ module Canon
89
89
  insensitive = (insensitive_raw || []).map(&:to_s)
90
90
  return false if insensitive.include?(elem_name)
91
91
 
92
+ # Check if we should ignore xml:space (user override)
93
+ if respect_xml_space?(match_opts)
94
+ # Check xml:space="preserve" (document declaration)
95
+ return true if xml_space_preserve?(element)
96
+
97
+ # Check xml:space="default" (use configured behavior)
98
+ return false if xml_space_default?(element)
99
+ end
100
+
92
101
  # Whitelist: preserve whitespace
93
102
  sensitive = resolved_sensitive_elements(match_opts)
94
103
  return true if sensitive.include?(elem_name)
@@ -25,6 +25,9 @@ module Canon
25
25
  preserve_whitespace: preserve_whitespace)
26
26
  end
27
27
 
28
+ # Normalize encoding before preprocessing (UTF-16 strings can't use strip, etc.)
29
+ node = Canon::Xml::DataModel.normalize_encoding(node)
30
+
28
31
  # Apply preprocessing to XML string before parsing
29
32
  xml_string = apply_preprocessing(node, preprocessing).strip
30
33
 
@@ -83,6 +83,20 @@ module Canon
83
83
  # Get ordinal index (position among siblings with same label)
84
84
  index = ordinal_index(tree_node)
85
85
 
86
+ # For text nodes, use parent element name for clarity
87
+ # e.g., instead of "/p/#text[0]" use "/p/text()[0]"
88
+ if ["text",
89
+ "#text"].include?(label) && tree_node.respond_to?(:parent) && tree_node.parent
90
+ parent_name = if tree_node.parent.respond_to?(:label)
91
+ tree_node.parent.label
92
+ elsif tree_node.parent.respond_to?(:name)
93
+ tree_node.parent.name
94
+ end
95
+ if parent_name && parent_name != "#document" && parent_name != "#document-fragment"
96
+ return "#{parent_name}/text()[#{index}]"
97
+ end
98
+ end
99
+
86
100
  "#{label}[#{index}]"
87
101
  end
88
102
 
@@ -340,10 +340,24 @@ module Canon
340
340
  TextUtils.visualize_whitespace(text2), :green, use_color
341
341
  )
342
342
  else
343
- detail1 = ColorHelper.colorize(format_json_value(text1), :red,
344
- use_color)
345
- detail2 = ColorHelper.colorize(format_json_value(text2), :green,
346
- use_color)
343
+ # Escape non-ASCII characters for better terminal display
344
+ # JSON.generate doesn't escape chars like NBSP (U+00A0) or em-dash (U+2014)
345
+ detail1 = if TextUtils.needs_escaping?(text1)
346
+ ColorHelper.colorize(
347
+ TextUtils.escape_for_display(text1), :red, use_color
348
+ )
349
+ else
350
+ ColorHelper.colorize(format_json_value(text1), :red,
351
+ use_color)
352
+ end
353
+ detail2 = if TextUtils.needs_escaping?(text2)
354
+ ColorHelper.colorize(
355
+ TextUtils.escape_for_display(text2), :green, use_color
356
+ )
357
+ else
358
+ ColorHelper.colorize(format_json_value(text2), :green,
359
+ use_color)
360
+ end
347
361
  end
348
362
 
349
363
  changes = "Content differs: #{detail1} → #{detail2}"
@@ -16,7 +16,12 @@ module Canon
16
16
  def self.extract_location(diff)
17
17
  return "" unless diff
18
18
 
19
- # Get the appropriate node based on diff type
19
+ # Prefer pre-computed path if available (populated by MetadataEnricher)
20
+ if diff.respond_to?(:path) && !diff.path.nil? && !diff.path.empty?
21
+ return "Location: #{diff.path}"
22
+ end
23
+
24
+ # Fall back to extracting from nodes
20
25
  node = if diff.respond_to?(:node1)
21
26
  diff.node1 || diff.node2
22
27
  elsif diff.is_a?(Hash)
@@ -159,21 +159,47 @@ module Canon
159
159
  def self.get_node_text(node)
160
160
  return "" unless node
161
161
 
162
- if node.respond_to?(:text)
163
- node.text
164
- elsif node.respond_to?(:content)
165
- node.content
166
- elsif node.respond_to?(:inner_text)
167
- node.inner_text
168
- elsif node.respond_to?(:value)
169
- node.value
170
- elsif node.respond_to?(:node_info)
171
- node.node_info
172
- elsif node.respond_to?(:to_s)
173
- node.to_s
174
- else
175
- ""
176
- end.to_s.strip
162
+ text = if node.respond_to?(:text)
163
+ node.text
164
+ elsif node.respond_to?(:content)
165
+ node.content
166
+ elsif node.respond_to?(:inner_text)
167
+ node.inner_text
168
+ elsif node.respond_to?(:value)
169
+ node.value
170
+ elsif node.respond_to?(:node_info)
171
+ node.node_info
172
+ elsif node.respond_to?(:to_s)
173
+ node.to_s
174
+ else
175
+ ""
176
+ end
177
+
178
+ strip_ascii_whitespace(text.to_s)
179
+ end
180
+
181
+ # Strip only ASCII whitespace (space, tab, CR, LF) but preserve Unicode
182
+ # whitespace like non-breaking space (\u00A0). Ruby's String#strip removes
183
+ # all Unicode whitespace, which destroys meaningful content like \u00A0.
184
+ #
185
+ # @param str [String] String to strip
186
+ # @return [String] String with leading/trailing ASCII whitespace removed
187
+ ASCII_WHITESPACE_BYTES = [32, 9, 13, 10].freeze # ' ', '\t', '\r', '\n'
188
+
189
+ def self.strip_ascii_whitespace(str)
190
+ return "" if str.nil?
191
+ return str if str.empty?
192
+
193
+ # Find first non-ASCII-whitespace character position
194
+ first_pos = str.index(/[^ \t\r\n]/)
195
+ return "" unless first_pos
196
+
197
+ # Find last non-ASCII-whitespace character position (from end)
198
+ # Use reverse and index, then convert back to forward position
199
+ reversed_pos = str.reverse.index(/[^ \t\r\n]/)
200
+ last_pos = str.length - 1 - reversed_pos
201
+
202
+ str[first_pos..last_pos]
177
203
  end
178
204
 
179
205
  # Get element name for display
@@ -20,7 +20,8 @@ module Canon
20
20
 
21
21
  # Visualize whitespace characters in text
22
22
  #
23
- # Shows spaces as ·, tabs as →, newlines as ¬
23
+ # Shows spaces as ·, tabs as →, newlines as ¬, and Unicode whitespace
24
+ # like non-breaking space as <NBSP>, etc.
24
25
  #
25
26
  # @param text [String] Text to visualize
26
27
  # @return [String] Text with visible whitespace
@@ -31,6 +32,9 @@ module Canon
31
32
  .gsub(" ", "·")
32
33
  .gsub("\t", "→")
33
34
  .gsub("\n", "¬")
35
+ .gsub("\u00A0", "<NBSP>") # Non-breaking space
36
+ .gsub("\u2028", "<LSEP>") # Line separator
37
+ .gsub("\u2029", "<PSEP>") # Paragraph separator
34
38
  end
35
39
 
36
40
  # Extract a content preview from a node
@@ -55,6 +59,42 @@ module Canon
55
59
  text = text.strip.gsub(/\s+/, " ")
56
60
  truncate_text(text, max_length)
57
61
  end
62
+
63
+ # Escape non-ASCII and non-printable characters for display
64
+ #
65
+ # Converts characters outside the printable ASCII range (32-126) to
66
+ # their \uXXXX escape sequences. This ensures special characters like
67
+ # non-breaking space (\u00A0) and em-dash (\u2014) are visible in
68
+ # terminal output.
69
+ #
70
+ # @param text [String] Text to escape
71
+ # @return [String] Escaped text safe for terminal display
72
+ def self.escape_for_display(text)
73
+ return "" if text.nil?
74
+
75
+ text.chars.map do |c|
76
+ codepoint = c.ord
77
+ if codepoint < 32 || codepoint >= 127 || codepoint == 34 || codepoint == 92
78
+ # Escape control characters, non-ASCII, double-quote, and backslash
79
+ "\\u#{codepoint.to_s(16).upcase.rjust(4, '0')}"
80
+ else
81
+ c
82
+ end
83
+ end.join
84
+ end
85
+
86
+ # Check if text contains non-ASCII or non-printable characters
87
+ #
88
+ # @param text [String] Text to check
89
+ # @return [Boolean] true if text needs escaping for display
90
+ def self.needs_escaping?(text)
91
+ return false if text.nil?
92
+
93
+ text.each_char.any? do |c|
94
+ codepoint = c.ord
95
+ codepoint < 32 || codepoint >= 127 || codepoint == 34 || codepoint == 92
96
+ end
97
+ end
58
98
  end
59
99
  end
60
100
  end
data/lib/canon/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Canon
4
- VERSION = "0.1.17"
4
+ VERSION = "0.1.19"
5
5
  end
@@ -21,8 +21,11 @@ module Canon
21
21
  # @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
22
22
  # @return [Nodes::RootNode] Root of the data model tree
23
23
  def self.from_xml(xml_string, preserve_whitespace: false)
24
+ # Normalize encoding before parsing
25
+ normalized_xml = normalize_encoding(xml_string)
26
+
24
27
  # Parse with Nokogiri
25
- doc = Nokogiri::XML(xml_string, &:nonet)
28
+ doc = Nokogiri::XML(normalized_xml, &:nonet)
26
29
 
27
30
  # Check for relative namespace URIs (prohibited by C14N 1.1)
28
31
  check_for_relative_namespace_uris(doc)
@@ -31,6 +34,132 @@ module Canon
31
34
  build_from_nokogiri(doc, preserve_whitespace: preserve_whitespace)
32
35
  end
33
36
 
37
+ # Normalize XML string encoding to UTF-8
38
+ #
39
+ # Handles cases where:
40
+ # 1. The XML declaration specifies an encoding that doesn't match the actual encoding
41
+ # 2. The string's internal encoding is non-UTF-8 (without a declaration)
42
+ #
43
+ # For case 1, we check if the declared encoding matches the actual bytes.
44
+ # If bytes are valid UTF-8 despite the declaration, we update the declaration to UTF-8.
45
+ #
46
+ # @param xml_string [String] XML string to normalize
47
+ # @return [String] Normalized XML string with UTF-8 encoding
48
+ def self.normalize_encoding(xml_string)
49
+ return xml_string unless xml_string.is_a?(String)
50
+
51
+ # Extract declared encoding from XML declaration
52
+ declared_encoding = extract_xml_encoding(xml_string)
53
+
54
+ if declared_encoding
55
+ # Case 1: XML has a declaration
56
+ if declared_encoding.upcase != "UTF-8"
57
+ # Check if bytes are actually valid UTF-8 despite the declaration
58
+ utf8_reinterpreted = try_utf8_reinterpretation(xml_string)
59
+ if utf8_reinterpreted
60
+ # Bytes are valid UTF-8 - update declaration to UTF-8
61
+ return update_xml_declaration(xml_string, "UTF-8")
62
+ end
63
+
64
+ # Bytes aren't valid UTF-8 - must really be in declared encoding
65
+ return transcode_to_utf8(xml_string, declared_encoding)
66
+ end
67
+ elsif xml_string.encoding.name != "UTF-8"
68
+ # Case 2: No declaration but string encoding is non-UTF-8
69
+ # First, try to re-interpret bytes as UTF-8 (handles mislabeled strings)
70
+ reinterpreted = try_utf8_reinterpretation(xml_string)
71
+ return reinterpreted if reinterpreted
72
+
73
+ # If re-interpretation fails, try transcoding with the labeled encoding
74
+ return transcode_to_utf8(xml_string, xml_string.encoding.name)
75
+ end
76
+
77
+ xml_string
78
+ end
79
+
80
+ # Update the encoding declaration in an XML string
81
+ #
82
+ # @param xml_string [String] XML string
83
+ # @param new_encoding [String] New encoding to declare
84
+ # @return [String] XML string with updated declaration
85
+ def self.update_xml_declaration(xml_string, new_encoding)
86
+ xml_string.sub(/\bencoding\s*=\s*["'][^"']+["']/i) do |_match|
87
+ %(encoding="#{new_encoding}")
88
+ end
89
+ end
90
+
91
+ # Transcode string to UTF-8
92
+ #
93
+ # @param xml_string [String] String to transcode
94
+ # @param source_encoding [String] Source encoding to interpret bytes as
95
+ # @return [String] UTF-8 transcoded string
96
+ def self.transcode_to_utf8(xml_string, source_encoding)
97
+ # First, check if the bytes are actually valid UTF-8 despite the declared encoding
98
+ # If so, just re-interpret as UTF-8 (common case: declaration is wrong)
99
+ if source_encoding != "UTF-8"
100
+ # Force the bytes to be interpreted as the declared encoding, then check validity
101
+ forced = xml_string.dup.force_encoding(source_encoding)
102
+ if forced.valid_encoding?
103
+ # Now check if the same bytes are valid UTF-8
104
+ utf8_check = xml_string.dup.force_encoding("UTF-8")
105
+ if utf8_check.valid_encoding?
106
+ # Bytes are valid UTF-8 - the declaration is likely wrong
107
+ # Return the string as UTF-8 (already is)
108
+ return xml_string.dup.force_encoding("UTF-8")
109
+ end
110
+
111
+ # Bytes aren't valid UTF-8, so they must really be in source_encoding
112
+ # Proceed with transcoding
113
+ return forced.encode("UTF-8", source_encoding,
114
+ invalid: :replace,
115
+ undef: :replace,
116
+ replace: "?")
117
+ end
118
+ end
119
+
120
+ # Already UTF-8 or transcoding failed, return as-is
121
+ xml_string.dup.force_encoding("UTF-8")
122
+ rescue EncodingError
123
+ xml_string
124
+ end
125
+
126
+ # Attempt to re-interpret string as UTF-8 if bytes are valid UTF-8
127
+ #
128
+ # This handles the case where a string was incorrectly labeled with a different
129
+ # encoding (e.g., `.encode("Shift_JIS")` on a UTF-8 string) but the actual
130
+ # bytes are valid UTF-8.
131
+ #
132
+ # @param xml_string [String] XML string to check
133
+ # @return [String, nil] UTF-8 re-interpreted string, or nil if not possible
134
+ def self.try_utf8_reinterpretation(xml_string)
135
+ return xml_string if xml_string.encoding.name == "UTF-8"
136
+
137
+ # Try forcing to UTF-8 and see if it's valid
138
+ forced = xml_string.dup.force_encoding("UTF-8")
139
+ return forced if forced.valid_encoding?
140
+
141
+ nil
142
+ end
143
+
144
+ # Extract encoding from XML declaration
145
+ #
146
+ # @param xml_string [String] XML string
147
+ # @return [String, nil] Declared encoding or nil if not found
148
+ def self.extract_xml_encoding(xml_string)
149
+ # Match XML declaration with encoding attribute
150
+ # Handles: <?xml version="1.0" encoding="UTF-8"?>
151
+ # and: <?xml version='1.0' encoding='UTF-8'?>
152
+ #
153
+ # Use binary encoding to avoid encoding compatibility issues
154
+ # when the string has non-ASCII compatible encoding (e.g., UTF-16)
155
+ binary_string = xml_string.dup.force_encoding("BINARY")
156
+ if binary_string =~ /\A\s*<\?xml[^>]*\bencoding\s*=\s*["']([^"']+)["'][^>]*\?>/i
157
+ return Regexp.last_match(1)
158
+ end
159
+
160
+ nil
161
+ end
162
+
34
163
  # Alias for compatibility with base class interface
35
164
  def self.parse(xml_string)
36
165
  from_xml(xml_string)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: canon
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.17
4
+ version: 0.1.19
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.