canon 0.1.17 → 0.1.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4bf32847db2d5c418daebb9ad5221646edecaf4b67b4f25c4e2a9e8a68167a8e
4
- data.tar.gz: 6e595f08701e61f73ad62dc5aec3ec3b95da8f41fc75d579e70721f2d9af42e5
3
+ metadata.gz: 34ab9a64b52d8598690536908941e38950bb4071fb6222b50d1cb584236e5286
4
+ data.tar.gz: 2dffcc8e29fcd1f75d78595ef73350208e7b369f2183c2df93672b94df7a6376
5
5
  SHA512:
6
- metadata.gz: 42a21e5e1badd2c1b96b1b86dce89551ee5b0794150fd2844b345fcabeb3d9bb484ca3beb423209e0bd455887d3597aa7d5973aaa0985ee77c450f20ff755866
7
- data.tar.gz: 8799d74f6a3738317387336308a3f95ffabaa5779d96dbde0ee9bccc424d360131230752031b0a0ee907af5907134b1ca8dec75e8cd0024fb600e090d3b681b7
6
+ metadata.gz: 41d784c820a7bbafd9874bf369ef303376578e74f7171e8f00a7ed1ed0b8800576ca67f51272f1464f6f36cbaec35b6f1f1e9806916e7a8c59bc6c04e827ee79
7
+ data.tar.gz: d3e354615ed4b40447ae0be7de64bf6250c40650e25f897a5155f52a74f57a8601003efd37d539abf9f514d5cbe614298a2185db7d72b68b4ff95db6da1b5b99
data/.rubocop_todo.yml CHANGED
@@ -1,6 +1,6 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2026-03-24 03:04:40 UTC using RuboCop version 1.85.1.
3
+ # on 2026-03-24 08:58:24 UTC using RuboCop version 1.85.1.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
@@ -11,13 +11,63 @@ Gemspec/RequiredRubyVersion:
11
11
  Exclude:
12
12
  - 'canon.gemspec'
13
13
 
14
- # Offense count: 802
14
+ # Offense count: 10
15
+ # This cop supports safe autocorrection (--autocorrect).
16
+ # Configuration parameters: EnforcedStyle, IndentationWidth.
17
+ # SupportedStyles: with_first_argument, with_fixed_indentation
18
+ Layout/ArgumentAlignment:
19
+ Exclude:
20
+ - 'lib/canon/xml/data_model.rb'
21
+ - 'spec/canon/comparison/encoding_normalization_spec.rb'
22
+ - 'spec/canon/comparison/xml_whitespace_spec.rb'
23
+
24
+ # Offense count: 1
25
+ # This cop supports safe autocorrection (--autocorrect).
26
+ # Configuration parameters: EnforcedStyle, IndentationWidth.
27
+ # SupportedStyles: with_first_element, with_fixed_indentation
28
+ Layout/ArrayAlignment:
29
+ Exclude:
30
+ - 'lib/canon/comparison/format_detector.rb'
31
+
32
+ # Offense count: 1
33
+ # This cop supports safe autocorrection (--autocorrect).
34
+ Layout/EmptyLineAfterGuardClause:
35
+ Exclude:
36
+ - 'lib/canon/xml/data_model.rb'
37
+
38
+ # Offense count: 1
39
+ # This cop supports safe autocorrection (--autocorrect).
40
+ # Configuration parameters: AllowMultipleStyles, EnforcedHashRocketStyle, EnforcedColonStyle, EnforcedLastArgumentHashStyle.
41
+ # SupportedHashRocketStyles: key, separator, table
42
+ # SupportedColonStyles: key, separator, table
43
+ # SupportedLastArgumentHashStyles: always_inspect, always_ignore, ignore_implicit, ignore_explicit
44
+ Layout/HashAlignment:
45
+ Exclude:
46
+ - 'lib/canon/comparison/format_detector.rb'
47
+
48
+ # Offense count: 831
15
49
  # This cop supports safe autocorrection (--autocorrect).
16
50
  # Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
17
51
  # URISchemes: http, https
18
52
  Layout/LineLength:
19
53
  Enabled: false
20
54
 
55
+ # Offense count: 9
56
+ # This cop supports safe autocorrection (--autocorrect).
57
+ # Configuration parameters: EnforcedStyle.
58
+ # SupportedStyles: symmetrical, new_line, same_line
59
+ Layout/MultilineMethodCallBraceLayout:
60
+ Exclude:
61
+ - 'spec/canon/comparison/encoding_normalization_spec.rb'
62
+ - 'spec/canon/comparison/xml_whitespace_spec.rb'
63
+
64
+ # Offense count: 2
65
+ # This cop supports safe autocorrection (--autocorrect).
66
+ # Configuration parameters: AllowInHeredoc.
67
+ Layout/TrailingWhitespace:
68
+ Exclude:
69
+ - 'lib/canon/comparison/format_detector.rb'
70
+
21
71
  # Offense count: 49
22
72
  # Configuration parameters: IgnoreLiteralBranches, IgnoreConstantBranches, IgnoreDuplicateElseBranch.
23
73
  Lint/DuplicateBranch:
@@ -47,6 +97,13 @@ Lint/UnreachableCode:
47
97
  Exclude:
48
98
  - 'lib/canon/diff_formatter/debug_output.rb'
49
99
 
100
+ # Offense count: 1
101
+ # This cop supports safe autocorrection (--autocorrect).
102
+ # Configuration parameters: IgnoreEmptyBlocks, AllowUnusedKeywordArguments.
103
+ Lint/UnusedBlockArgument:
104
+ Exclude:
105
+ - 'lib/canon/xml/data_model.rb'
106
+
50
107
  # Offense count: 6
51
108
  # This cop supports safe autocorrection (--autocorrect).
52
109
  # Configuration parameters: AllowUnusedKeywordArguments, IgnoreEmptyMethods, IgnoreNotImplementedMethods, NotImplementedExceptions.
@@ -58,7 +115,7 @@ Lint/UnusedMethodArgument:
58
115
  - 'lib/canon/diff_formatter/by_line/xml_formatter.rb'
59
116
  - 'lib/canon/diff_formatter/by_object/base_formatter.rb'
60
117
 
61
- # Offense count: 235
118
+ # Offense count: 236
62
119
  # Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
63
120
  Metrics/AbcSize:
64
121
  Enabled: false
@@ -69,12 +126,12 @@ Metrics/AbcSize:
69
126
  Metrics/BlockLength:
70
127
  Max: 84
71
128
 
72
- # Offense count: 192
129
+ # Offense count: 193
73
130
  # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
74
131
  Metrics/CyclomaticComplexity:
75
132
  Enabled: false
76
133
 
77
- # Offense count: 401
134
+ # Offense count: 403
78
135
  # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
79
136
  Metrics/MethodLength:
80
137
  Max: 95
@@ -84,7 +141,7 @@ Metrics/MethodLength:
84
141
  Metrics/ParameterLists:
85
142
  Max: 9
86
143
 
87
- # Offense count: 158
144
+ # Offense count: 160
88
145
  # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
89
146
  Metrics/PerceivedComplexity:
90
147
  Enabled: false
@@ -115,13 +172,19 @@ Performance/CollectionLiteralInLoop:
115
172
  - 'lib/canon/comparison/html_comparator.rb'
116
173
  - 'lib/canon/xml/xml_base_handler.rb'
117
174
 
175
+ # Offense count: 1
176
+ # This cop supports unsafe autocorrection (--autocorrect-all).
177
+ Performance/UnfreezeString:
178
+ Exclude:
179
+ - 'spec/canon/comparison/encoding_normalization_spec.rb'
180
+
118
181
  # Offense count: 68
119
182
  # Configuration parameters: Prefixes, AllowedPatterns.
120
183
  # Prefixes: when, with, without
121
184
  RSpec/ContextWording:
122
185
  Enabled: false
123
186
 
124
- # Offense count: 27
187
+ # Offense count: 29
125
188
  # Configuration parameters: IgnoredMetadata.
126
189
  RSpec/DescribeClass:
127
190
  Enabled: false
@@ -217,6 +280,11 @@ RSpec/NoExpectationExample:
217
280
  - 'spec/canon/isodoc_blockquotes_spec.rb'
218
281
  - 'spec/canon/match_scenarios_spec.rb'
219
282
 
283
+ # Offense count: 2
284
+ RSpec/RepeatedExample:
285
+ Exclude:
286
+ - 'spec/canon/comparison/encoding_normalization_spec.rb'
287
+
220
288
  # Offense count: 7
221
289
  # Configuration parameters: CustomTransform, IgnoreMethods, IgnoreMetadata, InflectorPath, EnforcedInflector.
222
290
  # SupportedInflectors: default, active_support
@@ -241,6 +309,17 @@ RSpec/VerifiedDoubles:
241
309
  - 'spec/canon/diff/xml_serialization_formatter_spec.rb'
242
310
  - 'spec/canon/tree_diff/operation_converter_spec.rb'
243
311
 
312
+ # Offense count: 1
313
+ # This cop supports safe autocorrection (--autocorrect).
314
+ # Configuration parameters: EnforcedStyle, ProceduralMethods, FunctionalMethods, AllowedMethods, AllowedPatterns, AllowBracesOnProceduralOneLiners, BracesRequiredMethods.
315
+ # SupportedStyles: line_count_based, semantic, braces_for_chaining, always_braces
316
+ # ProceduralMethods: benchmark, bm, bmbm, create, each_with_object, measure, new, realtime, tap, with_object
317
+ # FunctionalMethods: let, let!, subject, watch
318
+ # AllowedMethods: lambda, proc, it
319
+ Style/BlockDelimiters:
320
+ Exclude:
321
+ - 'spec/canon/comparison/encoding_normalization_spec.rb'
322
+
244
323
  # Offense count: 1
245
324
  # This cop supports safe autocorrection (--autocorrect).
246
325
  # Configuration parameters: EnforcedStyle, AllowComments.
@@ -263,6 +342,13 @@ Style/IdenticalConditionalBranches:
263
342
  - 'lib/canon/diff_formatter/by_object/base_formatter.rb'
264
343
  - 'lib/canon/diff_formatter/legend.rb'
265
344
 
345
+ # Offense count: 1
346
+ # This cop supports safe autocorrection (--autocorrect).
347
+ # Configuration parameters: AllowMethodComparison, ComparisonsThreshold.
348
+ Style/MultipleComparison:
349
+ Exclude:
350
+ - 'lib/canon/comparison/format_detector.rb'
351
+
266
352
  # Offense count: 1
267
353
  # Configuration parameters: AllowedMethods.
268
354
  # AllowedMethods: respond_to_missing?
data/README.adoc CHANGED
@@ -214,6 +214,7 @@ Compare documents based on meaning, not formatting:
214
214
  * Comment handling with display control
215
215
  * Multiple match dimensions with behaviors
216
216
  * Predefined match profiles (strict, rendered, spec_friendly, content_only)
217
+ * **Cross-encoding comparison**: Compare XML documents with different character encodings (UTF-8, Shift_JIS, ISO-8859-1, UTF-16) — Canon automatically normalizes to UTF-8 before comparison
217
218
 
218
219
  See link:docs/MATCH_OPTIONS[Match options] for details.
219
220
 
@@ -151,6 +151,43 @@ sensitivity in XML instance documents:
151
151
  </text>
152
152
  ----
153
153
 
154
+ The `xml:space` attribute affects both structural whitespace and text content:
155
+
156
+ * **Structural whitespace** (whitespace-only text nodes between child elements)
157
+ * **Text content whitespace** (whitespace within text nodes)
158
+
159
+ .xml:space with structural_whitespace
160
+ [example]
161
+ ====
162
+ [source,ruby]
163
+ ----
164
+ # With xml:space="preserve", structural whitespace is preserved
165
+ xml1 = "<root xml:space='preserve'>\n <text>Hello</text>\n</root>"
166
+ xml2 = "<root xml:space='preserve'><text>Hello</text></root>"
167
+
168
+ # These are NOT equivalent (structural whitespace differs)
169
+ Canon::Comparison.equivalent?(xml1, xml2)
170
+ # => false
171
+ ----
172
+ ====
173
+
174
+ .xml:space with text_content
175
+ [example]
176
+ ====
177
+ [source,ruby]
178
+ ----
179
+ # With xml:space="preserve", text content whitespace is preserved
180
+ xml1 = '<root xml:space="preserve"><code> indented </code></root>'
181
+ xml2 = '<root xml:space="preserve"><code>indented</code></root>'
182
+
183
+ # These are NOT equivalent (text whitespace differs)
184
+ Canon::Comparison.equivalent?(xml1, xml2,
185
+ match: { text_content: :strict }
186
+ )
187
+ # => false
188
+ ----
189
+ ====
190
+
154
191
  ==== Whitelist and blacklist options
155
192
 
156
193
  You can explicitly specify which elements are whitespace-sensitive using either short or long option names:
@@ -260,29 +297,44 @@ Canon::Comparison.equivalent?(xml1, xml2,
260
297
 
261
298
  ==== Examples
262
299
 
263
- .Using xml:space attribute
300
+ .Using xml:space="preserve" for structural whitespace
301
+ [source,ruby]
302
+ ----
303
+ xml1 = "<root xml:space='preserve'>\n <text>Hello</text>\n</root>"
304
+ xml2 = "<root xml:space='preserve'><text>Hello</text></root>"
305
+
306
+ # Structural whitespace differs - NOT equivalent
307
+ Canon::Comparison.equivalent?(xml1, xml2)
308
+ # => false
309
+ ----
310
+
311
+ .Using xml:space="preserve" for text content
264
312
  [source,ruby]
265
313
  ----
266
- xml1 = '<root><code xml:space="preserve"> indented </code></root>'
267
- xml2 = '<root><code xml:space="preserve">indented</code></root>'
314
+ xml1 = '<root><code xml:space="preserve"> multiple spaces </code></root>'
315
+ xml2 = '<root><code xml:space="preserve">multiple spaces</code></root>'
268
316
 
269
- # These are NOT equivalent (whitespace matters in xml:space="preserve")
317
+ # Text content whitespace differs - NOT equivalent with text_content: :strict
270
318
  Canon::Comparison.equivalent?(xml1, xml2,
271
- match: { structural_whitespace: :strict }
319
+ match: { text_content: :strict }
272
320
  )
273
321
  # => false
274
322
  ----
275
323
 
276
- .Using whitelist
324
+ .Using sensitive_elements whitelist
277
325
  [source,ruby]
278
326
  ----
279
- # Make <p> elements whitespace-sensitive (strings, not symbols)
327
+ # Make <sample> elements whitespace-sensitive (strings, not symbols)
328
+ xml1 = "<sample>\n content\n</sample>"
329
+ xml2 = "<sample>content</sample>"
330
+
280
331
  Canon::Comparison.equivalent?(xml1, xml2,
281
332
  match: {
282
333
  structural_whitespace: :strict,
283
- sensitive_elements: ["p", "pre"]
334
+ sensitive_elements: ["sample"]
284
335
  }
285
336
  )
337
+ # => false (structural whitespace differs in <sample>)
286
338
  ----
287
339
 
288
340
  .Overriding HTML defaults
@@ -340,6 +340,44 @@ Special attributes like `xml:lang`, `xml:space`, `xml:id`, and `xml:base` are pr
340
340
  When `xml:space="preserve"` is set, whitespace is preserved in descendants.
341
341
  ----
342
342
 
343
+ === Cross-encoding comparison
344
+
345
+ Canon automatically normalizes XML character encodings before comparison, enabling
346
+ cross-encoding comparisons to work correctly.
347
+
348
+ **Supported encodings**: UTF-8, UTF-16 (all variants), Shift_JIS, EUC-JP, ISO-8859-1, and more.
349
+
350
+ **How it works**:
351
+
352
+ 1. Extract the declared encoding from the XML declaration (e.g., `encoding="Shift_JIS"`)
353
+ 2. If declared encoding differs from UTF-8, transcode to UTF-8
354
+ 3. Handle cases where the declared encoding doesn't match actual bytes
355
+ 4. Use safe transcoding with replacement characters for invalid sequences
356
+
357
+ .Cross-encoding comparison example
358
+ [example]
359
+ ====
360
+ [source,ruby]
361
+ ----
362
+ # UTF-8 vs Shift_JIS - automatically normalized
363
+ xml1 = "<root>日本語</root>" # UTF-8
364
+ xml2 = "<root>日本語</root>".encode("Shift_JIS") # Shift_JIS
365
+
366
+ Canon::Comparison.equivalent?(xml1, xml2)
367
+ # => true (automatically transcoded to UTF-8 before comparison)
368
+
369
+ # ASCII content works across all encodings
370
+ xml3 = "<root>hello</root>"
371
+ xml4 = "<root>hello</root>".encode("ISO-8859-1")
372
+
373
+ Canon::Comparison.equivalent?(xml3, xml4)
374
+ # => true
375
+ ----
376
+ ====
377
+
378
+ This means you can compare XML files from different sources or systems without
379
+ worrying about their native encoding.
380
+
343
381
  == Usage examples
344
382
 
345
383
  === Basic XML comparison
data/lib/canon/cache.rb CHANGED
@@ -89,7 +89,8 @@ module Canon
89
89
  # @return [String] Cache key
90
90
  def key_for_format_detection(content)
91
91
  # Use first 100 chars for quick key, plus length
92
- preview = content[0..100]
92
+ # Force to binary to avoid encoding compatibility issues
93
+ preview = content[0..100].b
93
94
  digest = Digest::SHA256.hexdigest(preview + content.length.to_s)
94
95
  "fmt:#{digest[0..16]}"
95
96
  end
@@ -62,7 +62,21 @@ module Canon
62
62
  # @param str [String] String to detect format of
63
63
  # @return [Symbol] Format type
64
64
  def detect_string_uncached(str)
65
- trimmed = str.strip
65
+ # Convert to UTF-8 for consistent handling if possible
66
+ # This handles cases like UTF-16 encoded XML that would otherwise fail string operations
67
+ str_utf8 = if ["UTF-16", "UTF-16BE",
68
+ "UTF-16LE"].include?(str.encoding.name)
69
+ begin
70
+ str.encode("UTF-8", str.encoding, invalid: :replace,
71
+ undef: :replace, replace: "?")
72
+ rescue EncodingError
73
+ str.dup.force_encoding("BINARY").encode("UTF-8")
74
+ end
75
+ else
76
+ str
77
+ end
78
+
79
+ trimmed = str_utf8.strip
66
80
 
67
81
  # YAML indicators
68
82
  return :yaml if trimmed.start_with?("---")
@@ -89,6 +89,15 @@ module Canon
89
89
  insensitive = (insensitive_raw || []).map(&:to_s)
90
90
  return false if insensitive.include?(elem_name)
91
91
 
92
+ # Check if we should ignore xml:space (user override)
93
+ if respect_xml_space?(match_opts)
94
+ # Check xml:space="preserve" (document declaration)
95
+ return true if xml_space_preserve?(element)
96
+
97
+ # Check xml:space="default" (use configured behavior)
98
+ return false if xml_space_default?(element)
99
+ end
100
+
92
101
  # Whitelist: preserve whitespace
93
102
  sensitive = resolved_sensitive_elements(match_opts)
94
103
  return true if sensitive.include?(elem_name)
@@ -25,6 +25,9 @@ module Canon
25
25
  preserve_whitespace: preserve_whitespace)
26
26
  end
27
27
 
28
+ # Normalize encoding before preprocessing (UTF-16 strings can't use strip, etc.)
29
+ node = Canon::Xml::DataModel.normalize_encoding(node)
30
+
28
31
  # Apply preprocessing to XML string before parsing
29
32
  xml_string = apply_preprocessing(node, preprocessing).strip
30
33
 
data/lib/canon/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Canon
4
- VERSION = "0.1.17"
4
+ VERSION = "0.1.18"
5
5
  end
@@ -21,8 +21,11 @@ module Canon
21
21
  # @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
22
22
  # @return [Nodes::RootNode] Root of the data model tree
23
23
  def self.from_xml(xml_string, preserve_whitespace: false)
24
+ # Normalize encoding before parsing
25
+ normalized_xml = normalize_encoding(xml_string)
26
+
24
27
  # Parse with Nokogiri
25
- doc = Nokogiri::XML(xml_string, &:nonet)
28
+ doc = Nokogiri::XML(normalized_xml, &:nonet)
26
29
 
27
30
  # Check for relative namespace URIs (prohibited by C14N 1.1)
28
31
  check_for_relative_namespace_uris(doc)
@@ -31,6 +34,132 @@ module Canon
31
34
  build_from_nokogiri(doc, preserve_whitespace: preserve_whitespace)
32
35
  end
33
36
 
37
+ # Normalize XML string encoding to UTF-8
38
+ #
39
+ # Handles cases where:
40
+ # 1. The XML declaration specifies an encoding that doesn't match the actual encoding
41
+ # 2. The string's internal encoding is non-UTF-8 (without a declaration)
42
+ #
43
+ # For case 1, we check if the declared encoding matches the actual bytes.
44
+ # If bytes are valid UTF-8 despite the declaration, we update the declaration to UTF-8.
45
+ #
46
+ # @param xml_string [String] XML string to normalize
47
+ # @return [String] Normalized XML string with UTF-8 encoding
48
+ def self.normalize_encoding(xml_string)
49
+ return xml_string unless xml_string.is_a?(String)
50
+
51
+ # Extract declared encoding from XML declaration
52
+ declared_encoding = extract_xml_encoding(xml_string)
53
+
54
+ if declared_encoding
55
+ # Case 1: XML has a declaration
56
+ if declared_encoding.upcase != "UTF-8"
57
+ # Check if bytes are actually valid UTF-8 despite the declaration
58
+ utf8_reinterpreted = try_utf8_reinterpretation(xml_string)
59
+ if utf8_reinterpreted
60
+ # Bytes are valid UTF-8 - update declaration to UTF-8
61
+ return update_xml_declaration(xml_string, "UTF-8")
62
+ end
63
+
64
+ # Bytes aren't valid UTF-8 - must really be in declared encoding
65
+ return transcode_to_utf8(xml_string, declared_encoding)
66
+ end
67
+ elsif xml_string.encoding.name != "UTF-8"
68
+ # Case 2: No declaration but string encoding is non-UTF-8
69
+ # First, try to re-interpret bytes as UTF-8 (handles mislabeled strings)
70
+ reinterpreted = try_utf8_reinterpretation(xml_string)
71
+ return reinterpreted if reinterpreted
72
+
73
+ # If re-interpretation fails, try transcoding with the labeled encoding
74
+ return transcode_to_utf8(xml_string, xml_string.encoding.name)
75
+ end
76
+
77
+ xml_string
78
+ end
79
+
80
+ # Update the encoding declaration in an XML string
81
+ #
82
+ # @param xml_string [String] XML string
83
+ # @param new_encoding [String] New encoding to declare
84
+ # @return [String] XML string with updated declaration
85
+ def self.update_xml_declaration(xml_string, new_encoding)
86
+ xml_string.sub(/\bencoding\s*=\s*["'][^"']+["']/i) do |_match|
87
+ %(encoding="#{new_encoding}")
88
+ end
89
+ end
90
+
91
+ # Transcode string to UTF-8
92
+ #
93
+ # @param xml_string [String] String to transcode
94
+ # @param source_encoding [String] Source encoding to interpret bytes as
95
+ # @return [String] UTF-8 transcoded string
96
+ def self.transcode_to_utf8(xml_string, source_encoding)
97
+ # First, check if the bytes are actually valid UTF-8 despite the declared encoding
98
+ # If so, just re-interpret as UTF-8 (common case: declaration is wrong)
99
+ if source_encoding != "UTF-8"
100
+ # Force the bytes to be interpreted as the declared encoding, then check validity
101
+ forced = xml_string.dup.force_encoding(source_encoding)
102
+ if forced.valid_encoding?
103
+ # Now check if the same bytes are valid UTF-8
104
+ utf8_check = xml_string.dup.force_encoding("UTF-8")
105
+ if utf8_check.valid_encoding?
106
+ # Bytes are valid UTF-8 - the declaration is likely wrong
107
+ # Return the string as UTF-8 (already is)
108
+ return xml_string.dup.force_encoding("UTF-8")
109
+ end
110
+
111
+ # Bytes aren't valid UTF-8, so they must really be in source_encoding
112
+ # Proceed with transcoding
113
+ return forced.encode("UTF-8", source_encoding,
114
+ invalid: :replace,
115
+ undef: :replace,
116
+ replace: "?")
117
+ end
118
+ end
119
+
120
+ # Already UTF-8 or transcoding failed, return as-is
121
+ xml_string.dup.force_encoding("UTF-8")
122
+ rescue EncodingError
123
+ xml_string
124
+ end
125
+
126
+ # Attempt to re-interpret string as UTF-8 if bytes are valid UTF-8
127
+ #
128
+ # This handles the case where a string was incorrectly labeled with a different
129
+ # encoding (e.g., `.encode("Shift_JIS")` on a UTF-8 string) but the actual
130
+ # bytes are valid UTF-8.
131
+ #
132
+ # @param xml_string [String] XML string to check
133
+ # @return [String, nil] UTF-8 re-interpreted string, or nil if not possible
134
+ def self.try_utf8_reinterpretation(xml_string)
135
+ return xml_string if xml_string.encoding.name == "UTF-8"
136
+
137
+ # Try forcing to UTF-8 and see if it's valid
138
+ forced = xml_string.dup.force_encoding("UTF-8")
139
+ return forced if forced.valid_encoding?
140
+
141
+ nil
142
+ end
143
+
144
+ # Extract encoding from XML declaration
145
+ #
146
+ # @param xml_string [String] XML string
147
+ # @return [String, nil] Declared encoding or nil if not found
148
+ def self.extract_xml_encoding(xml_string)
149
+ # Match XML declaration with encoding attribute
150
+ # Handles: <?xml version="1.0" encoding="UTF-8"?>
151
+ # and: <?xml version='1.0' encoding='UTF-8'?>
152
+ #
153
+ # Use binary encoding to avoid encoding compatibility issues
154
+ # when the string has non-ASCII compatible encoding (e.g., UTF-16)
155
+ binary_string = xml_string.dup.force_encoding("BINARY")
156
+ if binary_string =~ /\A\s*<\?xml[^>]*\bencoding\s*=\s*["']([^"']+)["'][^>]*\?>/i
157
+ return Regexp.last_match(1)
158
+ end
159
+
160
+ nil
161
+ end
162
+
34
163
  # Alias for compatibility with base class interface
35
164
  def self.parse(xml_string)
36
165
  from_xml(xml_string)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: canon
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.17
4
+ version: 0.1.18
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.