RubyGems - canon - Versions diffs - 0.1.17 → 0.1.19 - Mend

canon 0.1.17 → 0.1.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/.rubocop_todo.yml +107 -10
data/README.adoc +1 -0
data/docs/features/match-options/index.adoc +60 -8
data/docs/understanding/formats/xml.adoc +38 -0
data/lib/canon/cache.rb +2 -1
data/lib/canon/comparison/format_detector.rb +15 -1
data/lib/canon/comparison/whitespace_sensitivity.rb +9 -0
data/lib/canon/comparison/xml_comparator/node_parser.rb +3 -0
data/lib/canon/diff/path_builder.rb +14 -0
data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +18 -4
data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +6 -1
data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +41 -15
data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +41 -1
data/lib/canon/version.rb +1 -1
data/lib/canon/xml/data_model.rb +130 -1
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 4bf32847db2d5c418daebb9ad5221646edecaf4b67b4f25c4e2a9e8a68167a8e
-  data.tar.gz: 6e595f08701e61f73ad62dc5aec3ec3b95da8f41fc75d579e70721f2d9af42e5
+  metadata.gz: fae901023e2945c8ee14c48a6de4ce793d2735d9f2b098ba9f727b9c0f10e8ad
+  data.tar.gz: 84ed342a12b39a77394275e6159eb16cc60f331c80ed161a6cd4fccc957dc06d
 SHA512:
-  metadata.gz: 42a21e5e1badd2c1b96b1b86dce89551ee5b0794150fd2844b345fcabeb3d9bb484ca3beb423209e0bd455887d3597aa7d5973aaa0985ee77c450f20ff755866
-  data.tar.gz: 8799d74f6a3738317387336308a3f95ffabaa5779d96dbde0ee9bccc424d360131230752031b0a0ee907af5907134b1ca8dec75e8cd0024fb600e090d3b681b7
+  metadata.gz: d88d544b3b961dfa5c0f9fb806f51a473e29b0a018a22dbc9ea2aebaaf459a3aa6317d1cb22c2d5dd32d69eb6162389be274a06886566ab9b63f83e613c4b276
+  data.tar.gz: cc409487c2c38791ec915584a8ebca85672ac7add362c78b5ace99bfd1a1657c6907ac387f321dbc12eda2613ffe998ae7fecdfce6895ff5e285f1f6022d250f

data/.rubocop_todo.yml CHANGED Viewed

@@ -1,6 +1,6 @@
 # This configuration was generated by
 # `rubocop --auto-gen-config`
-# on 2026-03-24 03:04:40 UTC using RuboCop version 1.85.1.
+# on 2026-03-24 10:43:04 UTC using RuboCop version 1.85.1.
 # The point is for the user to remove these configuration records
 # one by one as the offenses are removed from the code base.
 # Note that changes in the inspected code, or installation of new
@@ -11,13 +11,76 @@ Gemspec/RequiredRubyVersion:
   Exclude:
     - 'canon.gemspec'
-# Offense count: 802
+# Offense count: 2
+# This cop supports safe autocorrection (--autocorrect).
+# Configuration parameters: EnforcedStyle, IndentationWidth.
+# SupportedStyles: with_first_argument, with_fixed_indentation
+Layout/ArgumentAlignment:
+  Exclude:
+    - 'lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb'
+# Offense count: 1
+# This cop supports safe autocorrection (--autocorrect).
+# Configuration parameters: EnforcedStyle, IndentationWidth.
+# SupportedStyles: with_first_element, with_fixed_indentation
+Layout/ArrayAlignment:
+  Exclude:
+    - 'lib/canon/diff/path_builder.rb'
+# Offense count: 6
+# This cop supports safe autocorrection (--autocorrect).
+Layout/ElseAlignment:
+  Exclude:
+    - 'lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb'
+# Offense count: 2
+# This cop supports safe autocorrection (--autocorrect).
+# Configuration parameters: EnforcedStyleAlignWith.
+# SupportedStylesAlignWith: keyword, variable, start_of_line
+Layout/EndAlignment:
+  Exclude:
+    - 'lib/canon/diff/path_builder.rb'
+    - 'lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb'
+# Offense count: 1
+# This cop supports safe autocorrection (--autocorrect).
+# Configuration parameters: AllowForAlignment, AllowBeforeTrailingComments, ForceEqualSignAlignment.
+Layout/ExtraSpacing:
+  Exclude:
+    - 'lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb'
+# Offense count: 1
+# This cop supports safe autocorrection (--autocorrect).
+# Configuration parameters: EnforcedStyle.
+# SupportedStyles: normal, indented_internal_methods
+Layout/IndentationConsistency:
+  Exclude:
+    - 'lib/canon/diff/path_builder.rb'
+# Offense count: 8
+# This cop supports safe autocorrection (--autocorrect).
+# Configuration parameters: Width, EnforcedStyleAlignWith, AllowedPatterns.
+# SupportedStylesAlignWith: start_of_line, relative_to_receiver
+Layout/IndentationWidth:
+  Exclude:
+    - 'lib/canon/diff/path_builder.rb'
+    - 'lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb'
+# Offense count: 841
 # This cop supports safe autocorrection (--autocorrect).
 # Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
 # URISchemes: http, https
 Layout/LineLength:
   Enabled: false
+# Offense count: 3
+# This cop supports safe autocorrection (--autocorrect).
+# Configuration parameters: AllowInHeredoc.
+Layout/TrailingWhitespace:
+  Exclude:
+    - 'lib/canon/diff/path_builder.rb'
+    - 'lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb'
 # Offense count: 49
 # Configuration parameters: IgnoreLiteralBranches, IgnoreConstantBranches, IgnoreDuplicateElseBranch.
 Lint/DuplicateBranch:
@@ -58,7 +121,7 @@ Lint/UnusedMethodArgument:
     - 'lib/canon/diff_formatter/by_line/xml_formatter.rb'
     - 'lib/canon/diff_formatter/by_object/base_formatter.rb'
-# Offense count: 235
+# Offense count: 238
 # Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
 Metrics/AbcSize:
   Enabled: false
@@ -69,12 +132,12 @@ Metrics/AbcSize:
 Metrics/BlockLength:
   Max: 84
-# Offense count: 192
+# Offense count: 196
 # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
 Metrics/CyclomaticComplexity:
   Enabled: false
-# Offense count: 401
+# Offense count: 405
 # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
 Metrics/MethodLength:
   Max: 95
@@ -84,7 +147,7 @@ Metrics/MethodLength:
 Metrics/ParameterLists:
   Max: 9
-# Offense count: 158
+# Offense count: 162
 # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
 Metrics/PerceivedComplexity:
   Enabled: false
@@ -121,7 +184,7 @@ Performance/CollectionLiteralInLoop:
 RSpec/ContextWording:
   Enabled: false
-# Offense count: 27
+# Offense count: 30
 # Configuration parameters: IgnoredMetadata.
 RSpec/DescribeClass:
   Enabled: false
@@ -132,7 +195,7 @@ RSpec/DescribeMethod:
     - 'spec/canon/comparison/multiple_differences_spec.rb'
     - 'spec/canon/diff_formatter/character_map_customization_spec.rb'
-# Offense count: 695
+# Offense count: 699
 # Configuration parameters: CountAsOne.
 RSpec/ExampleLength:
   Max: 43
@@ -184,7 +247,7 @@ RSpec/MultipleDescribes:
   Exclude:
     - 'spec/canon/comparison/match_options_spec.rb'
-# Offense count: 536
+# Offense count: 537
 RSpec/MultipleExpectations:
   Max: 15
@@ -217,6 +280,11 @@ RSpec/NoExpectationExample:
     - 'spec/canon/isodoc_blockquotes_spec.rb'
     - 'spec/canon/match_scenarios_spec.rb'
+# Offense count: 2
+RSpec/RepeatedExample:
+  Exclude:
+    - 'spec/canon/comparison/encoding_normalization_spec.rb'
 # Offense count: 7
 # Configuration parameters: CustomTransform, IgnoreMethods, IgnoreMetadata, InflectorPath, EnforcedInflector.
 # SupportedInflectors: default, active_support
@@ -230,7 +298,7 @@ RSpec/SpecFilePathFormat:
     - 'spec/canon/yaml/formatter_spec.rb'
     - 'spec/xml_c14n_spec.rb'
-# Offense count: 126
+# Offense count: 128
 # Configuration parameters: IgnoreNameless, IgnoreSymbolicNames.
 RSpec/VerifiedDoubles:
   Exclude:
@@ -239,6 +307,7 @@ RSpec/VerifiedDoubles:
     - 'spec/canon/diff/diff_classifier_spec.rb'
     - 'spec/canon/diff/path_builder_spec.rb'
     - 'spec/canon/diff/xml_serialization_formatter_spec.rb'
+    - 'spec/canon/diff_formatter/diff_detail_formatter_spec.rb'
     - 'spec/canon/tree_diff/operation_converter_spec.rb'
 # Offense count: 1
@@ -263,9 +332,37 @@ Style/IdenticalConditionalBranches:
     - 'lib/canon/diff_formatter/by_object/base_formatter.rb'
     - 'lib/canon/diff_formatter/legend.rb'
+# Offense count: 2
+# This cop supports safe autocorrection (--autocorrect).
+# Configuration parameters: AllowMethodComparison, ComparisonsThreshold.
+Style/MultipleComparison:
+  Exclude:
+    - 'lib/canon/diff/path_builder.rb'
 # Offense count: 1
 # Configuration parameters: AllowedMethods.
 # AllowedMethods: respond_to_missing?
 Style/OptionalBooleanParameter:
   Exclude:
     - 'lib/canon/diff_formatter/debug_output.rb'
+# Offense count: 1
+# This cop supports safe autocorrection (--autocorrect).
+Style/RedundantParentheses:
+  Exclude:
+    - 'lib/canon/diff/path_builder.rb'
+# Offense count: 1
+# This cop supports safe autocorrection (--autocorrect).
+# Configuration parameters: AllowModifier.
+Style/SoleNestedConditional:
+  Exclude:
+    - 'lib/canon/diff/path_builder.rb'
+# Offense count: 3
+# This cop supports safe autocorrection (--autocorrect).
+# Configuration parameters: EnforcedStyleForMultiline.
+# SupportedStylesForMultiline: comma, consistent_comma, diff_comma, no_comma
+Style/TrailingCommaInArguments:
+  Exclude:
+    - 'spec/canon/diff_formatter/diff_detail_formatter_spec.rb'

data/README.adoc CHANGED Viewed

@@ -214,6 +214,7 @@ Compare documents based on meaning, not formatting:
 * Comment handling with display control
 * Multiple match dimensions with behaviors
 * Predefined match profiles (strict, rendered, spec_friendly, content_only)
+* **Cross-encoding comparison**: Compare XML documents with different character encodings (UTF-8, Shift_JIS, ISO-8859-1, UTF-16) — Canon automatically normalizes to UTF-8 before comparison
 See link:docs/MATCH_OPTIONS[Match options] for details.

data/docs/features/match-options/index.adoc CHANGED Viewed

@@ -151,6 +151,43 @@ sensitivity in XML instance documents:
 </text>
 ----
+The `xml:space` attribute affects both structural whitespace and text content:
+* **Structural whitespace** (whitespace-only text nodes between child elements)
+* **Text content whitespace** (whitespace within text nodes)
+.xml:space with structural_whitespace
+[example]
+====
+[source,ruby]
+----
+# With xml:space="preserve", structural whitespace is preserved
+xml1 = "<root xml:space='preserve'>\n  <text>Hello</text>\n</root>"
+xml2 = "<root xml:space='preserve'><text>Hello</text></root>"
+# These are NOT equivalent (structural whitespace differs)
+Canon::Comparison.equivalent?(xml1, xml2)
+# => false
+----
+====
+.xml:space with text_content
+[example]
+====
+[source,ruby]
+----
+# With xml:space="preserve", text content whitespace is preserved
+xml1 = '<root xml:space="preserve"><code>  indented  </code></root>'
+xml2 = '<root xml:space="preserve"><code>indented</code></root>'
+# These are NOT equivalent (text whitespace differs)
+Canon::Comparison.equivalent?(xml1, xml2,
+  match: { text_content: :strict }
+)
+# => false
+----
+====
 ==== Whitelist and blacklist options
 You can explicitly specify which elements are whitespace-sensitive using either short or long option names:
@@ -260,29 +297,44 @@ Canon::Comparison.equivalent?(xml1, xml2,
 ==== Examples
-.Using xml:space attribute
+.Using xml:space="preserve" for structural whitespace
+[source,ruby]
+----
+xml1 = "<root xml:space='preserve'>\n  <text>Hello</text>\n</root>"
+xml2 = "<root xml:space='preserve'><text>Hello</text></root>"
+# Structural whitespace differs - NOT equivalent
+Canon::Comparison.equivalent?(xml1, xml2)
+# => false
+----
+.Using xml:space="preserve" for text content
 [source,ruby]
 ----
-xml1 = '<root><code xml:space="preserve">  indented  </code></root>'
-xml2 = '<root><code xml:space="preserve">indented</code></root>'
+xml1 = '<root><code xml:space="preserve">  multiple   spaces  </code></root>'
+xml2 = '<root><code xml:space="preserve">multiple spaces</code></root>'
-# These are NOT equivalent (whitespace matters in xml:space="preserve")
+# Text content whitespace differs - NOT equivalent with text_content: :strict
 Canon::Comparison.equivalent?(xml1, xml2,
-  match: { structural_whitespace: :strict }
+  match: { text_content: :strict }
 )
 # => false
 ----
-.Using whitelist
+.Using sensitive_elements whitelist
 [source,ruby]
 ----
-# Make <p> elements whitespace-sensitive (strings, not symbols)
+# Make <sample> elements whitespace-sensitive (strings, not symbols)
+xml1 = "<sample>\n  content\n</sample>"
+xml2 = "<sample>content</sample>"
 Canon::Comparison.equivalent?(xml1, xml2,
   match: {
     structural_whitespace: :strict,
-    sensitive_elements: ["p", "pre"]
+    sensitive_elements: ["sample"]
   }
 )
+# => false (structural whitespace differs in <sample>)
 ----
 .Overriding HTML defaults

data/docs/understanding/formats/xml.adoc CHANGED Viewed

@@ -340,6 +340,44 @@ Special attributes like `xml:lang`, `xml:space`, `xml:id`, and `xml:base` are pr
 When `xml:space="preserve"` is set, whitespace is preserved in descendants.
 ----
+=== Cross-encoding comparison
+Canon automatically normalizes XML character encodings before comparison, enabling
+cross-encoding comparisons to work correctly.
+**Supported encodings**: UTF-8, UTF-16 (all variants), Shift_JIS, EUC-JP, ISO-8859-1, and more.
+**How it works**:
+1. Extract the declared encoding from the XML declaration (e.g., `encoding="Shift_JIS"`)
+2. If declared encoding differs from UTF-8, transcode to UTF-8
+3. Handle cases where the declared encoding doesn't match actual bytes
+4. Use safe transcoding with replacement characters for invalid sequences
+.Cross-encoding comparison example
+[example]
+====
+[source,ruby]
+----
+# UTF-8 vs Shift_JIS - automatically normalized
+xml1 = "<root>日本語</root>"  # UTF-8
+xml2 = "<root>日本語</root>".encode("Shift_JIS")  # Shift_JIS
+Canon::Comparison.equivalent?(xml1, xml2)
+# => true (automatically transcoded to UTF-8 before comparison)
+# ASCII content works across all encodings
+xml3 = "<root>hello</root>"
+xml4 = "<root>hello</root>".encode("ISO-8859-1")
+Canon::Comparison.equivalent?(xml3, xml4)
+# => true
+----
+====
+This means you can compare XML files from different sources or systems without
+worrying about their native encoding.
 == Usage examples
 === Basic XML comparison

data/lib/canon/cache.rb CHANGED Viewed

@@ -89,7 +89,8 @@ module Canon
       # @return [String] Cache key
       def key_for_format_detection(content)
         # Use first 100 chars for quick key, plus length
-        preview = content[0..100]
+        # Force to binary to avoid encoding compatibility issues
+        preview = content[0..100].b
         digest = Digest::SHA256.hexdigest(preview + content.length.to_s)
         "fmt:#{digest[0..16]}"
       end

data/lib/canon/comparison/format_detector.rb CHANGED Viewed

@@ -62,7 +62,21 @@ module Canon
         # @param str [String] String to detect format of
         # @return [Symbol] Format type
         def detect_string_uncached(str)
-          trimmed = str.strip
+          # Convert to UTF-8 for consistent handling if possible
+          # This handles cases like UTF-16 encoded XML that would otherwise fail string operations
+          str_utf8 = if ["UTF-16", "UTF-16BE",
+                         "UTF-16LE"].include?(str.encoding.name)
+                       begin
+                         str.encode("UTF-8", str.encoding, invalid: :replace,
+                                                           undef: :replace, replace: "?")
+                       rescue EncodingError
+                         str.dup.force_encoding("BINARY").encode("UTF-8")
+                       end
+                     else
+                       str
+                     end
+          trimmed = str_utf8.strip
           # YAML indicators
           return :yaml if trimmed.start_with?("---")

data/lib/canon/comparison/whitespace_sensitivity.rb CHANGED Viewed

@@ -89,6 +89,15 @@ module Canon
           insensitive = (insensitive_raw || []).map(&:to_s)
           return false if insensitive.include?(elem_name)
+          # Check if we should ignore xml:space (user override)
+          if respect_xml_space?(match_opts)
+            # Check xml:space="preserve" (document declaration)
+            return true if xml_space_preserve?(element)
+            # Check xml:space="default" (use configured behavior)
+            return false if xml_space_default?(element)
+          end
           # Whitelist: preserve whitespace
           sensitive = resolved_sensitive_elements(match_opts)
           return true if sensitive.include?(elem_name)

data/lib/canon/comparison/xml_comparator/node_parser.rb CHANGED Viewed

@@ -25,6 +25,9 @@ module Canon
                                      preserve_whitespace: preserve_whitespace)
           end
+          # Normalize encoding before preprocessing (UTF-16 strings can't use strip, etc.)
+          node = Canon::Xml::DataModel.normalize_encoding(node)
           # Apply preprocessing to XML string before parsing
           xml_string = apply_preprocessing(node, preprocessing).strip

data/lib/canon/diff/path_builder.rb CHANGED Viewed

@@ -83,6 +83,20 @@ module Canon
         # Get ordinal index (position among siblings with same label)
         index = ordinal_index(tree_node)
+        # For text nodes, use parent element name for clarity
+        # e.g., instead of "/p/#text[0]" use "/p/text()[0]"
+        if ["text",
+            "#text"].include?(label) && tree_node.respond_to?(:parent) && tree_node.parent
+          parent_name = if tree_node.parent.respond_to?(:label)
+                          tree_node.parent.label
+                        elsif tree_node.parent.respond_to?(:name)
+                          tree_node.parent.name
+                        end
+          if parent_name && parent_name != "#document" && parent_name != "#document-fragment"
+            return "#{parent_name}/text()[#{index}]"
+          end
+        end
         "#{label}[#{index}]"
       end

data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb CHANGED Viewed

@@ -340,10 +340,24 @@ module Canon
               TextUtils.visualize_whitespace(text2), :green, use_color
             )
           else
-            detail1 = ColorHelper.colorize(format_json_value(text1), :red,
-                                           use_color)
-            detail2 = ColorHelper.colorize(format_json_value(text2), :green,
-                                           use_color)
+            # Escape non-ASCII characters for better terminal display
+            # JSON.generate doesn't escape chars like NBSP (U+00A0) or em-dash (U+2014)
+            detail1 = if TextUtils.needs_escaping?(text1)
+                        ColorHelper.colorize(
+                          TextUtils.escape_for_display(text1), :red, use_color
+                        )
+                      else
+                        ColorHelper.colorize(format_json_value(text1), :red,
+                                             use_color)
+                      end
+            detail2 = if TextUtils.needs_escaping?(text2)
+                        ColorHelper.colorize(
+                          TextUtils.escape_for_display(text2), :green, use_color
+                        )
+                      else
+                        ColorHelper.colorize(format_json_value(text2), :green,
+                                             use_color)
+                      end
           end
           changes = "Content differs: #{detail1} → #{detail2}"

data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb CHANGED Viewed

@@ -16,7 +16,12 @@ module Canon
         def self.extract_location(diff)
           return "" unless diff
-          # Get the appropriate node based on diff type
+          # Prefer pre-computed path if available (populated by MetadataEnricher)
+          if diff.respond_to?(:path) && !diff.path.nil? && !diff.path.empty?
+            return "Location: #{diff.path}"
+          end
+          # Fall back to extracting from nodes
           node = if diff.respond_to?(:node1)
                    diff.node1 || diff.node2
                  elsif diff.is_a?(Hash)

data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb CHANGED Viewed

@@ -159,21 +159,47 @@ module Canon
         def self.get_node_text(node)
           return "" unless node
-          if node.respond_to?(:text)
-            node.text
-          elsif node.respond_to?(:content)
-            node.content
-          elsif node.respond_to?(:inner_text)
-            node.inner_text
-          elsif node.respond_to?(:value)
-            node.value
-          elsif node.respond_to?(:node_info)
-            node.node_info
-          elsif node.respond_to?(:to_s)
-            node.to_s
-          else
-            ""
-          end.to_s.strip
+          text = if node.respond_to?(:text)
+                   node.text
+                 elsif node.respond_to?(:content)
+                   node.content
+                 elsif node.respond_to?(:inner_text)
+                   node.inner_text
+                 elsif node.respond_to?(:value)
+                   node.value
+                 elsif node.respond_to?(:node_info)
+                   node.node_info
+                 elsif node.respond_to?(:to_s)
+                   node.to_s
+                 else
+                   ""
+                 end
+          strip_ascii_whitespace(text.to_s)
+        end
+        # Strip only ASCII whitespace (space, tab, CR, LF) but preserve Unicode
+        # whitespace like non-breaking space (\u00A0). Ruby's String#strip removes
+        # all Unicode whitespace, which destroys meaningful content like \u00A0.
+        #
+        # @param str [String] String to strip
+        # @return [String] String with leading/trailing ASCII whitespace removed
+        ASCII_WHITESPACE_BYTES = [32, 9, 13, 10].freeze # ' ', '\t', '\r', '\n'
+        def self.strip_ascii_whitespace(str)
+          return "" if str.nil?
+          return str if str.empty?
+          # Find first non-ASCII-whitespace character position
+          first_pos = str.index(/[^ \t\r\n]/)
+          return "" unless first_pos
+          # Find last non-ASCII-whitespace character position (from end)
+          # Use reverse and index, then convert back to forward position
+          reversed_pos = str.reverse.index(/[^ \t\r\n]/)
+          last_pos = str.length - 1 - reversed_pos
+          str[first_pos..last_pos]
         end
         # Get element name for display

data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb CHANGED Viewed

@@ -20,7 +20,8 @@ module Canon
         # Visualize whitespace characters in text
         #
-        # Shows spaces as ·, tabs as →, newlines as ¬
+        # Shows spaces as ·, tabs as →, newlines as ¬, and Unicode whitespace
+        # like non-breaking space as <NBSP>, etc.
         #
         # @param text [String] Text to visualize
         # @return [String] Text with visible whitespace
@@ -31,6 +32,9 @@ module Canon
             .gsub(" ", "·")
             .gsub("\t", "→")
             .gsub("\n", "¬")
+            .gsub("\u00A0", "<NBSP>") # Non-breaking space
+            .gsub("\u2028", "<LSEP>")    # Line separator
+            .gsub("\u2029", "<PSEP>")    # Paragraph separator
         end
         # Extract a content preview from a node
@@ -55,6 +59,42 @@ module Canon
           text = text.strip.gsub(/\s+/, " ")
           truncate_text(text, max_length)
         end
+        # Escape non-ASCII and non-printable characters for display
+        #
+        # Converts characters outside the printable ASCII range (32-126) to
+        # their \uXXXX escape sequences. This ensures special characters like
+        # non-breaking space (\u00A0) and em-dash (\u2014) are visible in
+        # terminal output.
+        #
+        # @param text [String] Text to escape
+        # @return [String] Escaped text safe for terminal display
+        def self.escape_for_display(text)
+          return "" if text.nil?
+          text.chars.map do |c|
+            codepoint = c.ord
+            if codepoint < 32 || codepoint >= 127 || codepoint == 34 || codepoint == 92
+              # Escape control characters, non-ASCII, double-quote, and backslash
+              "\\u#{codepoint.to_s(16).upcase.rjust(4, '0')}"
+            else
+              c
+            end
+          end.join
+        end
+        # Check if text contains non-ASCII or non-printable characters
+        #
+        # @param text [String] Text to check
+        # @return [Boolean] true if text needs escaping for display
+        def self.needs_escaping?(text)
+          return false if text.nil?
+          text.each_char.any? do |c|
+            codepoint = c.ord
+            codepoint < 32 || codepoint >= 127 || codepoint == 34 || codepoint == 92
+          end
+        end
       end
     end
   end

data/lib/canon/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Canon
-  VERSION = "0.1.17"
+  VERSION = "0.1.19"
 end

data/lib/canon/xml/data_model.rb CHANGED Viewed

@@ -21,8 +21,11 @@ module Canon
       # @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
       # @return [Nodes::RootNode] Root of the data model tree
       def self.from_xml(xml_string, preserve_whitespace: false)
+        # Normalize encoding before parsing
+        normalized_xml = normalize_encoding(xml_string)
         # Parse with Nokogiri
-        doc = Nokogiri::XML(xml_string, &:nonet)
+        doc = Nokogiri::XML(normalized_xml, &:nonet)
         # Check for relative namespace URIs (prohibited by C14N 1.1)
         check_for_relative_namespace_uris(doc)
@@ -31,6 +34,132 @@ module Canon
         build_from_nokogiri(doc, preserve_whitespace: preserve_whitespace)
       end
+      # Normalize XML string encoding to UTF-8
+      #
+      # Handles cases where:
+      # 1. The XML declaration specifies an encoding that doesn't match the actual encoding
+      # 2. The string's internal encoding is non-UTF-8 (without a declaration)
+      #
+      # For case 1, we check if the declared encoding matches the actual bytes.
+      # If bytes are valid UTF-8 despite the declaration, we update the declaration to UTF-8.
+      #
+      # @param xml_string [String] XML string to normalize
+      # @return [String] Normalized XML string with UTF-8 encoding
+      def self.normalize_encoding(xml_string)
+        return xml_string unless xml_string.is_a?(String)
+        # Extract declared encoding from XML declaration
+        declared_encoding = extract_xml_encoding(xml_string)
+        if declared_encoding
+          # Case 1: XML has a declaration
+          if declared_encoding.upcase != "UTF-8"
+            # Check if bytes are actually valid UTF-8 despite the declaration
+            utf8_reinterpreted = try_utf8_reinterpretation(xml_string)
+            if utf8_reinterpreted
+              # Bytes are valid UTF-8 - update declaration to UTF-8
+              return update_xml_declaration(xml_string, "UTF-8")
+            end
+            # Bytes aren't valid UTF-8 - must really be in declared encoding
+            return transcode_to_utf8(xml_string, declared_encoding)
+          end
+        elsif xml_string.encoding.name != "UTF-8"
+          # Case 2: No declaration but string encoding is non-UTF-8
+          # First, try to re-interpret bytes as UTF-8 (handles mislabeled strings)
+          reinterpreted = try_utf8_reinterpretation(xml_string)
+          return reinterpreted if reinterpreted
+          # If re-interpretation fails, try transcoding with the labeled encoding
+          return transcode_to_utf8(xml_string, xml_string.encoding.name)
+        end
+        xml_string
+      end
+      # Update the encoding declaration in an XML string
+      #
+      # @param xml_string [String] XML string
+      # @param new_encoding [String] New encoding to declare
+      # @return [String] XML string with updated declaration
+      def self.update_xml_declaration(xml_string, new_encoding)
+        xml_string.sub(/\bencoding\s*=\s*["'][^"']+["']/i) do |_match|
+          %(encoding="#{new_encoding}")
+        end
+      end
+      # Transcode string to UTF-8
+      #
+      # @param xml_string [String] String to transcode
+      # @param source_encoding [String] Source encoding to interpret bytes as
+      # @return [String] UTF-8 transcoded string
+      def self.transcode_to_utf8(xml_string, source_encoding)
+        # First, check if the bytes are actually valid UTF-8 despite the declared encoding
+        # If so, just re-interpret as UTF-8 (common case: declaration is wrong)
+        if source_encoding != "UTF-8"
+          # Force the bytes to be interpreted as the declared encoding, then check validity
+          forced = xml_string.dup.force_encoding(source_encoding)
+          if forced.valid_encoding?
+            # Now check if the same bytes are valid UTF-8
+            utf8_check = xml_string.dup.force_encoding("UTF-8")
+            if utf8_check.valid_encoding?
+              # Bytes are valid UTF-8 - the declaration is likely wrong
+              # Return the string as UTF-8 (already is)
+              return xml_string.dup.force_encoding("UTF-8")
+            end
+            # Bytes aren't valid UTF-8, so they must really be in source_encoding
+            # Proceed with transcoding
+            return forced.encode("UTF-8", source_encoding,
+                                 invalid: :replace,
+                                 undef: :replace,
+                                 replace: "?")
+          end
+        end
+        # Already UTF-8 or transcoding failed, return as-is
+        xml_string.dup.force_encoding("UTF-8")
+      rescue EncodingError
+        xml_string
+      end
+      # Attempt to re-interpret string as UTF-8 if bytes are valid UTF-8
+      #
+      # This handles the case where a string was incorrectly labeled with a different
+      # encoding (e.g., `.encode("Shift_JIS")` on a UTF-8 string) but the actual
+      # bytes are valid UTF-8.
+      #
+      # @param xml_string [String] XML string to check
+      # @return [String, nil] UTF-8 re-interpreted string, or nil if not possible
+      def self.try_utf8_reinterpretation(xml_string)
+        return xml_string if xml_string.encoding.name == "UTF-8"
+        # Try forcing to UTF-8 and see if it's valid
+        forced = xml_string.dup.force_encoding("UTF-8")
+        return forced if forced.valid_encoding?
+        nil
+      end
+      # Extract encoding from XML declaration
+      #
+      # @param xml_string [String] XML string
+      # @return [String, nil] Declared encoding or nil if not found
+      def self.extract_xml_encoding(xml_string)
+        # Match XML declaration with encoding attribute
+        # Handles: <?xml version="1.0" encoding="UTF-8"?>
+        # and: <?xml version='1.0' encoding='UTF-8'?>
+        #
+        # Use binary encoding to avoid encoding compatibility issues
+        # when the string has non-ASCII compatible encoding (e.g., UTF-16)
+        binary_string = xml_string.dup.force_encoding("BINARY")
+        if binary_string =~ /\A\s*<\?xml[^>]*\bencoding\s*=\s*["']([^"']+)["'][^>]*\?>/i
+          return Regexp.last_match(1)
+        end
+        nil
+      end
       # Alias for compatibility with base class interface
       def self.parse(xml_string)
         from_xml(xml_string)

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: canon
 version: !ruby/object:Gem::Version
-  version: 0.1.17
+  version: 0.1.19
 platform: ruby
 authors:
 - Ribose Inc.