RubyGems - disarm - Versions diffs - 0.10.0-x86_64-linux → 0.11.0-x86_64-linux - Mend

disarm 0.10.0-x86_64-linux → 0.11.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: bb2d53e5dd7345db9edd3342f549d42abb7c5729a674d73af083ec0cbaa3809c
-  data.tar.gz: b6afb09b36cecadf6eb3812d6d5899ee9424361466db6d28070f9d7cd8a457bb
+  metadata.gz: '0359982965e26fe7e2dd9a466af41416ba547af6524517e51b3311271f007248'
+  data.tar.gz: e4de9c08cc791e099176b09c19b08b1876f22fde7954cac5b5291d7758c6db86
 SHA512:
-  metadata.gz: bb034797e17b3fdccf1eb9eac41c155be6678b841d25dd84bdb871bd034bf03ce215b22ea5d196c8d0ca3cc8e9aa1b20b88f4e75d56f1bc7cdf5536cf2b8b34e
-  data.tar.gz: 58189aa9ab24e25d8201a499b950663ffb17854e0b768d02c4d5112f57ee7845369c49876ccdfc800aae5e1c5d5e80e5d6633c4391fa7371d348980e4e72a881
+  metadata.gz: 39c2d59de362a75f6198e59eadfb4ca47a45644b9cce3b303d39c95566e38e29ee2090016897ea856f2d62a5de39c54db4f5476176ecf2ee4047a81cd137d0ed
+  data.tar.gz: 2d250406c451bb1435c790b3508d99a95a4434d743751bc4056f6682ec927706954235921b8a389027b65137bad864b5a5df82746a9067d588e10d43bce13b98

data/README.md CHANGED Viewed

@@ -29,8 +29,10 @@ falls back to compiling from source (needs a Rust toolchain) otherwise.
 require "disarm"
 # Standards-based transliteration to ASCII. `scheme:` is a symbol (or string):
-# :default (general-purpose), :strict_iso9 (ISO 9:1995), :gost7034.
+# :default (general-purpose), :strict_iso9 (ISO 9:1995), :gost7034. `lang:`
+# applies a language profile on top (e.g. "uk" → Київ → "Kyiv").
 Disarm.transliterate("Москва")                       # => "Moskva"
+Disarm.transliterate("Київ", lang: :uk)              # => "Kyiv"
 Disarm.transliterate("Москва", scheme: :strict_iso9)
 # TR39 confusable folding (homoglyph defense). `target:` defaults to :latin.
@@ -48,7 +50,7 @@ Disarm.demojize("👍🏽", strip_modifiers: true)
 # Security presets
 Disarm.strip_obfuscation("Ѕ𝗲𝗰𝗿𝗲𝘁  data")            # deobfuscated
-Disarm.security_clean("…")                           # homoglyph/bidi/zero-width clean
+Disarm.canonicalize("…")                           # homoglyph/bidi/zero-width clean
 # IDN / hostname spoof check (a false result is not a safety guarantee)
 Disarm.suspicious_hostname?("pаypal.com")            # => true (Cyrillic 'а')

data/lib/disarm/3.1/disarm.so CHANGED Viewed

Binary file

data/lib/disarm/3.2/disarm.so CHANGED Viewed

Binary file

data/lib/disarm/3.3/disarm.so CHANGED Viewed

Binary file

data/lib/disarm/version.rb CHANGED Viewed

@@ -2,5 +2,5 @@
 module Disarm
   # Kept in lockstep with the Rust crate / Python package version.
-  VERSION = "0.10.0"
+  VERSION = "0.11.0"
 end

data/lib/disarm.rb CHANGED Viewed

@@ -36,13 +36,20 @@ module Disarm
   class << self
     # Transliterate Unicode text to ASCII. `scheme:` selects the standard:
-    # :default (the general-purpose scheme), :strict_iso9, or :gost7034. Accepts
-    # a String or Symbol.
-    def transliterate(text, scheme: :default)
+    # :default (the general-purpose scheme), :strict_iso9, or :gost7034. `lang:`
+    # applies a language profile on top of the scheme (e.g. "uk" → Київ → "Kyiv",
+    # "de" → ü → "ue"); nil means no profile. Both accept a String or Symbol.
+    def transliterate(text, scheme: :default, lang: nil)
       scheme = scheme.to_s
+      lang = lang&.to_s
       translate_errors do
-        # The bare default keeps the core's borrow-on-no-op fast path.
-        scheme == "default" ? _transliterate(text) : _transliterate_scheme(text, scheme)
+        # The bare default with no profile keeps the core's borrow-on-no-op fast
+        # path; any scheme or lang takes the option-carrying builder path.
+        if lang.nil? && scheme == "default"
+          _transliterate(text)
+        else
+          _transliterate_opts(text, scheme, lang)
+        end
       end
     end
@@ -98,10 +105,40 @@ module Disarm
       translate_errors { _strip_obfuscation(text) }
     end
-    # Aggressive security cleaning: strip obfuscation, control characters, and
-    # other spoofing vectors.
+    # Canonicalize text for security-sensitive comparison: strip obfuscation,
+    # control characters, and other spoofing vectors. The name describes the
+    # mechanism (Unicode canonicalization for matching), not a safety guarantee —
+    # this is not an output sanitizer; encode at the sink.
+    def canonicalize(text)
+      translate_errors { _canonicalize(text) }
+    end
+    # @deprecated Renamed to {#canonicalize} in 0.11 (the +_clean+ name
+    #   overpromised safety); removed in 1.0.
     def security_clean(text)
-      translate_errors { _security_clean(text) }
+      warn("[disarm] security_clean is deprecated; use canonicalize (removed in 1.0)", category: :deprecated)
+      canonicalize(text)
+    end
+    # Case/accent/script-insensitive search lookup key. `lang:` applies a
+    # language profile for transliteration (e.g. "ru", "uk"); nil means none.
+    # Raises Disarm::InvalidArgument on an unknown lang.
+    def search_key(text, lang: nil)
+      translate_errors { _search_key(text, lang&.to_s) }
+    end
+    # Collation sort key (like #search_key, but keeps base accented characters
+    # for correct ordering). `lang:` applies a language profile; nil means none.
+    # Raises Disarm::InvalidArgument on an unknown lang.
+    def sort_key(text, lang: nil)
+      translate_errors { _sort_key(text, lang&.to_s) }
+    end
+    # Library catalog deduplication key (search_key plus confusable folding).
+    # `lang:` applies a language profile; `strict_iso9:` selects the ISO 9:1995
+    # Cyrillic scheme. Raises Disarm::InvalidArgument on an unknown lang.
+    def catalog_key(text, lang: nil, strict_iso9: false)
+      translate_errors { _catalog_key(text, lang&.to_s, strict_iso9) }
     end
     # Strip diacritics ("café" → "cafe").
@@ -114,14 +151,275 @@ module Disarm
       translate_errors { _fold_case(text) }
     end
-    # Whether the hostname looks like a mixed-script / confusable IDN spoof. A
-    # false result asserts nothing was *found*, not that the host is safe.
+    # Whether the hostname looks like a mixed-script / confusable / bidi-reorder
+    # IDN spoof. Flags a mixed-script label, a Latin confusable, or a
+    # bidi-direction conflict (see #bidi_conflict?, the "BiDi Swap" precondition).
+    # A false result asserts nothing was *found*, not that the host is safe.
     def suspicious_hostname?(host)
       translate_errors { _suspicious_hostname?(host) }
     end
+    # Apply a Unicode normalization form. `form:` is :nfc (default), :nfd,
+    # :nfkc, or :nfkd (a Symbol or String; case-insensitive).
+    def normalize(text, form: :nfc)
+      translate_errors { _normalize(text, form.to_s.upcase) }
+    end
+    # Whether `text` is already in normalization `form:` (default :nfc).
+    def normalized?(text, form: :nfc)
+      translate_errors { _normalized?(text, form.to_s.upcase) }
+    end
+    # Fold every run of Unicode whitespace to a single ASCII space and trim
+    # leading/trailing whitespace (#433). Folds whitespace ONLY — the line
+    # controls (TAB/LF/VT/FF/CR), the information separators (U+001C–U+001F),
+    # NEL, the Zs/Zl/Zp spaces, and the blank-rendering set (Braille blank,
+    # Hangul fillers) each fold to a single space. It does NOT delete control or
+    # zero-width characters — use `strip_control_chars` / `strip_zero_width_chars`
+    # for that. Folding the line controls (not deleting) means "a\rb" → "a b".
+    def collapse_whitespace(text)
+      translate_errors { _collapse_whitespace(text) }
+    end
+    # Remove C0/C1 control characters (except tab and newline).
+    def strip_control_chars(text)
+      translate_errors { _strip_control_chars(text) }
+    end
+    # Remove zero-width characters (ZWSP, ZWNJ, ZWJ, word joiner).
+    def strip_zero_width_chars(text)
+      translate_errors { _strip_zero_width_chars(text) }
+    end
+    # Remove Unicode bidirectional control characters (a homoglyph/spoof vector).
+    def strip_bidi(text)
+      translate_errors { _strip_bidi(text) }
+    end
+    # Strip the Unicode Tags block (U+E0000-U+E007F) - the "ASCII smuggling"
+    # channel - preserving well-formed emoji subdivision flag sequences (#413).
+    def strip_tags(text)
+      translate_errors { _strip_tags(text) }
+    end
+    # Strip every variation selector (VS1-VS256) - the arbitrary-byte smuggling
+    # channel (#413).
+    def strip_variation_selectors(text)
+      translate_errors { _strip_variation_selectors(text) }
+    end
+    # Strip every Unicode noncharacter (U+FDD0-U+FDEF and U+xFFFE/U+xFFFF) (#413).
+    def strip_noncharacters(text)
+      translate_errors { _strip_noncharacters(text) }
+    end
+    # Strip every Private Use Area code point (BMP and planes 15/16) (#413).
+    def strip_pua(text)
+      translate_errors { _strip_pua(text) }
+    end
+    # Strip "zalgo" combining-mark stacking, keeping at most `max_marks:` (2)
+    # combining marks per base character.
+    def strip_zalgo(text, max_marks: 2)
+      translate_errors { _strip_zalgo(text, max_marks) }
+    end
+    # Whether `text` looks like zalgo: any base character carries more than
+    # `threshold:` (3) combining marks.
+    def zalgo?(text, threshold: 3)
+      translate_errors { _zalgo?(text, threshold) }
+    end
+    # Number of grapheme clusters (user-perceived characters). Counts an emoji
+    # or flag as one, unlike `String#length` (code points).
+    def grapheme_len(text)
+      translate_errors { _grapheme_len(text) }
+    end
+    # Split `text` into an array of grapheme-cluster strings.
+    def grapheme_split(text)
+      translate_errors { _grapheme_split(text) }
+    end
+    # Truncate `text` to at most `max_graphemes` grapheme clusters, never cutting
+    # through the middle of a cluster.
+    def grapheme_truncate(text, max_graphemes)
+      translate_errors { _grapheme_truncate(text, max_graphemes) }
+    end
+    # Display width (terminal columns) of a single grapheme `cluster` by East
+    # Asian Width. Pass `ambiguous_wide: true` to treat ambiguous-width
+    # characters as 2 columns.
+    def grapheme_width(cluster, ambiguous_wide: false)
+      translate_errors { _grapheme_width(cluster, ambiguous_wide) }
+    end
+    # Total display width (terminal columns) of `text`.
+    def terminal_width(text, ambiguous_wide: false)
+      translate_errors { _terminal_width(text, ambiguous_wide) }
+    end
+    # Turn arbitrary text into a safe filename. `platform:` is :universal
+    # (default), :windows, or :posix; `preserve_extension:` keeps the final
+    # extension when truncating to `max_length:`. Raises Disarm::InvalidArgument
+    # on an unknown platform.
+    def sanitize_filename(text, separator: "_", max_length: 255, platform: :universal,
+                          lang: nil, preserve_extension: true)
+      translate_errors do
+        _sanitize_filename(text, separator.to_s, max_length, platform.to_s,
+                           lang&.to_s, preserve_extension)
+      end
+    end
+    # Reverse-transliterate Latin back to a native script. `lang:` is :el (Greek),
+    # :ru (Russian), or :uk (Ukrainian) — a Symbol or String.
+    def reverse_transliterate(text, lang:)
+      translate_errors { _reverse_transliterate(text, lang.to_s) }
+    end
+    # Every character in `text` with no romanization, as an array of
+    # `{ char:, offset: }` hashes (byte offset), in order of appearance.
+    # `scheme:`/`lang:` mirror #transliterate.
+    def find_untranslatable(text, scheme: :default, lang: nil)
+      translate_errors do
+        _find_untranslatable(text, scheme.to_s, lang&.to_s)
+          .map { |ch, offset| { char: ch, offset: offset } }
+      end
+    end
+    # The Unicode scripts present in `text`, in first-appearance order
+    # (Common/Inherited excluded), as stable UCD identifiers (e.g. "Latin").
+    def detect_scripts(text)
+      translate_errors { _detect_scripts(text) }
+    end
+    # Whether `text` mixes characters from more than one script.
+    def mixed_script?(text)
+      translate_errors { _is_mixed_script?(text) }
+    end
+    # Whether `text` mixes strong left-to-right and strong right-to-left
+    # characters — the precondition for Bidi display-reordering (UAX #9) and the
+    # structural signal behind "BiDi Swap"-style spoofs. Fires on the real
+    # letters (no U+202x override). A false result is not a safety guarantee.
+    def bidi_conflict?(text)
+      translate_errors { _has_bidi_conflict?(text) }
+    end
+    # Explain how `lang: "auto"` detection resolves `text`: a hash with
+    # `:script`, `:chosen_lang` (both nil if undetected), `:reason`, and
+    # `:discriminators_hit`.
+    def inspect_auto_lang(text)
+      script, chosen_lang, reason, discriminators = translate_errors { _inspect_auto_lang(text) }
+      { script: script, chosen_lang: chosen_lang, reason: reason,
+        discriminators_hit: discriminators }
+    end
+    # Curated metadata for one language `code` (e.g. "de"), as a hash with symbol
+    # keys: `:name`, `:script`, `:region`, and `:context` ("none"/"partial"/"full").
+    # Raises Disarm::InvalidArgument on an unknown code.
+    def lang_info(code)
+      translate_errors { _lang_info(code.to_s) }
+    end
+    # Curated metadata for one script `name` (e.g. "Coptic"), as a hash with symbol
+    # keys: `:name`, `:default_lang` (nil when none), `:example`, and
+    # `:context_aware`. Raises Disarm::InvalidArgument on an unknown script.
+    def script_info(name)
+      translate_errors { _script_info(name.to_s) }
+    end
+    # Every script disarm knows, as stable UCD script identifiers (includes
+    # "Common"/"Inherited"), sorted by name.
+    def list_scripts
+      translate_errors { _list_scripts }
+    end
+    # The language codes with context-aware transliteration support, sorted by code.
+    def list_context_langs
+      translate_errors { _list_context_langs }
+    end
+    # Whether any whitespace token carries out-of-place characters that disguise a
+    # real word — a cross-script homoglyph, leet, segmentation, a zero-width / bidi
+    # control, or zalgo. Reports a technical fact and leaves the malicious-or-not
+    # judgement to the caller. `lexicon` is a common-word collection (Array or Set)
+    # used only by the leet and segmentation branches; it defaults to an empty list
+    # when those branches are not needed. A bare String is rejected — pass an Array
+    # or any object responding to `:each`.
+    #
+    # For repeated calls over the same word list, build a Disarm::Lexicon once and
+    # pass it here: the native HashSet is then reused rather than rebuilt per call
+    # (HAI-SDLC 6.1).
+    def has_anomalies?(text, lexicon = [])
+      translate_errors do
+        if lexicon.is_a?(Disarm::Lexicon)
+          _has_anomalies_lex(text, lexicon)
+        else
+          _has_anomalies?(text, coerce_lexicon(lexicon))
+        end
+      end
+    end
+    # Full anomaly analysis: a hash with `:anomalous`, `:kinds` (in first-appearance
+    # order), `:findings` (each `{ kind:, token:, start:, end:, detail:, reason: }`,
+    # with byte offsets), and `:reason` (the first finding's reason, or nil).
+    # `lexicon` defaults to an empty list; a bare String is rejected. Pass a
+    # pre-built Disarm::Lexicon to reuse the native HashSet across calls (6.1).
+    def inspect_anomalies(text, lexicon = [])
+      anomalous, kinds, findings, reason =
+        translate_errors do
+          if lexicon.is_a?(Disarm::Lexicon)
+            _inspect_anomalies_lex(text, lexicon)
+          else
+            _inspect_anomalies(text, coerce_lexicon(lexicon))
+          end
+        end
+      {
+        anomalous: anomalous,
+        kinds: kinds,
+        findings: findings.map do |kind, token, start, finish, detail, fr|
+          { kind: kind, token: token, start: start, end: finish, detail: detail, reason: fr }
+        end,
+        reason: reason,
+      }
+    end
+    # Build a reusable Disarm::Pipeline for a named policy `profile` (e.g.
+    # "search_index", "normalize_web_input"). The profile's steps are validated
+    # and assembled once at construction, so the returned handle can be reused
+    # across many `#process` calls without re-resolving the profile each time —
+    # the same reuse pattern as Disarm::Lexicon. Raises Disarm::InvalidArgument
+    # on an unknown profile name.
+    #
+    #   pipe = Disarm.get_pipeline("search_index")
+    #   pipe.process("Café") # => "cafe"
+    #   pipe.process("Köln") # reuse the same handle
+    #
+    # Disarm::Pipeline#process is the Rust-defined instance method on the handle.
+    def get_pipeline(profile)
+      translate_errors { _get_pipeline(profile.to_s) }
+    end
     private
+    # Coerce a lexicon argument to an Array of Strings for the native layer.
+    # Fast-path: an Array already containing only Strings is passed through as-is.
+    # Any other Enumerable (Set, etc.) is mapped to String. A bare String is rejected
+    # with ArgumentError — callers must wrap it in an Array: ["word"].
+    def coerce_lexicon(lexicon)
+      # An explicit nil is treated as an empty lexicon (parity with the `= []`
+      # default and the other bindings' null handling), not an error.
+      return [] if lexicon.nil?
+      raise ::ArgumentError, "lexicon must be an Array or Enumerable, not a String" \
+        if lexicon.is_a?(::String)
+      return lexicon if lexicon.is_a?(::Array) && lexicon.all?(::String)
+      lexicon.map(&:to_s)
+    end
     # Run a native call, re-raising its built-in exception as the matching
     # Disarm::Error subclass so callers can `rescue Disarm::Error` across the
     # whole surface. The original backtrace is preserved (passed as the third

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: disarm
 version: !ruby/object:Gem::Version
-  version: 0.10.0
+  version: 0.11.0
 platform: x86_64-linux
 authors:
 - Richard Quinn
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2026-06-15 00:00:00.000000000 Z
+date: 2026-06-21 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
@@ -52,6 +52,34 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '3.0'
+- !ruby/object:Gem::Dependency
+  name: rubocop
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.65'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.65'
+- !ruby/object:Gem::Dependency
+  name: rubocop-performance
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.21'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.21'
 description: |
   Ruby bindings for the disarm Rust core: TR39 confusable folding, bidi/zalgo/
   zero-width neutralization, Unicode normalization, standards-based