disarm 0.10.0-x86_64-linux → 0.11.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bb2d53e5dd7345db9edd3342f549d42abb7c5729a674d73af083ec0cbaa3809c
4
- data.tar.gz: b6afb09b36cecadf6eb3812d6d5899ee9424361466db6d28070f9d7cd8a457bb
3
+ metadata.gz: '0359982965e26fe7e2dd9a466af41416ba547af6524517e51b3311271f007248'
4
+ data.tar.gz: e4de9c08cc791e099176b09c19b08b1876f22fde7954cac5b5291d7758c6db86
5
5
  SHA512:
6
- metadata.gz: bb034797e17b3fdccf1eb9eac41c155be6678b841d25dd84bdb871bd034bf03ce215b22ea5d196c8d0ca3cc8e9aa1b20b88f4e75d56f1bc7cdf5536cf2b8b34e
7
- data.tar.gz: 58189aa9ab24e25d8201a499b950663ffb17854e0b768d02c4d5112f57ee7845369c49876ccdfc800aae5e1c5d5e80e5d6633c4391fa7371d348980e4e72a881
6
+ metadata.gz: 39c2d59de362a75f6198e59eadfb4ca47a45644b9cce3b303d39c95566e38e29ee2090016897ea856f2d62a5de39c54db4f5476176ecf2ee4047a81cd137d0ed
7
+ data.tar.gz: 2d250406c451bb1435c790b3508d99a95a4434d743751bc4056f6682ec927706954235921b8a389027b65137bad864b5a5df82746a9067d588e10d43bce13b98
data/README.md CHANGED
@@ -29,8 +29,10 @@ falls back to compiling from source (needs a Rust toolchain) otherwise.
29
29
  require "disarm"
30
30
 
31
31
  # Standards-based transliteration to ASCII. `scheme:` is a symbol (or string):
32
- # :default (general-purpose), :strict_iso9 (ISO 9:1995), :gost7034.
32
+ # :default (general-purpose), :strict_iso9 (ISO 9:1995), :gost7034. `lang:`
33
+ # applies a language profile on top (e.g. "uk" → Київ → "Kyiv").
33
34
  Disarm.transliterate("Москва") # => "Moskva"
35
+ Disarm.transliterate("Київ", lang: :uk) # => "Kyiv"
34
36
  Disarm.transliterate("Москва", scheme: :strict_iso9)
35
37
 
36
38
  # TR39 confusable folding (homoglyph defense). `target:` defaults to :latin.
@@ -48,7 +50,7 @@ Disarm.demojize("👍🏽", strip_modifiers: true)
48
50
 
49
51
  # Security presets
50
52
  Disarm.strip_obfuscation("Ѕ𝗲𝗰𝗿𝗲𝘁 ​data") # deobfuscated
51
- Disarm.security_clean("…") # homoglyph/bidi/zero-width clean
53
+ Disarm.canonicalize("…") # homoglyph/bidi/zero-width clean
52
54
 
53
55
  # IDN / hostname spoof check (a false result is not a safety guarantee)
54
56
  Disarm.suspicious_hostname?("pаypal.com") # => true (Cyrillic 'а')
Binary file
Binary file
Binary file
@@ -2,5 +2,5 @@
2
2
 
3
3
  module Disarm
4
4
  # Kept in lockstep with the Rust crate / Python package version.
5
- VERSION = "0.10.0"
5
+ VERSION = "0.11.0"
6
6
  end
data/lib/disarm.rb CHANGED
@@ -36,13 +36,20 @@ module Disarm
36
36
 
37
37
  class << self
38
38
  # Transliterate Unicode text to ASCII. `scheme:` selects the standard:
39
- # :default (the general-purpose scheme), :strict_iso9, or :gost7034. Accepts
40
- # a String or Symbol.
41
- def transliterate(text, scheme: :default)
39
+ # :default (the general-purpose scheme), :strict_iso9, or :gost7034. `lang:`
40
+ # applies a language profile on top of the scheme (e.g. "uk" → Київ → "Kyiv",
41
+ # "de" ü → "ue"); nil means no profile. Both accept a String or Symbol.
42
+ def transliterate(text, scheme: :default, lang: nil)
42
43
  scheme = scheme.to_s
44
+ lang = lang&.to_s
43
45
  translate_errors do
44
- # The bare default keeps the core's borrow-on-no-op fast path.
45
- scheme == "default" ? _transliterate(text) : _transliterate_scheme(text, scheme)
46
+ # The bare default with no profile keeps the core's borrow-on-no-op fast
47
+ # path; any scheme or lang takes the option-carrying builder path.
48
+ if lang.nil? && scheme == "default"
49
+ _transliterate(text)
50
+ else
51
+ _transliterate_opts(text, scheme, lang)
52
+ end
46
53
  end
47
54
  end
48
55
 
@@ -98,10 +105,40 @@ module Disarm
98
105
  translate_errors { _strip_obfuscation(text) }
99
106
  end
100
107
 
101
- # Aggressive security cleaning: strip obfuscation, control characters, and
102
- # other spoofing vectors.
108
+ # Canonicalize text for security-sensitive comparison: strip obfuscation,
109
+ # control characters, and other spoofing vectors. The name describes the
110
+ # mechanism (Unicode canonicalization for matching), not a safety guarantee —
111
+ # this is not an output sanitizer; encode at the sink.
112
+ def canonicalize(text)
113
+ translate_errors { _canonicalize(text) }
114
+ end
115
+
116
+ # @deprecated Renamed to {#canonicalize} in 0.11 (the +_clean+ name
117
+ # overpromised safety); removed in 1.0.
103
118
  def security_clean(text)
104
- translate_errors { _security_clean(text) }
119
+ warn("[disarm] security_clean is deprecated; use canonicalize (removed in 1.0)", category: :deprecated)
120
+ canonicalize(text)
121
+ end
122
+
123
+ # Case/accent/script-insensitive search lookup key. `lang:` applies a
124
+ # language profile for transliteration (e.g. "ru", "uk"); nil means none.
125
+ # Raises Disarm::InvalidArgument on an unknown lang.
126
+ def search_key(text, lang: nil)
127
+ translate_errors { _search_key(text, lang&.to_s) }
128
+ end
129
+
130
+ # Collation sort key (like #search_key, but keeps base accented characters
131
+ # for correct ordering). `lang:` applies a language profile; nil means none.
132
+ # Raises Disarm::InvalidArgument on an unknown lang.
133
+ def sort_key(text, lang: nil)
134
+ translate_errors { _sort_key(text, lang&.to_s) }
135
+ end
136
+
137
+ # Library catalog deduplication key (search_key plus confusable folding).
138
+ # `lang:` applies a language profile; `strict_iso9:` selects the ISO 9:1995
139
+ # Cyrillic scheme. Raises Disarm::InvalidArgument on an unknown lang.
140
+ def catalog_key(text, lang: nil, strict_iso9: false)
141
+ translate_errors { _catalog_key(text, lang&.to_s, strict_iso9) }
105
142
  end
106
143
 
107
144
  # Strip diacritics ("café" → "cafe").
@@ -114,14 +151,275 @@ module Disarm
114
151
  translate_errors { _fold_case(text) }
115
152
  end
116
153
 
117
- # Whether the hostname looks like a mixed-script / confusable IDN spoof. A
118
- # false result asserts nothing was *found*, not that the host is safe.
154
+ # Whether the hostname looks like a mixed-script / confusable / bidi-reorder
155
+ # IDN spoof. Flags a mixed-script label, a Latin confusable, or a
156
+ # bidi-direction conflict (see #bidi_conflict?, the "BiDi Swap" precondition).
157
+ # A false result asserts nothing was *found*, not that the host is safe.
119
158
  def suspicious_hostname?(host)
120
159
  translate_errors { _suspicious_hostname?(host) }
121
160
  end
122
161
 
162
+ # Apply a Unicode normalization form. `form:` is :nfc (default), :nfd,
163
+ # :nfkc, or :nfkd (a Symbol or String; case-insensitive).
164
+ def normalize(text, form: :nfc)
165
+ translate_errors { _normalize(text, form.to_s.upcase) }
166
+ end
167
+
168
+ # Whether `text` is already in normalization `form:` (default :nfc).
169
+ def normalized?(text, form: :nfc)
170
+ translate_errors { _normalized?(text, form.to_s.upcase) }
171
+ end
172
+
173
+ # Fold every run of Unicode whitespace to a single ASCII space and trim
174
+ # leading/trailing whitespace (#433). Folds whitespace ONLY — the line
175
+ # controls (TAB/LF/VT/FF/CR), the information separators (U+001C–U+001F),
176
+ # NEL, the Zs/Zl/Zp spaces, and the blank-rendering set (Braille blank,
177
+ # Hangul fillers) each fold to a single space. It does NOT delete control or
178
+ # zero-width characters — use `strip_control_chars` / `strip_zero_width_chars`
179
+ # for that. Folding the line controls (not deleting) means "a\rb" → "a b".
180
+ def collapse_whitespace(text)
181
+ translate_errors { _collapse_whitespace(text) }
182
+ end
183
+
184
+ # Remove C0/C1 control characters (except tab and newline).
185
+ def strip_control_chars(text)
186
+ translate_errors { _strip_control_chars(text) }
187
+ end
188
+
189
+ # Remove zero-width characters (ZWSP, ZWNJ, ZWJ, word joiner).
190
+ def strip_zero_width_chars(text)
191
+ translate_errors { _strip_zero_width_chars(text) }
192
+ end
193
+
194
+ # Remove Unicode bidirectional control characters (a homoglyph/spoof vector).
195
+ def strip_bidi(text)
196
+ translate_errors { _strip_bidi(text) }
197
+ end
198
+
199
+ # Strip the Unicode Tags block (U+E0000-U+E007F) - the "ASCII smuggling"
200
+ # channel - preserving well-formed emoji subdivision flag sequences (#413).
201
+ def strip_tags(text)
202
+ translate_errors { _strip_tags(text) }
203
+ end
204
+
205
+ # Strip every variation selector (VS1-VS256) - the arbitrary-byte smuggling
206
+ # channel (#413).
207
+ def strip_variation_selectors(text)
208
+ translate_errors { _strip_variation_selectors(text) }
209
+ end
210
+
211
+ # Strip every Unicode noncharacter (U+FDD0-U+FDEF and U+xFFFE/U+xFFFF) (#413).
212
+ def strip_noncharacters(text)
213
+ translate_errors { _strip_noncharacters(text) }
214
+ end
215
+
216
+ # Strip every Private Use Area code point (BMP and planes 15/16) (#413).
217
+ def strip_pua(text)
218
+ translate_errors { _strip_pua(text) }
219
+ end
220
+
221
+ # Strip "zalgo" combining-mark stacking, keeping at most `max_marks:` (2)
222
+ # combining marks per base character.
223
+ def strip_zalgo(text, max_marks: 2)
224
+ translate_errors { _strip_zalgo(text, max_marks) }
225
+ end
226
+
227
+ # Whether `text` looks like zalgo: any base character carries more than
228
+ # `threshold:` (3) combining marks.
229
+ def zalgo?(text, threshold: 3)
230
+ translate_errors { _zalgo?(text, threshold) }
231
+ end
232
+
233
+ # Number of grapheme clusters (user-perceived characters). Counts an emoji
234
+ # or flag as one, unlike `String#length` (code points).
235
+ def grapheme_len(text)
236
+ translate_errors { _grapheme_len(text) }
237
+ end
238
+
239
+ # Split `text` into an array of grapheme-cluster strings.
240
+ def grapheme_split(text)
241
+ translate_errors { _grapheme_split(text) }
242
+ end
243
+
244
+ # Truncate `text` to at most `max_graphemes` grapheme clusters, never cutting
245
+ # through the middle of a cluster.
246
+ def grapheme_truncate(text, max_graphemes)
247
+ translate_errors { _grapheme_truncate(text, max_graphemes) }
248
+ end
249
+
250
+ # Display width (terminal columns) of a single grapheme `cluster` by East
251
+ # Asian Width. Pass `ambiguous_wide: true` to treat ambiguous-width
252
+ # characters as 2 columns.
253
+ def grapheme_width(cluster, ambiguous_wide: false)
254
+ translate_errors { _grapheme_width(cluster, ambiguous_wide) }
255
+ end
256
+
257
+ # Total display width (terminal columns) of `text`.
258
+ def terminal_width(text, ambiguous_wide: false)
259
+ translate_errors { _terminal_width(text, ambiguous_wide) }
260
+ end
261
+
262
+ # Turn arbitrary text into a safe filename. `platform:` is :universal
263
+ # (default), :windows, or :posix; `preserve_extension:` keeps the final
264
+ # extension when truncating to `max_length:`. Raises Disarm::InvalidArgument
265
+ # on an unknown platform.
266
+ def sanitize_filename(text, separator: "_", max_length: 255, platform: :universal,
267
+ lang: nil, preserve_extension: true)
268
+ translate_errors do
269
+ _sanitize_filename(text, separator.to_s, max_length, platform.to_s,
270
+ lang&.to_s, preserve_extension)
271
+ end
272
+ end
273
+
274
+ # Reverse-transliterate Latin back to a native script. `lang:` is :el (Greek),
275
+ # :ru (Russian), or :uk (Ukrainian) — a Symbol or String.
276
+ def reverse_transliterate(text, lang:)
277
+ translate_errors { _reverse_transliterate(text, lang.to_s) }
278
+ end
279
+
280
+ # Every character in `text` with no romanization, as an array of
281
+ # `{ char:, offset: }` hashes (byte offset), in order of appearance.
282
+ # `scheme:`/`lang:` mirror #transliterate.
283
+ def find_untranslatable(text, scheme: :default, lang: nil)
284
+ translate_errors do
285
+ _find_untranslatable(text, scheme.to_s, lang&.to_s)
286
+ .map { |ch, offset| { char: ch, offset: offset } }
287
+ end
288
+ end
289
+
290
+ # The Unicode scripts present in `text`, in first-appearance order
291
+ # (Common/Inherited excluded), as stable UCD identifiers (e.g. "Latin").
292
+ def detect_scripts(text)
293
+ translate_errors { _detect_scripts(text) }
294
+ end
295
+
296
+ # Whether `text` mixes characters from more than one script.
297
+ def mixed_script?(text)
298
+ translate_errors { _is_mixed_script?(text) }
299
+ end
300
+
301
+ # Whether `text` mixes strong left-to-right and strong right-to-left
302
+ # characters — the precondition for Bidi display-reordering (UAX #9) and the
303
+ # structural signal behind "BiDi Swap"-style spoofs. Fires on the real
304
+ # letters (no U+202x override). A false result is not a safety guarantee.
305
+ def bidi_conflict?(text)
306
+ translate_errors { _has_bidi_conflict?(text) }
307
+ end
308
+
309
+ # Explain how `lang: "auto"` detection resolves `text`: a hash with
310
+ # `:script`, `:chosen_lang` (both nil if undetected), `:reason`, and
311
+ # `:discriminators_hit`.
312
+ def inspect_auto_lang(text)
313
+ script, chosen_lang, reason, discriminators = translate_errors { _inspect_auto_lang(text) }
314
+ { script: script, chosen_lang: chosen_lang, reason: reason,
315
+ discriminators_hit: discriminators }
316
+ end
317
+
318
+ # Curated metadata for one language `code` (e.g. "de"), as a hash with symbol
319
+ # keys: `:name`, `:script`, `:region`, and `:context` ("none"/"partial"/"full").
320
+ # Raises Disarm::InvalidArgument on an unknown code.
321
+ def lang_info(code)
322
+ translate_errors { _lang_info(code.to_s) }
323
+ end
324
+
325
+ # Curated metadata for one script `name` (e.g. "Coptic"), as a hash with symbol
326
+ # keys: `:name`, `:default_lang` (nil when none), `:example`, and
327
+ # `:context_aware`. Raises Disarm::InvalidArgument on an unknown script.
328
+ def script_info(name)
329
+ translate_errors { _script_info(name.to_s) }
330
+ end
331
+
332
+ # Every script disarm knows, as stable UCD script identifiers (includes
333
+ # "Common"/"Inherited"), sorted by name.
334
+ def list_scripts
335
+ translate_errors { _list_scripts }
336
+ end
337
+
338
+ # The language codes with context-aware transliteration support, sorted by code.
339
+ def list_context_langs
340
+ translate_errors { _list_context_langs }
341
+ end
342
+
343
+ # Whether any whitespace token carries out-of-place characters that disguise a
344
+ # real word — a cross-script homoglyph, leet, segmentation, a zero-width / bidi
345
+ # control, or zalgo. Reports a technical fact and leaves the malicious-or-not
346
+ # judgement to the caller. `lexicon` is a common-word collection (Array or Set)
347
+ # used only by the leet and segmentation branches; it defaults to an empty list
348
+ # when those branches are not needed. A bare String is rejected — pass an Array
349
+ # or any object responding to `:each`.
350
+ #
351
+ # For repeated calls over the same word list, build a Disarm::Lexicon once and
352
+ # pass it here: the native HashSet is then reused rather than rebuilt per call
353
+ # (HAI-SDLC 6.1).
354
+ def has_anomalies?(text, lexicon = [])
355
+ translate_errors do
356
+ if lexicon.is_a?(Disarm::Lexicon)
357
+ _has_anomalies_lex(text, lexicon)
358
+ else
359
+ _has_anomalies?(text, coerce_lexicon(lexicon))
360
+ end
361
+ end
362
+ end
363
+
364
+ # Full anomaly analysis: a hash with `:anomalous`, `:kinds` (in first-appearance
365
+ # order), `:findings` (each `{ kind:, token:, start:, end:, detail:, reason: }`,
366
+ # with byte offsets), and `:reason` (the first finding's reason, or nil).
367
+ # `lexicon` defaults to an empty list; a bare String is rejected. Pass a
368
+ # pre-built Disarm::Lexicon to reuse the native HashSet across calls (6.1).
369
+ def inspect_anomalies(text, lexicon = [])
370
+ anomalous, kinds, findings, reason =
371
+ translate_errors do
372
+ if lexicon.is_a?(Disarm::Lexicon)
373
+ _inspect_anomalies_lex(text, lexicon)
374
+ else
375
+ _inspect_anomalies(text, coerce_lexicon(lexicon))
376
+ end
377
+ end
378
+ {
379
+ anomalous: anomalous,
380
+ kinds: kinds,
381
+ findings: findings.map do |kind, token, start, finish, detail, fr|
382
+ { kind: kind, token: token, start: start, end: finish, detail: detail, reason: fr }
383
+ end,
384
+ reason: reason,
385
+ }
386
+ end
387
+
388
+ # Build a reusable Disarm::Pipeline for a named policy `profile` (e.g.
389
+ # "search_index", "normalize_web_input"). The profile's steps are validated
390
+ # and assembled once at construction, so the returned handle can be reused
391
+ # across many `#process` calls without re-resolving the profile each time —
392
+ # the same reuse pattern as Disarm::Lexicon. Raises Disarm::InvalidArgument
393
+ # on an unknown profile name.
394
+ #
395
+ # pipe = Disarm.get_pipeline("search_index")
396
+ # pipe.process("Café") # => "cafe"
397
+ # pipe.process("Köln") # reuse the same handle
398
+ #
399
+ # Disarm::Pipeline#process is the Rust-defined instance method on the handle.
400
+ def get_pipeline(profile)
401
+ translate_errors { _get_pipeline(profile.to_s) }
402
+ end
403
+
123
404
  private
124
405
 
406
+ # Coerce a lexicon argument to an Array of Strings for the native layer.
407
+ # Fast-path: an Array already containing only Strings is passed through as-is.
408
+ # Any other Enumerable (Set, etc.) is mapped to String. A bare String is rejected
409
+ # with ArgumentError — callers must wrap it in an Array: ["word"].
410
+ def coerce_lexicon(lexicon)
411
+ # An explicit nil is treated as an empty lexicon (parity with the `= []`
412
+ # default and the other bindings' null handling), not an error.
413
+ return [] if lexicon.nil?
414
+
415
+ raise ::ArgumentError, "lexicon must be an Array or Enumerable, not a String" \
416
+ if lexicon.is_a?(::String)
417
+
418
+ return lexicon if lexicon.is_a?(::Array) && lexicon.all?(::String)
419
+
420
+ lexicon.map(&:to_s)
421
+ end
422
+
125
423
  # Run a native call, re-raising its built-in exception as the matching
126
424
  # Disarm::Error subclass so callers can `rescue Disarm::Error` across the
127
425
  # whole surface. The original backtrace is preserved (passed as the third
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: disarm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.11.0
5
5
  platform: x86_64-linux
6
6
  authors:
7
7
  - Richard Quinn
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-06-15 00:00:00.000000000 Z
11
+ date: 2026-06-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -52,6 +52,34 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rubocop
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.65'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.65'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rubocop-performance
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.21'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.21'
55
83
  description: |
56
84
  Ruby bindings for the disarm Rust core: TR39 confusable folding, bidi/zalgo/
57
85
  zero-width neutralization, Unicode normalization, standards-based