namespace-guard 0.16.1 → 0.17.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -83,6 +83,52 @@ Details:
83
83
  - Suggestion strategies for taken names
84
84
  - CLI for red-team generation, calibration, drift, and CI gates
85
85
 
86
+ ## LLM Pipeline Preprocessing
87
+
88
+ LLM tokenizers process Unicode codepoints, not rendered glyphs. Confusable substitutions can inflate token counts and hide important terms in mixed-script text, especially on smaller models.
89
+
90
+ Use namespace-guard as a deterministic preprocess layer before model calls:
91
+
92
+ ```text
93
+ Document ingestion
94
+ |
95
+ v
96
+ +----------------+
97
+ | namespace- | <-- Detect mixed-script confusable substitution
98
+ | guard | <-- Canonicalise to Latin equivalents
99
+ | (microseconds) | <-- Flag suspicious patterns for review
100
+ +----------------+
101
+ |
102
+ v
103
+ +----------------+
104
+ | LLM API | <-- Any model/provider
105
+ | (GPT/Claude/ | <-- Receives canonicalised text
106
+ | Llama/etc) |
107
+ +----------------+
108
+ |
109
+ v
110
+ Analysis output
111
+ ```
112
+
113
+ ```typescript
114
+ import { canonicalise, scan, isClean } from "namespace-guard";
115
+
116
+ const raw = "The seller аssumes аll liаbility.";
117
+
118
+ const report = scan(raw); // detailed findings + risk level
119
+ const clean = canonicalise(raw); // "The seller assumes all liability."
120
+ const ok = isClean(raw); // false (mixed-script confusable detected)
121
+
122
+ // For known-Latin documents (e.g. English contracts), use strategy: "all"
123
+ // to also catch words where every character was substituted:
124
+ canonicalise("поп-refundable", { strategy: "all" }); // "non-refundable"
125
+ ```
126
+
127
+ Research context:
128
+ - Launch: https://paultendo.github.io/posts/namespace-guard-launch/
129
+ - NFKC/TR39 composability: https://paultendo.github.io/posts/unicode-confusables-nfkc-conflict/
130
+ - Confusable detection without NFKC: https://paultendo.github.io/posts/confusable-detection-without-nfkc/
131
+
86
132
  ## Built-in Profiles
87
133
 
88
134
  Use `createNamespaceGuardWithProfile(profile, overrides, adapter)`:
@@ -187,6 +233,7 @@ Migration guides per adapter: [docs/reference.md#canonical-uniqueness-migration-
187
233
  - Validators (profanity, homoglyph, invisible): [docs/reference.md#async-validators](docs/reference.md#async-validators)
188
234
  - Canonical preflight audit (`audit-canonical`): [docs/reference.md#audit-canonical-command](docs/reference.md#audit-canonical-command)
189
235
  - Anti-spoofing pipeline and composability vectors: [docs/reference.md#how-the-anti-spoofing-pipeline-works](docs/reference.md#how-the-anti-spoofing-pipeline-works)
236
+ - LLM preprocessing (`canonicalise`, `scan`, `isClean`): [docs/reference.md#llm-pipeline-preprocessing](docs/reference.md#llm-pipeline-preprocessing)
190
237
  - Benchmark corpus (`confusable-bench.v1`): [docs/reference.md#confusable-benchmark-corpus-artifact](docs/reference.md#confusable-benchmark-corpus-artifact)
191
238
  - Advanced primitives (`skeleton`, `areConfusable`, `confusableDistance`): [docs/reference.md#advanced-security-primitives](docs/reference.md#advanced-security-primitives)
192
239
  - Confusable weights (SSIM-scored pairs): [docs/reference.md#confusable-weights-subpath](docs/reference.md#confusable-weights-subpath)
package/dist/cli.js CHANGED
@@ -27,6 +27,9 @@ module.exports = __toCommonJS(cli_exports);
27
27
  var import_fs = require("fs");
28
28
  var import_path = require("path");
29
29
 
30
+ // src/llm-confusable-map.ts
31
+ var LLM_CONFUSABLE_MAP_SOURCE_COUNTS = Object.freeze({ tr39: 1425, novel: 793 });
32
+
30
33
  // src/index.ts
31
34
  function asRecord(value) {
32
35
  if (!value || typeof value !== "object") return null;
@@ -2828,6 +2831,21 @@ var SCRIPT_DETECTORS = [
2828
2831
  ["hiragana", /\p{Script=Hiragana}/u],
2829
2832
  ["katakana", /\p{Script=Katakana}/u]
2830
2833
  ];
2834
+ var DEFAULT_SCAN_RISK_TERMS = Object.freeze([
2835
+ "liability",
2836
+ "indemnity",
2837
+ "penalty",
2838
+ "damages",
2839
+ "termination",
2840
+ "breach",
2841
+ "warranty",
2842
+ "payment",
2843
+ "invoice",
2844
+ "governing",
2845
+ "jurisdiction",
2846
+ "arbitration",
2847
+ "confidentiality"
2848
+ ]);
2831
2849
  function clamp(value, min, max) {
2832
2850
  return Math.max(min, Math.min(max, value));
2833
2851
  }
package/dist/cli.mjs CHANGED
@@ -4,6 +4,9 @@
4
4
  import { readFileSync, existsSync } from "fs";
5
5
  import { resolve } from "path";
6
6
 
7
+ // src/llm-confusable-map.ts
8
+ var LLM_CONFUSABLE_MAP_SOURCE_COUNTS = Object.freeze({ tr39: 1425, novel: 793 });
9
+
7
10
  // src/index.ts
8
11
  function asRecord(value) {
9
12
  if (!value || typeof value !== "object") return null;
@@ -2805,6 +2808,21 @@ var SCRIPT_DETECTORS = [
2805
2808
  ["hiragana", /\p{Script=Hiragana}/u],
2806
2809
  ["katakana", /\p{Script=Katakana}/u]
2807
2810
  ];
2811
+ var DEFAULT_SCAN_RISK_TERMS = Object.freeze([
2812
+ "liability",
2813
+ "indemnity",
2814
+ "penalty",
2815
+ "damages",
2816
+ "termination",
2817
+ "breach",
2818
+ "warranty",
2819
+ "payment",
2820
+ "invoice",
2821
+ "governing",
2822
+ "jurisdiction",
2823
+ "arbitration",
2824
+ "confidentiality"
2825
+ ]);
2808
2826
  function clamp(value, min, max) {
2809
2827
  return Math.max(min, Math.min(max, value));
2810
2828
  }
@@ -26,6 +26,9 @@ __export(composability_vectors_exports, {
26
26
  });
27
27
  module.exports = __toCommonJS(composability_vectors_exports);
28
28
 
29
+ // src/llm-confusable-map.ts
30
+ var LLM_CONFUSABLE_MAP_SOURCE_COUNTS = Object.freeze({ tr39: 1425, novel: 793 });
31
+
29
32
  // src/index.ts
30
33
  var PROFANITY_SUBSTITUTE_MAP_BALANCED = {
31
34
  "0": ["o"],
@@ -1845,6 +1848,21 @@ var COMPOSABILITY_VECTORS = NFKC_TR39_DIVERGENCE_VECTORS;
1845
1848
  var COMPOSABILITY_VECTORS_COUNT = COMPOSABILITY_VECTORS.length;
1846
1849
  var DEFAULT_IGNORABLE_RE = /[\u00AD\u034F\u061C\u115F\u1160\u17B4\u17B5\u180B-\u180F\u200B-\u200F\u202A-\u202E\u2060-\u206F\uFE00-\uFE0F\uFEFF\uFFA0\uFFF0-\uFFF8\u{1BCA0}-\u{1BCA3}\u{1D173}-\u{1D17A}\u{E0000}-\u{E0FFF}]/gu;
1847
1850
  var DEFAULT_IGNORABLE_SINGLE_RE = new RegExp(DEFAULT_IGNORABLE_RE.source, "u");
1851
+ var DEFAULT_SCAN_RISK_TERMS = Object.freeze([
1852
+ "liability",
1853
+ "indemnity",
1854
+ "penalty",
1855
+ "damages",
1856
+ "termination",
1857
+ "breach",
1858
+ "warranty",
1859
+ "payment",
1860
+ "invoice",
1861
+ "governing",
1862
+ "jurisdiction",
1863
+ "arbitration",
1864
+ "confidentiality"
1865
+ ]);
1848
1866
  // Annotate the CommonJS export names for ESM import in node:
1849
1867
  0 && (module.exports = {
1850
1868
  COMPOSABILITY_VECTORS,
@@ -1,3 +1,6 @@
1
+ // src/llm-confusable-map.ts
2
+ var LLM_CONFUSABLE_MAP_SOURCE_COUNTS = Object.freeze({ tr39: 1425, novel: 793 });
3
+
1
4
  // src/index.ts
2
5
  var PROFANITY_SUBSTITUTE_MAP_BALANCED = {
3
6
  "0": ["o"],
@@ -1817,6 +1820,21 @@ var COMPOSABILITY_VECTORS = NFKC_TR39_DIVERGENCE_VECTORS;
1817
1820
  var COMPOSABILITY_VECTORS_COUNT = COMPOSABILITY_VECTORS.length;
1818
1821
  var DEFAULT_IGNORABLE_RE = /[\u00AD\u034F\u061C\u115F\u1160\u17B4\u17B5\u180B-\u180F\u200B-\u200F\u202A-\u202E\u2060-\u206F\uFE00-\uFE0F\uFEFF\uFFA0\uFFF0-\uFFF8\u{1BCA0}-\u{1BCA3}\u{1D173}-\u{1D17A}\u{E0000}-\u{E0FFF}]/gu;
1819
1822
  var DEFAULT_IGNORABLE_SINGLE_RE = new RegExp(DEFAULT_IGNORABLE_RE.source, "u");
1823
+ var DEFAULT_SCAN_RISK_TERMS = Object.freeze([
1824
+ "liability",
1825
+ "indemnity",
1826
+ "penalty",
1827
+ "damages",
1828
+ "termination",
1829
+ "breach",
1830
+ "warranty",
1831
+ "payment",
1832
+ "invoice",
1833
+ "governing",
1834
+ "jurisdiction",
1835
+ "arbitration",
1836
+ "confidentiality"
1837
+ ]);
1820
1838
  export {
1821
1839
  COMPOSABILITY_VECTORS,
1822
1840
  COMPOSABILITY_VECTORS_COUNT,
package/dist/index.d.mts CHANGED
@@ -1,3 +1,24 @@
1
+ type LlmConfusableSource = "tr39" | "novel";
2
+ type LlmConfusableMapEntry = {
3
+ latin: string;
4
+ ssimScore: number;
5
+ source: LlmConfusableSource;
6
+ script: string;
7
+ codepoint: string;
8
+ /** Width ratio between source and target at natural rendering size. Null if not measured. */
9
+ widthRatio?: number | null;
10
+ /** Height ratio between source and target at natural rendering size. Null if not measured. */
11
+ heightRatio?: number | null;
12
+ };
13
+ type LlmConfusableMap = Readonly<Record<string, readonly LlmConfusableMapEntry[]>>;
14
+ declare const LLM_CONFUSABLE_MAP: LlmConfusableMap;
15
+ declare const LLM_CONFUSABLE_MAP_PAIR_COUNT = 2218;
16
+ declare const LLM_CONFUSABLE_MAP_CHAR_COUNT = 1947;
17
+ declare const LLM_CONFUSABLE_MAP_SOURCE_COUNTS: Readonly<{
18
+ tr39: 1425;
19
+ novel: 793;
20
+ }>;
21
+
1
22
  /** A database table or model to check for slug/handle collisions. */
2
23
  type NamespaceSource = {
3
24
  /** Table/model name (must match the adapter's lookup key) */
@@ -136,6 +157,87 @@ type InvisibleCharacterValidatorOptions = {
136
157
  /** Reject combining marks (Unicode category `M*`) often used for visual obfuscation (default: `false`). */
137
158
  rejectCombiningMarks?: boolean;
138
159
  };
160
+ /** Options for `canonicalise()` LLM preprocessing. */
161
+ type CanonicaliseOptions = {
162
+ /** Minimum SSIM score required for replacement (default: `0.7`). */
163
+ threshold?: number;
164
+ /** Include confusable-vision novel discoveries in addition to TR39 mappings (default: `true`). */
165
+ includeNovel?: boolean;
166
+ /** Restrict replacement to specific source scripts (case-insensitive, e.g. `["Cyrillic", "Greek"]`). */
167
+ scripts?: string[];
168
+ /**
169
+ * Canonicalisation strategy (default: `"mixed"`).
170
+ *
171
+ * - `"mixed"` -- only replace confusable characters inside tokens that already
172
+ * contain Latin letters. Standalone non-Latin words (e.g. "Москва") are
173
+ * preserved. Safe for multilingual text.
174
+ *
175
+ * - `"all"` -- replace every confusable character regardless of surrounding
176
+ * context. Use this when the document is known to be Latin-script (e.g.
177
+ * an English contract) and you want to catch attackers who substitute
178
+ * every character in a word.
179
+ */
180
+ strategy?: "mixed" | "all";
181
+ /**
182
+ * Maximum allowed width or height ratio between source and target at natural
183
+ * rendering size (default: `3.0`). Pairs where the source character is more
184
+ * than this many times wider or taller than the Latin target are skipped,
185
+ * because the size difference would be visible in running text even if the
186
+ * shapes match after normalisation.
187
+ *
188
+ * Set to `Infinity` to disable size-ratio filtering. Set to `2.0` for
189
+ * stricter filtering. Only applies to novel (non-TR39) pairs that have
190
+ * measured size ratios.
191
+ */
192
+ maxSizeRatio?: number;
193
+ };
194
+ /** Options for `scan()` and `isClean()`. */
195
+ type ScanOptions = CanonicaliseOptions & {
196
+ /** Optional list of high-value terms used to raise `riskLevel` when targeted (default: built-in legal/financial list). */
197
+ riskTerms?: string[];
198
+ };
199
+ /** Single confusable finding returned by `scan()`. */
200
+ type ScanFinding = {
201
+ /** The confusable character found in the input. */
202
+ char: string;
203
+ /** Codepoint label in `U+XXXX` format. */
204
+ codepoint: string;
205
+ /** Script name of the source character. */
206
+ script: string;
207
+ /** Canonical Latin equivalent selected by the lookup table. */
208
+ latinEquivalent: string;
209
+ /** SSIM score used for this mapping. */
210
+ ssimScore: number;
211
+ /** Mapping source (`tr39` baseline or `novel` discovery). */
212
+ source: "tr39" | "novel";
213
+ /** UTF-16 code-unit offset in the input string. */
214
+ index: number;
215
+ /** Token/word containing this character. */
216
+ word: string;
217
+ /** Whether the token mixes Latin and non-Latin letters. */
218
+ mixedScript: boolean;
219
+ };
220
+ /** Structured confusable scan result for LLM preprocessing pipelines. */
221
+ type ScanResult = {
222
+ /** Whether any confusable mapping candidates were detected. */
223
+ hasConfusables: boolean;
224
+ /** Number of findings in `findings`. */
225
+ count: number;
226
+ /** Detailed findings with script/source/position metadata. */
227
+ findings: ScanFinding[];
228
+ /** Aggregate scan summary for policy and logging. */
229
+ summary: {
230
+ /** Number of distinct confusable characters found. */
231
+ distinctChars: number;
232
+ /** Number of distinct words/tokens affected. */
233
+ wordsAffected: number;
234
+ /** Distinct scripts detected among findings. */
235
+ scriptsDetected: string[];
236
+ /** Heuristic risk level from confusable density + targeting. */
237
+ riskLevel: "none" | "low" | "medium" | "high";
238
+ };
239
+ };
240
+
139
241
  /** Options for the `skeleton()` and `areConfusable()` functions. */
140
242
  type SkeletonOptions = {
141
243
  /** Confusable character map to use.
@@ -504,6 +606,32 @@ declare function createHomoglyphValidator(options?: {
504
606
  * @returns An async validator function for use in `config.validators`
505
607
  */
506
608
  declare function createInvisibleCharacterValidator(options?: InvisibleCharacterValidatorOptions): NamespaceValidator;
609
+ /**
610
+ * Canonicalise confusable characters in text for LLM preprocessing.
611
+ *
612
+ * With the default `strategy: "mixed"`, only rewrites characters inside tokens
613
+ * that already contain Latin letters. Standalone non-Latin words are preserved
614
+ * to reduce false positives in multilingual text.
615
+ *
616
+ * With `strategy: "all"`, rewrites every confusable character regardless of
617
+ * context. Use this when the document is known to be Latin-script.
618
+ */
619
+ declare function canonicalise(text: string, options?: CanonicaliseOptions): string;
620
+ /**
621
+ * Scan text for confusable characters and return structured findings + risk summary.
622
+ */
623
+ declare function scan(text: string, options?: ScanOptions): ScanResult;
624
+ /**
625
+ * Fast gate for LLM pipelines.
626
+ *
627
+ * With the default `strategy: "mixed"`, returns `false` as soon as a
628
+ * mixed-script confusable substitution is found. Standalone non-Latin
629
+ * words do not fail this gate.
630
+ *
631
+ * With `strategy: "all"`, returns `false` if any confusable character is
632
+ * found, regardless of surrounding context.
633
+ */
634
+ declare function isClean(text: string, options?: ScanOptions): boolean;
507
635
  /**
508
636
  * Compute the TR39 Section 4 skeleton of a string for confusable comparison.
509
637
  *
@@ -634,4 +762,4 @@ declare function createNamespaceGuard(config: NamespaceConfig, adapter: Namespac
634
762
  /** The guard instance returned by `createNamespaceGuard`. */
635
763
  type NamespaceGuard = ReturnType<typeof createNamespaceGuard>;
636
764
 
637
- export { type AssertClaimableOptions, COMPOSABILITY_VECTORS, COMPOSABILITY_VECTORS_COUNT, COMPOSABILITY_VECTOR_SUITE, CONFUSABLE_MAP, CONFUSABLE_MAP_FULL, type CheckManyOptions, type CheckResult, type CheckRiskOptions, type ClaimOptions, type ClaimResult, type ComposabilityVector, type ConfusableDistanceOptions, type ConfusableDistanceResult, type ConfusableDistanceStep, type ConfusableWeight, type ConfusableWeights, DEFAULT_PROTECTED_TOKENS, type EnforceRiskOptions, type EnforceRiskResult, type FindOneOptions, type InvisibleCharacterValidatorOptions, NAMESPACE_PROFILES, NFKC_TR39_DIVERGENCE_VECTORS, type NamespaceAdapter, type NamespaceConfig, type NamespaceGuard, type NamespaceProfileName, type NamespaceProfilePreset, type NamespaceSource, type NamespaceValidator, type NamespaceValidatorResult, type NfkcTr39DivergenceVector, type OwnershipScope, type PredicateValidatorOptions, type ProfanityValidationMode, type ProfanityValidatorOptions, type ProfanityVariantProfile, type RiskAction, type RiskCheckResult, type RiskLevel, type RiskMatch, type RiskReason, type RiskReasonCode, type SkeletonOptions, type SuggestStrategyName, type UniqueViolationDetector, areConfusable, confusableDistance, createHomoglyphValidator, createInvisibleCharacterValidator, createNamespaceGuard, createNamespaceGuardWithProfile, createPredicateValidator, createProfanityValidator, deriveNfkcTr39DivergenceVectors, isLikelyUniqueViolationError, normalize, skeleton };
765
+ export { type AssertClaimableOptions, COMPOSABILITY_VECTORS, COMPOSABILITY_VECTORS_COUNT, COMPOSABILITY_VECTOR_SUITE, CONFUSABLE_MAP, CONFUSABLE_MAP_FULL, type CanonicaliseOptions, type CheckManyOptions, type CheckResult, type CheckRiskOptions, type ClaimOptions, type ClaimResult, type ComposabilityVector, type ConfusableDistanceOptions, type ConfusableDistanceResult, type ConfusableDistanceStep, type ConfusableWeight, type ConfusableWeights, DEFAULT_PROTECTED_TOKENS, type EnforceRiskOptions, type EnforceRiskResult, type FindOneOptions, type InvisibleCharacterValidatorOptions, LLM_CONFUSABLE_MAP, LLM_CONFUSABLE_MAP_CHAR_COUNT, LLM_CONFUSABLE_MAP_PAIR_COUNT, LLM_CONFUSABLE_MAP_SOURCE_COUNTS, NAMESPACE_PROFILES, NFKC_TR39_DIVERGENCE_VECTORS, type NamespaceAdapter, type NamespaceConfig, type NamespaceGuard, type NamespaceProfileName, type NamespaceProfilePreset, type NamespaceSource, type NamespaceValidator, type NamespaceValidatorResult, type NfkcTr39DivergenceVector, type OwnershipScope, type PredicateValidatorOptions, type ProfanityValidationMode, type ProfanityValidatorOptions, type ProfanityVariantProfile, type RiskAction, type RiskCheckResult, type RiskLevel, type RiskMatch, type RiskReason, type RiskReasonCode, type ScanFinding, type ScanOptions, type ScanResult, type SkeletonOptions, type SuggestStrategyName, type UniqueViolationDetector, areConfusable, canonicalise, confusableDistance, createHomoglyphValidator, createInvisibleCharacterValidator, createNamespaceGuard, createNamespaceGuardWithProfile, createPredicateValidator, createProfanityValidator, deriveNfkcTr39DivergenceVectors, isClean, isLikelyUniqueViolationError, normalize, scan, skeleton };
package/dist/index.d.ts CHANGED
@@ -1,3 +1,24 @@
1
+ type LlmConfusableSource = "tr39" | "novel";
2
+ type LlmConfusableMapEntry = {
3
+ latin: string;
4
+ ssimScore: number;
5
+ source: LlmConfusableSource;
6
+ script: string;
7
+ codepoint: string;
8
+ /** Width ratio between source and target at natural rendering size. Null if not measured. */
9
+ widthRatio?: number | null;
10
+ /** Height ratio between source and target at natural rendering size. Null if not measured. */
11
+ heightRatio?: number | null;
12
+ };
13
+ type LlmConfusableMap = Readonly<Record<string, readonly LlmConfusableMapEntry[]>>;
14
+ declare const LLM_CONFUSABLE_MAP: LlmConfusableMap;
15
+ declare const LLM_CONFUSABLE_MAP_PAIR_COUNT = 2218;
16
+ declare const LLM_CONFUSABLE_MAP_CHAR_COUNT = 1947;
17
+ declare const LLM_CONFUSABLE_MAP_SOURCE_COUNTS: Readonly<{
18
+ tr39: 1425;
19
+ novel: 793;
20
+ }>;
21
+
1
22
  /** A database table or model to check for slug/handle collisions. */
2
23
  type NamespaceSource = {
3
24
  /** Table/model name (must match the adapter's lookup key) */
@@ -136,6 +157,87 @@ type InvisibleCharacterValidatorOptions = {
136
157
  /** Reject combining marks (Unicode category `M*`) often used for visual obfuscation (default: `false`). */
137
158
  rejectCombiningMarks?: boolean;
138
159
  };
160
+ /** Options for `canonicalise()` LLM preprocessing. */
161
+ type CanonicaliseOptions = {
162
+ /** Minimum SSIM score required for replacement (default: `0.7`). */
163
+ threshold?: number;
164
+ /** Include confusable-vision novel discoveries in addition to TR39 mappings (default: `true`). */
165
+ includeNovel?: boolean;
166
+ /** Restrict replacement to specific source scripts (case-insensitive, e.g. `["Cyrillic", "Greek"]`). */
167
+ scripts?: string[];
168
+ /**
169
+ * Canonicalisation strategy (default: `"mixed"`).
170
+ *
171
+ * - `"mixed"` -- only replace confusable characters inside tokens that already
172
+ * contain Latin letters. Standalone non-Latin words (e.g. "Москва") are
173
+ * preserved. Safe for multilingual text.
174
+ *
175
+ * - `"all"` -- replace every confusable character regardless of surrounding
176
+ * context. Use this when the document is known to be Latin-script (e.g.
177
+ * an English contract) and you want to catch attackers who substitute
178
+ * every character in a word.
179
+ */
180
+ strategy?: "mixed" | "all";
181
+ /**
182
+ * Maximum allowed width or height ratio between source and target at natural
183
+ * rendering size (default: `3.0`). Pairs where the source character is more
184
+ * than this many times wider or taller than the Latin target are skipped,
185
+ * because the size difference would be visible in running text even if the
186
+ * shapes match after normalisation.
187
+ *
188
+ * Set to `Infinity` to disable size-ratio filtering. Set to `2.0` for
189
+ * stricter filtering. Only applies to novel (non-TR39) pairs that have
190
+ * measured size ratios.
191
+ */
192
+ maxSizeRatio?: number;
193
+ };
194
+ /** Options for `scan()` and `isClean()`. */
195
+ type ScanOptions = CanonicaliseOptions & {
196
+ /** Optional list of high-value terms used to raise `riskLevel` when targeted (default: built-in legal/financial list). */
197
+ riskTerms?: string[];
198
+ };
199
+ /** Single confusable finding returned by `scan()`. */
200
+ type ScanFinding = {
201
+ /** The confusable character found in the input. */
202
+ char: string;
203
+ /** Codepoint label in `U+XXXX` format. */
204
+ codepoint: string;
205
+ /** Script name of the source character. */
206
+ script: string;
207
+ /** Canonical Latin equivalent selected by the lookup table. */
208
+ latinEquivalent: string;
209
+ /** SSIM score used for this mapping. */
210
+ ssimScore: number;
211
+ /** Mapping source (`tr39` baseline or `novel` discovery). */
212
+ source: "tr39" | "novel";
213
+ /** UTF-16 code-unit offset in the input string. */
214
+ index: number;
215
+ /** Token/word containing this character. */
216
+ word: string;
217
+ /** Whether the token mixes Latin and non-Latin letters. */
218
+ mixedScript: boolean;
219
+ };
220
+ /** Structured confusable scan result for LLM preprocessing pipelines. */
221
+ type ScanResult = {
222
+ /** Whether any confusable mapping candidates were detected. */
223
+ hasConfusables: boolean;
224
+ /** Number of findings in `findings`. */
225
+ count: number;
226
+ /** Detailed findings with script/source/position metadata. */
227
+ findings: ScanFinding[];
228
+ /** Aggregate scan summary for policy and logging. */
229
+ summary: {
230
+ /** Number of distinct confusable characters found. */
231
+ distinctChars: number;
232
+ /** Number of distinct words/tokens affected. */
233
+ wordsAffected: number;
234
+ /** Distinct scripts detected among findings. */
235
+ scriptsDetected: string[];
236
+ /** Heuristic risk level from confusable density + targeting. */
237
+ riskLevel: "none" | "low" | "medium" | "high";
238
+ };
239
+ };
240
+
139
241
  /** Options for the `skeleton()` and `areConfusable()` functions. */
140
242
  type SkeletonOptions = {
141
243
  /** Confusable character map to use.
@@ -504,6 +606,32 @@ declare function createHomoglyphValidator(options?: {
504
606
  * @returns An async validator function for use in `config.validators`
505
607
  */
506
608
  declare function createInvisibleCharacterValidator(options?: InvisibleCharacterValidatorOptions): NamespaceValidator;
609
+ /**
610
+ * Canonicalise confusable characters in text for LLM preprocessing.
611
+ *
612
+ * With the default `strategy: "mixed"`, only rewrites characters inside tokens
613
+ * that already contain Latin letters. Standalone non-Latin words are preserved
614
+ * to reduce false positives in multilingual text.
615
+ *
616
+ * With `strategy: "all"`, rewrites every confusable character regardless of
617
+ * context. Use this when the document is known to be Latin-script.
618
+ */
619
+ declare function canonicalise(text: string, options?: CanonicaliseOptions): string;
620
+ /**
621
+ * Scan text for confusable characters and return structured findings + risk summary.
622
+ */
623
+ declare function scan(text: string, options?: ScanOptions): ScanResult;
624
+ /**
625
+ * Fast gate for LLM pipelines.
626
+ *
627
+ * With the default `strategy: "mixed"`, returns `false` as soon as a
628
+ * mixed-script confusable substitution is found. Standalone non-Latin
629
+ * words do not fail this gate.
630
+ *
631
+ * With `strategy: "all"`, returns `false` if any confusable character is
632
+ * found, regardless of surrounding context.
633
+ */
634
+ declare function isClean(text: string, options?: ScanOptions): boolean;
507
635
  /**
508
636
  * Compute the TR39 Section 4 skeleton of a string for confusable comparison.
509
637
  *
@@ -634,4 +762,4 @@ declare function createNamespaceGuard(config: NamespaceConfig, adapter: Namespac
634
762
  /** The guard instance returned by `createNamespaceGuard`. */
635
763
  type NamespaceGuard = ReturnType<typeof createNamespaceGuard>;
636
764
 
637
- export { type AssertClaimableOptions, COMPOSABILITY_VECTORS, COMPOSABILITY_VECTORS_COUNT, COMPOSABILITY_VECTOR_SUITE, CONFUSABLE_MAP, CONFUSABLE_MAP_FULL, type CheckManyOptions, type CheckResult, type CheckRiskOptions, type ClaimOptions, type ClaimResult, type ComposabilityVector, type ConfusableDistanceOptions, type ConfusableDistanceResult, type ConfusableDistanceStep, type ConfusableWeight, type ConfusableWeights, DEFAULT_PROTECTED_TOKENS, type EnforceRiskOptions, type EnforceRiskResult, type FindOneOptions, type InvisibleCharacterValidatorOptions, NAMESPACE_PROFILES, NFKC_TR39_DIVERGENCE_VECTORS, type NamespaceAdapter, type NamespaceConfig, type NamespaceGuard, type NamespaceProfileName, type NamespaceProfilePreset, type NamespaceSource, type NamespaceValidator, type NamespaceValidatorResult, type NfkcTr39DivergenceVector, type OwnershipScope, type PredicateValidatorOptions, type ProfanityValidationMode, type ProfanityValidatorOptions, type ProfanityVariantProfile, type RiskAction, type RiskCheckResult, type RiskLevel, type RiskMatch, type RiskReason, type RiskReasonCode, type SkeletonOptions, type SuggestStrategyName, type UniqueViolationDetector, areConfusable, confusableDistance, createHomoglyphValidator, createInvisibleCharacterValidator, createNamespaceGuard, createNamespaceGuardWithProfile, createPredicateValidator, createProfanityValidator, deriveNfkcTr39DivergenceVectors, isLikelyUniqueViolationError, normalize, skeleton };
765
+ export { type AssertClaimableOptions, COMPOSABILITY_VECTORS, COMPOSABILITY_VECTORS_COUNT, COMPOSABILITY_VECTOR_SUITE, CONFUSABLE_MAP, CONFUSABLE_MAP_FULL, type CanonicaliseOptions, type CheckManyOptions, type CheckResult, type CheckRiskOptions, type ClaimOptions, type ClaimResult, type ComposabilityVector, type ConfusableDistanceOptions, type ConfusableDistanceResult, type ConfusableDistanceStep, type ConfusableWeight, type ConfusableWeights, DEFAULT_PROTECTED_TOKENS, type EnforceRiskOptions, type EnforceRiskResult, type FindOneOptions, type InvisibleCharacterValidatorOptions, LLM_CONFUSABLE_MAP, LLM_CONFUSABLE_MAP_CHAR_COUNT, LLM_CONFUSABLE_MAP_PAIR_COUNT, LLM_CONFUSABLE_MAP_SOURCE_COUNTS, NAMESPACE_PROFILES, NFKC_TR39_DIVERGENCE_VECTORS, type NamespaceAdapter, type NamespaceConfig, type NamespaceGuard, type NamespaceProfileName, type NamespaceProfilePreset, type NamespaceSource, type NamespaceValidator, type NamespaceValidatorResult, type NfkcTr39DivergenceVector, type OwnershipScope, type PredicateValidatorOptions, type ProfanityValidationMode, type ProfanityValidatorOptions, type ProfanityVariantProfile, type RiskAction, type RiskCheckResult, type RiskLevel, type RiskMatch, type RiskReason, type RiskReasonCode, type ScanFinding, type ScanOptions, type ScanResult, type SkeletonOptions, type SuggestStrategyName, type UniqueViolationDetector, areConfusable, canonicalise, confusableDistance, createHomoglyphValidator, createInvisibleCharacterValidator, createNamespaceGuard, createNamespaceGuardWithProfile, createPredicateValidator, createProfanityValidator, deriveNfkcTr39DivergenceVectors, isClean, isLikelyUniqueViolationError, normalize, scan, skeleton };