namespace-guard 0.16.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -83,6 +83,52 @@ Details:
83
83
  - Suggestion strategies for taken names
84
84
  - CLI for red-team generation, calibration, drift, and CI gates
85
85
 
86
+ ## LLM Pipeline Preprocessing
87
+
88
+ LLM tokenizers process Unicode codepoints, not rendered glyphs. Confusable substitutions can inflate token counts and hide important terms in mixed-script text, especially on smaller models.
89
+
90
+ Use namespace-guard as a deterministic preprocess layer before model calls:
91
+
92
+ ```text
93
+ Document ingestion
94
+ |
95
+ v
96
+ +----------------+
97
+ | namespace- | <-- Detect mixed-script confusable substitution
98
+ | guard | <-- Canonicalise to Latin equivalents
99
+ | (microseconds) | <-- Flag suspicious patterns for review
100
+ +----------------+
101
+ |
102
+ v
103
+ +----------------+
104
+ | LLM API | <-- Any model/provider
105
+ | (GPT/Claude/ | <-- Receives canonicalised text
106
+ | Llama/etc) |
107
+ +----------------+
108
+ |
109
+ v
110
+ Analysis output
111
+ ```
112
+
113
+ ```typescript
114
+ import { canonicalise, scan, isClean } from "namespace-guard";
115
+
116
+ const raw = "The seller аssumes аll liаbility.";
117
+
118
+ const report = scan(raw); // detailed findings + risk level
119
+ const clean = canonicalise(raw); // "The seller assumes all liability."
120
+ const ok = isClean(raw); // false (mixed-script confusable detected)
121
+
122
+ // For known-Latin documents (e.g. English contracts), use strategy: "all"
123
+ // to also catch words where every character was substituted:
124
+ canonicalise("поп-refundable", { strategy: "all" }); // "non-refundable"
125
+ ```
126
+
127
+ Research context:
128
+ - Launch: https://paultendo.github.io/posts/namespace-guard-launch/
129
+ - NFKC/TR39 composability: https://paultendo.github.io/posts/unicode-confusables-nfkc-conflict/
130
+ - Confusable detection without NFKC: https://paultendo.github.io/posts/confusable-detection-without-nfkc/
131
+
86
132
  ## Built-in Profiles
87
133
 
88
134
  Use `createNamespaceGuardWithProfile(profile, overrides, adapter)`:
@@ -147,6 +193,19 @@ areConfusable("paypal", "pa\u0443pal"); // true
147
193
  confusableDistance("paypal", "pa\u0443pal"); // graded similarity + chainDepth + explainable steps
148
194
  ```
149
195
 
196
+ For measured visual scoring, pass the optional weights from confusable-vision (903 SSIM-scored pairs across 230 fonts). The `context` filter restricts to identifier-valid, domain-valid, or all pairs.
197
+
198
+ ```typescript
199
+ import { confusableDistance } from "namespace-guard";
200
+ import { CONFUSABLE_WEIGHTS } from "namespace-guard/confusable-weights";
201
+
202
+ const result = confusableDistance("paypal", "pa\u0443pal", {
203
+ weights: CONFUSABLE_WEIGHTS,
204
+ context: "identifier",
205
+ });
206
+ // result.similarity, result.steps (including "visual-weight" reason for novel pairs)
207
+ ```
208
+
150
209
  ## Adapter Support
151
210
 
152
211
  - Prisma
@@ -174,8 +233,10 @@ Migration guides per adapter: [docs/reference.md#canonical-uniqueness-migration-
174
233
  - Validators (profanity, homoglyph, invisible): [docs/reference.md#async-validators](docs/reference.md#async-validators)
175
234
  - Canonical preflight audit (`audit-canonical`): [docs/reference.md#audit-canonical-command](docs/reference.md#audit-canonical-command)
176
235
  - Anti-spoofing pipeline and composability vectors: [docs/reference.md#how-the-anti-spoofing-pipeline-works](docs/reference.md#how-the-anti-spoofing-pipeline-works)
236
+ - LLM preprocessing (`canonicalise`, `scan`, `isClean`): [docs/reference.md#llm-pipeline-preprocessing](docs/reference.md#llm-pipeline-preprocessing)
177
237
  - Benchmark corpus (`confusable-bench.v1`): [docs/reference.md#confusable-benchmark-corpus-artifact](docs/reference.md#confusable-benchmark-corpus-artifact)
178
238
  - Advanced primitives (`skeleton`, `areConfusable`, `confusableDistance`): [docs/reference.md#advanced-security-primitives](docs/reference.md#advanced-security-primitives)
239
+ - Confusable weights (SSIM-scored pairs): [docs/reference.md#confusable-weights-subpath](docs/reference.md#confusable-weights-subpath)
179
240
  - CLI reference: [docs/reference.md#cli](docs/reference.md#cli)
180
241
  - API reference: [docs/reference.md#api-reference](docs/reference.md#api-reference)
181
242
  - Framework integration (Next.js/Express/tRPC): [docs/reference.md#framework-integration](docs/reference.md#framework-integration)
package/dist/cli.js CHANGED
@@ -27,6 +27,9 @@ module.exports = __toCommonJS(cli_exports);
27
27
  var import_fs = require("fs");
28
28
  var import_path = require("path");
29
29
 
30
+ // src/llm-confusable-map.ts
31
+ var LLM_CONFUSABLE_MAP_SOURCE_COUNTS = Object.freeze({ tr39: 1425, novel: 793 });
32
+
30
33
  // src/index.ts
31
34
  function asRecord(value) {
32
35
  if (!value || typeof value !== "object") return null;
@@ -2828,6 +2831,21 @@ var SCRIPT_DETECTORS = [
2828
2831
  ["hiragana", /\p{Script=Hiragana}/u],
2829
2832
  ["katakana", /\p{Script=Katakana}/u]
2830
2833
  ];
2834
+ var DEFAULT_SCAN_RISK_TERMS = Object.freeze([
2835
+ "liability",
2836
+ "indemnity",
2837
+ "penalty",
2838
+ "damages",
2839
+ "termination",
2840
+ "breach",
2841
+ "warranty",
2842
+ "payment",
2843
+ "invoice",
2844
+ "governing",
2845
+ "jurisdiction",
2846
+ "arbitration",
2847
+ "confidentiality"
2848
+ ]);
2831
2849
  function clamp(value, min, max) {
2832
2850
  return Math.max(min, Math.min(max, value));
2833
2851
  }
package/dist/cli.mjs CHANGED
@@ -4,6 +4,9 @@
4
4
  import { readFileSync, existsSync } from "fs";
5
5
  import { resolve } from "path";
6
6
 
7
+ // src/llm-confusable-map.ts
8
+ var LLM_CONFUSABLE_MAP_SOURCE_COUNTS = Object.freeze({ tr39: 1425, novel: 793 });
9
+
7
10
  // src/index.ts
8
11
  function asRecord(value) {
9
12
  if (!value || typeof value !== "object") return null;
@@ -2805,6 +2808,21 @@ var SCRIPT_DETECTORS = [
2805
2808
  ["hiragana", /\p{Script=Hiragana}/u],
2806
2809
  ["katakana", /\p{Script=Katakana}/u]
2807
2810
  ];
2811
+ var DEFAULT_SCAN_RISK_TERMS = Object.freeze([
2812
+ "liability",
2813
+ "indemnity",
2814
+ "penalty",
2815
+ "damages",
2816
+ "termination",
2817
+ "breach",
2818
+ "warranty",
2819
+ "payment",
2820
+ "invoice",
2821
+ "governing",
2822
+ "jurisdiction",
2823
+ "arbitration",
2824
+ "confidentiality"
2825
+ ]);
2808
2826
  function clamp(value, min, max) {
2809
2827
  return Math.max(min, Math.min(max, value));
2810
2828
  }
@@ -26,6 +26,9 @@ __export(composability_vectors_exports, {
26
26
  });
27
27
  module.exports = __toCommonJS(composability_vectors_exports);
28
28
 
29
+ // src/llm-confusable-map.ts
30
+ var LLM_CONFUSABLE_MAP_SOURCE_COUNTS = Object.freeze({ tr39: 1425, novel: 793 });
31
+
29
32
  // src/index.ts
30
33
  var PROFANITY_SUBSTITUTE_MAP_BALANCED = {
31
34
  "0": ["o"],
@@ -1845,6 +1848,21 @@ var COMPOSABILITY_VECTORS = NFKC_TR39_DIVERGENCE_VECTORS;
1845
1848
  var COMPOSABILITY_VECTORS_COUNT = COMPOSABILITY_VECTORS.length;
1846
1849
  var DEFAULT_IGNORABLE_RE = /[\u00AD\u034F\u061C\u115F\u1160\u17B4\u17B5\u180B-\u180F\u200B-\u200F\u202A-\u202E\u2060-\u206F\uFE00-\uFE0F\uFEFF\uFFA0\uFFF0-\uFFF8\u{1BCA0}-\u{1BCA3}\u{1D173}-\u{1D17A}\u{E0000}-\u{E0FFF}]/gu;
1847
1850
  var DEFAULT_IGNORABLE_SINGLE_RE = new RegExp(DEFAULT_IGNORABLE_RE.source, "u");
1851
+ var DEFAULT_SCAN_RISK_TERMS = Object.freeze([
1852
+ "liability",
1853
+ "indemnity",
1854
+ "penalty",
1855
+ "damages",
1856
+ "termination",
1857
+ "breach",
1858
+ "warranty",
1859
+ "payment",
1860
+ "invoice",
1861
+ "governing",
1862
+ "jurisdiction",
1863
+ "arbitration",
1864
+ "confidentiality"
1865
+ ]);
1848
1866
  // Annotate the CommonJS export names for ESM import in node:
1849
1867
  0 && (module.exports = {
1850
1868
  COMPOSABILITY_VECTORS,
@@ -1,3 +1,6 @@
1
+ // src/llm-confusable-map.ts
2
+ var LLM_CONFUSABLE_MAP_SOURCE_COUNTS = Object.freeze({ tr39: 1425, novel: 793 });
3
+
1
4
  // src/index.ts
2
5
  var PROFANITY_SUBSTITUTE_MAP_BALANCED = {
3
6
  "0": ["o"],
@@ -1817,6 +1820,21 @@ var COMPOSABILITY_VECTORS = NFKC_TR39_DIVERGENCE_VECTORS;
1817
1820
  var COMPOSABILITY_VECTORS_COUNT = COMPOSABILITY_VECTORS.length;
1818
1821
  var DEFAULT_IGNORABLE_RE = /[\u00AD\u034F\u061C\u115F\u1160\u17B4\u17B5\u180B-\u180F\u200B-\u200F\u202A-\u202E\u2060-\u206F\uFE00-\uFE0F\uFEFF\uFFA0\uFFF0-\uFFF8\u{1BCA0}-\u{1BCA3}\u{1D173}-\u{1D17A}\u{E0000}-\u{E0FFF}]/gu;
1819
1822
  var DEFAULT_IGNORABLE_SINGLE_RE = new RegExp(DEFAULT_IGNORABLE_RE.source, "u");
1823
+ var DEFAULT_SCAN_RISK_TERMS = Object.freeze([
1824
+ "liability",
1825
+ "indemnity",
1826
+ "penalty",
1827
+ "damages",
1828
+ "termination",
1829
+ "breach",
1830
+ "warranty",
1831
+ "payment",
1832
+ "invoice",
1833
+ "governing",
1834
+ "jurisdiction",
1835
+ "arbitration",
1836
+ "confidentiality"
1837
+ ]);
1820
1838
  export {
1821
1839
  COMPOSABILITY_VECTORS,
1822
1840
  COMPOSABILITY_VECTORS_COUNT,
package/dist/index.d.mts CHANGED
@@ -1,3 +1,20 @@
1
+ type LlmConfusableSource = "tr39" | "novel";
2
+ type LlmConfusableMapEntry = {
3
+ latin: string;
4
+ ssimScore: number;
5
+ source: LlmConfusableSource;
6
+ script: string;
7
+ codepoint: string;
8
+ };
9
+ type LlmConfusableMap = Readonly<Record<string, readonly LlmConfusableMapEntry[]>>;
10
+ declare const LLM_CONFUSABLE_MAP: LlmConfusableMap;
11
+ declare const LLM_CONFUSABLE_MAP_PAIR_COUNT = 2218;
12
+ declare const LLM_CONFUSABLE_MAP_CHAR_COUNT = 1947;
13
+ declare const LLM_CONFUSABLE_MAP_SOURCE_COUNTS: Readonly<{
14
+ tr39: 1425;
15
+ novel: 793;
16
+ }>;
17
+
1
18
  /** A database table or model to check for slug/handle collisions. */
2
19
  type NamespaceSource = {
3
20
  /** Table/model name (must match the adapter's lookup key) */
@@ -136,6 +153,75 @@ type InvisibleCharacterValidatorOptions = {
136
153
  /** Reject combining marks (Unicode category `M*`) often used for visual obfuscation (default: `false`). */
137
154
  rejectCombiningMarks?: boolean;
138
155
  };
156
+ /** Options for `canonicalise()` LLM preprocessing. */
157
+ type CanonicaliseOptions = {
158
+ /** Minimum SSIM score required for replacement (default: `0.7`). */
159
+ threshold?: number;
160
+ /** Include confusable-vision novel discoveries in addition to TR39 mappings (default: `true`). */
161
+ includeNovel?: boolean;
162
+ /** Restrict replacement to specific source scripts (case-insensitive, e.g. `["Cyrillic", "Greek"]`). */
163
+ scripts?: string[];
164
+ /**
165
+ * Canonicalisation strategy (default: `"mixed"`).
166
+ *
167
+ * - `"mixed"` -- only replace confusable characters inside tokens that already
168
+ * contain Latin letters. Standalone non-Latin words (e.g. "Москва") are
169
+ * preserved. Safe for multilingual text.
170
+ *
171
+ * - `"all"` -- replace every confusable character regardless of surrounding
172
+ * context. Use this when the document is known to be Latin-script (e.g.
173
+ * an English contract) and you want to catch attackers who substitute
174
+ * every character in a word.
175
+ */
176
+ strategy?: "mixed" | "all";
177
+ };
178
+ /** Options for `scan()` and `isClean()`. */
179
+ type ScanOptions = CanonicaliseOptions & {
180
+ /** Optional list of high-value terms used to raise `riskLevel` when targeted (default: built-in legal/financial list). */
181
+ riskTerms?: string[];
182
+ };
183
+ /** Single confusable finding returned by `scan()`. */
184
+ type ScanFinding = {
185
+ /** The confusable character found in the input. */
186
+ char: string;
187
+ /** Codepoint label in `U+XXXX` format. */
188
+ codepoint: string;
189
+ /** Script name of the source character. */
190
+ script: string;
191
+ /** Canonical Latin equivalent selected by the lookup table. */
192
+ latinEquivalent: string;
193
+ /** SSIM score used for this mapping. */
194
+ ssimScore: number;
195
+ /** Mapping source (`tr39` baseline or `novel` discovery). */
196
+ source: "tr39" | "novel";
197
+ /** UTF-16 code-unit offset in the input string. */
198
+ index: number;
199
+ /** Token/word containing this character. */
200
+ word: string;
201
+ /** Whether the token mixes Latin and non-Latin letters. */
202
+ mixedScript: boolean;
203
+ };
204
+ /** Structured confusable scan result for LLM preprocessing pipelines. */
205
+ type ScanResult = {
206
+ /** Whether any confusable mapping candidates were detected. */
207
+ hasConfusables: boolean;
208
+ /** Number of findings in `findings`. */
209
+ count: number;
210
+ /** Detailed findings with script/source/position metadata. */
211
+ findings: ScanFinding[];
212
+ /** Aggregate scan summary for policy and logging. */
213
+ summary: {
214
+ /** Number of distinct confusable characters found. */
215
+ distinctChars: number;
216
+ /** Number of distinct words/tokens affected. */
217
+ wordsAffected: number;
218
+ /** Distinct scripts detected among findings. */
219
+ scriptsDetected: string[];
220
+ /** Heuristic risk level from confusable density + targeting. */
221
+ riskLevel: "none" | "low" | "medium" | "high";
222
+ };
223
+ };
224
+
139
225
  /** Options for the `skeleton()` and `areConfusable()` functions. */
140
226
  type SkeletonOptions = {
141
227
  /** Confusable character map to use.
@@ -504,6 +590,32 @@ declare function createHomoglyphValidator(options?: {
504
590
  * @returns An async validator function for use in `config.validators`
505
591
  */
506
592
  declare function createInvisibleCharacterValidator(options?: InvisibleCharacterValidatorOptions): NamespaceValidator;
593
+ /**
594
+ * Canonicalise confusable characters in text for LLM preprocessing.
595
+ *
596
+ * With the default `strategy: "mixed"`, only rewrites characters inside tokens
597
+ * that already contain Latin letters. Standalone non-Latin words are preserved
598
+ * to reduce false positives in multilingual text.
599
+ *
600
+ * With `strategy: "all"`, rewrites every confusable character regardless of
601
+ * context. Use this when the document is known to be Latin-script.
602
+ */
603
+ declare function canonicalise(text: string, options?: CanonicaliseOptions): string;
604
+ /**
605
+ * Scan text for confusable characters and return structured findings + risk summary.
606
+ */
607
+ declare function scan(text: string, options?: ScanOptions): ScanResult;
608
+ /**
609
+ * Fast gate for LLM pipelines.
610
+ *
611
+ * With the default `strategy: "mixed"`, returns `false` as soon as a
612
+ * mixed-script confusable substitution is found. Standalone non-Latin
613
+ * words do not fail this gate.
614
+ *
615
+ * With `strategy: "all"`, returns `false` if any confusable character is
616
+ * found, regardless of surrounding context.
617
+ */
618
+ declare function isClean(text: string, options?: ScanOptions): boolean;
507
619
  /**
508
620
  * Compute the TR39 Section 4 skeleton of a string for confusable comparison.
509
621
  *
@@ -634,4 +746,4 @@ declare function createNamespaceGuard(config: NamespaceConfig, adapter: Namespac
634
746
  /** The guard instance returned by `createNamespaceGuard`. */
635
747
  type NamespaceGuard = ReturnType<typeof createNamespaceGuard>;
636
748
 
637
- export { type AssertClaimableOptions, COMPOSABILITY_VECTORS, COMPOSABILITY_VECTORS_COUNT, COMPOSABILITY_VECTOR_SUITE, CONFUSABLE_MAP, CONFUSABLE_MAP_FULL, type CheckManyOptions, type CheckResult, type CheckRiskOptions, type ClaimOptions, type ClaimResult, type ComposabilityVector, type ConfusableDistanceOptions, type ConfusableDistanceResult, type ConfusableDistanceStep, type ConfusableWeight, type ConfusableWeights, DEFAULT_PROTECTED_TOKENS, type EnforceRiskOptions, type EnforceRiskResult, type FindOneOptions, type InvisibleCharacterValidatorOptions, NAMESPACE_PROFILES, NFKC_TR39_DIVERGENCE_VECTORS, type NamespaceAdapter, type NamespaceConfig, type NamespaceGuard, type NamespaceProfileName, type NamespaceProfilePreset, type NamespaceSource, type NamespaceValidator, type NamespaceValidatorResult, type NfkcTr39DivergenceVector, type OwnershipScope, type PredicateValidatorOptions, type ProfanityValidationMode, type ProfanityValidatorOptions, type ProfanityVariantProfile, type RiskAction, type RiskCheckResult, type RiskLevel, type RiskMatch, type RiskReason, type RiskReasonCode, type SkeletonOptions, type SuggestStrategyName, type UniqueViolationDetector, areConfusable, confusableDistance, createHomoglyphValidator, createInvisibleCharacterValidator, createNamespaceGuard, createNamespaceGuardWithProfile, createPredicateValidator, createProfanityValidator, deriveNfkcTr39DivergenceVectors, isLikelyUniqueViolationError, normalize, skeleton };
749
+ export { type AssertClaimableOptions, COMPOSABILITY_VECTORS, COMPOSABILITY_VECTORS_COUNT, COMPOSABILITY_VECTOR_SUITE, CONFUSABLE_MAP, CONFUSABLE_MAP_FULL, type CanonicaliseOptions, type CheckManyOptions, type CheckResult, type CheckRiskOptions, type ClaimOptions, type ClaimResult, type ComposabilityVector, type ConfusableDistanceOptions, type ConfusableDistanceResult, type ConfusableDistanceStep, type ConfusableWeight, type ConfusableWeights, DEFAULT_PROTECTED_TOKENS, type EnforceRiskOptions, type EnforceRiskResult, type FindOneOptions, type InvisibleCharacterValidatorOptions, LLM_CONFUSABLE_MAP, LLM_CONFUSABLE_MAP_CHAR_COUNT, LLM_CONFUSABLE_MAP_PAIR_COUNT, LLM_CONFUSABLE_MAP_SOURCE_COUNTS, NAMESPACE_PROFILES, NFKC_TR39_DIVERGENCE_VECTORS, type NamespaceAdapter, type NamespaceConfig, type NamespaceGuard, type NamespaceProfileName, type NamespaceProfilePreset, type NamespaceSource, type NamespaceValidator, type NamespaceValidatorResult, type NfkcTr39DivergenceVector, type OwnershipScope, type PredicateValidatorOptions, type ProfanityValidationMode, type ProfanityValidatorOptions, type ProfanityVariantProfile, type RiskAction, type RiskCheckResult, type RiskLevel, type RiskMatch, type RiskReason, type RiskReasonCode, type ScanFinding, type ScanOptions, type ScanResult, type SkeletonOptions, type SuggestStrategyName, type UniqueViolationDetector, areConfusable, canonicalise, confusableDistance, createHomoglyphValidator, createInvisibleCharacterValidator, createNamespaceGuard, createNamespaceGuardWithProfile, createPredicateValidator, createProfanityValidator, deriveNfkcTr39DivergenceVectors, isClean, isLikelyUniqueViolationError, normalize, scan, skeleton };
package/dist/index.d.ts CHANGED
@@ -1,3 +1,20 @@
1
+ type LlmConfusableSource = "tr39" | "novel";
2
+ type LlmConfusableMapEntry = {
3
+ latin: string;
4
+ ssimScore: number;
5
+ source: LlmConfusableSource;
6
+ script: string;
7
+ codepoint: string;
8
+ };
9
+ type LlmConfusableMap = Readonly<Record<string, readonly LlmConfusableMapEntry[]>>;
10
+ declare const LLM_CONFUSABLE_MAP: LlmConfusableMap;
11
+ declare const LLM_CONFUSABLE_MAP_PAIR_COUNT = 2218;
12
+ declare const LLM_CONFUSABLE_MAP_CHAR_COUNT = 1947;
13
+ declare const LLM_CONFUSABLE_MAP_SOURCE_COUNTS: Readonly<{
14
+ tr39: 1425;
15
+ novel: 793;
16
+ }>;
17
+
1
18
  /** A database table or model to check for slug/handle collisions. */
2
19
  type NamespaceSource = {
3
20
  /** Table/model name (must match the adapter's lookup key) */
@@ -136,6 +153,75 @@ type InvisibleCharacterValidatorOptions = {
136
153
  /** Reject combining marks (Unicode category `M*`) often used for visual obfuscation (default: `false`). */
137
154
  rejectCombiningMarks?: boolean;
138
155
  };
156
+ /** Options for `canonicalise()` LLM preprocessing. */
157
+ type CanonicaliseOptions = {
158
+ /** Minimum SSIM score required for replacement (default: `0.7`). */
159
+ threshold?: number;
160
+ /** Include confusable-vision novel discoveries in addition to TR39 mappings (default: `true`). */
161
+ includeNovel?: boolean;
162
+ /** Restrict replacement to specific source scripts (case-insensitive, e.g. `["Cyrillic", "Greek"]`). */
163
+ scripts?: string[];
164
+ /**
165
+ * Canonicalisation strategy (default: `"mixed"`).
166
+ *
167
+ * - `"mixed"` -- only replace confusable characters inside tokens that already
168
+ * contain Latin letters. Standalone non-Latin words (e.g. "Москва") are
169
+ * preserved. Safe for multilingual text.
170
+ *
171
+ * - `"all"` -- replace every confusable character regardless of surrounding
172
+ * context. Use this when the document is known to be Latin-script (e.g.
173
+ * an English contract) and you want to catch attackers who substitute
174
+ * every character in a word.
175
+ */
176
+ strategy?: "mixed" | "all";
177
+ };
178
+ /** Options for `scan()` and `isClean()`. */
179
+ type ScanOptions = CanonicaliseOptions & {
180
+ /** Optional list of high-value terms used to raise `riskLevel` when targeted (default: built-in legal/financial list). */
181
+ riskTerms?: string[];
182
+ };
183
+ /** Single confusable finding returned by `scan()`. */
184
+ type ScanFinding = {
185
+ /** The confusable character found in the input. */
186
+ char: string;
187
+ /** Codepoint label in `U+XXXX` format. */
188
+ codepoint: string;
189
+ /** Script name of the source character. */
190
+ script: string;
191
+ /** Canonical Latin equivalent selected by the lookup table. */
192
+ latinEquivalent: string;
193
+ /** SSIM score used for this mapping. */
194
+ ssimScore: number;
195
+ /** Mapping source (`tr39` baseline or `novel` discovery). */
196
+ source: "tr39" | "novel";
197
+ /** UTF-16 code-unit offset in the input string. */
198
+ index: number;
199
+ /** Token/word containing this character. */
200
+ word: string;
201
+ /** Whether the token mixes Latin and non-Latin letters. */
202
+ mixedScript: boolean;
203
+ };
204
+ /** Structured confusable scan result for LLM preprocessing pipelines. */
205
+ type ScanResult = {
206
+ /** Whether any confusable mapping candidates were detected. */
207
+ hasConfusables: boolean;
208
+ /** Number of findings in `findings`. */
209
+ count: number;
210
+ /** Detailed findings with script/source/position metadata. */
211
+ findings: ScanFinding[];
212
+ /** Aggregate scan summary for policy and logging. */
213
+ summary: {
214
+ /** Number of distinct confusable characters found. */
215
+ distinctChars: number;
216
+ /** Number of distinct words/tokens affected. */
217
+ wordsAffected: number;
218
+ /** Distinct scripts detected among findings. */
219
+ scriptsDetected: string[];
220
+ /** Heuristic risk level from confusable density + targeting. */
221
+ riskLevel: "none" | "low" | "medium" | "high";
222
+ };
223
+ };
224
+
139
225
  /** Options for the `skeleton()` and `areConfusable()` functions. */
140
226
  type SkeletonOptions = {
141
227
  /** Confusable character map to use.
@@ -504,6 +590,32 @@ declare function createHomoglyphValidator(options?: {
504
590
  * @returns An async validator function for use in `config.validators`
505
591
  */
506
592
  declare function createInvisibleCharacterValidator(options?: InvisibleCharacterValidatorOptions): NamespaceValidator;
593
+ /**
594
+ * Canonicalise confusable characters in text for LLM preprocessing.
595
+ *
596
+ * With the default `strategy: "mixed"`, only rewrites characters inside tokens
597
+ * that already contain Latin letters. Standalone non-Latin words are preserved
598
+ * to reduce false positives in multilingual text.
599
+ *
600
+ * With `strategy: "all"`, rewrites every confusable character regardless of
601
+ * context. Use this when the document is known to be Latin-script.
602
+ */
603
+ declare function canonicalise(text: string, options?: CanonicaliseOptions): string;
604
+ /**
605
+ * Scan text for confusable characters and return structured findings + risk summary.
606
+ */
607
+ declare function scan(text: string, options?: ScanOptions): ScanResult;
608
+ /**
609
+ * Fast gate for LLM pipelines.
610
+ *
611
+ * With the default `strategy: "mixed"`, returns `false` as soon as a
612
+ * mixed-script confusable substitution is found. Standalone non-Latin
613
+ * words do not fail this gate.
614
+ *
615
+ * With `strategy: "all"`, returns `false` if any confusable character is
616
+ * found, regardless of surrounding context.
617
+ */
618
+ declare function isClean(text: string, options?: ScanOptions): boolean;
507
619
  /**
508
620
  * Compute the TR39 Section 4 skeleton of a string for confusable comparison.
509
621
  *
@@ -634,4 +746,4 @@ declare function createNamespaceGuard(config: NamespaceConfig, adapter: Namespac
634
746
  /** The guard instance returned by `createNamespaceGuard`. */
635
747
  type NamespaceGuard = ReturnType<typeof createNamespaceGuard>;
636
748
 
637
- export { type AssertClaimableOptions, COMPOSABILITY_VECTORS, COMPOSABILITY_VECTORS_COUNT, COMPOSABILITY_VECTOR_SUITE, CONFUSABLE_MAP, CONFUSABLE_MAP_FULL, type CheckManyOptions, type CheckResult, type CheckRiskOptions, type ClaimOptions, type ClaimResult, type ComposabilityVector, type ConfusableDistanceOptions, type ConfusableDistanceResult, type ConfusableDistanceStep, type ConfusableWeight, type ConfusableWeights, DEFAULT_PROTECTED_TOKENS, type EnforceRiskOptions, type EnforceRiskResult, type FindOneOptions, type InvisibleCharacterValidatorOptions, NAMESPACE_PROFILES, NFKC_TR39_DIVERGENCE_VECTORS, type NamespaceAdapter, type NamespaceConfig, type NamespaceGuard, type NamespaceProfileName, type NamespaceProfilePreset, type NamespaceSource, type NamespaceValidator, type NamespaceValidatorResult, type NfkcTr39DivergenceVector, type OwnershipScope, type PredicateValidatorOptions, type ProfanityValidationMode, type ProfanityValidatorOptions, type ProfanityVariantProfile, type RiskAction, type RiskCheckResult, type RiskLevel, type RiskMatch, type RiskReason, type RiskReasonCode, type SkeletonOptions, type SuggestStrategyName, type UniqueViolationDetector, areConfusable, confusableDistance, createHomoglyphValidator, createInvisibleCharacterValidator, createNamespaceGuard, createNamespaceGuardWithProfile, createPredicateValidator, createProfanityValidator, deriveNfkcTr39DivergenceVectors, isLikelyUniqueViolationError, normalize, skeleton };
749
+ export { type AssertClaimableOptions, COMPOSABILITY_VECTORS, COMPOSABILITY_VECTORS_COUNT, COMPOSABILITY_VECTOR_SUITE, CONFUSABLE_MAP, CONFUSABLE_MAP_FULL, type CanonicaliseOptions, type CheckManyOptions, type CheckResult, type CheckRiskOptions, type ClaimOptions, type ClaimResult, type ComposabilityVector, type ConfusableDistanceOptions, type ConfusableDistanceResult, type ConfusableDistanceStep, type ConfusableWeight, type ConfusableWeights, DEFAULT_PROTECTED_TOKENS, type EnforceRiskOptions, type EnforceRiskResult, type FindOneOptions, type InvisibleCharacterValidatorOptions, LLM_CONFUSABLE_MAP, LLM_CONFUSABLE_MAP_CHAR_COUNT, LLM_CONFUSABLE_MAP_PAIR_COUNT, LLM_CONFUSABLE_MAP_SOURCE_COUNTS, NAMESPACE_PROFILES, NFKC_TR39_DIVERGENCE_VECTORS, type NamespaceAdapter, type NamespaceConfig, type NamespaceGuard, type NamespaceProfileName, type NamespaceProfilePreset, type NamespaceSource, type NamespaceValidator, type NamespaceValidatorResult, type NfkcTr39DivergenceVector, type OwnershipScope, type PredicateValidatorOptions, type ProfanityValidationMode, type ProfanityValidatorOptions, type ProfanityVariantProfile, type RiskAction, type RiskCheckResult, type RiskLevel, type RiskMatch, type RiskReason, type RiskReasonCode, type ScanFinding, type ScanOptions, type ScanResult, type SkeletonOptions, type SuggestStrategyName, type UniqueViolationDetector, areConfusable, canonicalise, confusableDistance, createHomoglyphValidator, createInvisibleCharacterValidator, createNamespaceGuard, createNamespaceGuardWithProfile, createPredicateValidator, createProfanityValidator, deriveNfkcTr39DivergenceVectors, isClean, isLikelyUniqueViolationError, normalize, scan, skeleton };