npm - namespace-guard - Versions diffs - 0.16.1 → 0.17.1 - Mend

namespace-guard 0.16.1 → 0.17.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/README.md +47 -0
package/dist/cli.js +18 -0
package/dist/cli.mjs +18 -0
package/dist/composability-vectors.js +18 -0
package/dist/composability-vectors.mjs +18 -0
package/dist/index.d.mts +129 -1
package/dist/index.d.ts +129 -1
package/dist/index.js +6340 -0
package/dist/index.mjs +6331 -0
package/dist/profanity-en.js +18 -0
package/dist/profanity-en.mjs +18 -0
package/package.json +2 -1

package/README.md CHANGED Viewed

@@ -83,6 +83,52 @@ Details:
 - Suggestion strategies for taken names
 - CLI for red-team generation, calibration, drift, and CI gates
+## LLM Pipeline Preprocessing
+LLM tokenizers process Unicode codepoints, not rendered glyphs. Confusable substitutions can inflate token counts and hide important terms in mixed-script text, especially on smaller models.
+Use namespace-guard as a deterministic preprocess layer before model calls:
+```text
+Document ingestion
+       |
+       v
++----------------+
+| namespace-     |  <-- Detect mixed-script confusable substitution
+| guard          |  <-- Canonicalise to Latin equivalents
+| (microseconds) |  <-- Flag suspicious patterns for review
++----------------+
+       |
+       v
++----------------+
+| LLM API        |  <-- Any model/provider
+| (GPT/Claude/   |  <-- Receives canonicalised text
+| Llama/etc)     |
++----------------+
+       |
+       v
+   Analysis output
+```
+```typescript
+import { canonicalise, scan, isClean } from "namespace-guard";
+const raw = "The seller аssumes аll liаbility.";
+const report = scan(raw);        // detailed findings + risk level
+const clean = canonicalise(raw); // "The seller assumes all liability."
+const ok = isClean(raw);         // false (mixed-script confusable detected)
+// For known-Latin documents (e.g. English contracts), use strategy: "all"
+// to also catch words where every character was substituted:
+canonicalise("поп-refundable", { strategy: "all" }); // "non-refundable"
+```
+Research context:
+- Launch: https://paultendo.github.io/posts/namespace-guard-launch/
+- NFKC/TR39 composability: https://paultendo.github.io/posts/unicode-confusables-nfkc-conflict/
+- Confusable detection without NFKC: https://paultendo.github.io/posts/confusable-detection-without-nfkc/
 ## Built-in Profiles
 Use `createNamespaceGuardWithProfile(profile, overrides, adapter)`:
@@ -187,6 +233,7 @@ Migration guides per adapter: [docs/reference.md#canonical-uniqueness-migration-
 - Validators (profanity, homoglyph, invisible): [docs/reference.md#async-validators](docs/reference.md#async-validators)
 - Canonical preflight audit (`audit-canonical`): [docs/reference.md#audit-canonical-command](docs/reference.md#audit-canonical-command)
 - Anti-spoofing pipeline and composability vectors: [docs/reference.md#how-the-anti-spoofing-pipeline-works](docs/reference.md#how-the-anti-spoofing-pipeline-works)
+- LLM preprocessing (`canonicalise`, `scan`, `isClean`): [docs/reference.md#llm-pipeline-preprocessing](docs/reference.md#llm-pipeline-preprocessing)
 - Benchmark corpus (`confusable-bench.v1`): [docs/reference.md#confusable-benchmark-corpus-artifact](docs/reference.md#confusable-benchmark-corpus-artifact)
 - Advanced primitives (`skeleton`, `areConfusable`, `confusableDistance`): [docs/reference.md#advanced-security-primitives](docs/reference.md#advanced-security-primitives)
 - Confusable weights (SSIM-scored pairs): [docs/reference.md#confusable-weights-subpath](docs/reference.md#confusable-weights-subpath)

package/dist/cli.js CHANGED Viewed

@@ -27,6 +27,9 @@ module.exports = __toCommonJS(cli_exports);
 var import_fs = require("fs");
 var import_path = require("path");
+// src/llm-confusable-map.ts
+var LLM_CONFUSABLE_MAP_SOURCE_COUNTS = Object.freeze({ tr39: 1425, novel: 793 });
 // src/index.ts
 function asRecord(value) {
   if (!value || typeof value !== "object") return null;
@@ -2828,6 +2831,21 @@ var SCRIPT_DETECTORS = [
   ["hiragana", /\p{Script=Hiragana}/u],
   ["katakana", /\p{Script=Katakana}/u]
 ];
+var DEFAULT_SCAN_RISK_TERMS = Object.freeze([
+  "liability",
+  "indemnity",
+  "penalty",
+  "damages",
+  "termination",
+  "breach",
+  "warranty",
+  "payment",
+  "invoice",
+  "governing",
+  "jurisdiction",
+  "arbitration",
+  "confidentiality"
+]);
 function clamp(value, min, max) {
   return Math.max(min, Math.min(max, value));
 }

package/dist/cli.mjs CHANGED Viewed

@@ -4,6 +4,9 @@
 import { readFileSync, existsSync } from "fs";
 import { resolve } from "path";
+// src/llm-confusable-map.ts
+var LLM_CONFUSABLE_MAP_SOURCE_COUNTS = Object.freeze({ tr39: 1425, novel: 793 });
 // src/index.ts
 function asRecord(value) {
   if (!value || typeof value !== "object") return null;
@@ -2805,6 +2808,21 @@ var SCRIPT_DETECTORS = [
   ["hiragana", /\p{Script=Hiragana}/u],
   ["katakana", /\p{Script=Katakana}/u]
 ];
+var DEFAULT_SCAN_RISK_TERMS = Object.freeze([
+  "liability",
+  "indemnity",
+  "penalty",
+  "damages",
+  "termination",
+  "breach",
+  "warranty",
+  "payment",
+  "invoice",
+  "governing",
+  "jurisdiction",
+  "arbitration",
+  "confidentiality"
+]);
 function clamp(value, min, max) {
   return Math.max(min, Math.min(max, value));
 }

package/dist/composability-vectors.js CHANGED Viewed

@@ -26,6 +26,9 @@ __export(composability_vectors_exports, {
 });
 module.exports = __toCommonJS(composability_vectors_exports);
+// src/llm-confusable-map.ts
+var LLM_CONFUSABLE_MAP_SOURCE_COUNTS = Object.freeze({ tr39: 1425, novel: 793 });
 // src/index.ts
 var PROFANITY_SUBSTITUTE_MAP_BALANCED = {
   "0": ["o"],
@@ -1845,6 +1848,21 @@ var COMPOSABILITY_VECTORS = NFKC_TR39_DIVERGENCE_VECTORS;
 var COMPOSABILITY_VECTORS_COUNT = COMPOSABILITY_VECTORS.length;
 var DEFAULT_IGNORABLE_RE = /[\u00AD\u034F\u061C\u115F\u1160\u17B4\u17B5\u180B-\u180F\u200B-\u200F\u202A-\u202E\u2060-\u206F\uFE00-\uFE0F\uFEFF\uFFA0\uFFF0-\uFFF8\u{1BCA0}-\u{1BCA3}\u{1D173}-\u{1D17A}\u{E0000}-\u{E0FFF}]/gu;
 var DEFAULT_IGNORABLE_SINGLE_RE = new RegExp(DEFAULT_IGNORABLE_RE.source, "u");
+var DEFAULT_SCAN_RISK_TERMS = Object.freeze([
+  "liability",
+  "indemnity",
+  "penalty",
+  "damages",
+  "termination",
+  "breach",
+  "warranty",
+  "payment",
+  "invoice",
+  "governing",
+  "jurisdiction",
+  "arbitration",
+  "confidentiality"
+]);
 // Annotate the CommonJS export names for ESM import in node:
 0 && (module.exports = {
   COMPOSABILITY_VECTORS,

package/dist/composability-vectors.mjs CHANGED Viewed

@@ -1,3 +1,6 @@
+// src/llm-confusable-map.ts
+var LLM_CONFUSABLE_MAP_SOURCE_COUNTS = Object.freeze({ tr39: 1425, novel: 793 });
 // src/index.ts
 var PROFANITY_SUBSTITUTE_MAP_BALANCED = {
   "0": ["o"],
@@ -1817,6 +1820,21 @@ var COMPOSABILITY_VECTORS = NFKC_TR39_DIVERGENCE_VECTORS;
 var COMPOSABILITY_VECTORS_COUNT = COMPOSABILITY_VECTORS.length;
 var DEFAULT_IGNORABLE_RE = /[\u00AD\u034F\u061C\u115F\u1160\u17B4\u17B5\u180B-\u180F\u200B-\u200F\u202A-\u202E\u2060-\u206F\uFE00-\uFE0F\uFEFF\uFFA0\uFFF0-\uFFF8\u{1BCA0}-\u{1BCA3}\u{1D173}-\u{1D17A}\u{E0000}-\u{E0FFF}]/gu;
 var DEFAULT_IGNORABLE_SINGLE_RE = new RegExp(DEFAULT_IGNORABLE_RE.source, "u");
+var DEFAULT_SCAN_RISK_TERMS = Object.freeze([
+  "liability",
+  "indemnity",
+  "penalty",
+  "damages",
+  "termination",
+  "breach",
+  "warranty",
+  "payment",
+  "invoice",
+  "governing",
+  "jurisdiction",
+  "arbitration",
+  "confidentiality"
+]);
 export {
   COMPOSABILITY_VECTORS,
   COMPOSABILITY_VECTORS_COUNT,

package/dist/index.d.mts CHANGED Viewed

@@ -1,3 +1,24 @@
+type LlmConfusableSource = "tr39" | "novel";
+type LlmConfusableMapEntry = {
+    latin: string;
+    ssimScore: number;
+    source: LlmConfusableSource;
+    script: string;
+    codepoint: string;
+    /** Width ratio between source and target at natural rendering size. Null if not measured. */
+    widthRatio?: number | null;
+    /** Height ratio between source and target at natural rendering size. Null if not measured. */
+    heightRatio?: number | null;
+};
+type LlmConfusableMap = Readonly<Record<string, readonly LlmConfusableMapEntry[]>>;
+declare const LLM_CONFUSABLE_MAP: LlmConfusableMap;
+declare const LLM_CONFUSABLE_MAP_PAIR_COUNT = 2218;
+declare const LLM_CONFUSABLE_MAP_CHAR_COUNT = 1947;
+declare const LLM_CONFUSABLE_MAP_SOURCE_COUNTS: Readonly<{
+    tr39: 1425;
+    novel: 793;
+}>;
 /** A database table or model to check for slug/handle collisions. */
 type NamespaceSource = {
     /** Table/model name (must match the adapter's lookup key) */
@@ -136,6 +157,87 @@ type InvisibleCharacterValidatorOptions = {
     /** Reject combining marks (Unicode category `M*`) often used for visual obfuscation (default: `false`). */
     rejectCombiningMarks?: boolean;
 };
+/** Options for `canonicalise()` LLM preprocessing. */
+type CanonicaliseOptions = {
+    /** Minimum SSIM score required for replacement (default: `0.7`). */
+    threshold?: number;
+    /** Include confusable-vision novel discoveries in addition to TR39 mappings (default: `true`). */
+    includeNovel?: boolean;
+    /** Restrict replacement to specific source scripts (case-insensitive, e.g. `["Cyrillic", "Greek"]`). */
+    scripts?: string[];
+    /**
+     * Canonicalisation strategy (default: `"mixed"`).
+     *
+     * - `"mixed"` -- only replace confusable characters inside tokens that already
+     *   contain Latin letters. Standalone non-Latin words (e.g. "Москва") are
+     *   preserved.  Safe for multilingual text.
+     *
+     * - `"all"` -- replace every confusable character regardless of surrounding
+     *   context.  Use this when the document is known to be Latin-script (e.g.
+     *   an English contract) and you want to catch attackers who substitute
+     *   every character in a word.
+     */
+    strategy?: "mixed" | "all";
+    /**
+     * Maximum allowed width or height ratio between source and target at natural
+     * rendering size (default: `3.0`). Pairs where the source character is more
+     * than this many times wider or taller than the Latin target are skipped,
+     * because the size difference would be visible in running text even if the
+     * shapes match after normalisation.
+     *
+     * Set to `Infinity` to disable size-ratio filtering. Set to `2.0` for
+     * stricter filtering. Only applies to novel (non-TR39) pairs that have
+     * measured size ratios.
+     */
+    maxSizeRatio?: number;
+};
+/** Options for `scan()` and `isClean()`. */
+type ScanOptions = CanonicaliseOptions & {
+    /** Optional list of high-value terms used to raise `riskLevel` when targeted (default: built-in legal/financial list). */
+    riskTerms?: string[];
+};
+/** Single confusable finding returned by `scan()`. */
+type ScanFinding = {
+    /** The confusable character found in the input. */
+    char: string;
+    /** Codepoint label in `U+XXXX` format. */
+    codepoint: string;
+    /** Script name of the source character. */
+    script: string;
+    /** Canonical Latin equivalent selected by the lookup table. */
+    latinEquivalent: string;
+    /** SSIM score used for this mapping. */
+    ssimScore: number;
+    /** Mapping source (`tr39` baseline or `novel` discovery). */
+    source: "tr39" | "novel";
+    /** UTF-16 code-unit offset in the input string. */
+    index: number;
+    /** Token/word containing this character. */
+    word: string;
+    /** Whether the token mixes Latin and non-Latin letters. */
+    mixedScript: boolean;
+};
+/** Structured confusable scan result for LLM preprocessing pipelines. */
+type ScanResult = {
+    /** Whether any confusable mapping candidates were detected. */
+    hasConfusables: boolean;
+    /** Number of findings in `findings`. */
+    count: number;
+    /** Detailed findings with script/source/position metadata. */
+    findings: ScanFinding[];
+    /** Aggregate scan summary for policy and logging. */
+    summary: {
+        /** Number of distinct confusable characters found. */
+        distinctChars: number;
+        /** Number of distinct words/tokens affected. */
+        wordsAffected: number;
+        /** Distinct scripts detected among findings. */
+        scriptsDetected: string[];
+        /** Heuristic risk level from confusable density + targeting. */
+        riskLevel: "none" | "low" | "medium" | "high";
+    };
+};
 /** Options for the `skeleton()` and `areConfusable()` functions. */
 type SkeletonOptions = {
     /** Confusable character map to use.
@@ -504,6 +606,32 @@ declare function createHomoglyphValidator(options?: {
  * @returns An async validator function for use in `config.validators`
  */
 declare function createInvisibleCharacterValidator(options?: InvisibleCharacterValidatorOptions): NamespaceValidator;
+/**
+ * Canonicalise confusable characters in text for LLM preprocessing.
+ *
+ * With the default `strategy: "mixed"`, only rewrites characters inside tokens
+ * that already contain Latin letters.  Standalone non-Latin words are preserved
+ * to reduce false positives in multilingual text.
+ *
+ * With `strategy: "all"`, rewrites every confusable character regardless of
+ * context.  Use this when the document is known to be Latin-script.
+ */
+declare function canonicalise(text: string, options?: CanonicaliseOptions): string;
+/**
+ * Scan text for confusable characters and return structured findings + risk summary.
+ */
+declare function scan(text: string, options?: ScanOptions): ScanResult;
+/**
+ * Fast gate for LLM pipelines.
+ *
+ * With the default `strategy: "mixed"`, returns `false` as soon as a
+ * mixed-script confusable substitution is found.  Standalone non-Latin
+ * words do not fail this gate.
+ *
+ * With `strategy: "all"`, returns `false` if any confusable character is
+ * found, regardless of surrounding context.
+ */
+declare function isClean(text: string, options?: ScanOptions): boolean;
 /**
  * Compute the TR39 Section 4 skeleton of a string for confusable comparison.
  *
@@ -634,4 +762,4 @@ declare function createNamespaceGuard(config: NamespaceConfig, adapter: Namespac
 /** The guard instance returned by `createNamespaceGuard`. */
 type NamespaceGuard = ReturnType<typeof createNamespaceGuard>;
-export { type AssertClaimableOptions, COMPOSABILITY_VECTORS, COMPOSABILITY_VECTORS_COUNT, COMPOSABILITY_VECTOR_SUITE, CONFUSABLE_MAP, CONFUSABLE_MAP_FULL, type CheckManyOptions, type CheckResult, type CheckRiskOptions, type ClaimOptions, type ClaimResult, type ComposabilityVector, type ConfusableDistanceOptions, type ConfusableDistanceResult, type ConfusableDistanceStep, type ConfusableWeight, type ConfusableWeights, DEFAULT_PROTECTED_TOKENS, type EnforceRiskOptions, type EnforceRiskResult, type FindOneOptions, type InvisibleCharacterValidatorOptions, NAMESPACE_PROFILES, NFKC_TR39_DIVERGENCE_VECTORS, type NamespaceAdapter, type NamespaceConfig, type NamespaceGuard, type NamespaceProfileName, type NamespaceProfilePreset, type NamespaceSource, type NamespaceValidator, type NamespaceValidatorResult, type NfkcTr39DivergenceVector, type OwnershipScope, type PredicateValidatorOptions, type ProfanityValidationMode, type ProfanityValidatorOptions, type ProfanityVariantProfile, type RiskAction, type RiskCheckResult, type RiskLevel, type RiskMatch, type RiskReason, type RiskReasonCode, type SkeletonOptions, type SuggestStrategyName, type UniqueViolationDetector, areConfusable, confusableDistance, createHomoglyphValidator, createInvisibleCharacterValidator, createNamespaceGuard, createNamespaceGuardWithProfile, createPredicateValidator, createProfanityValidator, deriveNfkcTr39DivergenceVectors, isLikelyUniqueViolationError, normalize, skeleton };
+export { type AssertClaimableOptions, COMPOSABILITY_VECTORS, COMPOSABILITY_VECTORS_COUNT, COMPOSABILITY_VECTOR_SUITE, CONFUSABLE_MAP, CONFUSABLE_MAP_FULL, type CanonicaliseOptions, type CheckManyOptions, type CheckResult, type CheckRiskOptions, type ClaimOptions, type ClaimResult, type ComposabilityVector, type ConfusableDistanceOptions, type ConfusableDistanceResult, type ConfusableDistanceStep, type ConfusableWeight, type ConfusableWeights, DEFAULT_PROTECTED_TOKENS, type EnforceRiskOptions, type EnforceRiskResult, type FindOneOptions, type InvisibleCharacterValidatorOptions, LLM_CONFUSABLE_MAP, LLM_CONFUSABLE_MAP_CHAR_COUNT, LLM_CONFUSABLE_MAP_PAIR_COUNT, LLM_CONFUSABLE_MAP_SOURCE_COUNTS, NAMESPACE_PROFILES, NFKC_TR39_DIVERGENCE_VECTORS, type NamespaceAdapter, type NamespaceConfig, type NamespaceGuard, type NamespaceProfileName, type NamespaceProfilePreset, type NamespaceSource, type NamespaceValidator, type NamespaceValidatorResult, type NfkcTr39DivergenceVector, type OwnershipScope, type PredicateValidatorOptions, type ProfanityValidationMode, type ProfanityValidatorOptions, type ProfanityVariantProfile, type RiskAction, type RiskCheckResult, type RiskLevel, type RiskMatch, type RiskReason, type RiskReasonCode, type ScanFinding, type ScanOptions, type ScanResult, type SkeletonOptions, type SuggestStrategyName, type UniqueViolationDetector, areConfusable, canonicalise, confusableDistance, createHomoglyphValidator, createInvisibleCharacterValidator, createNamespaceGuard, createNamespaceGuardWithProfile, createPredicateValidator, createProfanityValidator, deriveNfkcTr39DivergenceVectors, isClean, isLikelyUniqueViolationError, normalize, scan, skeleton };

package/dist/index.d.ts CHANGED Viewed

@@ -1,3 +1,24 @@
+type LlmConfusableSource = "tr39" | "novel";
+type LlmConfusableMapEntry = {
+    latin: string;
+    ssimScore: number;
+    source: LlmConfusableSource;
+    script: string;
+    codepoint: string;
+    /** Width ratio between source and target at natural rendering size. Null if not measured. */
+    widthRatio?: number | null;
+    /** Height ratio between source and target at natural rendering size. Null if not measured. */
+    heightRatio?: number | null;
+};
+type LlmConfusableMap = Readonly<Record<string, readonly LlmConfusableMapEntry[]>>;
+declare const LLM_CONFUSABLE_MAP: LlmConfusableMap;
+declare const LLM_CONFUSABLE_MAP_PAIR_COUNT = 2218;
+declare const LLM_CONFUSABLE_MAP_CHAR_COUNT = 1947;
+declare const LLM_CONFUSABLE_MAP_SOURCE_COUNTS: Readonly<{
+    tr39: 1425;
+    novel: 793;
+}>;
 /** A database table or model to check for slug/handle collisions. */
 type NamespaceSource = {
     /** Table/model name (must match the adapter's lookup key) */
@@ -136,6 +157,87 @@ type InvisibleCharacterValidatorOptions = {
     /** Reject combining marks (Unicode category `M*`) often used for visual obfuscation (default: `false`). */
     rejectCombiningMarks?: boolean;
 };
+/** Options for `canonicalise()` LLM preprocessing. */
+type CanonicaliseOptions = {
+    /** Minimum SSIM score required for replacement (default: `0.7`). */
+    threshold?: number;
+    /** Include confusable-vision novel discoveries in addition to TR39 mappings (default: `true`). */
+    includeNovel?: boolean;
+    /** Restrict replacement to specific source scripts (case-insensitive, e.g. `["Cyrillic", "Greek"]`). */
+    scripts?: string[];
+    /**
+     * Canonicalisation strategy (default: `"mixed"`).
+     *
+     * - `"mixed"` -- only replace confusable characters inside tokens that already
+     *   contain Latin letters. Standalone non-Latin words (e.g. "Москва") are
+     *   preserved.  Safe for multilingual text.
+     *
+     * - `"all"` -- replace every confusable character regardless of surrounding
+     *   context.  Use this when the document is known to be Latin-script (e.g.
+     *   an English contract) and you want to catch attackers who substitute
+     *   every character in a word.
+     */
+    strategy?: "mixed" | "all";
+    /**
+     * Maximum allowed width or height ratio between source and target at natural
+     * rendering size (default: `3.0`). Pairs where the source character is more
+     * than this many times wider or taller than the Latin target are skipped,
+     * because the size difference would be visible in running text even if the
+     * shapes match after normalisation.
+     *
+     * Set to `Infinity` to disable size-ratio filtering. Set to `2.0` for
+     * stricter filtering. Only applies to novel (non-TR39) pairs that have
+     * measured size ratios.
+     */
+    maxSizeRatio?: number;
+};
+/** Options for `scan()` and `isClean()`. */
+type ScanOptions = CanonicaliseOptions & {
+    /** Optional list of high-value terms used to raise `riskLevel` when targeted (default: built-in legal/financial list). */
+    riskTerms?: string[];
+};
+/** Single confusable finding returned by `scan()`. */
+type ScanFinding = {
+    /** The confusable character found in the input. */
+    char: string;
+    /** Codepoint label in `U+XXXX` format. */
+    codepoint: string;
+    /** Script name of the source character. */
+    script: string;
+    /** Canonical Latin equivalent selected by the lookup table. */
+    latinEquivalent: string;
+    /** SSIM score used for this mapping. */
+    ssimScore: number;
+    /** Mapping source (`tr39` baseline or `novel` discovery). */
+    source: "tr39" | "novel";
+    /** UTF-16 code-unit offset in the input string. */
+    index: number;
+    /** Token/word containing this character. */
+    word: string;
+    /** Whether the token mixes Latin and non-Latin letters. */
+    mixedScript: boolean;
+};
+/** Structured confusable scan result for LLM preprocessing pipelines. */
+type ScanResult = {
+    /** Whether any confusable mapping candidates were detected. */
+    hasConfusables: boolean;
+    /** Number of findings in `findings`. */
+    count: number;
+    /** Detailed findings with script/source/position metadata. */
+    findings: ScanFinding[];
+    /** Aggregate scan summary for policy and logging. */
+    summary: {
+        /** Number of distinct confusable characters found. */
+        distinctChars: number;
+        /** Number of distinct words/tokens affected. */
+        wordsAffected: number;
+        /** Distinct scripts detected among findings. */
+        scriptsDetected: string[];
+        /** Heuristic risk level from confusable density + targeting. */
+        riskLevel: "none" | "low" | "medium" | "high";
+    };
+};
 /** Options for the `skeleton()` and `areConfusable()` functions. */
 type SkeletonOptions = {
     /** Confusable character map to use.
@@ -504,6 +606,32 @@ declare function createHomoglyphValidator(options?: {
  * @returns An async validator function for use in `config.validators`
  */
 declare function createInvisibleCharacterValidator(options?: InvisibleCharacterValidatorOptions): NamespaceValidator;
+/**
+ * Canonicalise confusable characters in text for LLM preprocessing.
+ *
+ * With the default `strategy: "mixed"`, only rewrites characters inside tokens
+ * that already contain Latin letters.  Standalone non-Latin words are preserved
+ * to reduce false positives in multilingual text.
+ *
+ * With `strategy: "all"`, rewrites every confusable character regardless of
+ * context.  Use this when the document is known to be Latin-script.
+ */
+declare function canonicalise(text: string, options?: CanonicaliseOptions): string;
+/**
+ * Scan text for confusable characters and return structured findings + risk summary.
+ */
+declare function scan(text: string, options?: ScanOptions): ScanResult;
+/**
+ * Fast gate for LLM pipelines.
+ *
+ * With the default `strategy: "mixed"`, returns `false` as soon as a
+ * mixed-script confusable substitution is found.  Standalone non-Latin
+ * words do not fail this gate.
+ *
+ * With `strategy: "all"`, returns `false` if any confusable character is
+ * found, regardless of surrounding context.
+ */
+declare function isClean(text: string, options?: ScanOptions): boolean;
 /**
  * Compute the TR39 Section 4 skeleton of a string for confusable comparison.
  *
@@ -634,4 +762,4 @@ declare function createNamespaceGuard(config: NamespaceConfig, adapter: Namespac
 /** The guard instance returned by `createNamespaceGuard`. */
 type NamespaceGuard = ReturnType<typeof createNamespaceGuard>;
-export { type AssertClaimableOptions, COMPOSABILITY_VECTORS, COMPOSABILITY_VECTORS_COUNT, COMPOSABILITY_VECTOR_SUITE, CONFUSABLE_MAP, CONFUSABLE_MAP_FULL, type CheckManyOptions, type CheckResult, type CheckRiskOptions, type ClaimOptions, type ClaimResult, type ComposabilityVector, type ConfusableDistanceOptions, type ConfusableDistanceResult, type ConfusableDistanceStep, type ConfusableWeight, type ConfusableWeights, DEFAULT_PROTECTED_TOKENS, type EnforceRiskOptions, type EnforceRiskResult, type FindOneOptions, type InvisibleCharacterValidatorOptions, NAMESPACE_PROFILES, NFKC_TR39_DIVERGENCE_VECTORS, type NamespaceAdapter, type NamespaceConfig, type NamespaceGuard, type NamespaceProfileName, type NamespaceProfilePreset, type NamespaceSource, type NamespaceValidator, type NamespaceValidatorResult, type NfkcTr39DivergenceVector, type OwnershipScope, type PredicateValidatorOptions, type ProfanityValidationMode, type ProfanityValidatorOptions, type ProfanityVariantProfile, type RiskAction, type RiskCheckResult, type RiskLevel, type RiskMatch, type RiskReason, type RiskReasonCode, type SkeletonOptions, type SuggestStrategyName, type UniqueViolationDetector, areConfusable, confusableDistance, createHomoglyphValidator, createInvisibleCharacterValidator, createNamespaceGuard, createNamespaceGuardWithProfile, createPredicateValidator, createProfanityValidator, deriveNfkcTr39DivergenceVectors, isLikelyUniqueViolationError, normalize, skeleton };
+export { type AssertClaimableOptions, COMPOSABILITY_VECTORS, COMPOSABILITY_VECTORS_COUNT, COMPOSABILITY_VECTOR_SUITE, CONFUSABLE_MAP, CONFUSABLE_MAP_FULL, type CanonicaliseOptions, type CheckManyOptions, type CheckResult, type CheckRiskOptions, type ClaimOptions, type ClaimResult, type ComposabilityVector, type ConfusableDistanceOptions, type ConfusableDistanceResult, type ConfusableDistanceStep, type ConfusableWeight, type ConfusableWeights, DEFAULT_PROTECTED_TOKENS, type EnforceRiskOptions, type EnforceRiskResult, type FindOneOptions, type InvisibleCharacterValidatorOptions, LLM_CONFUSABLE_MAP, LLM_CONFUSABLE_MAP_CHAR_COUNT, LLM_CONFUSABLE_MAP_PAIR_COUNT, LLM_CONFUSABLE_MAP_SOURCE_COUNTS, NAMESPACE_PROFILES, NFKC_TR39_DIVERGENCE_VECTORS, type NamespaceAdapter, type NamespaceConfig, type NamespaceGuard, type NamespaceProfileName, type NamespaceProfilePreset, type NamespaceSource, type NamespaceValidator, type NamespaceValidatorResult, type NfkcTr39DivergenceVector, type OwnershipScope, type PredicateValidatorOptions, type ProfanityValidationMode, type ProfanityValidatorOptions, type ProfanityVariantProfile, type RiskAction, type RiskCheckResult, type RiskLevel, type RiskMatch, type RiskReason, type RiskReasonCode, type ScanFinding, type ScanOptions, type ScanResult, type SkeletonOptions, type SuggestStrategyName, type UniqueViolationDetector, areConfusable, canonicalise, confusableDistance, createHomoglyphValidator, createInvisibleCharacterValidator, createNamespaceGuard, createNamespaceGuardWithProfile, createPredicateValidator, createProfanityValidator, deriveNfkcTr39DivergenceVectors, isClean, isLikelyUniqueViolationError, normalize, scan, skeleton };