namespace-guard 0.16.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +61 -0
- package/dist/cli.js +18 -0
- package/dist/cli.mjs +18 -0
- package/dist/composability-vectors.js +18 -0
- package/dist/composability-vectors.mjs +18 -0
- package/dist/index.d.mts +113 -1
- package/dist/index.d.ts +113 -1
- package/dist/index.js +6338 -0
- package/dist/index.mjs +6329 -0
- package/dist/profanity-en.js +18 -0
- package/dist/profanity-en.mjs +18 -0
- package/package.json +3 -1
package/README.md
CHANGED
|
@@ -83,6 +83,52 @@ Details:
|
|
|
83
83
|
- Suggestion strategies for taken names
|
|
84
84
|
- CLI for red-team generation, calibration, drift, and CI gates
|
|
85
85
|
|
|
86
|
+
## LLM Pipeline Preprocessing
|
|
87
|
+
|
|
88
|
+
LLM tokenizers process Unicode codepoints, not rendered glyphs. Confusable substitutions can inflate token counts and hide important terms in mixed-script text, especially on smaller models.
|
|
89
|
+
|
|
90
|
+
Use namespace-guard as a deterministic preprocess layer before model calls:
|
|
91
|
+
|
|
92
|
+
```text
|
|
93
|
+
Document ingestion
|
|
94
|
+
|
|
|
95
|
+
v
|
|
96
|
+
+----------------+
|
|
97
|
+
| namespace- | <-- Detect mixed-script confusable substitution
|
|
98
|
+
| guard | <-- Canonicalise to Latin equivalents
|
|
99
|
+
| (microseconds) | <-- Flag suspicious patterns for review
|
|
100
|
+
+----------------+
|
|
101
|
+
|
|
|
102
|
+
v
|
|
103
|
+
+----------------+
|
|
104
|
+
| LLM API | <-- Any model/provider
|
|
105
|
+
| (GPT/Claude/ | <-- Receives canonicalised text
|
|
106
|
+
| Llama/etc) |
|
|
107
|
+
+----------------+
|
|
108
|
+
|
|
|
109
|
+
v
|
|
110
|
+
Analysis output
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
```typescript
|
|
114
|
+
import { canonicalise, scan, isClean } from "namespace-guard";
|
|
115
|
+
|
|
116
|
+
const raw = "The seller аssumes аll liаbility.";
|
|
117
|
+
|
|
118
|
+
const report = scan(raw); // detailed findings + risk level
|
|
119
|
+
const clean = canonicalise(raw); // "The seller assumes all liability."
|
|
120
|
+
const ok = isClean(raw); // false (mixed-script confusable detected)
|
|
121
|
+
|
|
122
|
+
// For known-Latin documents (e.g. English contracts), use strategy: "all"
|
|
123
|
+
// to also catch words where every character was substituted:
|
|
124
|
+
canonicalise("поп-refundable", { strategy: "all" }); // "non-refundable"
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Research context:
|
|
128
|
+
- Launch: https://paultendo.github.io/posts/namespace-guard-launch/
|
|
129
|
+
- NFKC/TR39 composability: https://paultendo.github.io/posts/unicode-confusables-nfkc-conflict/
|
|
130
|
+
- Confusable detection without NFKC: https://paultendo.github.io/posts/confusable-detection-without-nfkc/
|
|
131
|
+
|
|
86
132
|
## Built-in Profiles
|
|
87
133
|
|
|
88
134
|
Use `createNamespaceGuardWithProfile(profile, overrides, adapter)`:
|
|
@@ -147,6 +193,19 @@ areConfusable("paypal", "pa\u0443pal"); // true
|
|
|
147
193
|
confusableDistance("paypal", "pa\u0443pal"); // graded similarity + chainDepth + explainable steps
|
|
148
194
|
```
|
|
149
195
|
|
|
196
|
+
For measured visual scoring, pass the optional weights from confusable-vision (903 SSIM-scored pairs across 230 fonts). The `context` filter restricts to identifier-valid, domain-valid, or all pairs.
|
|
197
|
+
|
|
198
|
+
```typescript
|
|
199
|
+
import { confusableDistance } from "namespace-guard";
|
|
200
|
+
import { CONFUSABLE_WEIGHTS } from "namespace-guard/confusable-weights";
|
|
201
|
+
|
|
202
|
+
const result = confusableDistance("paypal", "pa\u0443pal", {
|
|
203
|
+
weights: CONFUSABLE_WEIGHTS,
|
|
204
|
+
context: "identifier",
|
|
205
|
+
});
|
|
206
|
+
// result.similarity, result.steps (including "visual-weight" reason for novel pairs)
|
|
207
|
+
```
|
|
208
|
+
|
|
150
209
|
## Adapter Support
|
|
151
210
|
|
|
152
211
|
- Prisma
|
|
@@ -174,8 +233,10 @@ Migration guides per adapter: [docs/reference.md#canonical-uniqueness-migration-
|
|
|
174
233
|
- Validators (profanity, homoglyph, invisible): [docs/reference.md#async-validators](docs/reference.md#async-validators)
|
|
175
234
|
- Canonical preflight audit (`audit-canonical`): [docs/reference.md#audit-canonical-command](docs/reference.md#audit-canonical-command)
|
|
176
235
|
- Anti-spoofing pipeline and composability vectors: [docs/reference.md#how-the-anti-spoofing-pipeline-works](docs/reference.md#how-the-anti-spoofing-pipeline-works)
|
|
236
|
+
- LLM preprocessing (`canonicalise`, `scan`, `isClean`): [docs/reference.md#llm-pipeline-preprocessing](docs/reference.md#llm-pipeline-preprocessing)
|
|
177
237
|
- Benchmark corpus (`confusable-bench.v1`): [docs/reference.md#confusable-benchmark-corpus-artifact](docs/reference.md#confusable-benchmark-corpus-artifact)
|
|
178
238
|
- Advanced primitives (`skeleton`, `areConfusable`, `confusableDistance`): [docs/reference.md#advanced-security-primitives](docs/reference.md#advanced-security-primitives)
|
|
239
|
+
- Confusable weights (SSIM-scored pairs): [docs/reference.md#confusable-weights-subpath](docs/reference.md#confusable-weights-subpath)
|
|
179
240
|
- CLI reference: [docs/reference.md#cli](docs/reference.md#cli)
|
|
180
241
|
- API reference: [docs/reference.md#api-reference](docs/reference.md#api-reference)
|
|
181
242
|
- Framework integration (Next.js/Express/tRPC): [docs/reference.md#framework-integration](docs/reference.md#framework-integration)
|
package/dist/cli.js
CHANGED
|
@@ -27,6 +27,9 @@ module.exports = __toCommonJS(cli_exports);
|
|
|
27
27
|
var import_fs = require("fs");
|
|
28
28
|
var import_path = require("path");
|
|
29
29
|
|
|
30
|
+
// src/llm-confusable-map.ts
|
|
31
|
+
var LLM_CONFUSABLE_MAP_SOURCE_COUNTS = Object.freeze({ tr39: 1425, novel: 793 });
|
|
32
|
+
|
|
30
33
|
// src/index.ts
|
|
31
34
|
function asRecord(value) {
|
|
32
35
|
if (!value || typeof value !== "object") return null;
|
|
@@ -2828,6 +2831,21 @@ var SCRIPT_DETECTORS = [
|
|
|
2828
2831
|
["hiragana", /\p{Script=Hiragana}/u],
|
|
2829
2832
|
["katakana", /\p{Script=Katakana}/u]
|
|
2830
2833
|
];
|
|
2834
|
+
var DEFAULT_SCAN_RISK_TERMS = Object.freeze([
|
|
2835
|
+
"liability",
|
|
2836
|
+
"indemnity",
|
|
2837
|
+
"penalty",
|
|
2838
|
+
"damages",
|
|
2839
|
+
"termination",
|
|
2840
|
+
"breach",
|
|
2841
|
+
"warranty",
|
|
2842
|
+
"payment",
|
|
2843
|
+
"invoice",
|
|
2844
|
+
"governing",
|
|
2845
|
+
"jurisdiction",
|
|
2846
|
+
"arbitration",
|
|
2847
|
+
"confidentiality"
|
|
2848
|
+
]);
|
|
2831
2849
|
function clamp(value, min, max) {
|
|
2832
2850
|
return Math.max(min, Math.min(max, value));
|
|
2833
2851
|
}
|
package/dist/cli.mjs
CHANGED
|
@@ -4,6 +4,9 @@
|
|
|
4
4
|
import { readFileSync, existsSync } from "fs";
|
|
5
5
|
import { resolve } from "path";
|
|
6
6
|
|
|
7
|
+
// src/llm-confusable-map.ts
|
|
8
|
+
var LLM_CONFUSABLE_MAP_SOURCE_COUNTS = Object.freeze({ tr39: 1425, novel: 793 });
|
|
9
|
+
|
|
7
10
|
// src/index.ts
|
|
8
11
|
function asRecord(value) {
|
|
9
12
|
if (!value || typeof value !== "object") return null;
|
|
@@ -2805,6 +2808,21 @@ var SCRIPT_DETECTORS = [
|
|
|
2805
2808
|
["hiragana", /\p{Script=Hiragana}/u],
|
|
2806
2809
|
["katakana", /\p{Script=Katakana}/u]
|
|
2807
2810
|
];
|
|
2811
|
+
var DEFAULT_SCAN_RISK_TERMS = Object.freeze([
|
|
2812
|
+
"liability",
|
|
2813
|
+
"indemnity",
|
|
2814
|
+
"penalty",
|
|
2815
|
+
"damages",
|
|
2816
|
+
"termination",
|
|
2817
|
+
"breach",
|
|
2818
|
+
"warranty",
|
|
2819
|
+
"payment",
|
|
2820
|
+
"invoice",
|
|
2821
|
+
"governing",
|
|
2822
|
+
"jurisdiction",
|
|
2823
|
+
"arbitration",
|
|
2824
|
+
"confidentiality"
|
|
2825
|
+
]);
|
|
2808
2826
|
function clamp(value, min, max) {
|
|
2809
2827
|
return Math.max(min, Math.min(max, value));
|
|
2810
2828
|
}
|
|
@@ -26,6 +26,9 @@ __export(composability_vectors_exports, {
|
|
|
26
26
|
});
|
|
27
27
|
module.exports = __toCommonJS(composability_vectors_exports);
|
|
28
28
|
|
|
29
|
+
// src/llm-confusable-map.ts
|
|
30
|
+
var LLM_CONFUSABLE_MAP_SOURCE_COUNTS = Object.freeze({ tr39: 1425, novel: 793 });
|
|
31
|
+
|
|
29
32
|
// src/index.ts
|
|
30
33
|
var PROFANITY_SUBSTITUTE_MAP_BALANCED = {
|
|
31
34
|
"0": ["o"],
|
|
@@ -1845,6 +1848,21 @@ var COMPOSABILITY_VECTORS = NFKC_TR39_DIVERGENCE_VECTORS;
|
|
|
1845
1848
|
var COMPOSABILITY_VECTORS_COUNT = COMPOSABILITY_VECTORS.length;
|
|
1846
1849
|
var DEFAULT_IGNORABLE_RE = /[\u00AD\u034F\u061C\u115F\u1160\u17B4\u17B5\u180B-\u180F\u200B-\u200F\u202A-\u202E\u2060-\u206F\uFE00-\uFE0F\uFEFF\uFFA0\uFFF0-\uFFF8\u{1BCA0}-\u{1BCA3}\u{1D173}-\u{1D17A}\u{E0000}-\u{E0FFF}]/gu;
|
|
1847
1850
|
var DEFAULT_IGNORABLE_SINGLE_RE = new RegExp(DEFAULT_IGNORABLE_RE.source, "u");
|
|
1851
|
+
var DEFAULT_SCAN_RISK_TERMS = Object.freeze([
|
|
1852
|
+
"liability",
|
|
1853
|
+
"indemnity",
|
|
1854
|
+
"penalty",
|
|
1855
|
+
"damages",
|
|
1856
|
+
"termination",
|
|
1857
|
+
"breach",
|
|
1858
|
+
"warranty",
|
|
1859
|
+
"payment",
|
|
1860
|
+
"invoice",
|
|
1861
|
+
"governing",
|
|
1862
|
+
"jurisdiction",
|
|
1863
|
+
"arbitration",
|
|
1864
|
+
"confidentiality"
|
|
1865
|
+
]);
|
|
1848
1866
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1849
1867
|
0 && (module.exports = {
|
|
1850
1868
|
COMPOSABILITY_VECTORS,
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
// src/llm-confusable-map.ts
|
|
2
|
+
var LLM_CONFUSABLE_MAP_SOURCE_COUNTS = Object.freeze({ tr39: 1425, novel: 793 });
|
|
3
|
+
|
|
1
4
|
// src/index.ts
|
|
2
5
|
var PROFANITY_SUBSTITUTE_MAP_BALANCED = {
|
|
3
6
|
"0": ["o"],
|
|
@@ -1817,6 +1820,21 @@ var COMPOSABILITY_VECTORS = NFKC_TR39_DIVERGENCE_VECTORS;
|
|
|
1817
1820
|
var COMPOSABILITY_VECTORS_COUNT = COMPOSABILITY_VECTORS.length;
|
|
1818
1821
|
var DEFAULT_IGNORABLE_RE = /[\u00AD\u034F\u061C\u115F\u1160\u17B4\u17B5\u180B-\u180F\u200B-\u200F\u202A-\u202E\u2060-\u206F\uFE00-\uFE0F\uFEFF\uFFA0\uFFF0-\uFFF8\u{1BCA0}-\u{1BCA3}\u{1D173}-\u{1D17A}\u{E0000}-\u{E0FFF}]/gu;
|
|
1819
1822
|
var DEFAULT_IGNORABLE_SINGLE_RE = new RegExp(DEFAULT_IGNORABLE_RE.source, "u");
|
|
1823
|
+
var DEFAULT_SCAN_RISK_TERMS = Object.freeze([
|
|
1824
|
+
"liability",
|
|
1825
|
+
"indemnity",
|
|
1826
|
+
"penalty",
|
|
1827
|
+
"damages",
|
|
1828
|
+
"termination",
|
|
1829
|
+
"breach",
|
|
1830
|
+
"warranty",
|
|
1831
|
+
"payment",
|
|
1832
|
+
"invoice",
|
|
1833
|
+
"governing",
|
|
1834
|
+
"jurisdiction",
|
|
1835
|
+
"arbitration",
|
|
1836
|
+
"confidentiality"
|
|
1837
|
+
]);
|
|
1820
1838
|
export {
|
|
1821
1839
|
COMPOSABILITY_VECTORS,
|
|
1822
1840
|
COMPOSABILITY_VECTORS_COUNT,
|
package/dist/index.d.mts
CHANGED
|
@@ -1,3 +1,20 @@
|
|
|
1
|
+
type LlmConfusableSource = "tr39" | "novel";
|
|
2
|
+
type LlmConfusableMapEntry = {
|
|
3
|
+
latin: string;
|
|
4
|
+
ssimScore: number;
|
|
5
|
+
source: LlmConfusableSource;
|
|
6
|
+
script: string;
|
|
7
|
+
codepoint: string;
|
|
8
|
+
};
|
|
9
|
+
type LlmConfusableMap = Readonly<Record<string, readonly LlmConfusableMapEntry[]>>;
|
|
10
|
+
declare const LLM_CONFUSABLE_MAP: LlmConfusableMap;
|
|
11
|
+
declare const LLM_CONFUSABLE_MAP_PAIR_COUNT = 2218;
|
|
12
|
+
declare const LLM_CONFUSABLE_MAP_CHAR_COUNT = 1947;
|
|
13
|
+
declare const LLM_CONFUSABLE_MAP_SOURCE_COUNTS: Readonly<{
|
|
14
|
+
tr39: 1425;
|
|
15
|
+
novel: 793;
|
|
16
|
+
}>;
|
|
17
|
+
|
|
1
18
|
/** A database table or model to check for slug/handle collisions. */
|
|
2
19
|
type NamespaceSource = {
|
|
3
20
|
/** Table/model name (must match the adapter's lookup key) */
|
|
@@ -136,6 +153,75 @@ type InvisibleCharacterValidatorOptions = {
|
|
|
136
153
|
/** Reject combining marks (Unicode category `M*`) often used for visual obfuscation (default: `false`). */
|
|
137
154
|
rejectCombiningMarks?: boolean;
|
|
138
155
|
};
|
|
156
|
+
/** Options for `canonicalise()` LLM preprocessing. */
|
|
157
|
+
type CanonicaliseOptions = {
|
|
158
|
+
/** Minimum SSIM score required for replacement (default: `0.7`). */
|
|
159
|
+
threshold?: number;
|
|
160
|
+
/** Include confusable-vision novel discoveries in addition to TR39 mappings (default: `true`). */
|
|
161
|
+
includeNovel?: boolean;
|
|
162
|
+
/** Restrict replacement to specific source scripts (case-insensitive, e.g. `["Cyrillic", "Greek"]`). */
|
|
163
|
+
scripts?: string[];
|
|
164
|
+
/**
|
|
165
|
+
* Canonicalisation strategy (default: `"mixed"`).
|
|
166
|
+
*
|
|
167
|
+
* - `"mixed"` -- only replace confusable characters inside tokens that already
|
|
168
|
+
* contain Latin letters. Standalone non-Latin words (e.g. "Москва") are
|
|
169
|
+
* preserved. Safe for multilingual text.
|
|
170
|
+
*
|
|
171
|
+
* - `"all"` -- replace every confusable character regardless of surrounding
|
|
172
|
+
* context. Use this when the document is known to be Latin-script (e.g.
|
|
173
|
+
* an English contract) and you want to catch attackers who substitute
|
|
174
|
+
* every character in a word.
|
|
175
|
+
*/
|
|
176
|
+
strategy?: "mixed" | "all";
|
|
177
|
+
};
|
|
178
|
+
/** Options for `scan()` and `isClean()`. */
|
|
179
|
+
type ScanOptions = CanonicaliseOptions & {
|
|
180
|
+
/** Optional list of high-value terms used to raise `riskLevel` when targeted (default: built-in legal/financial list). */
|
|
181
|
+
riskTerms?: string[];
|
|
182
|
+
};
|
|
183
|
+
/** Single confusable finding returned by `scan()`. */
|
|
184
|
+
type ScanFinding = {
|
|
185
|
+
/** The confusable character found in the input. */
|
|
186
|
+
char: string;
|
|
187
|
+
/** Codepoint label in `U+XXXX` format. */
|
|
188
|
+
codepoint: string;
|
|
189
|
+
/** Script name of the source character. */
|
|
190
|
+
script: string;
|
|
191
|
+
/** Canonical Latin equivalent selected by the lookup table. */
|
|
192
|
+
latinEquivalent: string;
|
|
193
|
+
/** SSIM score used for this mapping. */
|
|
194
|
+
ssimScore: number;
|
|
195
|
+
/** Mapping source (`tr39` baseline or `novel` discovery). */
|
|
196
|
+
source: "tr39" | "novel";
|
|
197
|
+
/** UTF-16 code-unit offset in the input string. */
|
|
198
|
+
index: number;
|
|
199
|
+
/** Token/word containing this character. */
|
|
200
|
+
word: string;
|
|
201
|
+
/** Whether the token mixes Latin and non-Latin letters. */
|
|
202
|
+
mixedScript: boolean;
|
|
203
|
+
};
|
|
204
|
+
/** Structured confusable scan result for LLM preprocessing pipelines. */
|
|
205
|
+
type ScanResult = {
|
|
206
|
+
/** Whether any confusable mapping candidates were detected. */
|
|
207
|
+
hasConfusables: boolean;
|
|
208
|
+
/** Number of findings in `findings`. */
|
|
209
|
+
count: number;
|
|
210
|
+
/** Detailed findings with script/source/position metadata. */
|
|
211
|
+
findings: ScanFinding[];
|
|
212
|
+
/** Aggregate scan summary for policy and logging. */
|
|
213
|
+
summary: {
|
|
214
|
+
/** Number of distinct confusable characters found. */
|
|
215
|
+
distinctChars: number;
|
|
216
|
+
/** Number of distinct words/tokens affected. */
|
|
217
|
+
wordsAffected: number;
|
|
218
|
+
/** Distinct scripts detected among findings. */
|
|
219
|
+
scriptsDetected: string[];
|
|
220
|
+
/** Heuristic risk level from confusable density + targeting. */
|
|
221
|
+
riskLevel: "none" | "low" | "medium" | "high";
|
|
222
|
+
};
|
|
223
|
+
};
|
|
224
|
+
|
|
139
225
|
/** Options for the `skeleton()` and `areConfusable()` functions. */
|
|
140
226
|
type SkeletonOptions = {
|
|
141
227
|
/** Confusable character map to use.
|
|
@@ -504,6 +590,32 @@ declare function createHomoglyphValidator(options?: {
|
|
|
504
590
|
* @returns An async validator function for use in `config.validators`
|
|
505
591
|
*/
|
|
506
592
|
declare function createInvisibleCharacterValidator(options?: InvisibleCharacterValidatorOptions): NamespaceValidator;
|
|
593
|
+
/**
|
|
594
|
+
* Canonicalise confusable characters in text for LLM preprocessing.
|
|
595
|
+
*
|
|
596
|
+
* With the default `strategy: "mixed"`, only rewrites characters inside tokens
|
|
597
|
+
* that already contain Latin letters. Standalone non-Latin words are preserved
|
|
598
|
+
* to reduce false positives in multilingual text.
|
|
599
|
+
*
|
|
600
|
+
* With `strategy: "all"`, rewrites every confusable character regardless of
|
|
601
|
+
* context. Use this when the document is known to be Latin-script.
|
|
602
|
+
*/
|
|
603
|
+
declare function canonicalise(text: string, options?: CanonicaliseOptions): string;
|
|
604
|
+
/**
|
|
605
|
+
* Scan text for confusable characters and return structured findings + risk summary.
|
|
606
|
+
*/
|
|
607
|
+
declare function scan(text: string, options?: ScanOptions): ScanResult;
|
|
608
|
+
/**
|
|
609
|
+
* Fast gate for LLM pipelines.
|
|
610
|
+
*
|
|
611
|
+
* With the default `strategy: "mixed"`, returns `false` as soon as a
|
|
612
|
+
* mixed-script confusable substitution is found. Standalone non-Latin
|
|
613
|
+
* words do not fail this gate.
|
|
614
|
+
*
|
|
615
|
+
* With `strategy: "all"`, returns `false` if any confusable character is
|
|
616
|
+
* found, regardless of surrounding context.
|
|
617
|
+
*/
|
|
618
|
+
declare function isClean(text: string, options?: ScanOptions): boolean;
|
|
507
619
|
/**
|
|
508
620
|
* Compute the TR39 Section 4 skeleton of a string for confusable comparison.
|
|
509
621
|
*
|
|
@@ -634,4 +746,4 @@ declare function createNamespaceGuard(config: NamespaceConfig, adapter: Namespac
|
|
|
634
746
|
/** The guard instance returned by `createNamespaceGuard`. */
|
|
635
747
|
type NamespaceGuard = ReturnType<typeof createNamespaceGuard>;
|
|
636
748
|
|
|
637
|
-
export { type AssertClaimableOptions, COMPOSABILITY_VECTORS, COMPOSABILITY_VECTORS_COUNT, COMPOSABILITY_VECTOR_SUITE, CONFUSABLE_MAP, CONFUSABLE_MAP_FULL, type CheckManyOptions, type CheckResult, type CheckRiskOptions, type ClaimOptions, type ClaimResult, type ComposabilityVector, type ConfusableDistanceOptions, type ConfusableDistanceResult, type ConfusableDistanceStep, type ConfusableWeight, type ConfusableWeights, DEFAULT_PROTECTED_TOKENS, type EnforceRiskOptions, type EnforceRiskResult, type FindOneOptions, type InvisibleCharacterValidatorOptions, NAMESPACE_PROFILES, NFKC_TR39_DIVERGENCE_VECTORS, type NamespaceAdapter, type NamespaceConfig, type NamespaceGuard, type NamespaceProfileName, type NamespaceProfilePreset, type NamespaceSource, type NamespaceValidator, type NamespaceValidatorResult, type NfkcTr39DivergenceVector, type OwnershipScope, type PredicateValidatorOptions, type ProfanityValidationMode, type ProfanityValidatorOptions, type ProfanityVariantProfile, type RiskAction, type RiskCheckResult, type RiskLevel, type RiskMatch, type RiskReason, type RiskReasonCode, type SkeletonOptions, type SuggestStrategyName, type UniqueViolationDetector, areConfusable, confusableDistance, createHomoglyphValidator, createInvisibleCharacterValidator, createNamespaceGuard, createNamespaceGuardWithProfile, createPredicateValidator, createProfanityValidator, deriveNfkcTr39DivergenceVectors, isLikelyUniqueViolationError, normalize, skeleton };
|
|
749
|
+
export { type AssertClaimableOptions, COMPOSABILITY_VECTORS, COMPOSABILITY_VECTORS_COUNT, COMPOSABILITY_VECTOR_SUITE, CONFUSABLE_MAP, CONFUSABLE_MAP_FULL, type CanonicaliseOptions, type CheckManyOptions, type CheckResult, type CheckRiskOptions, type ClaimOptions, type ClaimResult, type ComposabilityVector, type ConfusableDistanceOptions, type ConfusableDistanceResult, type ConfusableDistanceStep, type ConfusableWeight, type ConfusableWeights, DEFAULT_PROTECTED_TOKENS, type EnforceRiskOptions, type EnforceRiskResult, type FindOneOptions, type InvisibleCharacterValidatorOptions, LLM_CONFUSABLE_MAP, LLM_CONFUSABLE_MAP_CHAR_COUNT, LLM_CONFUSABLE_MAP_PAIR_COUNT, LLM_CONFUSABLE_MAP_SOURCE_COUNTS, NAMESPACE_PROFILES, NFKC_TR39_DIVERGENCE_VECTORS, type NamespaceAdapter, type NamespaceConfig, type NamespaceGuard, type NamespaceProfileName, type NamespaceProfilePreset, type NamespaceSource, type NamespaceValidator, type NamespaceValidatorResult, type NfkcTr39DivergenceVector, type OwnershipScope, type PredicateValidatorOptions, type ProfanityValidationMode, type ProfanityValidatorOptions, type ProfanityVariantProfile, type RiskAction, type RiskCheckResult, type RiskLevel, type RiskMatch, type RiskReason, type RiskReasonCode, type ScanFinding, type ScanOptions, type ScanResult, type SkeletonOptions, type SuggestStrategyName, type UniqueViolationDetector, areConfusable, canonicalise, confusableDistance, createHomoglyphValidator, createInvisibleCharacterValidator, createNamespaceGuard, createNamespaceGuardWithProfile, createPredicateValidator, createProfanityValidator, deriveNfkcTr39DivergenceVectors, isClean, isLikelyUniqueViolationError, normalize, scan, skeleton };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,3 +1,20 @@
|
|
|
1
|
+
type LlmConfusableSource = "tr39" | "novel";
|
|
2
|
+
type LlmConfusableMapEntry = {
|
|
3
|
+
latin: string;
|
|
4
|
+
ssimScore: number;
|
|
5
|
+
source: LlmConfusableSource;
|
|
6
|
+
script: string;
|
|
7
|
+
codepoint: string;
|
|
8
|
+
};
|
|
9
|
+
type LlmConfusableMap = Readonly<Record<string, readonly LlmConfusableMapEntry[]>>;
|
|
10
|
+
declare const LLM_CONFUSABLE_MAP: LlmConfusableMap;
|
|
11
|
+
declare const LLM_CONFUSABLE_MAP_PAIR_COUNT = 2218;
|
|
12
|
+
declare const LLM_CONFUSABLE_MAP_CHAR_COUNT = 1947;
|
|
13
|
+
declare const LLM_CONFUSABLE_MAP_SOURCE_COUNTS: Readonly<{
|
|
14
|
+
tr39: 1425;
|
|
15
|
+
novel: 793;
|
|
16
|
+
}>;
|
|
17
|
+
|
|
1
18
|
/** A database table or model to check for slug/handle collisions. */
|
|
2
19
|
type NamespaceSource = {
|
|
3
20
|
/** Table/model name (must match the adapter's lookup key) */
|
|
@@ -136,6 +153,75 @@ type InvisibleCharacterValidatorOptions = {
|
|
|
136
153
|
/** Reject combining marks (Unicode category `M*`) often used for visual obfuscation (default: `false`). */
|
|
137
154
|
rejectCombiningMarks?: boolean;
|
|
138
155
|
};
|
|
156
|
+
/** Options for `canonicalise()` LLM preprocessing. */
|
|
157
|
+
type CanonicaliseOptions = {
|
|
158
|
+
/** Minimum SSIM score required for replacement (default: `0.7`). */
|
|
159
|
+
threshold?: number;
|
|
160
|
+
/** Include confusable-vision novel discoveries in addition to TR39 mappings (default: `true`). */
|
|
161
|
+
includeNovel?: boolean;
|
|
162
|
+
/** Restrict replacement to specific source scripts (case-insensitive, e.g. `["Cyrillic", "Greek"]`). */
|
|
163
|
+
scripts?: string[];
|
|
164
|
+
/**
|
|
165
|
+
* Canonicalisation strategy (default: `"mixed"`).
|
|
166
|
+
*
|
|
167
|
+
* - `"mixed"` -- only replace confusable characters inside tokens that already
|
|
168
|
+
* contain Latin letters. Standalone non-Latin words (e.g. "Москва") are
|
|
169
|
+
* preserved. Safe for multilingual text.
|
|
170
|
+
*
|
|
171
|
+
* - `"all"` -- replace every confusable character regardless of surrounding
|
|
172
|
+
* context. Use this when the document is known to be Latin-script (e.g.
|
|
173
|
+
* an English contract) and you want to catch attackers who substitute
|
|
174
|
+
* every character in a word.
|
|
175
|
+
*/
|
|
176
|
+
strategy?: "mixed" | "all";
|
|
177
|
+
};
|
|
178
|
+
/** Options for `scan()` and `isClean()`. */
|
|
179
|
+
type ScanOptions = CanonicaliseOptions & {
|
|
180
|
+
/** Optional list of high-value terms used to raise `riskLevel` when targeted (default: built-in legal/financial list). */
|
|
181
|
+
riskTerms?: string[];
|
|
182
|
+
};
|
|
183
|
+
/** Single confusable finding returned by `scan()`. */
|
|
184
|
+
type ScanFinding = {
|
|
185
|
+
/** The confusable character found in the input. */
|
|
186
|
+
char: string;
|
|
187
|
+
/** Codepoint label in `U+XXXX` format. */
|
|
188
|
+
codepoint: string;
|
|
189
|
+
/** Script name of the source character. */
|
|
190
|
+
script: string;
|
|
191
|
+
/** Canonical Latin equivalent selected by the lookup table. */
|
|
192
|
+
latinEquivalent: string;
|
|
193
|
+
/** SSIM score used for this mapping. */
|
|
194
|
+
ssimScore: number;
|
|
195
|
+
/** Mapping source (`tr39` baseline or `novel` discovery). */
|
|
196
|
+
source: "tr39" | "novel";
|
|
197
|
+
/** UTF-16 code-unit offset in the input string. */
|
|
198
|
+
index: number;
|
|
199
|
+
/** Token/word containing this character. */
|
|
200
|
+
word: string;
|
|
201
|
+
/** Whether the token mixes Latin and non-Latin letters. */
|
|
202
|
+
mixedScript: boolean;
|
|
203
|
+
};
|
|
204
|
+
/** Structured confusable scan result for LLM preprocessing pipelines. */
|
|
205
|
+
type ScanResult = {
|
|
206
|
+
/** Whether any confusable mapping candidates were detected. */
|
|
207
|
+
hasConfusables: boolean;
|
|
208
|
+
/** Number of findings in `findings`. */
|
|
209
|
+
count: number;
|
|
210
|
+
/** Detailed findings with script/source/position metadata. */
|
|
211
|
+
findings: ScanFinding[];
|
|
212
|
+
/** Aggregate scan summary for policy and logging. */
|
|
213
|
+
summary: {
|
|
214
|
+
/** Number of distinct confusable characters found. */
|
|
215
|
+
distinctChars: number;
|
|
216
|
+
/** Number of distinct words/tokens affected. */
|
|
217
|
+
wordsAffected: number;
|
|
218
|
+
/** Distinct scripts detected among findings. */
|
|
219
|
+
scriptsDetected: string[];
|
|
220
|
+
/** Heuristic risk level from confusable density + targeting. */
|
|
221
|
+
riskLevel: "none" | "low" | "medium" | "high";
|
|
222
|
+
};
|
|
223
|
+
};
|
|
224
|
+
|
|
139
225
|
/** Options for the `skeleton()` and `areConfusable()` functions. */
|
|
140
226
|
type SkeletonOptions = {
|
|
141
227
|
/** Confusable character map to use.
|
|
@@ -504,6 +590,32 @@ declare function createHomoglyphValidator(options?: {
|
|
|
504
590
|
* @returns An async validator function for use in `config.validators`
|
|
505
591
|
*/
|
|
506
592
|
declare function createInvisibleCharacterValidator(options?: InvisibleCharacterValidatorOptions): NamespaceValidator;
|
|
593
|
+
/**
|
|
594
|
+
* Canonicalise confusable characters in text for LLM preprocessing.
|
|
595
|
+
*
|
|
596
|
+
* With the default `strategy: "mixed"`, only rewrites characters inside tokens
|
|
597
|
+
* that already contain Latin letters. Standalone non-Latin words are preserved
|
|
598
|
+
* to reduce false positives in multilingual text.
|
|
599
|
+
*
|
|
600
|
+
* With `strategy: "all"`, rewrites every confusable character regardless of
|
|
601
|
+
* context. Use this when the document is known to be Latin-script.
|
|
602
|
+
*/
|
|
603
|
+
declare function canonicalise(text: string, options?: CanonicaliseOptions): string;
|
|
604
|
+
/**
|
|
605
|
+
* Scan text for confusable characters and return structured findings + risk summary.
|
|
606
|
+
*/
|
|
607
|
+
declare function scan(text: string, options?: ScanOptions): ScanResult;
|
|
608
|
+
/**
|
|
609
|
+
* Fast gate for LLM pipelines.
|
|
610
|
+
*
|
|
611
|
+
* With the default `strategy: "mixed"`, returns `false` as soon as a
|
|
612
|
+
* mixed-script confusable substitution is found. Standalone non-Latin
|
|
613
|
+
* words do not fail this gate.
|
|
614
|
+
*
|
|
615
|
+
* With `strategy: "all"`, returns `false` if any confusable character is
|
|
616
|
+
* found, regardless of surrounding context.
|
|
617
|
+
*/
|
|
618
|
+
declare function isClean(text: string, options?: ScanOptions): boolean;
|
|
507
619
|
/**
|
|
508
620
|
* Compute the TR39 Section 4 skeleton of a string for confusable comparison.
|
|
509
621
|
*
|
|
@@ -634,4 +746,4 @@ declare function createNamespaceGuard(config: NamespaceConfig, adapter: Namespac
|
|
|
634
746
|
/** The guard instance returned by `createNamespaceGuard`. */
|
|
635
747
|
type NamespaceGuard = ReturnType<typeof createNamespaceGuard>;
|
|
636
748
|
|
|
637
|
-
export { type AssertClaimableOptions, COMPOSABILITY_VECTORS, COMPOSABILITY_VECTORS_COUNT, COMPOSABILITY_VECTOR_SUITE, CONFUSABLE_MAP, CONFUSABLE_MAP_FULL, type CheckManyOptions, type CheckResult, type CheckRiskOptions, type ClaimOptions, type ClaimResult, type ComposabilityVector, type ConfusableDistanceOptions, type ConfusableDistanceResult, type ConfusableDistanceStep, type ConfusableWeight, type ConfusableWeights, DEFAULT_PROTECTED_TOKENS, type EnforceRiskOptions, type EnforceRiskResult, type FindOneOptions, type InvisibleCharacterValidatorOptions, NAMESPACE_PROFILES, NFKC_TR39_DIVERGENCE_VECTORS, type NamespaceAdapter, type NamespaceConfig, type NamespaceGuard, type NamespaceProfileName, type NamespaceProfilePreset, type NamespaceSource, type NamespaceValidator, type NamespaceValidatorResult, type NfkcTr39DivergenceVector, type OwnershipScope, type PredicateValidatorOptions, type ProfanityValidationMode, type ProfanityValidatorOptions, type ProfanityVariantProfile, type RiskAction, type RiskCheckResult, type RiskLevel, type RiskMatch, type RiskReason, type RiskReasonCode, type SkeletonOptions, type SuggestStrategyName, type UniqueViolationDetector, areConfusable, confusableDistance, createHomoglyphValidator, createInvisibleCharacterValidator, createNamespaceGuard, createNamespaceGuardWithProfile, createPredicateValidator, createProfanityValidator, deriveNfkcTr39DivergenceVectors, isLikelyUniqueViolationError, normalize, skeleton };
|
|
749
|
+
export { type AssertClaimableOptions, COMPOSABILITY_VECTORS, COMPOSABILITY_VECTORS_COUNT, COMPOSABILITY_VECTOR_SUITE, CONFUSABLE_MAP, CONFUSABLE_MAP_FULL, type CanonicaliseOptions, type CheckManyOptions, type CheckResult, type CheckRiskOptions, type ClaimOptions, type ClaimResult, type ComposabilityVector, type ConfusableDistanceOptions, type ConfusableDistanceResult, type ConfusableDistanceStep, type ConfusableWeight, type ConfusableWeights, DEFAULT_PROTECTED_TOKENS, type EnforceRiskOptions, type EnforceRiskResult, type FindOneOptions, type InvisibleCharacterValidatorOptions, LLM_CONFUSABLE_MAP, LLM_CONFUSABLE_MAP_CHAR_COUNT, LLM_CONFUSABLE_MAP_PAIR_COUNT, LLM_CONFUSABLE_MAP_SOURCE_COUNTS, NAMESPACE_PROFILES, NFKC_TR39_DIVERGENCE_VECTORS, type NamespaceAdapter, type NamespaceConfig, type NamespaceGuard, type NamespaceProfileName, type NamespaceProfilePreset, type NamespaceSource, type NamespaceValidator, type NamespaceValidatorResult, type NfkcTr39DivergenceVector, type OwnershipScope, type PredicateValidatorOptions, type ProfanityValidationMode, type ProfanityValidatorOptions, type ProfanityVariantProfile, type RiskAction, type RiskCheckResult, type RiskLevel, type RiskMatch, type RiskReason, type RiskReasonCode, type ScanFinding, type ScanOptions, type ScanResult, type SkeletonOptions, type SuggestStrategyName, type UniqueViolationDetector, areConfusable, canonicalise, confusableDistance, createHomoglyphValidator, createInvisibleCharacterValidator, createNamespaceGuard, createNamespaceGuardWithProfile, createPredicateValidator, createProfanityValidator, deriveNfkcTr39DivergenceVectors, isClean, isLikelyUniqueViolationError, normalize, scan, skeleton };
|