@raeven-co/sether-ner 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +80 -0
- package/dist/index.cjs +148 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +97 -0
- package/dist/index.d.ts +97 -0
- package/dist/index.js +118 -0
- package/dist/index.js.map +1 -0
- package/package.json +46 -0
package/README.md
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# @raeven-co/sether-ner
|
|
2
|
+
|
|
3
|
+
> Free-text NER redaction — **names, organisations, locations** — for
|
|
4
|
+
> [Sether](https://www.npmjs.com/package/@raeven-co/sether). The part regex
|
|
5
|
+
> can't do, shipped as a separate, lazy-loaded package so the core stays ~35 KB.
|
|
6
|
+
|
|
7
|
+
The core Sether detectors catch *structured* PII (emails, cards, SSNs, keys) and
|
|
8
|
+
*label-anchored* identity (`Name:`, `DOB:`). This package adds the hard part:
|
|
9
|
+
unlabelled people, companies, and places in running prose — the thing a
|
|
10
|
+
customer's own weekend build and the regex-only competitors can't replicate.
|
|
11
|
+
|
|
12
|
+
## Why a separate package
|
|
13
|
+
|
|
14
|
+
- The model + ONNX runtime is ~30 MB+. Keeping it out of the core means
|
|
15
|
+
`new Sether()` stays tiny and dependency-light.
|
|
16
|
+
- NER is **async** and runs on **full outbound text** (the prompt you're about to
|
|
17
|
+
send), not the streaming response — so it's a different integration point than
|
|
18
|
+
the sync, chunk-boundary-safe core detectors. This package is honest about that.
|
|
19
|
+
- `@huggingface/transformers` is an **optional peer dependency** — install it only
|
|
20
|
+
if you use the default model. Bring your own inferer (e.g. GLiNER) and you don't
|
|
21
|
+
need it at all.
|
|
22
|
+
|
|
23
|
+
## Install
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
npm install @raeven-co/sether-ner @huggingface/transformers
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Use
|
|
30
|
+
|
|
31
|
+
```ts
|
|
32
|
+
import { Sether, redactSync, basicDetectors } from '@raeven-co/sether';
|
|
33
|
+
import { createNerRedactor } from '@raeven-co/sether-ner';
|
|
34
|
+
|
|
35
|
+
const sether = new Sether();
|
|
36
|
+
const ner = createNerRedactor(); // Xenova/bert-base-NER, lazy-loaded on first call
|
|
37
|
+
|
|
38
|
+
// Outbound path: NER first (names/orgs/locations), then structured PII — one vault.
|
|
39
|
+
const { redacted } = await ner.redact(userPrompt, { vault: sether.vault });
|
|
40
|
+
const safe = redactSync(redacted, { detectors: basicDetectors, vault: sether.vault });
|
|
41
|
+
|
|
42
|
+
// Send `safe` to the LLM. On the reply, sether.restore() swaps BOTH token sets
|
|
43
|
+
// back, because NER tokens use the same `<TYPE_uuid>` format the core restores.
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
NER tokens look like `<NAME_…>`, `<ORG_…>`, `<LOCATION_…>` and restore through the
|
|
47
|
+
core's `restore()` / `createRestoreStream()` with no extra wiring.
|
|
48
|
+
|
|
49
|
+
## Options
|
|
50
|
+
|
|
51
|
+
```ts
|
|
52
|
+
createNerRedactor({
|
|
53
|
+
model: 'Xenova/bert-base-NER', // any transformers.js token-classification model
|
|
54
|
+
threshold: 0.6, // min confidence
|
|
55
|
+
labels: ['NAME', 'ORG'], // restrict which types to redact
|
|
56
|
+
infer: myGlinerInferer, // bring your own model / service / mock
|
|
57
|
+
});
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Bring your own model (e.g. GLiNER)
|
|
61
|
+
|
|
62
|
+
`infer` is `(text) => Promise<RawEntity[]>` where each entity has
|
|
63
|
+
`{ entity_group, score, start, end }` (absolute char offsets). Map your model's
|
|
64
|
+
output to that shape and the rest of the pipeline is identical — which also makes
|
|
65
|
+
the whole redactor unit-testable without downloading a model.
|
|
66
|
+
|
|
67
|
+
## Honest limitations
|
|
68
|
+
|
|
69
|
+
- **First call is slow** — the model downloads (~30 MB+) and warms up. Call
|
|
70
|
+
`ner.warmup()` at boot. ONNX inference runs ~50% slower than PyTorch.
|
|
71
|
+
- **Not on the streaming hot path.** NER is a batched forward pass; it runs on the
|
|
72
|
+
outbound prompt, not per-chunk on the response. Restoration of the response is
|
|
73
|
+
the core's job (sync, chunk-boundary-safe).
|
|
74
|
+
- **Accuracy is model-bound.** `bert-base-NER` is solid for Western names/orgs;
|
|
75
|
+
multilingual and domain names need a better model (swap via `model`/`infer`).
|
|
76
|
+
We'd rather you know that than oversell it.
|
|
77
|
+
|
|
78
|
+
## License
|
|
79
|
+
|
|
80
|
+
MIT © Godfrey Lebo / Raeven, Inc.
|
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
|
|
20
|
+
// src/index.ts
|
|
21
|
+
var index_exports = {};
|
|
22
|
+
__export(index_exports, {
|
|
23
|
+
NER_LABELS: () => NER_LABELS,
|
|
24
|
+
buildRedactor: () => buildRedactor,
|
|
25
|
+
createNerRedactor: () => createNerRedactor,
|
|
26
|
+
createTransformersInfer: () => createTransformersInfer
|
|
27
|
+
});
|
|
28
|
+
module.exports = __toCommonJS(index_exports);
|
|
29
|
+
|
|
30
|
+
// src/redactor.ts
|
|
31
|
+
var import_node_crypto = require("crypto");
|
|
32
|
+
|
|
33
|
+
// src/types.ts
|
|
34
|
+
var NER_LABELS = ["NAME", "ORG", "LOCATION"];
|
|
35
|
+
|
|
36
|
+
// src/redactor.ts
|
|
37
|
+
var LABEL_MAP = {
|
|
38
|
+
PER: "NAME",
|
|
39
|
+
PERSON: "NAME",
|
|
40
|
+
NAME: "NAME",
|
|
41
|
+
ORG: "ORG",
|
|
42
|
+
ORGANIZATION: "ORG",
|
|
43
|
+
ORGANISATION: "ORG",
|
|
44
|
+
LOC: "LOCATION",
|
|
45
|
+
LOCATION: "LOCATION",
|
|
46
|
+
GPE: "LOCATION"
|
|
47
|
+
};
|
|
48
|
+
function mapLabel(group) {
|
|
49
|
+
return LABEL_MAP[group.toUpperCase().replace(/^[BI]-/, "")] ?? null;
|
|
50
|
+
}
|
|
51
|
+
function buildRedactor(infer, options = {}) {
|
|
52
|
+
const threshold = options.threshold ?? 0.6;
|
|
53
|
+
const enabled = new Set(options.labels ?? NER_LABELS);
|
|
54
|
+
const uuid = options.uuid ?? import_node_crypto.randomUUID;
|
|
55
|
+
async function detect(text) {
|
|
56
|
+
if (!text.trim()) return [];
|
|
57
|
+
const raw = await infer(text);
|
|
58
|
+
return resolveOverlaps(toMatches(raw, text, threshold, enabled));
|
|
59
|
+
}
|
|
60
|
+
async function redact(text, opts) {
|
|
61
|
+
const matches = await detect(text);
|
|
62
|
+
if (matches.length === 0) return { redacted: text, matches };
|
|
63
|
+
let out = "";
|
|
64
|
+
let pos = 0;
|
|
65
|
+
for (const m of matches) {
|
|
66
|
+
out += text.slice(pos, m.start);
|
|
67
|
+
const token = `<${m.type}_${uuid()}>`;
|
|
68
|
+
opts.vault.set(token, m.value);
|
|
69
|
+
out += token;
|
|
70
|
+
pos = m.end;
|
|
71
|
+
}
|
|
72
|
+
out += text.slice(pos);
|
|
73
|
+
return { redacted: out, matches };
|
|
74
|
+
}
|
|
75
|
+
async function warmup() {
|
|
76
|
+
await infer("Warm up the model.");
|
|
77
|
+
}
|
|
78
|
+
return { detect, redact, warmup };
|
|
79
|
+
}
|
|
80
|
+
function toMatches(raw, text, threshold, enabled) {
|
|
81
|
+
const out = [];
|
|
82
|
+
for (const e of raw) {
|
|
83
|
+
if (e.score < threshold) continue;
|
|
84
|
+
if (e.start == null || e.end == null || e.end <= e.start) continue;
|
|
85
|
+
const type = mapLabel(e.entity_group);
|
|
86
|
+
if (!type || !enabled.has(type)) continue;
|
|
87
|
+
const value = text.slice(e.start, e.end);
|
|
88
|
+
if (!value.trim()) continue;
|
|
89
|
+
out.push({ type, value, start: e.start, end: e.end, score: e.score });
|
|
90
|
+
}
|
|
91
|
+
return out;
|
|
92
|
+
}
|
|
93
|
+
function resolveOverlaps(matches) {
|
|
94
|
+
matches.sort((a, b) => a.start - b.start || b.score - a.score || b.end - b.start - (a.end - a.start));
|
|
95
|
+
const out = [];
|
|
96
|
+
for (const m of matches) {
|
|
97
|
+
if (!out.some((o) => m.start < o.end && o.start < m.end)) out.push(m);
|
|
98
|
+
}
|
|
99
|
+
return out.sort((a, b) => a.start - b.start);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// src/infer.ts
|
|
103
|
+
var dynamicImport = (specifier) => Function("s", "return import(s)")(specifier);
|
|
104
|
+
function createTransformersInfer(options = {}) {
|
|
105
|
+
const model = options.model ?? "Xenova/bert-base-NER";
|
|
106
|
+
let pipePromise = null;
|
|
107
|
+
async function getPipe() {
|
|
108
|
+
if (!pipePromise) {
|
|
109
|
+
pipePromise = (async () => {
|
|
110
|
+
let mod;
|
|
111
|
+
try {
|
|
112
|
+
mod = await dynamicImport("@huggingface/transformers");
|
|
113
|
+
} catch {
|
|
114
|
+
throw new Error(
|
|
115
|
+
"@raeven-co/sether-ner: install the optional peer '@huggingface/transformers' to use the default model, or pass your own { infer } to createNerRedactor()."
|
|
116
|
+
);
|
|
117
|
+
}
|
|
118
|
+
return mod.pipeline("token-classification", model);
|
|
119
|
+
})();
|
|
120
|
+
}
|
|
121
|
+
return pipePromise;
|
|
122
|
+
}
|
|
123
|
+
return async (text) => {
|
|
124
|
+
const pipe = await getPipe();
|
|
125
|
+
const out = await pipe(text, { aggregation_strategy: "simple" });
|
|
126
|
+
return out.map((e) => ({
|
|
127
|
+
entity_group: e.entity_group ?? e.entity ?? "MISC",
|
|
128
|
+
score: typeof e.score === "number" ? e.score : 0,
|
|
129
|
+
start: e.start,
|
|
130
|
+
end: e.end,
|
|
131
|
+
word: e.word
|
|
132
|
+
}));
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// src/index.ts
|
|
137
|
+
function createNerRedactor(options = {}) {
|
|
138
|
+
const infer = options.infer ?? createTransformersInfer(options);
|
|
139
|
+
return buildRedactor(infer, options);
|
|
140
|
+
}
|
|
141
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
142
|
+
0 && (module.exports = {
|
|
143
|
+
NER_LABELS,
|
|
144
|
+
buildRedactor,
|
|
145
|
+
createNerRedactor,
|
|
146
|
+
createTransformersInfer
|
|
147
|
+
});
|
|
148
|
+
//# sourceMappingURL=index.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/index.ts","../src/redactor.ts","../src/types.ts","../src/infer.ts"],"sourcesContent":["import { buildRedactor } from './redactor.js';\nimport { createTransformersInfer } from './infer.js';\nimport type { NerOptions, NerRedactor } from './types.js';\n\n/**\n * Create a free-text NER redactor for names, organisations, and locations.\n *\n * ```ts\n * import { Sether, redactSync, basicDetectors } from '@raeven-co/sether';\n * import { createNerRedactor } from '@raeven-co/sether-ner';\n *\n * const sether = new Sether();\n * const ner = createNerRedactor(); // uses Xenova/bert-base-NER, lazy-loaded\n *\n * // Outbound: NER first (names/orgs/locations), then structured PII — same vault.\n * const { redacted } = await ner.redact(prompt, { vault: sether.vault });\n * const safe = redactSync(redacted, { detectors: basicDetectors, vault: sether.vault });\n * // ...send `safe` to the LLM. sether.restore() then swaps BOTH token sets back,\n * // because NER tokens use the same `<TYPE_uuid>` format.\n * ```\n *\n * Pass `{ infer }` to use GLiNER or any other model. NER runs on the full\n * outbound text — not the streaming response.\n */\nexport function createNerRedactor(options: NerOptions = {}): NerRedactor {\n const infer = options.infer ?? createTransformersInfer(options);\n return buildRedactor(infer, options);\n}\n\nexport { buildRedactor } from './redactor.js';\nexport { createTransformersInfer } from './infer.js';\nexport { NER_LABELS } from './types.js';\nexport type {\n NerLabel,\n NerMatch,\n NerOptions,\n NerRedactor,\n InferFn,\n RawEntity,\n VaultLike,\n RedactResult,\n} from './types.js';\n","import { randomUUID } from 'node:crypto';\nimport {\n NER_LABELS,\n type InferFn,\n type NerLabel,\n type NerMatch,\n type NerOptions,\n type NerRedactor,\n type RawEntity,\n type RedactResult,\n type VaultLike,\n} from './types.js';\n\n// Map model label groups (and B-/I- prefixes) to Sether's three NER types.\nconst LABEL_MAP: Record<string, NerLabel> = {\n PER: 'NAME',\n PERSON: 'NAME',\n NAME: 'NAME',\n ORG: 'ORG',\n ORGANIZATION: 'ORG',\n ORGANISATION: 'ORG',\n LOC: 'LOCATION',\n LOCATION: 'LOCATION',\n GPE: 'LOCATION',\n};\n\nfunction mapLabel(group: string): NerLabel | null {\n return LABEL_MAP[group.toUpperCase().replace(/^[BI]-/, '')] ?? null;\n}\n\n/**\n * Build a redactor from an injected inferer. This is the pure, model-free core —\n * fully unit-testable with a mock `infer`. The default model wiring lives in\n * index.ts so this module carries no heavy dependency.\n */\nexport function buildRedactor(infer: InferFn, options: NerOptions = {}): NerRedactor {\n const threshold = options.threshold ?? 0.6;\n const enabled = new Set<NerLabel>(options.labels ?? NER_LABELS);\n const uuid = options.uuid ?? randomUUID;\n\n async function detect(text: string): Promise<NerMatch[]> {\n if (!text.trim()) return [];\n const raw = await infer(text);\n return resolveOverlaps(toMatches(raw, text, threshold, enabled));\n }\n\n async function redact(text: string, opts: { vault: VaultLike }): Promise<RedactResult> {\n const matches = await detect(text);\n if (matches.length === 0) return { redacted: text, matches };\n\n let out = '';\n let pos = 0;\n for (const m of matches) {\n out += text.slice(pos, m.start);\n const token = `<${m.type}_${uuid()}>`;\n opts.vault.set(token, m.value);\n out += token;\n pos = m.end;\n }\n out += text.slice(pos);\n return { redacted: out, matches };\n }\n\n async function warmup(): Promise<void> {\n await infer('Warm up the model.');\n }\n\n return { detect, redact, warmup };\n}\n\nfunction toMatches(raw: RawEntity[], text: string, threshold: number, enabled: Set<NerLabel>): NerMatch[] {\n const out: NerMatch[] = [];\n for (const e of raw) {\n if (e.score < threshold) continue;\n if (e.start == null || e.end == null || e.end <= e.start) continue;\n const type = mapLabel(e.entity_group);\n if (!type || !enabled.has(type)) continue;\n const value = text.slice(e.start, e.end);\n if (!value.trim()) continue;\n out.push({ type, value, start: e.start, end: e.end, score: e.score });\n }\n return out;\n}\n\n// Keep the higher-score (then longer) match when spans overlap.\nfunction resolveOverlaps(matches: NerMatch[]): NerMatch[] {\n matches.sort((a, b) => a.start - b.start || b.score - a.score || b.end - b.start - (a.end - a.start));\n const out: NerMatch[] = [];\n for (const m of matches) {\n if (!out.some((o) => m.start < o.end && o.start < m.end)) out.push(m);\n }\n return out.sort((a, b) => a.start - b.start);\n}\n","export type NerLabel = 'NAME' | 'ORG' | 'LOCATION';\n\nexport const NER_LABELS: readonly NerLabel[] = ['NAME', 'ORG', 'LOCATION'];\n\n/**\n * A raw entity as produced by a token-classification pipeline\n * (transformers.js `aggregation_strategy: 'simple'`) or any compatible inferer.\n * Offsets are absolute char positions into the input; entities without offsets\n * are skipped (we never trust the reconstructed `word`, which can carry subword\n * artifacts — we always slice the original text by [start, end)).\n */\nexport interface RawEntity {\n entity_group: string; // e.g. 'PER' | 'ORG' | 'LOC' | 'person' | 'organization'\n score: number; // 0..1\n start?: number;\n end?: number;\n word?: string;\n}\n\nexport type InferFn = (text: string) => Promise<RawEntity[]>;\n\nexport interface NerMatch {\n type: NerLabel;\n value: string;\n start: number;\n end: number;\n score: number;\n}\n\n/** Minimal vault surface — structurally compatible with @raeven-co/sether's Vault. */\nexport interface VaultLike {\n set(token: string, value: string): void;\n}\n\nexport interface NerOptions {\n /** Model id for the default transformers.js inferer. Default: 'Xenova/bert-base-NER'. */\n model?: string;\n /** Minimum confidence to keep a match. Default: 0.6. */\n threshold?: number;\n /** Which entity types to redact. Default: NAME, ORG, LOCATION. */\n labels?: readonly NerLabel[];\n /** Inject a custom inferer (GLiNER, a remote service, or a test mock). */\n infer?: InferFn;\n /** Token id generator. Default: node:crypto randomUUID. */\n uuid?: () => string;\n}\n\nexport interface RedactResult {\n redacted: string;\n matches: NerMatch[];\n}\n\nexport interface NerRedactor {\n /** Detect names / organisations / locations in the text. */\n detect(text: string): Promise<NerMatch[]>;\n /**\n * Redact entities into `<TYPE_uuid>` tokens (the same format @raeven-co/sether\n * uses), storing originals in the vault so the core `restore()` swaps them back.\n * Runs on the full OUTBOUND text — not the response byte-stream.\n */\n redact(text: string, opts: { vault: VaultLike }): Promise<RedactResult>;\n /** Preload the model so the first real call isn't slow. */\n warmup(): Promise<void>;\n}\n","import type { InferFn, NerOptions, RawEntity } from './types.js';\n\ntype Pipe = (text: string, opts: { aggregation_strategy: string }) => Promise<unknown>;\n\ninterface RawPipelineEntity {\n entity_group?: string;\n entity?: string;\n score?: number;\n start?: number;\n end?: number;\n word?: string;\n}\n\n// Hide the specifier from the bundler and the type-checker so\n// @huggingface/transformers stays a TRULY optional peer dependency — it's only\n// needed at runtime if the default inferer actually runs.\nconst dynamicImport = (specifier: string): Promise<unknown> =>\n (Function('s', 'return import(s)') as (s: string) => Promise<unknown>)(specifier);\n\n/**\n * The default inferer: a lazy, memoized transformers.js token-classification\n * pipeline. The ~30 MB+ model is only fetched on first inference, so importing\n * this module stays cheap. Swap in GLiNER or a remote service by passing your\n * own `infer` to createNerRedactor().\n */\nexport function createTransformersInfer(options: NerOptions = {}): InferFn {\n const model = options.model ?? 'Xenova/bert-base-NER';\n let pipePromise: Promise<Pipe> | null = null;\n\n async function getPipe(): Promise<Pipe> {\n if (!pipePromise) {\n pipePromise = (async () => {\n let mod: { pipeline: (task: string, model: string) => Promise<Pipe> };\n try {\n mod = (await dynamicImport('@huggingface/transformers')) as typeof mod;\n } catch {\n throw new Error(\n \"@raeven-co/sether-ner: install the optional peer '@huggingface/transformers' to use the default model, or pass your own { infer } to createNerRedactor().\",\n );\n }\n return mod.pipeline('token-classification', model);\n })();\n }\n return pipePromise;\n }\n\n return async (text: string): Promise<RawEntity[]> => {\n const pipe = await getPipe();\n const out = (await pipe(text, { aggregation_strategy: 'simple' })) as RawPipelineEntity[];\n return out.map((e) => ({\n entity_group: e.entity_group ?? e.entity ?? 'MISC',\n score: typeof e.score === 'number' ? e.score : 0,\n start: e.start,\n end: e.end,\n word: e.word,\n }));\n };\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACAA,yBAA2B;;;ACEpB,IAAM,aAAkC,CAAC,QAAQ,OAAO,UAAU;;;ADYzE,IAAM,YAAsC;AAAA,EAC1C,KAAK;AAAA,EACL,QAAQ;AAAA,EACR,MAAM;AAAA,EACN,KAAK;AAAA,EACL,cAAc;AAAA,EACd,cAAc;AAAA,EACd,KAAK;AAAA,EACL,UAAU;AAAA,EACV,KAAK;AACP;AAEA,SAAS,SAAS,OAAgC;AAChD,SAAO,UAAU,MAAM,YAAY,EAAE,QAAQ,UAAU,EAAE,CAAC,KAAK;AACjE;AAOO,SAAS,cAAc,OAAgB,UAAsB,CAAC,GAAgB;AACnF,QAAM,YAAY,QAAQ,aAAa;AACvC,QAAM,UAAU,IAAI,IAAc,QAAQ,UAAU,UAAU;AAC9D,QAAM,OAAO,QAAQ,QAAQ;AAE7B,iBAAe,OAAO,MAAmC;AACvD,QAAI,CAAC,KAAK,KAAK,EAAG,QAAO,CAAC;AAC1B,UAAM,MAAM,MAAM,MAAM,IAAI;AAC5B,WAAO,gBAAgB,UAAU,KAAK,MAAM,WAAW,OAAO,CAAC;AAAA,EACjE;AAEA,iBAAe,OAAO,MAAc,MAAmD;AACrF,UAAM,UAAU,MAAM,OAAO,IAAI;AACjC,QAAI,QAAQ,WAAW,EAAG,QAAO,EAAE,UAAU,MAAM,QAAQ;AAE3D,QAAI,MAAM;AACV,QAAI,MAAM;AACV,eAAW,KAAK,SAAS;AACvB,aAAO,KAAK,MAAM,KAAK,EAAE,KAAK;AAC9B,YAAM,QAAQ,IAAI,EAAE,IAAI,IAAI,KAAK,CAAC;AAClC,WAAK,MAAM,IAAI,OAAO,EAAE,KAAK;AAC7B,aAAO;AACP,YAAM,EAAE;AAAA,IACV;AACA,WAAO,KAAK,MAAM,GAAG;AACrB,WAAO,EAAE,UAAU,KAAK,QAAQ;AAAA,EAClC;AAEA,iBAAe,SAAwB;AACrC,UAAM,MAAM,oBAAoB;AAAA,EAClC;AAEA,SAAO,EAAE,QAAQ,QAAQ,OAAO;AAClC;AAEA,SAAS,UAAU,KAAkB,MAAc,WAAmB,SAAoC;AACxG,QAAM,MAAkB,CAAC;AACzB,aAAW,KAAK,KAAK;AACnB,QAAI,EAAE,QAAQ,UAAW;AACzB,QAAI,EAAE,SAAS,QAAQ,EAAE,OAAO,QAAQ,EAAE,OAAO,EAAE,MAAO;AAC1D,UAAM,OAAO,SAAS,EAAE,YAAY;AACpC,QAAI,CAAC,QAAQ,CAAC,QAAQ,IAAI,IAAI,EAAG;AACjC,UAAM,QAAQ,KAAK,MAAM,EAAE,OAAO,EAAE,GAAG;AACvC,QAAI,CAAC,MAAM,KAAK,EAAG;AACnB,QAAI,KAAK,EAAE,MAAM,OAAO,OAAO,EAAE,OAAO,KAAK,EAAE,KAAK,OAAO,EAAE,MAAM,CAAC;AAAA,EACtE;AACA,SAAO;AACT;AAGA,SAAS,gBAAgB,SAAiC;AACxD,UAAQ,KAAK,CAAC,GAAG,MAAM,EAAE,QAAQ,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM;AACpG,QAAM,MAAkB,CAAC;AACzB,aAAW,KAAK,SAAS;AACvB,QAAI,CAAC,IAAI,KAAK,CAAC,MAAM,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,GAAG,EAAG,KAAI,KAAK,CAAC;AAAA,EACtE;AACA,SAAO,IAAI,KAAK,CAAC,GAAG,MAAM,EAAE,QAAQ,EAAE,KAAK;AAC7C;;;AE5EA,IAAM,gBAAgB,CAAC,cACpB,SAAS,KAAK,kBAAkB,EAAsC,SAAS;AAQ3E,SAAS,wBAAwB,UAAsB,CAAC,GAAY;AACzE,QAAM,QAAQ,QAAQ,SAAS;AAC/B,MAAI,cAAoC;AAExC,iBAAe,UAAyB;AACtC,QAAI,CAAC,aAAa;AAChB,qBAAe,YAAY;AACzB,YAAI;AACJ,YAAI;AACF,gBAAO,MAAM,cAAc,2BAA2B;AAAA,QACxD,QAAQ;AACN,gBAAM,IAAI;AAAA,YACR;AAAA,UACF;AAAA,QACF;AACA,eAAO,IAAI,SAAS,wBAAwB,KAAK;AAAA,MACnD,GAAG;AAAA,IACL;AACA,WAAO;AAAA,EACT;AAEA,SAAO,OAAO,SAAuC;AACnD,UAAM,OAAO,MAAM,QAAQ;AAC3B,UAAM,MAAO,MAAM,KAAK,MAAM,EAAE,sBAAsB,SAAS,CAAC;AAChE,WAAO,IAAI,IAAI,CAAC,OAAO;AAAA,MACrB,cAAc,EAAE,gBAAgB,EAAE,UAAU;AAAA,MAC5C,OAAO,OAAO,EAAE,UAAU,WAAW,EAAE,QAAQ;AAAA,MAC/C,OAAO,EAAE;AAAA,MACT,KAAK,EAAE;AAAA,MACP,MAAM,EAAE;AAAA,IACV,EAAE;AAAA,EACJ;AACF;;;AHjCO,SAAS,kBAAkB,UAAsB,CAAC,GAAgB;AACvE,QAAM,QAAQ,QAAQ,SAAS,wBAAwB,OAAO;AAC9D,SAAO,cAAc,OAAO,OAAO;AACrC;","names":[]}
|
package/dist/index.d.cts
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
type NerLabel = 'NAME' | 'ORG' | 'LOCATION';
|
|
2
|
+
declare const NER_LABELS: readonly NerLabel[];
|
|
3
|
+
/**
|
|
4
|
+
* A raw entity as produced by a token-classification pipeline
|
|
5
|
+
* (transformers.js `aggregation_strategy: 'simple'`) or any compatible inferer.
|
|
6
|
+
* Offsets are absolute char positions into the input; entities without offsets
|
|
7
|
+
* are skipped (we never trust the reconstructed `word`, which can carry subword
|
|
8
|
+
* artifacts — we always slice the original text by [start, end)).
|
|
9
|
+
*/
|
|
10
|
+
interface RawEntity {
|
|
11
|
+
entity_group: string;
|
|
12
|
+
score: number;
|
|
13
|
+
start?: number;
|
|
14
|
+
end?: number;
|
|
15
|
+
word?: string;
|
|
16
|
+
}
|
|
17
|
+
type InferFn = (text: string) => Promise<RawEntity[]>;
|
|
18
|
+
interface NerMatch {
|
|
19
|
+
type: NerLabel;
|
|
20
|
+
value: string;
|
|
21
|
+
start: number;
|
|
22
|
+
end: number;
|
|
23
|
+
score: number;
|
|
24
|
+
}
|
|
25
|
+
/** Minimal vault surface — structurally compatible with @raeven-co/sether's Vault. */
|
|
26
|
+
interface VaultLike {
|
|
27
|
+
set(token: string, value: string): void;
|
|
28
|
+
}
|
|
29
|
+
interface NerOptions {
|
|
30
|
+
/** Model id for the default transformers.js inferer. Default: 'Xenova/bert-base-NER'. */
|
|
31
|
+
model?: string;
|
|
32
|
+
/** Minimum confidence to keep a match. Default: 0.6. */
|
|
33
|
+
threshold?: number;
|
|
34
|
+
/** Which entity types to redact. Default: NAME, ORG, LOCATION. */
|
|
35
|
+
labels?: readonly NerLabel[];
|
|
36
|
+
/** Inject a custom inferer (GLiNER, a remote service, or a test mock). */
|
|
37
|
+
infer?: InferFn;
|
|
38
|
+
/** Token id generator. Default: node:crypto randomUUID. */
|
|
39
|
+
uuid?: () => string;
|
|
40
|
+
}
|
|
41
|
+
interface RedactResult {
|
|
42
|
+
redacted: string;
|
|
43
|
+
matches: NerMatch[];
|
|
44
|
+
}
|
|
45
|
+
interface NerRedactor {
|
|
46
|
+
/** Detect names / organisations / locations in the text. */
|
|
47
|
+
detect(text: string): Promise<NerMatch[]>;
|
|
48
|
+
/**
|
|
49
|
+
* Redact entities into `<TYPE_uuid>` tokens (the same format @raeven-co/sether
|
|
50
|
+
* uses), storing originals in the vault so the core `restore()` swaps them back.
|
|
51
|
+
* Runs on the full OUTBOUND text — not the response byte-stream.
|
|
52
|
+
*/
|
|
53
|
+
redact(text: string, opts: {
|
|
54
|
+
vault: VaultLike;
|
|
55
|
+
}): Promise<RedactResult>;
|
|
56
|
+
/** Preload the model so the first real call isn't slow. */
|
|
57
|
+
warmup(): Promise<void>;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Build a redactor from an injected inferer. This is the pure, model-free core —
|
|
62
|
+
* fully unit-testable with a mock `infer`. The default model wiring lives in
|
|
63
|
+
* index.ts so this module carries no heavy dependency.
|
|
64
|
+
*/
|
|
65
|
+
declare function buildRedactor(infer: InferFn, options?: NerOptions): NerRedactor;
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* The default inferer: a lazy, memoized transformers.js token-classification
|
|
69
|
+
* pipeline. The ~30 MB+ model is only fetched on first inference, so importing
|
|
70
|
+
* this module stays cheap. Swap in GLiNER or a remote service by passing your
|
|
71
|
+
* own `infer` to createNerRedactor().
|
|
72
|
+
*/
|
|
73
|
+
declare function createTransformersInfer(options?: NerOptions): InferFn;
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Create a free-text NER redactor for names, organisations, and locations.
|
|
77
|
+
*
|
|
78
|
+
* ```ts
|
|
79
|
+
* import { Sether, redactSync, basicDetectors } from '@raeven-co/sether';
|
|
80
|
+
* import { createNerRedactor } from '@raeven-co/sether-ner';
|
|
81
|
+
*
|
|
82
|
+
* const sether = new Sether();
|
|
83
|
+
* const ner = createNerRedactor(); // uses Xenova/bert-base-NER, lazy-loaded
|
|
84
|
+
*
|
|
85
|
+
* // Outbound: NER first (names/orgs/locations), then structured PII — same vault.
|
|
86
|
+
* const { redacted } = await ner.redact(prompt, { vault: sether.vault });
|
|
87
|
+
* const safe = redactSync(redacted, { detectors: basicDetectors, vault: sether.vault });
|
|
88
|
+
* // ...send `safe` to the LLM. sether.restore() then swaps BOTH token sets back,
|
|
89
|
+
* // because NER tokens use the same `<TYPE_uuid>` format.
|
|
90
|
+
* ```
|
|
91
|
+
*
|
|
92
|
+
* Pass `{ infer }` to use GLiNER or any other model. NER runs on the full
|
|
93
|
+
* outbound text — not the streaming response.
|
|
94
|
+
*/
|
|
95
|
+
declare function createNerRedactor(options?: NerOptions): NerRedactor;
|
|
96
|
+
|
|
97
|
+
export { type InferFn, NER_LABELS, type NerLabel, type NerMatch, type NerOptions, type NerRedactor, type RawEntity, type RedactResult, type VaultLike, buildRedactor, createNerRedactor, createTransformersInfer };
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
type NerLabel = 'NAME' | 'ORG' | 'LOCATION';
|
|
2
|
+
declare const NER_LABELS: readonly NerLabel[];
|
|
3
|
+
/**
|
|
4
|
+
* A raw entity as produced by a token-classification pipeline
|
|
5
|
+
* (transformers.js `aggregation_strategy: 'simple'`) or any compatible inferer.
|
|
6
|
+
* Offsets are absolute char positions into the input; entities without offsets
|
|
7
|
+
* are skipped (we never trust the reconstructed `word`, which can carry subword
|
|
8
|
+
* artifacts — we always slice the original text by [start, end)).
|
|
9
|
+
*/
|
|
10
|
+
interface RawEntity {
|
|
11
|
+
entity_group: string;
|
|
12
|
+
score: number;
|
|
13
|
+
start?: number;
|
|
14
|
+
end?: number;
|
|
15
|
+
word?: string;
|
|
16
|
+
}
|
|
17
|
+
type InferFn = (text: string) => Promise<RawEntity[]>;
|
|
18
|
+
interface NerMatch {
|
|
19
|
+
type: NerLabel;
|
|
20
|
+
value: string;
|
|
21
|
+
start: number;
|
|
22
|
+
end: number;
|
|
23
|
+
score: number;
|
|
24
|
+
}
|
|
25
|
+
/** Minimal vault surface — structurally compatible with @raeven-co/sether's Vault. */
|
|
26
|
+
interface VaultLike {
|
|
27
|
+
set(token: string, value: string): void;
|
|
28
|
+
}
|
|
29
|
+
interface NerOptions {
|
|
30
|
+
/** Model id for the default transformers.js inferer. Default: 'Xenova/bert-base-NER'. */
|
|
31
|
+
model?: string;
|
|
32
|
+
/** Minimum confidence to keep a match. Default: 0.6. */
|
|
33
|
+
threshold?: number;
|
|
34
|
+
/** Which entity types to redact. Default: NAME, ORG, LOCATION. */
|
|
35
|
+
labels?: readonly NerLabel[];
|
|
36
|
+
/** Inject a custom inferer (GLiNER, a remote service, or a test mock). */
|
|
37
|
+
infer?: InferFn;
|
|
38
|
+
/** Token id generator. Default: node:crypto randomUUID. */
|
|
39
|
+
uuid?: () => string;
|
|
40
|
+
}
|
|
41
|
+
interface RedactResult {
|
|
42
|
+
redacted: string;
|
|
43
|
+
matches: NerMatch[];
|
|
44
|
+
}
|
|
45
|
+
interface NerRedactor {
|
|
46
|
+
/** Detect names / organisations / locations in the text. */
|
|
47
|
+
detect(text: string): Promise<NerMatch[]>;
|
|
48
|
+
/**
|
|
49
|
+
* Redact entities into `<TYPE_uuid>` tokens (the same format @raeven-co/sether
|
|
50
|
+
* uses), storing originals in the vault so the core `restore()` swaps them back.
|
|
51
|
+
* Runs on the full OUTBOUND text — not the response byte-stream.
|
|
52
|
+
*/
|
|
53
|
+
redact(text: string, opts: {
|
|
54
|
+
vault: VaultLike;
|
|
55
|
+
}): Promise<RedactResult>;
|
|
56
|
+
/** Preload the model so the first real call isn't slow. */
|
|
57
|
+
warmup(): Promise<void>;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Build a redactor from an injected inferer. This is the pure, model-free core —
|
|
62
|
+
* fully unit-testable with a mock `infer`. The default model wiring lives in
|
|
63
|
+
* index.ts so this module carries no heavy dependency.
|
|
64
|
+
*/
|
|
65
|
+
declare function buildRedactor(infer: InferFn, options?: NerOptions): NerRedactor;
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* The default inferer: a lazy, memoized transformers.js token-classification
|
|
69
|
+
* pipeline. The ~30 MB+ model is only fetched on first inference, so importing
|
|
70
|
+
* this module stays cheap. Swap in GLiNER or a remote service by passing your
|
|
71
|
+
* own `infer` to createNerRedactor().
|
|
72
|
+
*/
|
|
73
|
+
declare function createTransformersInfer(options?: NerOptions): InferFn;
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Create a free-text NER redactor for names, organisations, and locations.
|
|
77
|
+
*
|
|
78
|
+
* ```ts
|
|
79
|
+
* import { Sether, redactSync, basicDetectors } from '@raeven-co/sether';
|
|
80
|
+
* import { createNerRedactor } from '@raeven-co/sether-ner';
|
|
81
|
+
*
|
|
82
|
+
* const sether = new Sether();
|
|
83
|
+
* const ner = createNerRedactor(); // uses Xenova/bert-base-NER, lazy-loaded
|
|
84
|
+
*
|
|
85
|
+
* // Outbound: NER first (names/orgs/locations), then structured PII — same vault.
|
|
86
|
+
* const { redacted } = await ner.redact(prompt, { vault: sether.vault });
|
|
87
|
+
* const safe = redactSync(redacted, { detectors: basicDetectors, vault: sether.vault });
|
|
88
|
+
* // ...send `safe` to the LLM. sether.restore() then swaps BOTH token sets back,
|
|
89
|
+
* // because NER tokens use the same `<TYPE_uuid>` format.
|
|
90
|
+
* ```
|
|
91
|
+
*
|
|
92
|
+
* Pass `{ infer }` to use GLiNER or any other model. NER runs on the full
|
|
93
|
+
* outbound text — not the streaming response.
|
|
94
|
+
*/
|
|
95
|
+
declare function createNerRedactor(options?: NerOptions): NerRedactor;
|
|
96
|
+
|
|
97
|
+
export { type InferFn, NER_LABELS, type NerLabel, type NerMatch, type NerOptions, type NerRedactor, type RawEntity, type RedactResult, type VaultLike, buildRedactor, createNerRedactor, createTransformersInfer };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
// src/redactor.ts
|
|
2
|
+
import { randomUUID } from "crypto";
|
|
3
|
+
|
|
4
|
+
// src/types.ts
|
|
5
|
+
var NER_LABELS = ["NAME", "ORG", "LOCATION"];
|
|
6
|
+
|
|
7
|
+
// src/redactor.ts
|
|
8
|
+
var LABEL_MAP = {
|
|
9
|
+
PER: "NAME",
|
|
10
|
+
PERSON: "NAME",
|
|
11
|
+
NAME: "NAME",
|
|
12
|
+
ORG: "ORG",
|
|
13
|
+
ORGANIZATION: "ORG",
|
|
14
|
+
ORGANISATION: "ORG",
|
|
15
|
+
LOC: "LOCATION",
|
|
16
|
+
LOCATION: "LOCATION",
|
|
17
|
+
GPE: "LOCATION"
|
|
18
|
+
};
|
|
19
|
+
function mapLabel(group) {
|
|
20
|
+
return LABEL_MAP[group.toUpperCase().replace(/^[BI]-/, "")] ?? null;
|
|
21
|
+
}
|
|
22
|
+
function buildRedactor(infer, options = {}) {
|
|
23
|
+
const threshold = options.threshold ?? 0.6;
|
|
24
|
+
const enabled = new Set(options.labels ?? NER_LABELS);
|
|
25
|
+
const uuid = options.uuid ?? randomUUID;
|
|
26
|
+
async function detect(text) {
|
|
27
|
+
if (!text.trim()) return [];
|
|
28
|
+
const raw = await infer(text);
|
|
29
|
+
return resolveOverlaps(toMatches(raw, text, threshold, enabled));
|
|
30
|
+
}
|
|
31
|
+
async function redact(text, opts) {
|
|
32
|
+
const matches = await detect(text);
|
|
33
|
+
if (matches.length === 0) return { redacted: text, matches };
|
|
34
|
+
let out = "";
|
|
35
|
+
let pos = 0;
|
|
36
|
+
for (const m of matches) {
|
|
37
|
+
out += text.slice(pos, m.start);
|
|
38
|
+
const token = `<${m.type}_${uuid()}>`;
|
|
39
|
+
opts.vault.set(token, m.value);
|
|
40
|
+
out += token;
|
|
41
|
+
pos = m.end;
|
|
42
|
+
}
|
|
43
|
+
out += text.slice(pos);
|
|
44
|
+
return { redacted: out, matches };
|
|
45
|
+
}
|
|
46
|
+
async function warmup() {
|
|
47
|
+
await infer("Warm up the model.");
|
|
48
|
+
}
|
|
49
|
+
return { detect, redact, warmup };
|
|
50
|
+
}
|
|
51
|
+
function toMatches(raw, text, threshold, enabled) {
|
|
52
|
+
const out = [];
|
|
53
|
+
for (const e of raw) {
|
|
54
|
+
if (e.score < threshold) continue;
|
|
55
|
+
if (e.start == null || e.end == null || e.end <= e.start) continue;
|
|
56
|
+
const type = mapLabel(e.entity_group);
|
|
57
|
+
if (!type || !enabled.has(type)) continue;
|
|
58
|
+
const value = text.slice(e.start, e.end);
|
|
59
|
+
if (!value.trim()) continue;
|
|
60
|
+
out.push({ type, value, start: e.start, end: e.end, score: e.score });
|
|
61
|
+
}
|
|
62
|
+
return out;
|
|
63
|
+
}
|
|
64
|
+
function resolveOverlaps(matches) {
|
|
65
|
+
matches.sort((a, b) => a.start - b.start || b.score - a.score || b.end - b.start - (a.end - a.start));
|
|
66
|
+
const out = [];
|
|
67
|
+
for (const m of matches) {
|
|
68
|
+
if (!out.some((o) => m.start < o.end && o.start < m.end)) out.push(m);
|
|
69
|
+
}
|
|
70
|
+
return out.sort((a, b) => a.start - b.start);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// src/infer.ts
|
|
74
|
+
var dynamicImport = (specifier) => Function("s", "return import(s)")(specifier);
|
|
75
|
+
function createTransformersInfer(options = {}) {
|
|
76
|
+
const model = options.model ?? "Xenova/bert-base-NER";
|
|
77
|
+
let pipePromise = null;
|
|
78
|
+
async function getPipe() {
|
|
79
|
+
if (!pipePromise) {
|
|
80
|
+
pipePromise = (async () => {
|
|
81
|
+
let mod;
|
|
82
|
+
try {
|
|
83
|
+
mod = await dynamicImport("@huggingface/transformers");
|
|
84
|
+
} catch {
|
|
85
|
+
throw new Error(
|
|
86
|
+
"@raeven-co/sether-ner: install the optional peer '@huggingface/transformers' to use the default model, or pass your own { infer } to createNerRedactor()."
|
|
87
|
+
);
|
|
88
|
+
}
|
|
89
|
+
return mod.pipeline("token-classification", model);
|
|
90
|
+
})();
|
|
91
|
+
}
|
|
92
|
+
return pipePromise;
|
|
93
|
+
}
|
|
94
|
+
return async (text) => {
|
|
95
|
+
const pipe = await getPipe();
|
|
96
|
+
const out = await pipe(text, { aggregation_strategy: "simple" });
|
|
97
|
+
return out.map((e) => ({
|
|
98
|
+
entity_group: e.entity_group ?? e.entity ?? "MISC",
|
|
99
|
+
score: typeof e.score === "number" ? e.score : 0,
|
|
100
|
+
start: e.start,
|
|
101
|
+
end: e.end,
|
|
102
|
+
word: e.word
|
|
103
|
+
}));
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// src/index.ts
|
|
108
|
+
function createNerRedactor(options = {}) {
|
|
109
|
+
const infer = options.infer ?? createTransformersInfer(options);
|
|
110
|
+
return buildRedactor(infer, options);
|
|
111
|
+
}
|
|
112
|
+
export {
|
|
113
|
+
NER_LABELS,
|
|
114
|
+
buildRedactor,
|
|
115
|
+
createNerRedactor,
|
|
116
|
+
createTransformersInfer
|
|
117
|
+
};
|
|
118
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/redactor.ts","../src/types.ts","../src/infer.ts","../src/index.ts"],"sourcesContent":["import { randomUUID } from 'node:crypto';\nimport {\n NER_LABELS,\n type InferFn,\n type NerLabel,\n type NerMatch,\n type NerOptions,\n type NerRedactor,\n type RawEntity,\n type RedactResult,\n type VaultLike,\n} from './types.js';\n\n// Map model label groups (and B-/I- prefixes) to Sether's three NER types.\nconst LABEL_MAP: Record<string, NerLabel> = {\n PER: 'NAME',\n PERSON: 'NAME',\n NAME: 'NAME',\n ORG: 'ORG',\n ORGANIZATION: 'ORG',\n ORGANISATION: 'ORG',\n LOC: 'LOCATION',\n LOCATION: 'LOCATION',\n GPE: 'LOCATION',\n};\n\nfunction mapLabel(group: string): NerLabel | null {\n return LABEL_MAP[group.toUpperCase().replace(/^[BI]-/, '')] ?? null;\n}\n\n/**\n * Build a redactor from an injected inferer. This is the pure, model-free core —\n * fully unit-testable with a mock `infer`. The default model wiring lives in\n * index.ts so this module carries no heavy dependency.\n */\nexport function buildRedactor(infer: InferFn, options: NerOptions = {}): NerRedactor {\n const threshold = options.threshold ?? 0.6;\n const enabled = new Set<NerLabel>(options.labels ?? NER_LABELS);\n const uuid = options.uuid ?? randomUUID;\n\n async function detect(text: string): Promise<NerMatch[]> {\n if (!text.trim()) return [];\n const raw = await infer(text);\n return resolveOverlaps(toMatches(raw, text, threshold, enabled));\n }\n\n async function redact(text: string, opts: { vault: VaultLike }): Promise<RedactResult> {\n const matches = await detect(text);\n if (matches.length === 0) return { redacted: text, matches };\n\n let out = '';\n let pos = 0;\n for (const m of matches) {\n out += text.slice(pos, m.start);\n const token = `<${m.type}_${uuid()}>`;\n opts.vault.set(token, m.value);\n out += token;\n pos = m.end;\n }\n out += text.slice(pos);\n return { redacted: out, matches };\n }\n\n async function warmup(): Promise<void> {\n await infer('Warm up the model.');\n }\n\n return { detect, redact, warmup };\n}\n\nfunction toMatches(raw: RawEntity[], text: string, threshold: number, enabled: Set<NerLabel>): NerMatch[] {\n const out: NerMatch[] = [];\n for (const e of raw) {\n if (e.score < threshold) continue;\n if (e.start == null || e.end == null || e.end <= e.start) continue;\n const type = mapLabel(e.entity_group);\n if (!type || !enabled.has(type)) continue;\n const value = text.slice(e.start, e.end);\n if (!value.trim()) continue;\n out.push({ type, value, start: e.start, end: e.end, score: e.score });\n }\n return out;\n}\n\n// Keep the higher-score (then longer) match when spans overlap.\nfunction resolveOverlaps(matches: NerMatch[]): NerMatch[] {\n matches.sort((a, b) => a.start - b.start || b.score - a.score || b.end - b.start - (a.end - a.start));\n const out: NerMatch[] = [];\n for (const m of matches) {\n if (!out.some((o) => m.start < o.end && o.start < m.end)) out.push(m);\n }\n return out.sort((a, b) => a.start - b.start);\n}\n","export type NerLabel = 'NAME' | 'ORG' | 'LOCATION';\n\nexport const NER_LABELS: readonly NerLabel[] = ['NAME', 'ORG', 'LOCATION'];\n\n/**\n * A raw entity as produced by a token-classification pipeline\n * (transformers.js `aggregation_strategy: 'simple'`) or any compatible inferer.\n * Offsets are absolute char positions into the input; entities without offsets\n * are skipped (we never trust the reconstructed `word`, which can carry subword\n * artifacts — we always slice the original text by [start, end)).\n */\nexport interface RawEntity {\n entity_group: string; // e.g. 'PER' | 'ORG' | 'LOC' | 'person' | 'organization'\n score: number; // 0..1\n start?: number;\n end?: number;\n word?: string;\n}\n\nexport type InferFn = (text: string) => Promise<RawEntity[]>;\n\nexport interface NerMatch {\n type: NerLabel;\n value: string;\n start: number;\n end: number;\n score: number;\n}\n\n/** Minimal vault surface — structurally compatible with @raeven-co/sether's Vault. */\nexport interface VaultLike {\n set(token: string, value: string): void;\n}\n\nexport interface NerOptions {\n /** Model id for the default transformers.js inferer. Default: 'Xenova/bert-base-NER'. */\n model?: string;\n /** Minimum confidence to keep a match. Default: 0.6. */\n threshold?: number;\n /** Which entity types to redact. Default: NAME, ORG, LOCATION. */\n labels?: readonly NerLabel[];\n /** Inject a custom inferer (GLiNER, a remote service, or a test mock). */\n infer?: InferFn;\n /** Token id generator. Default: node:crypto randomUUID. */\n uuid?: () => string;\n}\n\nexport interface RedactResult {\n redacted: string;\n matches: NerMatch[];\n}\n\nexport interface NerRedactor {\n /** Detect names / organisations / locations in the text. */\n detect(text: string): Promise<NerMatch[]>;\n /**\n * Redact entities into `<TYPE_uuid>` tokens (the same format @raeven-co/sether\n * uses), storing originals in the vault so the core `restore()` swaps them back.\n * Runs on the full OUTBOUND text — not the response byte-stream.\n */\n redact(text: string, opts: { vault: VaultLike }): Promise<RedactResult>;\n /** Preload the model so the first real call isn't slow. */\n warmup(): Promise<void>;\n}\n","import type { InferFn, NerOptions, RawEntity } from './types.js';\n\ntype Pipe = (text: string, opts: { aggregation_strategy: string }) => Promise<unknown>;\n\ninterface RawPipelineEntity {\n entity_group?: string;\n entity?: string;\n score?: number;\n start?: number;\n end?: number;\n word?: string;\n}\n\n// Hide the specifier from the bundler and the type-checker so\n// @huggingface/transformers stays a TRULY optional peer dependency — it's only\n// needed at runtime if the default inferer actually runs.\nconst dynamicImport = (specifier: string): Promise<unknown> =>\n (Function('s', 'return import(s)') as (s: string) => Promise<unknown>)(specifier);\n\n/**\n * The default inferer: a lazy, memoized transformers.js token-classification\n * pipeline. The ~30 MB+ model is only fetched on first inference, so importing\n * this module stays cheap. Swap in GLiNER or a remote service by passing your\n * own `infer` to createNerRedactor().\n */\nexport function createTransformersInfer(options: NerOptions = {}): InferFn {\n const model = options.model ?? 'Xenova/bert-base-NER';\n let pipePromise: Promise<Pipe> | null = null;\n\n async function getPipe(): Promise<Pipe> {\n if (!pipePromise) {\n pipePromise = (async () => {\n let mod: { pipeline: (task: string, model: string) => Promise<Pipe> };\n try {\n mod = (await dynamicImport('@huggingface/transformers')) as typeof mod;\n } catch {\n throw new Error(\n \"@raeven-co/sether-ner: install the optional peer '@huggingface/transformers' to use the default model, or pass your own { infer } to createNerRedactor().\",\n );\n }\n return mod.pipeline('token-classification', model);\n })();\n }\n return pipePromise;\n }\n\n return async (text: string): Promise<RawEntity[]> => {\n const pipe = await getPipe();\n const out = (await pipe(text, { aggregation_strategy: 'simple' })) as RawPipelineEntity[];\n return out.map((e) => ({\n entity_group: e.entity_group ?? e.entity ?? 'MISC',\n score: typeof e.score === 'number' ? e.score : 0,\n start: e.start,\n end: e.end,\n word: e.word,\n }));\n };\n}\n","import { buildRedactor } from './redactor.js';\nimport { createTransformersInfer } from './infer.js';\nimport type { NerOptions, NerRedactor } from './types.js';\n\n/**\n * Create a free-text NER redactor for names, organisations, and locations.\n *\n * ```ts\n * import { Sether, redactSync, basicDetectors } from '@raeven-co/sether';\n * import { createNerRedactor } from '@raeven-co/sether-ner';\n *\n * const sether = new Sether();\n * const ner = createNerRedactor(); // uses Xenova/bert-base-NER, lazy-loaded\n *\n * // Outbound: NER first (names/orgs/locations), then structured PII — same vault.\n * const { redacted } = await ner.redact(prompt, { vault: sether.vault });\n * const safe = redactSync(redacted, { detectors: basicDetectors, vault: sether.vault });\n * // ...send `safe` to the LLM. sether.restore() then swaps BOTH token sets back,\n * // because NER tokens use the same `<TYPE_uuid>` format.\n * ```\n *\n * Pass `{ infer }` to use GLiNER or any other model. NER runs on the full\n * outbound text — not the streaming response.\n */\nexport function createNerRedactor(options: NerOptions = {}): NerRedactor {\n const infer = options.infer ?? createTransformersInfer(options);\n return buildRedactor(infer, options);\n}\n\nexport { buildRedactor } from './redactor.js';\nexport { createTransformersInfer } from './infer.js';\nexport { NER_LABELS } from './types.js';\nexport type {\n NerLabel,\n NerMatch,\n NerOptions,\n NerRedactor,\n InferFn,\n RawEntity,\n VaultLike,\n RedactResult,\n} from './types.js';\n"],"mappings":";AAAA,SAAS,kBAAkB;;;ACEpB,IAAM,aAAkC,CAAC,QAAQ,OAAO,UAAU;;;ADYzE,IAAM,YAAsC;AAAA,EAC1C,KAAK;AAAA,EACL,QAAQ;AAAA,EACR,MAAM;AAAA,EACN,KAAK;AAAA,EACL,cAAc;AAAA,EACd,cAAc;AAAA,EACd,KAAK;AAAA,EACL,UAAU;AAAA,EACV,KAAK;AACP;AAEA,SAAS,SAAS,OAAgC;AAChD,SAAO,UAAU,MAAM,YAAY,EAAE,QAAQ,UAAU,EAAE,CAAC,KAAK;AACjE;AAOO,SAAS,cAAc,OAAgB,UAAsB,CAAC,GAAgB;AACnF,QAAM,YAAY,QAAQ,aAAa;AACvC,QAAM,UAAU,IAAI,IAAc,QAAQ,UAAU,UAAU;AAC9D,QAAM,OAAO,QAAQ,QAAQ;AAE7B,iBAAe,OAAO,MAAmC;AACvD,QAAI,CAAC,KAAK,KAAK,EAAG,QAAO,CAAC;AAC1B,UAAM,MAAM,MAAM,MAAM,IAAI;AAC5B,WAAO,gBAAgB,UAAU,KAAK,MAAM,WAAW,OAAO,CAAC;AAAA,EACjE;AAEA,iBAAe,OAAO,MAAc,MAAmD;AACrF,UAAM,UAAU,MAAM,OAAO,IAAI;AACjC,QAAI,QAAQ,WAAW,EAAG,QAAO,EAAE,UAAU,MAAM,QAAQ;AAE3D,QAAI,MAAM;AACV,QAAI,MAAM;AACV,eAAW,KAAK,SAAS;AACvB,aAAO,KAAK,MAAM,KAAK,EAAE,KAAK;AAC9B,YAAM,QAAQ,IAAI,EAAE,IAAI,IAAI,KAAK,CAAC;AAClC,WAAK,MAAM,IAAI,OAAO,EAAE,KAAK;AAC7B,aAAO;AACP,YAAM,EAAE;AAAA,IACV;AACA,WAAO,KAAK,MAAM,GAAG;AACrB,WAAO,EAAE,UAAU,KAAK,QAAQ;AAAA,EAClC;AAEA,iBAAe,SAAwB;AACrC,UAAM,MAAM,oBAAoB;AAAA,EAClC;AAEA,SAAO,EAAE,QAAQ,QAAQ,OAAO;AAClC;AAEA,SAAS,UAAU,KAAkB,MAAc,WAAmB,SAAoC;AACxG,QAAM,MAAkB,CAAC;AACzB,aAAW,KAAK,KAAK;AACnB,QAAI,EAAE,QAAQ,UAAW;AACzB,QAAI,EAAE,SAAS,QAAQ,EAAE,OAAO,QAAQ,EAAE,OAAO,EAAE,MAAO;AAC1D,UAAM,OAAO,SAAS,EAAE,YAAY;AACpC,QAAI,CAAC,QAAQ,CAAC,QAAQ,IAAI,IAAI,EAAG;AACjC,UAAM,QAAQ,KAAK,MAAM,EAAE,OAAO,EAAE,GAAG;AACvC,QAAI,CAAC,MAAM,KAAK,EAAG;AACnB,QAAI,KAAK,EAAE,MAAM,OAAO,OAAO,EAAE,OAAO,KAAK,EAAE,KAAK,OAAO,EAAE,MAAM,CAAC;AAAA,EACtE;AACA,SAAO;AACT;AAGA,SAAS,gBAAgB,SAAiC;AACxD,UAAQ,KAAK,CAAC,GAAG,MAAM,EAAE,QAAQ,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM;AACpG,QAAM,MAAkB,CAAC;AACzB,aAAW,KAAK,SAAS;AACvB,QAAI,CAAC,IAAI,KAAK,CAAC,MAAM,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,GAAG,EAAG,KAAI,KAAK,CAAC;AAAA,EACtE;AACA,SAAO,IAAI,KAAK,CAAC,GAAG,MAAM,EAAE,QAAQ,EAAE,KAAK;AAC7C;;;AE5EA,IAAM,gBAAgB,CAAC,cACpB,SAAS,KAAK,kBAAkB,EAAsC,SAAS;AAQ3E,SAAS,wBAAwB,UAAsB,CAAC,GAAY;AACzE,QAAM,QAAQ,QAAQ,SAAS;AAC/B,MAAI,cAAoC;AAExC,iBAAe,UAAyB;AACtC,QAAI,CAAC,aAAa;AAChB,qBAAe,YAAY;AACzB,YAAI;AACJ,YAAI;AACF,gBAAO,MAAM,cAAc,2BAA2B;AAAA,QACxD,QAAQ;AACN,gBAAM,IAAI;AAAA,YACR;AAAA,UACF;AAAA,QACF;AACA,eAAO,IAAI,SAAS,wBAAwB,KAAK;AAAA,MACnD,GAAG;AAAA,IACL;AACA,WAAO;AAAA,EACT;AAEA,SAAO,OAAO,SAAuC;AACnD,UAAM,OAAO,MAAM,QAAQ;AAC3B,UAAM,MAAO,MAAM,KAAK,MAAM,EAAE,sBAAsB,SAAS,CAAC;AAChE,WAAO,IAAI,IAAI,CAAC,OAAO;AAAA,MACrB,cAAc,EAAE,gBAAgB,EAAE,UAAU;AAAA,MAC5C,OAAO,OAAO,EAAE,UAAU,WAAW,EAAE,QAAQ;AAAA,MAC/C,OAAO,EAAE;AAAA,MACT,KAAK,EAAE;AAAA,MACP,MAAM,EAAE;AAAA,IACV,EAAE;AAAA,EACJ;AACF;;;ACjCO,SAAS,kBAAkB,UAAsB,CAAC,GAAgB;AACvE,QAAM,QAAQ,QAAQ,SAAS,wBAAwB,OAAO;AAC9D,SAAO,cAAc,OAAO,OAAO;AACrC;","names":[]}
|
package/package.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@raeven-co/sether-ner",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Free-text NER redaction (names, organisations, locations) for Sether. Lazy-loaded ONNX model via transformers.js. Tokens restore through @raeven-co/sether's vault.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./dist/index.cjs",
|
|
7
|
+
"module": "./dist/index.js",
|
|
8
|
+
"types": "./dist/index.d.ts",
|
|
9
|
+
"exports": {
|
|
10
|
+
".": {
|
|
11
|
+
"types": "./dist/index.d.ts",
|
|
12
|
+
"import": "./dist/index.js",
|
|
13
|
+
"require": "./dist/index.cjs"
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"files": ["dist", "README.md"],
|
|
17
|
+
"scripts": {
|
|
18
|
+
"build": "tsup",
|
|
19
|
+
"typecheck": "tsc --noEmit",
|
|
20
|
+
"test": "node test/redactor.test.mjs"
|
|
21
|
+
},
|
|
22
|
+
"keywords": ["pii", "ner", "redaction", "sether", "llm", "privacy", "onnx", "transformers", "gdpr"],
|
|
23
|
+
"author": "Godfrey Lebo <emorylebo@gmail.com> (Raeven, Inc.)",
|
|
24
|
+
"license": "MIT",
|
|
25
|
+
"repository": {
|
|
26
|
+
"type": "git",
|
|
27
|
+
"url": "git+https://github.com/raeven-co/sether-ner.git"
|
|
28
|
+
},
|
|
29
|
+
"peerDependencies": {
|
|
30
|
+
"@huggingface/transformers": ">=3.0.0"
|
|
31
|
+
},
|
|
32
|
+
"peerDependenciesMeta": {
|
|
33
|
+
"@huggingface/transformers": {
|
|
34
|
+
"optional": true
|
|
35
|
+
}
|
|
36
|
+
},
|
|
37
|
+
"devDependencies": {
|
|
38
|
+
"@types/node": "^22.10.0",
|
|
39
|
+
"esbuild": "^0.24.0",
|
|
40
|
+
"tsup": "^8.3.5",
|
|
41
|
+
"typescript": "^5.7.0"
|
|
42
|
+
},
|
|
43
|
+
"engines": {
|
|
44
|
+
"node": ">=18"
|
|
45
|
+
}
|
|
46
|
+
}
|