@floatingsidewal/bulkhead-core 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +142 -0
- package/dist/cascade/bert-worker.js +84 -0
- package/dist/cascade/bert-worker.js.map +1 -0
- package/dist/cascade/index.d.mts +1 -0
- package/dist/cascade/index.d.ts +1 -0
- package/dist/cascade/index.js +386 -0
- package/dist/cascade/index.js.map +1 -0
- package/dist/cascade/index.mjs +11 -0
- package/dist/cascade/index.mjs.map +1 -0
- package/dist/chunk-4KUXRYNS.mjs +358 -0
- package/dist/chunk-4KUXRYNS.mjs.map +1 -0
- package/dist/index-BNiM_sPB.d.mts +237 -0
- package/dist/index-BNiM_sPB.d.ts +237 -0
- package/dist/index.d.mts +265 -0
- package/dist/index.d.ts +265 -0
- package/dist/index.js +3470 -0
- package/dist/index.js.map +1 -0
- package/dist/index.mjs +3082 -0
- package/dist/index.mjs.map +1 -0
- package/package.json +70 -0
package/README.md
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
# @floatingsidewal/bulkhead-core
|
|
2
|
+
|
|
3
|
+
Cascading content protection engine -- detects and redacts PII, secrets, prompt injection, and system prompt leakage in text before it reaches LLMs.
|
|
4
|
+
|
|
5
|
+
Part of the [Bulkhead](https://github.com/floatingsidewal/bulkhead) project.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
```
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Then install:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
npm install @floatingsidewal/bulkhead-core
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Quick Start
|
|
20
|
+
|
|
21
|
+
```typescript
|
|
22
|
+
import { createEngine } from "@floatingsidewal/bulkhead-core";
|
|
23
|
+
|
|
24
|
+
const engine = createEngine();
|
|
25
|
+
|
|
26
|
+
// Fast regex-only scan (sub-millisecond)
|
|
27
|
+
const result = await engine.analyze("My SSN is 123-45-6789 and key is AKIAIOSFODNN7EXAMPLE");
|
|
28
|
+
|
|
29
|
+
console.log(result.passed); // false
|
|
30
|
+
console.log(result.detections); // [{ entityType: "US_SSN", ... }, { entityType: "AWS_ACCESS_KEY", ... }]
|
|
31
|
+
|
|
32
|
+
// Scan and redact
|
|
33
|
+
const redacted = await engine.scan("Call me at 555-867-5309");
|
|
34
|
+
console.log(redacted.redactedText); // "Call me at [REDACTED-US_PHONE]"
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## What It Detects
|
|
38
|
+
|
|
39
|
+
| Category | Coverage |
|
|
40
|
+
|----------|----------|
|
|
41
|
+
| **PII** | 45+ entity types across 20+ countries (SSN, credit cards, IBAN, phone, email, medical IDs, national IDs) |
|
|
42
|
+
| **Secrets** | 154 patterns across 13 categories (AWS, Azure, GCP, GitHub, Slack, Stripe, database credentials, private keys) |
|
|
43
|
+
| **Prompt Injection** | 16+ patterns (role-play attacks, DAN mode, instruction override) |
|
|
44
|
+
| **System Prompt Leakage** | 7+ patterns (prompt extraction, "repeat everything above") |
|
|
45
|
+
|
|
46
|
+
All structured patterns include checksum validation where applicable (Luhn, IBAN mod-97, Verhoeff).
|
|
47
|
+
|
|
48
|
+
## Configuration
|
|
49
|
+
|
|
50
|
+
```typescript
|
|
51
|
+
import { createEngine, type BulkheadConfig } from "@floatingsidewal/bulkhead-core";
|
|
52
|
+
|
|
53
|
+
const engine = createEngine({
|
|
54
|
+
enabled: true,
|
|
55
|
+
debounceMs: 500,
|
|
56
|
+
guards: {
|
|
57
|
+
pii: { enabled: true },
|
|
58
|
+
secret: { enabled: true },
|
|
59
|
+
injection: { enabled: true },
|
|
60
|
+
contentSafety: { enabled: false },
|
|
61
|
+
},
|
|
62
|
+
cascade: {
|
|
63
|
+
modelEnabled: false, // Enable BERT layer (see below)
|
|
64
|
+
escalationThreshold: 0.75,
|
|
65
|
+
contextSentences: 3,
|
|
66
|
+
modelId: "Xenova/bert-base-NER",
|
|
67
|
+
},
|
|
68
|
+
});
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Custom Guard Composition
|
|
72
|
+
|
|
73
|
+
For fine-grained control, compose guards directly:
|
|
74
|
+
|
|
75
|
+
```typescript
|
|
76
|
+
import { GuardrailsEngine, PiiGuard, SecretGuard } from "@floatingsidewal/bulkhead-core";
|
|
77
|
+
|
|
78
|
+
const engine = new GuardrailsEngine();
|
|
79
|
+
engine.addGuard(new PiiGuard());
|
|
80
|
+
engine.addGuard(new SecretGuard());
|
|
81
|
+
// Skip injection/leakage guards if not needed
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## BERT Layer (Optional)
|
|
85
|
+
|
|
86
|
+
The regex layer catches structured patterns. For contextual entities like names, locations, and organizations, enable the BERT layer:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
npm install @huggingface/transformers
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
```typescript
|
|
93
|
+
const engine = createEngine({
|
|
94
|
+
// ...
|
|
95
|
+
cascade: {
|
|
96
|
+
modelEnabled: true, // Enables BERT NER model
|
|
97
|
+
// ...
|
|
98
|
+
},
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
// Use deepScan for full cascade (regex + BERT + optional LLM)
|
|
102
|
+
const result = await engine.deepScan("Send the report to John Smith at Acme Corp");
|
|
103
|
+
|
|
104
|
+
// Or modelScan for regex + BERT only (no LLM)
|
|
105
|
+
const result = await engine.modelScan(text);
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
The BERT model (~29 MB) downloads on first inference and runs in a worker thread. No GPU required.
|
|
109
|
+
|
|
110
|
+
## API
|
|
111
|
+
|
|
112
|
+
### `createEngine(config?)`
|
|
113
|
+
|
|
114
|
+
Creates a configured engine from a `BulkheadConfig`. Returns a `GuardrailsEngine`.
|
|
115
|
+
|
|
116
|
+
### `engine.analyze(text)`
|
|
117
|
+
|
|
118
|
+
Layer 1 only (regex). Sub-millisecond. Returns `{ passed, detections, stats }`.
|
|
119
|
+
|
|
120
|
+
### `engine.scan(text)`
|
|
121
|
+
|
|
122
|
+
Layer 1 scan with redaction. Returns `{ passed, detections, redactedText, stats }`.
|
|
123
|
+
|
|
124
|
+
### `engine.modelScan(text)`
|
|
125
|
+
|
|
126
|
+
Regex + BERT (Layers 1-2). Requires `cascade.modelEnabled: true`.
|
|
127
|
+
|
|
128
|
+
### `engine.deepScan(text)`
|
|
129
|
+
|
|
130
|
+
Full cascade (Layers 1-3). Requires cascade configuration.
|
|
131
|
+
|
|
132
|
+
### `engine.dispose()`
|
|
133
|
+
|
|
134
|
+
Cleanup (terminates BERT worker thread if running).
|
|
135
|
+
|
|
136
|
+
## Documentation
|
|
137
|
+
|
|
138
|
+
See the [full documentation](https://github.com/floatingsidewal/bulkhead/tree/develop/docs) for architecture details, deployment guides, and API reference.
|
|
139
|
+
|
|
140
|
+
## License
|
|
141
|
+
|
|
142
|
+
MIT
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __copyProps = (to, from, except, desc) => {
|
|
9
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
10
|
+
for (let key of __getOwnPropNames(from))
|
|
11
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
12
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
13
|
+
}
|
|
14
|
+
return to;
|
|
15
|
+
};
|
|
16
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
17
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
18
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
19
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
20
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
21
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
22
|
+
mod
|
|
23
|
+
));
|
|
24
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
25
|
+
|
|
26
|
+
// src/cascade/bert-worker.ts
|
|
27
|
+
var bert_worker_exports = {};
|
|
28
|
+
module.exports = __toCommonJS(bert_worker_exports);
|
|
29
|
+
var import_node_worker_threads = require("worker_threads");
|
|
30
|
+
var pipeline = null;
|
|
31
|
+
var currentModelId = null;
|
|
32
|
+
async function loadModel(modelId) {
|
|
33
|
+
if (pipeline && currentModelId === modelId) return;
|
|
34
|
+
const { pipeline: createPipeline } = await import("@huggingface/transformers");
|
|
35
|
+
pipeline = await createPipeline("token-classification", modelId, {
|
|
36
|
+
dtype: "q8",
|
|
37
|
+
// Use ONNX runtime for Node.js
|
|
38
|
+
device: "cpu"
|
|
39
|
+
});
|
|
40
|
+
currentModelId = modelId;
|
|
41
|
+
}
|
|
42
|
+
async function analyze(text, modelId) {
|
|
43
|
+
await loadModel(modelId);
|
|
44
|
+
const results = await pipeline(text, {
|
|
45
|
+
aggregation_strategy: "simple"
|
|
46
|
+
});
|
|
47
|
+
return results.map((r) => ({
|
|
48
|
+
entity: r.entity_group ?? r.entity,
|
|
49
|
+
score: r.score,
|
|
50
|
+
word: r.word ?? r.text ?? "",
|
|
51
|
+
start: r.start ?? 0,
|
|
52
|
+
end: r.end ?? (r.start != null ? r.start + (r.word?.length ?? 0) : 0)
|
|
53
|
+
}));
|
|
54
|
+
}
|
|
55
|
+
if (import_node_worker_threads.parentPort) {
|
|
56
|
+
import_node_worker_threads.parentPort.on("message", async (msg) => {
|
|
57
|
+
if (msg.type === "dispose") {
|
|
58
|
+
pipeline = null;
|
|
59
|
+
currentModelId = null;
|
|
60
|
+
import_node_worker_threads.parentPort.postMessage({ type: "ready", id: msg.id });
|
|
61
|
+
return;
|
|
62
|
+
}
|
|
63
|
+
if (msg.type === "analyze" && msg.text) {
|
|
64
|
+
try {
|
|
65
|
+
const tokens = await analyze(
|
|
66
|
+
msg.text,
|
|
67
|
+
msg.modelId ?? "gravitee-io/bert-small-pii-detection"
|
|
68
|
+
);
|
|
69
|
+
import_node_worker_threads.parentPort.postMessage({
|
|
70
|
+
type: "result",
|
|
71
|
+
id: msg.id,
|
|
72
|
+
tokens
|
|
73
|
+
});
|
|
74
|
+
} catch (err) {
|
|
75
|
+
import_node_worker_threads.parentPort.postMessage({
|
|
76
|
+
type: "error",
|
|
77
|
+
id: msg.id,
|
|
78
|
+
error: err instanceof Error ? err.message : String(err)
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
//# sourceMappingURL=bert-worker.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/cascade/bert-worker.ts"],"sourcesContent":["/**\n * BERT model worker thread for Layer 2 of the cascading classifier.\n * Runs in a separate thread to avoid blocking the VS Code extension host.\n *\n * Uses @huggingface/transformers to load and run a token-classification model.\n * The model loads lazily on first request and stays loaded (singleton).\n */\n\nimport { parentPort } from \"node:worker_threads\";\n\n/** Message types between main thread and worker */\nexport interface WorkerRequest {\n type: \"analyze\" | \"dispose\";\n id: string;\n text?: string;\n modelId?: string;\n}\n\nexport interface BertToken {\n entity: string; // e.g., \"B-PERSON\", \"I-LOCATION\"\n score: number;\n word: string;\n start: number;\n end: number;\n}\n\nexport interface WorkerResponse {\n type: \"result\" | \"error\" | \"ready\";\n id: string;\n tokens?: BertToken[];\n error?: string;\n}\n\nlet pipeline: any = null;\nlet currentModelId: string | null = null;\n\nasync function loadModel(modelId: string): Promise<void> {\n if (pipeline && currentModelId === modelId) return;\n\n // Dynamic import to avoid loading transformers.js at module level\n const { pipeline: createPipeline } = await import(\n \"@huggingface/transformers\"\n );\n\n pipeline = await createPipeline(\"token-classification\", modelId, {\n dtype: \"q8\",\n // Use ONNX runtime for Node.js\n device: \"cpu\",\n });\n currentModelId = modelId;\n}\n\nasync function analyze(\n text: string,\n modelId: string\n): Promise<BertToken[]> {\n await loadModel(modelId);\n\n const results = await pipeline(text, {\n aggregation_strategy: \"simple\",\n });\n\n return (results as any[]).map((r) => ({\n entity: r.entity_group ?? r.entity,\n score: r.score,\n word: r.word ?? r.text ?? \"\",\n start: r.start ?? 0,\n end: r.end ?? (r.start != null ? r.start + (r.word?.length ?? 0) : 0),\n }));\n}\n\n// Worker message handler\nif (parentPort) {\n parentPort.on(\"message\", async (msg: WorkerRequest) => {\n if (msg.type === \"dispose\") {\n pipeline = null;\n currentModelId = null;\n parentPort!.postMessage({ type: \"ready\", id: msg.id } as WorkerResponse);\n return;\n }\n\n if (msg.type === \"analyze\" && msg.text) {\n try {\n const tokens = await analyze(\n msg.text,\n msg.modelId ?? \"gravitee-io/bert-small-pii-detection\"\n );\n parentPort!.postMessage({\n type: \"result\",\n id: msg.id,\n tokens,\n } as WorkerResponse);\n } catch (err) {\n parentPort!.postMessage({\n type: \"error\",\n id: msg.id,\n error: err instanceof Error ? err.message : String(err),\n } as WorkerResponse);\n }\n }\n });\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAQA,iCAA2B;AAyB3B,IAAI,WAAgB;AACpB,IAAI,iBAAgC;AAEpC,eAAe,UAAU,SAAgC;AACvD,MAAI,YAAY,mBAAmB,QAAS;AAG5C,QAAM,EAAE,UAAU,eAAe,IAAI,MAAM,OACzC,2BACF;AAEA,aAAW,MAAM,eAAe,wBAAwB,SAAS;AAAA,IAC/D,OAAO;AAAA;AAAA,IAEP,QAAQ;AAAA,EACV,CAAC;AACD,mBAAiB;AACnB;AAEA,eAAe,QACb,MACA,SACsB;AACtB,QAAM,UAAU,OAAO;AAEvB,QAAM,UAAU,MAAM,SAAS,MAAM;AAAA,IACnC,sBAAsB;AAAA,EACxB,CAAC;AAED,SAAQ,QAAkB,IAAI,CAAC,OAAO;AAAA,IACpC,QAAQ,EAAE,gBAAgB,EAAE;AAAA,IAC5B,OAAO,EAAE;AAAA,IACT,MAAM,EAAE,QAAQ,EAAE,QAAQ;AAAA,IAC1B,OAAO,EAAE,SAAS;AAAA,IAClB,KAAK,EAAE,QAAQ,EAAE,SAAS,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,KAAK;AAAA,EACrE,EAAE;AACJ;AAGA,IAAI,uCAAY;AACd,wCAAW,GAAG,WAAW,OAAO,QAAuB;AACrD,QAAI,IAAI,SAAS,WAAW;AAC1B,iBAAW;AACX,uBAAiB;AACjB,4CAAY,YAAY,EAAE,MAAM,SAAS,IAAI,IAAI,GAAG,CAAmB;AACvE;AAAA,IACF;AAEA,QAAI,IAAI,SAAS,aAAa,IAAI,MAAM;AACtC,UAAI;AACF,cAAM,SAAS,MAAM;AAAA,UACnB,IAAI;AAAA,UACJ,IAAI,WAAW;AAAA,QACjB;AACA,8CAAY,YAAY;AAAA,UACtB,MAAM;AAAA,UACN,IAAI,IAAI;AAAA,UACR;AAAA,QACF,CAAmB;AAAA,MACrB,SAAS,KAAK;AACZ,8CAAY,YAAY;AAAA,UACtB,MAAM;AAAA,UACN,IAAI,IAAI;AAAA,UACR,OAAO,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAAA,QACxD,CAAmB;AAAA,MACrB;AAAA,IACF;AAAA,EACF,CAAC;AACH;","names":[]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { k as BertLayer, B as BertLayerConfig, c as CascadeClassifier, C as CascadeConfig, l as LlmLayer, L as LlmLayerConfig, h as LlmProvider } from '../index-BNiM_sPB.mjs';
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { k as BertLayer, B as BertLayerConfig, c as CascadeClassifier, C as CascadeConfig, l as LlmLayer, L as LlmLayerConfig, h as LlmProvider } from '../index-BNiM_sPB.js';
|
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
|
|
20
|
+
// src/cascade/index.ts
|
|
21
|
+
var cascade_exports = {};
|
|
22
|
+
__export(cascade_exports, {
|
|
23
|
+
BertLayer: () => BertLayer,
|
|
24
|
+
CascadeClassifier: () => CascadeClassifier,
|
|
25
|
+
LlmLayer: () => LlmLayer
|
|
26
|
+
});
|
|
27
|
+
module.exports = __toCommonJS(cascade_exports);
|
|
28
|
+
|
|
29
|
+
// src/cascade/bert-layer.ts
|
|
30
|
+
var import_node_worker_threads = require("worker_threads");
|
|
31
|
+
var import_node_path = require("path");
|
|
32
|
+
var import_node_fs = require("fs");
|
|
33
|
+
var DEFAULT_MODEL_ID = "Xenova/bert-base-NER";
|
|
34
|
+
var BertLayer = class {
|
|
35
|
+
worker = null;
|
|
36
|
+
pendingRequests = /* @__PURE__ */ new Map();
|
|
37
|
+
requestId = 0;
|
|
38
|
+
config;
|
|
39
|
+
/** Whether the BERT model has been loaded and first inference completed */
|
|
40
|
+
_loaded = false;
|
|
41
|
+
get loaded() {
|
|
42
|
+
return this._loaded;
|
|
43
|
+
}
|
|
44
|
+
constructor(config) {
|
|
45
|
+
this.config = {
|
|
46
|
+
escalationThreshold: 0.75,
|
|
47
|
+
...config
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
/** Resolve the worker path — supports both compiled .js and source .ts */
|
|
51
|
+
resolveWorkerPath() {
|
|
52
|
+
const compiledPath = (0, import_node_path.resolve)(__dirname, "cascade", "bert-worker.js");
|
|
53
|
+
if ((0, import_node_fs.existsSync)(compiledPath)) return compiledPath;
|
|
54
|
+
const tsPath = (0, import_node_path.resolve)(__dirname, "bert-worker.ts");
|
|
55
|
+
if ((0, import_node_fs.existsSync)(tsPath)) return tsPath;
|
|
56
|
+
return compiledPath;
|
|
57
|
+
}
|
|
58
|
+
/** Ensure the worker thread is running */
|
|
59
|
+
ensureWorker() {
|
|
60
|
+
if (!this.worker) {
|
|
61
|
+
const workerPath = this.resolveWorkerPath();
|
|
62
|
+
const isTs = workerPath.endsWith(".ts");
|
|
63
|
+
this.worker = isTs ? new import_node_worker_threads.Worker(workerPath, {
|
|
64
|
+
execArgv: ["--require", "tsx/cjs"]
|
|
65
|
+
}) : new import_node_worker_threads.Worker(workerPath);
|
|
66
|
+
this.worker.on("message", (msg) => {
|
|
67
|
+
const pending = this.pendingRequests.get(msg.id);
|
|
68
|
+
if (!pending) return;
|
|
69
|
+
this.pendingRequests.delete(msg.id);
|
|
70
|
+
if (msg.type === "error") {
|
|
71
|
+
pending.reject(new Error(msg.error ?? "Unknown worker error"));
|
|
72
|
+
} else {
|
|
73
|
+
pending.resolve(msg.tokens ?? []);
|
|
74
|
+
}
|
|
75
|
+
});
|
|
76
|
+
this.worker.on("error", (err) => {
|
|
77
|
+
for (const [id, pending] of this.pendingRequests) {
|
|
78
|
+
pending.reject(err);
|
|
79
|
+
this.pendingRequests.delete(id);
|
|
80
|
+
}
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
return this.worker;
|
|
84
|
+
}
|
|
85
|
+
/** Send text to the BERT worker and get raw token results */
|
|
86
|
+
async analyzeRaw(text) {
|
|
87
|
+
const worker = this.ensureWorker();
|
|
88
|
+
const id = String(++this.requestId);
|
|
89
|
+
return new Promise((resolve2, reject) => {
|
|
90
|
+
this.pendingRequests.set(id, { resolve: resolve2, reject });
|
|
91
|
+
worker.postMessage({
|
|
92
|
+
type: "analyze",
|
|
93
|
+
id,
|
|
94
|
+
text,
|
|
95
|
+
modelId: this.config.modelId ?? DEFAULT_MODEL_ID
|
|
96
|
+
});
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Analyze text and return Detection objects with escalation disposition.
|
|
101
|
+
* Tokens above the escalation threshold are "confirmed",
|
|
102
|
+
* tokens below are "escalate" (need LLM review).
|
|
103
|
+
*/
|
|
104
|
+
async analyze(text) {
|
|
105
|
+
const tokens = await this.analyzeRaw(text);
|
|
106
|
+
this._loaded = true;
|
|
107
|
+
return tokens.map((token) => {
|
|
108
|
+
const entityType = token.entity.replace(/^[BI]-/, "");
|
|
109
|
+
const isConfirmed = token.score >= this.config.escalationThreshold;
|
|
110
|
+
const confidence = token.score >= 0.9 ? "high" : token.score >= 0.7 ? "medium" : "low";
|
|
111
|
+
return {
|
|
112
|
+
entityType,
|
|
113
|
+
start: token.start,
|
|
114
|
+
end: token.end,
|
|
115
|
+
text: token.word,
|
|
116
|
+
confidence,
|
|
117
|
+
score: token.score,
|
|
118
|
+
guardName: "cascade-bert",
|
|
119
|
+
source: "bert",
|
|
120
|
+
context: text.slice(
|
|
121
|
+
Math.max(0, token.start - 150),
|
|
122
|
+
Math.min(text.length, token.end + 150)
|
|
123
|
+
),
|
|
124
|
+
disposition: isConfirmed ? "confirmed" : "escalate"
|
|
125
|
+
};
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
/** Terminate the worker thread */
|
|
129
|
+
async dispose() {
|
|
130
|
+
if (this.worker) {
|
|
131
|
+
await this.worker.terminate();
|
|
132
|
+
this.worker = null;
|
|
133
|
+
this.pendingRequests.clear();
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
};
|
|
137
|
+
|
|
138
|
+
// src/cascade/llm-layer.ts
|
|
139
|
+
var LlmLayer = class {
|
|
140
|
+
config;
|
|
141
|
+
constructor(config) {
|
|
142
|
+
this.config = {
|
|
143
|
+
contextSentences: 3,
|
|
144
|
+
...config
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
/** Set the LLM provider (can be swapped at runtime) */
|
|
148
|
+
setProvider(provider) {
|
|
149
|
+
this.config.provider = provider;
|
|
150
|
+
}
|
|
151
|
+
/**
|
|
152
|
+
* Disambiguate escalated detections using an LLM.
|
|
153
|
+
* @param escalated Detections with disposition "escalate"
|
|
154
|
+
* @param fullText The full document text
|
|
155
|
+
* @param confirmed Already-confirmed detections (passed as context to help disambiguation)
|
|
156
|
+
*/
|
|
157
|
+
async disambiguate(escalated, fullText, confirmed) {
|
|
158
|
+
if (!this.config.provider) {
|
|
159
|
+
return escalated;
|
|
160
|
+
}
|
|
161
|
+
const results = [];
|
|
162
|
+
for (const detection of escalated) {
|
|
163
|
+
const prompt = this.buildPrompt(detection, fullText, confirmed);
|
|
164
|
+
try {
|
|
165
|
+
const response = await this.config.provider(prompt);
|
|
166
|
+
const parsed = this.parseResponse(response);
|
|
167
|
+
if (parsed && parsed.type !== "NONE") {
|
|
168
|
+
results.push({
|
|
169
|
+
...detection,
|
|
170
|
+
entityType: parsed.type,
|
|
171
|
+
score: parsed.confidence,
|
|
172
|
+
confidence: parsed.confidence >= 0.9 ? "high" : parsed.confidence >= 0.7 ? "medium" : "low",
|
|
173
|
+
source: "llm",
|
|
174
|
+
disposition: "confirmed"
|
|
175
|
+
});
|
|
176
|
+
} else {
|
|
177
|
+
results.push({
|
|
178
|
+
...detection,
|
|
179
|
+
source: "llm",
|
|
180
|
+
disposition: "dismissed"
|
|
181
|
+
});
|
|
182
|
+
}
|
|
183
|
+
} catch {
|
|
184
|
+
results.push(detection);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
return results;
|
|
188
|
+
}
|
|
189
|
+
/** Build a focused disambiguation prompt */
|
|
190
|
+
buildPrompt(detection, fullText, confirmed) {
|
|
191
|
+
const contextWindow = this.extractSentenceContext(
|
|
192
|
+
fullText,
|
|
193
|
+
detection.start,
|
|
194
|
+
detection.end
|
|
195
|
+
);
|
|
196
|
+
const confirmedList = confirmed.filter((d) => d.disposition === "confirmed").map((d) => `${d.text} (${d.entityType})`).slice(0, 10);
|
|
197
|
+
return `You are a PII detection system. Determine if the highlighted span is personally identifiable information.
|
|
198
|
+
|
|
199
|
+
Context: "${contextWindow}"
|
|
200
|
+
Span: "${detection.text}"
|
|
201
|
+
BERT suggested: ${detection.entityType} (confidence: ${detection.score.toFixed(2)})
|
|
202
|
+
${confirmedList.length > 0 ? `Other confirmed entities in document: [${confirmedList.join(", ")}]` : ""}
|
|
203
|
+
|
|
204
|
+
Is this span PII? If yes, what type? If it's ambiguous (e.g., "Jordan" could be a person or country), use the context to decide.
|
|
205
|
+
|
|
206
|
+
Respond with ONLY a JSON object: { "type": "PERSON"|"LOCATION"|"ORGANIZATION"|"NONE", "confidence": 0.0-1.0 }`;
|
|
207
|
+
}
|
|
208
|
+
/** Extract ±N sentences around a span */
|
|
209
|
+
extractSentenceContext(text, start, end) {
|
|
210
|
+
const n = this.config.contextSentences;
|
|
211
|
+
const sentenceBreaks = [0];
|
|
212
|
+
const sentenceRegex = /[.!?]+\s+/g;
|
|
213
|
+
let match;
|
|
214
|
+
while ((match = sentenceRegex.exec(text)) !== null) {
|
|
215
|
+
sentenceBreaks.push(match.index + match[0].length);
|
|
216
|
+
}
|
|
217
|
+
sentenceBreaks.push(text.length);
|
|
218
|
+
let spanSentenceIdx = 0;
|
|
219
|
+
for (let i = 0; i < sentenceBreaks.length - 1; i++) {
|
|
220
|
+
if (sentenceBreaks[i] <= start && start < sentenceBreaks[i + 1]) {
|
|
221
|
+
spanSentenceIdx = i;
|
|
222
|
+
break;
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
const contextStart = sentenceBreaks[Math.max(0, spanSentenceIdx - n)];
|
|
226
|
+
const contextEnd = sentenceBreaks[Math.min(sentenceBreaks.length - 1, spanSentenceIdx + n + 1)];
|
|
227
|
+
return text.slice(contextStart, contextEnd).trim();
|
|
228
|
+
}
|
|
229
|
+
/** Parse the LLM response JSON */
|
|
230
|
+
parseResponse(response) {
|
|
231
|
+
try {
|
|
232
|
+
const jsonMatch = response.match(/\{[^}]+\}/);
|
|
233
|
+
if (!jsonMatch) return null;
|
|
234
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
235
|
+
if (typeof parsed.type === "string" && typeof parsed.confidence === "number") {
|
|
236
|
+
return parsed;
|
|
237
|
+
}
|
|
238
|
+
return null;
|
|
239
|
+
} catch {
|
|
240
|
+
return null;
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
};
|
|
244
|
+
|
|
245
|
+
// src/cascade/cascade.ts
|
|
246
|
+
var DEFAULT_CASCADE_CONFIG = {
|
|
247
|
+
escalationThreshold: 0.75,
|
|
248
|
+
contextSentences: 3,
|
|
249
|
+
bertEnabled: true,
|
|
250
|
+
llmEnabled: false,
|
|
251
|
+
modelId: "Xenova/bert-base-NER"
|
|
252
|
+
};
|
|
253
|
+
var CascadeClassifier = class {
|
|
254
|
+
config;
|
|
255
|
+
bertLayer = null;
|
|
256
|
+
llmLayer;
|
|
257
|
+
regexGuards = [];
|
|
258
|
+
constructor(config) {
|
|
259
|
+
this.config = { ...DEFAULT_CASCADE_CONFIG, ...config };
|
|
260
|
+
this.llmLayer = new LlmLayer({
|
|
261
|
+
contextSentences: this.config.contextSentences,
|
|
262
|
+
provider: this.config.llmProvider
|
|
263
|
+
});
|
|
264
|
+
}
|
|
265
|
+
/** Whether the cascade is ready to serve (BERT model loaded if enabled) */
|
|
266
|
+
get ready() {
|
|
267
|
+
if (!this.config.bertEnabled) return true;
|
|
268
|
+
if (!this.bertLayer) return true;
|
|
269
|
+
return this.bertLayer.loaded;
|
|
270
|
+
}
|
|
271
|
+
/** Register regex-based guards (Layer 1) */
|
|
272
|
+
addRegexGuard(guard) {
|
|
273
|
+
this.regexGuards.push(guard);
|
|
274
|
+
return this;
|
|
275
|
+
}
|
|
276
|
+
/** Set the LLM provider for Layer 3 */
|
|
277
|
+
setLlmProvider(provider) {
|
|
278
|
+
this.config.llmProvider = provider;
|
|
279
|
+
this.llmLayer.setProvider(provider);
|
|
280
|
+
}
|
|
281
|
+
/**
|
|
282
|
+
* Run the full cascade: Regex → BERT → LLM
|
|
283
|
+
* Returns a unified GuardResult with all detections carrying provenance.
|
|
284
|
+
*/
|
|
285
|
+
async deepScan(text) {
|
|
286
|
+
const regexDetections = await this.runRegexLayer(text);
|
|
287
|
+
if (!this.config.bertEnabled) {
|
|
288
|
+
return this.buildCascadeResult(text, regexDetections);
|
|
289
|
+
}
|
|
290
|
+
const bertDetections = await this.runBertLayer(text);
|
|
291
|
+
const mergedBert = this.deduplicateAgainstRegex(
|
|
292
|
+
bertDetections,
|
|
293
|
+
regexDetections
|
|
294
|
+
);
|
|
295
|
+
const allDetections = [...regexDetections, ...mergedBert];
|
|
296
|
+
const escalated = allDetections.filter((d) => d.disposition === "escalate");
|
|
297
|
+
if (!this.config.llmEnabled || escalated.length === 0 || !this.config.llmProvider) {
|
|
298
|
+
return this.buildCascadeResult(text, allDetections);
|
|
299
|
+
}
|
|
300
|
+
const confirmed = allDetections.filter((d) => d.disposition === "confirmed");
|
|
301
|
+
const resolved = await this.llmLayer.disambiguate(
|
|
302
|
+
escalated,
|
|
303
|
+
text,
|
|
304
|
+
confirmed
|
|
305
|
+
);
|
|
306
|
+
const finalDetections = [
|
|
307
|
+
...allDetections.filter((d) => d.disposition !== "escalate"),
|
|
308
|
+
...resolved
|
|
309
|
+
];
|
|
310
|
+
return this.buildCascadeResult(text, finalDetections);
|
|
311
|
+
}
|
|
312
|
+
/** Run Layer 1 only (for fast auto-scan path) */
|
|
313
|
+
async regexScan(text) {
|
|
314
|
+
const detections = await this.runRegexLayer(text);
|
|
315
|
+
return this.buildCascadeResult(text, detections);
|
|
316
|
+
}
|
|
317
|
+
/** Run Layers 1 + 2 only (no LLM, for "Scan File" command) */
|
|
318
|
+
async modelScan(text) {
|
|
319
|
+
const regexDetections = await this.runRegexLayer(text);
|
|
320
|
+
if (!this.config.bertEnabled) {
|
|
321
|
+
return this.buildCascadeResult(text, regexDetections);
|
|
322
|
+
}
|
|
323
|
+
const bertDetections = await this.runBertLayer(text);
|
|
324
|
+
const mergedBert = this.deduplicateAgainstRegex(
|
|
325
|
+
bertDetections,
|
|
326
|
+
regexDetections
|
|
327
|
+
);
|
|
328
|
+
return this.buildCascadeResult(text, [...regexDetections, ...mergedBert]);
|
|
329
|
+
}
|
|
330
|
+
// --- Private methods ---
|
|
331
|
+
async runRegexLayer(text) {
|
|
332
|
+
const allDetections = [];
|
|
333
|
+
for (const guard of this.regexGuards) {
|
|
334
|
+
const result = await guard.analyze(text);
|
|
335
|
+
allDetections.push(...result.detections);
|
|
336
|
+
}
|
|
337
|
+
return allDetections;
|
|
338
|
+
}
|
|
339
|
+
async runBertLayer(text) {
|
|
340
|
+
if (!this.bertLayer) {
|
|
341
|
+
this.bertLayer = new BertLayer({
|
|
342
|
+
modelId: this.config.modelId,
|
|
343
|
+
escalationThreshold: this.config.escalationThreshold
|
|
344
|
+
});
|
|
345
|
+
}
|
|
346
|
+
return this.bertLayer.analyze(text);
|
|
347
|
+
}
|
|
348
|
+
/** Remove BERT detections that overlap with regex detections */
|
|
349
|
+
deduplicateAgainstRegex(bertDetections, regexDetections) {
|
|
350
|
+
return bertDetections.filter((bert) => {
|
|
351
|
+
return !regexDetections.some(
|
|
352
|
+
(regex) => bert.start < regex.end && bert.end > regex.start
|
|
353
|
+
);
|
|
354
|
+
});
|
|
355
|
+
}
|
|
356
|
+
buildCascadeResult(text, detections) {
|
|
357
|
+
const activeDetections = detections.filter(
|
|
358
|
+
(d) => d.disposition !== "dismissed"
|
|
359
|
+
);
|
|
360
|
+
const passed = activeDetections.length === 0;
|
|
361
|
+
const score = activeDetections.length > 0 ? Math.max(...activeDetections.map((d) => d.score)) : 0;
|
|
362
|
+
const sources = [...new Set(detections.map((d) => d.source))];
|
|
363
|
+
const types = [...new Set(activeDetections.map((d) => d.entityType))];
|
|
364
|
+
return {
|
|
365
|
+
passed,
|
|
366
|
+
reason: passed ? "No issues detected" : `Detected via ${sources.join("+")}: ${types.join(", ")}`,
|
|
367
|
+
guardName: "cascade",
|
|
368
|
+
score,
|
|
369
|
+
detections
|
|
370
|
+
};
|
|
371
|
+
}
|
|
372
|
+
/** Clean up resources */
|
|
373
|
+
async dispose() {
|
|
374
|
+
if (this.bertLayer) {
|
|
375
|
+
await this.bertLayer.dispose();
|
|
376
|
+
this.bertLayer = null;
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
};
|
|
380
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
381
|
+
0 && (module.exports = {
|
|
382
|
+
BertLayer,
|
|
383
|
+
CascadeClassifier,
|
|
384
|
+
LlmLayer
|
|
385
|
+
});
|
|
386
|
+
//# sourceMappingURL=index.js.map
|