@datafog/fogclaw 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/harness-docs.yml +30 -0
- package/AGENTS.md +28 -0
- package/LICENSE +21 -0
- package/README.md +208 -0
- package/dist/config.d.ts +4 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +30 -0
- package/dist/config.js.map +1 -0
- package/dist/engines/gliner.d.ts +14 -0
- package/dist/engines/gliner.d.ts.map +1 -0
- package/dist/engines/gliner.js +75 -0
- package/dist/engines/gliner.js.map +1 -0
- package/dist/engines/regex.d.ts +5 -0
- package/dist/engines/regex.d.ts.map +1 -0
- package/dist/engines/regex.js +54 -0
- package/dist/engines/regex.js.map +1 -0
- package/dist/index.d.ts +19 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +157 -0
- package/dist/index.js.map +1 -0
- package/dist/redactor.d.ts +3 -0
- package/dist/redactor.d.ts.map +1 -0
- package/dist/redactor.js +37 -0
- package/dist/redactor.js.map +1 -0
- package/dist/scanner.d.ts +11 -0
- package/dist/scanner.d.ts.map +1 -0
- package/dist/scanner.js +77 -0
- package/dist/scanner.js.map +1 -0
- package/dist/types.d.ts +31 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +18 -0
- package/dist/types.js.map +1 -0
- package/docs/DATA.md +28 -0
- package/docs/DESIGN.md +17 -0
- package/docs/DOMAIN_DOCS.md +30 -0
- package/docs/FRONTEND.md +24 -0
- package/docs/OBSERVABILITY.md +25 -0
- package/docs/PLANS.md +171 -0
- package/docs/PRODUCT_SENSE.md +20 -0
- package/docs/RELIABILITY.md +60 -0
- package/docs/SECURITY.md +50 -0
- package/docs/design-docs/core-beliefs.md +17 -0
- package/docs/design-docs/index.md +8 -0
- package/docs/generated/README.md +36 -0
- package/docs/generated/memory.md +1 -0
- package/docs/plans/2026-02-16-fogclaw-design.md +172 -0
- package/docs/plans/2026-02-16-fogclaw-implementation.md +1606 -0
- package/docs/plans/README.md +15 -0
- package/docs/plans/active/2026-02-16-feat-openclaw-official-submission-plan.md +386 -0
- package/docs/plans/active/2026-02-17-feat-release-fogclaw-via-datafog-package-plan.md +318 -0
- package/docs/plans/active/2026-02-17-feat-submit-fogclaw-to-openclaw-plan.md +244 -0
- package/docs/plans/tech-debt-tracker.md +42 -0
- package/docs/plugins/fogclaw.md +95 -0
- package/docs/runbooks/address-review-findings.md +30 -0
- package/docs/runbooks/ci-failures.md +46 -0
- package/docs/runbooks/code-review.md +34 -0
- package/docs/runbooks/merge-change.md +28 -0
- package/docs/runbooks/pull-request.md +45 -0
- package/docs/runbooks/record-evidence.md +43 -0
- package/docs/runbooks/reproduce-bug.md +42 -0
- package/docs/runbooks/respond-to-feedback.md +42 -0
- package/docs/runbooks/review-findings.md +31 -0
- package/docs/runbooks/submit-openclaw-plugin.md +68 -0
- package/docs/runbooks/update-agents-md.md +59 -0
- package/docs/runbooks/update-domain-docs.md +42 -0
- package/docs/runbooks/validate-current-state.md +41 -0
- package/docs/runbooks/verify-release.md +69 -0
- package/docs/specs/2026-02-16-feat-openclaw-official-submission-spec.md +115 -0
- package/docs/specs/2026-02-17-feat-submit-fogclaw-to-openclaw.md +125 -0
- package/docs/specs/README.md +5 -0
- package/docs/specs/index.md +8 -0
- package/docs/spikes/README.md +8 -0
- package/fogclaw.config.example.json +15 -0
- package/openclaw.plugin.json +45 -0
- package/package.json +37 -0
- package/scripts/ci/he-docs-config.json +123 -0
- package/scripts/ci/he-docs-drift.sh +112 -0
- package/scripts/ci/he-docs-lint.sh +234 -0
- package/scripts/ci/he-plans-lint.sh +354 -0
- package/scripts/ci/he-runbooks-lint.sh +445 -0
- package/scripts/ci/he-specs-lint.sh +258 -0
- package/scripts/ci/he-spikes-lint.sh +249 -0
- package/scripts/runbooks/select-runbooks.sh +154 -0
- package/src/config.ts +46 -0
- package/src/engines/gliner.ts +88 -0
- package/src/engines/regex.ts +71 -0
- package/src/index.ts +223 -0
- package/src/redactor.ts +51 -0
- package/src/scanner.ts +90 -0
- package/src/types.ts +52 -0
- package/tests/config.test.ts +104 -0
- package/tests/gliner.test.ts +184 -0
- package/tests/plugin-smoke.test.ts +114 -0
- package/tests/redactor.test.ts +320 -0
- package/tests/regex.test.ts +345 -0
- package/tests/scanner.test.ts +199 -0
- package/tsconfig.json +20 -0
package/src/index.ts
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
import { Scanner } from "./scanner.js";
|
|
2
|
+
import { redact } from "./redactor.js";
|
|
3
|
+
import { loadConfig } from "./config.js";
|
|
4
|
+
import type { GuardrailAction } from "./types.js";
|
|
5
|
+
|
|
6
|
+
export { Scanner } from "./scanner.js";
|
|
7
|
+
export { redact } from "./redactor.js";
|
|
8
|
+
export { loadConfig, DEFAULT_CONFIG } from "./config.js";
|
|
9
|
+
export type {
|
|
10
|
+
Entity,
|
|
11
|
+
FogClawConfig,
|
|
12
|
+
ScanResult,
|
|
13
|
+
RedactResult,
|
|
14
|
+
RedactStrategy,
|
|
15
|
+
GuardrailAction,
|
|
16
|
+
} from "./types.js";
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* OpenClaw plugin definition.
|
|
20
|
+
*
|
|
21
|
+
* Registers:
|
|
22
|
+
* - `before_agent_start` hook for automatic PII guardrail
|
|
23
|
+
* - `fogclaw_scan` tool for on-demand entity detection
|
|
24
|
+
* - `fogclaw_redact` tool for on-demand redaction
|
|
25
|
+
*/
|
|
26
|
+
const fogclaw = {
|
|
27
|
+
id: "fogclaw",
|
|
28
|
+
name: "FogClaw",
|
|
29
|
+
|
|
30
|
+
register(api: any) {
|
|
31
|
+
const rawConfig = api.pluginConfig ?? api.getConfig?.() ?? {};
|
|
32
|
+
const config = loadConfig(rawConfig);
|
|
33
|
+
|
|
34
|
+
if (!config.enabled) {
|
|
35
|
+
api.logger?.info("[fogclaw] Plugin disabled via config");
|
|
36
|
+
return;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
const scanner = new Scanner(config);
|
|
40
|
+
// Initialize GLiNER in the background — regex works immediately,
|
|
41
|
+
// GLiNER becomes available once the model loads.
|
|
42
|
+
scanner.initialize().catch((err: unknown) => {
|
|
43
|
+
api.logger?.warn(`[fogclaw] GLiNER background init failed: ${String(err)}`);
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
// --- HOOK: Guardrail on incoming messages ---
|
|
47
|
+
api.on("before_agent_start", async (event: any) => {
|
|
48
|
+
const message = event.prompt ?? "";
|
|
49
|
+
if (!message) return;
|
|
50
|
+
|
|
51
|
+
const result = await scanner.scan(message);
|
|
52
|
+
|
|
53
|
+
if (result.entities.length === 0) return;
|
|
54
|
+
|
|
55
|
+
// Classify entities by their configured action
|
|
56
|
+
const blocked: typeof result.entities = [];
|
|
57
|
+
const warned: typeof result.entities = [];
|
|
58
|
+
const toRedact: typeof result.entities = [];
|
|
59
|
+
|
|
60
|
+
for (const entity of result.entities) {
|
|
61
|
+
const action: GuardrailAction =
|
|
62
|
+
config.entityActions[entity.label] ?? config.guardrail_mode;
|
|
63
|
+
if (action === "block") blocked.push(entity);
|
|
64
|
+
else if (action === "warn") warned.push(entity);
|
|
65
|
+
else if (action === "redact") toRedact.push(entity);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
const contextParts: string[] = [];
|
|
69
|
+
|
|
70
|
+
// "block" — inject a strong instruction to refuse
|
|
71
|
+
if (blocked.length > 0) {
|
|
72
|
+
const types = [...new Set(blocked.map((e) => e.label))].join(", ");
|
|
73
|
+
contextParts.push(
|
|
74
|
+
`[FOGCLAW GUARDRAIL — BLOCKED] The user's message contains sensitive information (${types}). ` +
|
|
75
|
+
`Do NOT process or repeat this information. Ask the user to rephrase without sensitive data.`,
|
|
76
|
+
);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// "warn" — inject a warning notice
|
|
80
|
+
if (warned.length > 0) {
|
|
81
|
+
const types = [...new Set(warned.map((e) => e.label))].join(", ");
|
|
82
|
+
contextParts.push(
|
|
83
|
+
`[FOGCLAW NOTICE] PII detected in user message: ${types}. Handle with care.`,
|
|
84
|
+
);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// "redact" — replace PII with tokens
|
|
88
|
+
if (toRedact.length > 0) {
|
|
89
|
+
const redacted = redact(message, toRedact, config.redactStrategy);
|
|
90
|
+
contextParts.push(
|
|
91
|
+
`[FOGCLAW REDACTED] The following is the user's message with PII redacted:\n${redacted.redacted_text}`,
|
|
92
|
+
);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
if (contextParts.length > 0) {
|
|
96
|
+
return { prependContext: contextParts.join("\n\n") };
|
|
97
|
+
}
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
// --- TOOL: On-demand scan ---
|
|
101
|
+
api.registerTool(
|
|
102
|
+
{
|
|
103
|
+
name: "fogclaw_scan",
|
|
104
|
+
id: "fogclaw_scan",
|
|
105
|
+
description:
|
|
106
|
+
"Scan text for PII and custom entities. Returns detected entities with types, positions, and confidence scores.",
|
|
107
|
+
schema: {
|
|
108
|
+
type: "object",
|
|
109
|
+
properties: {
|
|
110
|
+
text: {
|
|
111
|
+
type: "string",
|
|
112
|
+
description: "Text to scan for entities",
|
|
113
|
+
},
|
|
114
|
+
custom_labels: {
|
|
115
|
+
type: "array",
|
|
116
|
+
items: { type: "string" },
|
|
117
|
+
description:
|
|
118
|
+
"Additional entity labels for zero-shot detection (e.g., ['competitor name', 'project codename'])",
|
|
119
|
+
},
|
|
120
|
+
},
|
|
121
|
+
required: ["text"],
|
|
122
|
+
},
|
|
123
|
+
handler: async ({
|
|
124
|
+
text,
|
|
125
|
+
custom_labels,
|
|
126
|
+
}: {
|
|
127
|
+
text: string;
|
|
128
|
+
custom_labels?: string[];
|
|
129
|
+
}) => {
|
|
130
|
+
const result = await scanner.scan(text, custom_labels);
|
|
131
|
+
return {
|
|
132
|
+
content: [
|
|
133
|
+
{
|
|
134
|
+
type: "text",
|
|
135
|
+
text: JSON.stringify(
|
|
136
|
+
{
|
|
137
|
+
entities: result.entities,
|
|
138
|
+
count: result.entities.length,
|
|
139
|
+
summary:
|
|
140
|
+
result.entities.length > 0
|
|
141
|
+
? `Found ${result.entities.length} entities: ${[...new Set(result.entities.map((e) => e.label))].join(", ")}`
|
|
142
|
+
: "No entities detected",
|
|
143
|
+
},
|
|
144
|
+
null,
|
|
145
|
+
2,
|
|
146
|
+
),
|
|
147
|
+
},
|
|
148
|
+
],
|
|
149
|
+
};
|
|
150
|
+
},
|
|
151
|
+
}
|
|
152
|
+
);
|
|
153
|
+
|
|
154
|
+
// --- TOOL: On-demand redact ---
|
|
155
|
+
api.registerTool(
|
|
156
|
+
{
|
|
157
|
+
name: "fogclaw_redact",
|
|
158
|
+
id: "fogclaw_redact",
|
|
159
|
+
description:
|
|
160
|
+
"Scan and redact PII/custom entities from text. Returns sanitized text with entities replaced.",
|
|
161
|
+
schema: {
|
|
162
|
+
type: "object",
|
|
163
|
+
properties: {
|
|
164
|
+
text: {
|
|
165
|
+
type: "string",
|
|
166
|
+
description: "Text to scan and redact",
|
|
167
|
+
},
|
|
168
|
+
strategy: {
|
|
169
|
+
type: "string",
|
|
170
|
+
description:
|
|
171
|
+
'Redaction strategy: "token" ([EMAIL_1]), "mask" (****), or "hash" ([EMAIL_a1b2c3...])',
|
|
172
|
+
enum: ["token", "mask", "hash"],
|
|
173
|
+
},
|
|
174
|
+
custom_labels: {
|
|
175
|
+
type: "array",
|
|
176
|
+
items: { type: "string" },
|
|
177
|
+
description: "Additional entity labels for zero-shot detection",
|
|
178
|
+
},
|
|
179
|
+
},
|
|
180
|
+
required: ["text"],
|
|
181
|
+
},
|
|
182
|
+
handler: async ({
|
|
183
|
+
text,
|
|
184
|
+
strategy,
|
|
185
|
+
custom_labels,
|
|
186
|
+
}: {
|
|
187
|
+
text: string;
|
|
188
|
+
strategy?: "token" | "mask" | "hash";
|
|
189
|
+
custom_labels?: string[];
|
|
190
|
+
}) => {
|
|
191
|
+
const result = await scanner.scan(text, custom_labels);
|
|
192
|
+
const redacted = redact(
|
|
193
|
+
text,
|
|
194
|
+
result.entities,
|
|
195
|
+
strategy ?? config.redactStrategy,
|
|
196
|
+
);
|
|
197
|
+
return {
|
|
198
|
+
content: [
|
|
199
|
+
{
|
|
200
|
+
type: "text",
|
|
201
|
+
text: JSON.stringify(
|
|
202
|
+
{
|
|
203
|
+
redacted_text: redacted.redacted_text,
|
|
204
|
+
entities_found: result.entities.length,
|
|
205
|
+
mapping: redacted.mapping,
|
|
206
|
+
},
|
|
207
|
+
null,
|
|
208
|
+
2,
|
|
209
|
+
),
|
|
210
|
+
},
|
|
211
|
+
],
|
|
212
|
+
};
|
|
213
|
+
},
|
|
214
|
+
}
|
|
215
|
+
);
|
|
216
|
+
|
|
217
|
+
api.logger?.info(
|
|
218
|
+
`[fogclaw] Plugin registered — guardrail: ${config.guardrail_mode}, model: ${config.model}, custom entities: ${config.custom_entities.length}`,
|
|
219
|
+
);
|
|
220
|
+
},
|
|
221
|
+
};
|
|
222
|
+
|
|
223
|
+
export default fogclaw;
|
package/src/redactor.ts
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import type { Entity, RedactResult, RedactStrategy } from "./types.js";
|
|
3
|
+
|
|
4
|
+
export function redact(
|
|
5
|
+
text: string,
|
|
6
|
+
entities: Entity[],
|
|
7
|
+
strategy: RedactStrategy = "token",
|
|
8
|
+
): RedactResult {
|
|
9
|
+
if (entities.length === 0) {
|
|
10
|
+
return { redacted_text: text, mapping: {}, entities: [] };
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
// Sort by start position descending so we replace from end to start
|
|
14
|
+
// without corrupting earlier offsets
|
|
15
|
+
const sorted = [...entities].sort((a, b) => b.start - a.start);
|
|
16
|
+
|
|
17
|
+
const counters: Record<string, number> = {};
|
|
18
|
+
const mapping: Record<string, string> = {};
|
|
19
|
+
let result = text;
|
|
20
|
+
|
|
21
|
+
for (const entity of sorted) {
|
|
22
|
+
const replacement = makeReplacement(entity, strategy, counters);
|
|
23
|
+
mapping[replacement] = entity.text;
|
|
24
|
+
result = result.slice(0, entity.start) + replacement + result.slice(entity.end);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
return { redacted_text: result, mapping, entities };
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function makeReplacement(
|
|
31
|
+
entity: Entity,
|
|
32
|
+
strategy: RedactStrategy,
|
|
33
|
+
counters: Record<string, number>,
|
|
34
|
+
): string {
|
|
35
|
+
switch (strategy) {
|
|
36
|
+
case "token": {
|
|
37
|
+
counters[entity.label] = (counters[entity.label] ?? 0) + 1;
|
|
38
|
+
return `[${entity.label}_${counters[entity.label]}]`;
|
|
39
|
+
}
|
|
40
|
+
case "mask": {
|
|
41
|
+
return "*".repeat(Math.max(entity.text.length, 1));
|
|
42
|
+
}
|
|
43
|
+
case "hash": {
|
|
44
|
+
const digest = createHash("sha256")
|
|
45
|
+
.update(entity.text)
|
|
46
|
+
.digest("hex")
|
|
47
|
+
.slice(0, 12);
|
|
48
|
+
return `[${entity.label}_${digest}]`;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
}
|
package/src/scanner.ts
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import type { Entity, FogClawConfig, ScanResult } from "./types.js";
|
|
2
|
+
import { RegexEngine } from "./engines/regex.js";
|
|
3
|
+
import { GlinerEngine } from "./engines/gliner.js";
|
|
4
|
+
|
|
5
|
+
export class Scanner {
|
|
6
|
+
private regexEngine: RegexEngine;
|
|
7
|
+
private glinerEngine: GlinerEngine;
|
|
8
|
+
private glinerAvailable = false;
|
|
9
|
+
private config: FogClawConfig;
|
|
10
|
+
|
|
11
|
+
constructor(config: FogClawConfig) {
|
|
12
|
+
this.config = config;
|
|
13
|
+
this.regexEngine = new RegexEngine();
|
|
14
|
+
this.glinerEngine = new GlinerEngine(
|
|
15
|
+
config.model,
|
|
16
|
+
config.confidence_threshold,
|
|
17
|
+
);
|
|
18
|
+
if (config.custom_entities.length > 0) {
|
|
19
|
+
this.glinerEngine.setCustomLabels(config.custom_entities);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
async initialize(): Promise<void> {
|
|
24
|
+
try {
|
|
25
|
+
await this.glinerEngine.initialize();
|
|
26
|
+
this.glinerAvailable = true;
|
|
27
|
+
} catch (err) {
|
|
28
|
+
console.warn(
|
|
29
|
+
`[fogclaw] GLiNER failed to initialize, falling back to regex-only mode: ${err instanceof Error ? err.message : String(err)}`,
|
|
30
|
+
);
|
|
31
|
+
this.glinerAvailable = false;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
async scan(text: string, extraLabels?: string[]): Promise<ScanResult> {
|
|
36
|
+
if (!text) return { entities: [], text };
|
|
37
|
+
|
|
38
|
+
// Step 1: Regex pass (always runs, synchronous)
|
|
39
|
+
const regexEntities = this.regexEngine.scan(text);
|
|
40
|
+
|
|
41
|
+
// Step 2: GLiNER pass (if available)
|
|
42
|
+
let glinerEntities: Entity[] = [];
|
|
43
|
+
if (this.glinerAvailable) {
|
|
44
|
+
try {
|
|
45
|
+
glinerEntities = await this.glinerEngine.scan(text, extraLabels);
|
|
46
|
+
} catch (err) {
|
|
47
|
+
console.warn(`[fogclaw] GLiNER scan failed, using regex results only: ${err instanceof Error ? err.message : String(err)}`);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// Step 3: Merge and deduplicate
|
|
52
|
+
const merged = deduplicateEntities([...regexEntities, ...glinerEntities]);
|
|
53
|
+
|
|
54
|
+
return { entities: merged, text };
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Remove overlapping entity spans. When two entities overlap,
|
|
60
|
+
* keep the one with higher confidence. If equal, prefer regex.
|
|
61
|
+
*/
|
|
62
|
+
function deduplicateEntities(entities: Entity[]): Entity[] {
|
|
63
|
+
if (entities.length <= 1) return entities;
|
|
64
|
+
|
|
65
|
+
// Sort by start position, then by confidence descending
|
|
66
|
+
const sorted = [...entities].sort((a, b) => {
|
|
67
|
+
if (a.start !== b.start) return a.start - b.start;
|
|
68
|
+
return b.confidence - a.confidence;
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
const result: Entity[] = [sorted[0]];
|
|
72
|
+
|
|
73
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
74
|
+
const current = sorted[i];
|
|
75
|
+
const last = result[result.length - 1];
|
|
76
|
+
|
|
77
|
+
// Check for overlap
|
|
78
|
+
if (current.start < last.end) {
|
|
79
|
+
// Overlapping: keep higher confidence (already in result if first)
|
|
80
|
+
if (current.confidence > last.confidence) {
|
|
81
|
+
result[result.length - 1] = current;
|
|
82
|
+
}
|
|
83
|
+
// Otherwise keep what's already in result
|
|
84
|
+
} else {
|
|
85
|
+
result.push(current);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return result;
|
|
90
|
+
}
|
package/src/types.ts
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
export interface Entity {
|
|
2
|
+
text: string;
|
|
3
|
+
label: string;
|
|
4
|
+
start: number;
|
|
5
|
+
end: number;
|
|
6
|
+
confidence: number;
|
|
7
|
+
source: "regex" | "gliner";
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export type RedactStrategy = "token" | "mask" | "hash";
|
|
11
|
+
|
|
12
|
+
export type GuardrailAction = "redact" | "block" | "warn";
|
|
13
|
+
|
|
14
|
+
export interface FogClawConfig {
|
|
15
|
+
enabled: boolean;
|
|
16
|
+
guardrail_mode: GuardrailAction;
|
|
17
|
+
redactStrategy: RedactStrategy;
|
|
18
|
+
model: string;
|
|
19
|
+
confidence_threshold: number;
|
|
20
|
+
custom_entities: string[];
|
|
21
|
+
entityActions: Record<string, GuardrailAction>;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export interface ScanResult {
|
|
25
|
+
entities: Entity[];
|
|
26
|
+
text: string;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export interface RedactResult {
|
|
30
|
+
redacted_text: string;
|
|
31
|
+
mapping: Record<string, string>;
|
|
32
|
+
entities: Entity[];
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export const CANONICAL_TYPE_MAP: Record<string, string> = {
|
|
36
|
+
DOB: "DATE",
|
|
37
|
+
ZIP: "ZIP_CODE",
|
|
38
|
+
PER: "PERSON",
|
|
39
|
+
ORG: "ORGANIZATION",
|
|
40
|
+
GPE: "LOCATION",
|
|
41
|
+
LOC: "LOCATION",
|
|
42
|
+
FAC: "ADDRESS",
|
|
43
|
+
PHONE_NUMBER: "PHONE",
|
|
44
|
+
SOCIAL_SECURITY_NUMBER: "SSN",
|
|
45
|
+
CREDIT_CARD_NUMBER: "CREDIT_CARD",
|
|
46
|
+
DATE_OF_BIRTH: "DATE",
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
export function canonicalType(entityType: string): string {
|
|
50
|
+
const normalized = entityType.toUpperCase().trim();
|
|
51
|
+
return CANONICAL_TYPE_MAP[normalized] ?? normalized;
|
|
52
|
+
}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { loadConfig, DEFAULT_CONFIG } from "../src/config.js";
|
|
3
|
+
|
|
4
|
+
describe("loadConfig", () => {
|
|
5
|
+
it("returns defaults when no overrides are provided", () => {
|
|
6
|
+
const config = loadConfig({});
|
|
7
|
+
expect(config).toEqual(DEFAULT_CONFIG);
|
|
8
|
+
});
|
|
9
|
+
|
|
10
|
+
it("merges partial overrides with defaults", () => {
|
|
11
|
+
const config = loadConfig({ guardrail_mode: "block", confidence_threshold: 0.8 });
|
|
12
|
+
|
|
13
|
+
expect(config.guardrail_mode).toBe("block");
|
|
14
|
+
expect(config.confidence_threshold).toBe(0.8);
|
|
15
|
+
// Unset defaults are preserved
|
|
16
|
+
expect(config.enabled).toBe(true);
|
|
17
|
+
expect(config.redactStrategy).toBe("token");
|
|
18
|
+
expect(config.model).toBe("onnx-community/gliner_large-v2.1");
|
|
19
|
+
expect(config.custom_entities).toEqual([]);
|
|
20
|
+
expect(config.entityActions).toEqual({});
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
it("accepts all valid guardrail_mode values", () => {
|
|
24
|
+
expect(() => loadConfig({ guardrail_mode: "redact" })).not.toThrow();
|
|
25
|
+
expect(() => loadConfig({ guardrail_mode: "block" })).not.toThrow();
|
|
26
|
+
expect(() => loadConfig({ guardrail_mode: "warn" })).not.toThrow();
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
it("rejects invalid guardrail_mode", () => {
|
|
30
|
+
expect(() =>
|
|
31
|
+
loadConfig({ guardrail_mode: "invalid" as never }),
|
|
32
|
+
).toThrowError(
|
|
33
|
+
'Invalid guardrail_mode "invalid". Must be one of: redact, block, warn',
|
|
34
|
+
);
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
it("accepts all valid redactStrategy values", () => {
|
|
38
|
+
expect(() => loadConfig({ redactStrategy: "token" })).not.toThrow();
|
|
39
|
+
expect(() => loadConfig({ redactStrategy: "mask" })).not.toThrow();
|
|
40
|
+
expect(() => loadConfig({ redactStrategy: "hash" })).not.toThrow();
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
it("rejects invalid redactStrategy", () => {
|
|
44
|
+
expect(() =>
|
|
45
|
+
loadConfig({ redactStrategy: "plaintext" as never }),
|
|
46
|
+
).toThrowError(
|
|
47
|
+
'Invalid redactStrategy "plaintext". Must be one of: token, mask, hash',
|
|
48
|
+
);
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
it("accepts confidence_threshold at boundaries (0 and 1)", () => {
|
|
52
|
+
expect(() => loadConfig({ confidence_threshold: 0 })).not.toThrow();
|
|
53
|
+
expect(() => loadConfig({ confidence_threshold: 1 })).not.toThrow();
|
|
54
|
+
expect(() => loadConfig({ confidence_threshold: 0.5 })).not.toThrow();
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
it("rejects confidence_threshold below 0", () => {
|
|
58
|
+
expect(() =>
|
|
59
|
+
loadConfig({ confidence_threshold: -0.1 }),
|
|
60
|
+
).toThrowError("confidence_threshold must be between 0 and 1, got -0.1");
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
it("rejects confidence_threshold above 1", () => {
|
|
64
|
+
expect(() =>
|
|
65
|
+
loadConfig({ confidence_threshold: 1.5 }),
|
|
66
|
+
).toThrowError("confidence_threshold must be between 0 and 1, got 1.5");
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
it("accepts valid entityActions values", () => {
|
|
70
|
+
const config = loadConfig({
|
|
71
|
+
entityActions: { PERSON: "redact", EMAIL: "block", SSN: "warn" },
|
|
72
|
+
});
|
|
73
|
+
expect(config.entityActions).toEqual({
|
|
74
|
+
PERSON: "redact",
|
|
75
|
+
EMAIL: "block",
|
|
76
|
+
SSN: "warn",
|
|
77
|
+
});
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
it("rejects invalid entityActions values", () => {
|
|
81
|
+
expect(() =>
|
|
82
|
+
loadConfig({
|
|
83
|
+
entityActions: { EMAIL: "delete" as never },
|
|
84
|
+
}),
|
|
85
|
+
).toThrowError(
|
|
86
|
+
'Invalid action "delete" for entity type "EMAIL". Must be one of: redact, block, warn',
|
|
87
|
+
);
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
it("preserves custom_entities from overrides", () => {
|
|
91
|
+
const config = loadConfig({ custom_entities: ["EMPLOYEE_ID", "PROJECT_CODE"] });
|
|
92
|
+
expect(config.custom_entities).toEqual(["EMPLOYEE_ID", "PROJECT_CODE"]);
|
|
93
|
+
});
|
|
94
|
+
|
|
95
|
+
it("preserves model from overrides", () => {
|
|
96
|
+
const config = loadConfig({ model: "custom/my-model" });
|
|
97
|
+
expect(config.model).toBe("custom/my-model");
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
it("allows disabling via enabled: false", () => {
|
|
101
|
+
const config = loadConfig({ enabled: false });
|
|
102
|
+
expect(config.enabled).toBe(false);
|
|
103
|
+
});
|
|
104
|
+
});
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach } from "vitest";
|
|
2
|
+
|
|
3
|
+
// Mock the gliner npm package so we don't need the actual 1.4GB model
|
|
4
|
+
vi.mock("gliner", () => {
|
|
5
|
+
class MockGliner {
|
|
6
|
+
private config: any;
|
|
7
|
+
|
|
8
|
+
constructor(config: any) {
|
|
9
|
+
this.config = config;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
async initialize(): Promise<void> {
|
|
13
|
+
// No-op in mock
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
async inference(
|
|
17
|
+
text: string,
|
|
18
|
+
labels: string[],
|
|
19
|
+
options: { threshold: number },
|
|
20
|
+
): Promise<Array<{ text: string; label: string; score: number; start: number; end: number }>> {
|
|
21
|
+
const results: Array<{ text: string; label: string; score: number; start: number; end: number }> = [];
|
|
22
|
+
|
|
23
|
+
// Simulate entity detection for "John Smith"
|
|
24
|
+
const johnIndex = text.indexOf("John Smith");
|
|
25
|
+
if (johnIndex !== -1 && labels.includes("person")) {
|
|
26
|
+
results.push({
|
|
27
|
+
text: "John Smith",
|
|
28
|
+
label: "person",
|
|
29
|
+
score: 0.95,
|
|
30
|
+
start: johnIndex,
|
|
31
|
+
end: johnIndex + "John Smith".length,
|
|
32
|
+
});
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Simulate entity detection for "Acme Corp"
|
|
36
|
+
const acmeIndex = text.indexOf("Acme Corp");
|
|
37
|
+
if (acmeIndex !== -1 && labels.includes("organization")) {
|
|
38
|
+
results.push({
|
|
39
|
+
text: "Acme Corp",
|
|
40
|
+
label: "organization",
|
|
41
|
+
score: 0.88,
|
|
42
|
+
start: acmeIndex,
|
|
43
|
+
end: acmeIndex + "Acme Corp".length,
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Simulate entity detection for "New York"
|
|
48
|
+
const nyIndex = text.indexOf("New York");
|
|
49
|
+
if (nyIndex !== -1 && labels.includes("location")) {
|
|
50
|
+
results.push({
|
|
51
|
+
text: "New York",
|
|
52
|
+
label: "location",
|
|
53
|
+
score: 0.91,
|
|
54
|
+
start: nyIndex,
|
|
55
|
+
end: nyIndex + "New York".length,
|
|
56
|
+
});
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
return results;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
return { Gliner: MockGliner };
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
import { GlinerEngine } from "../src/engines/gliner.js";
|
|
67
|
+
|
|
68
|
+
describe("GlinerEngine", () => {
|
|
69
|
+
let engine: GlinerEngine;
|
|
70
|
+
|
|
71
|
+
beforeEach(async () => {
|
|
72
|
+
engine = new GlinerEngine("onnx-community/gliner_small-v2.5", 0.5);
|
|
73
|
+
await engine.initialize();
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
it("detects person entities with canonical PERSON label", async () => {
|
|
77
|
+
const entities = await engine.scan("My name is John Smith and I live here.");
|
|
78
|
+
|
|
79
|
+
expect(entities).toHaveLength(1);
|
|
80
|
+
expect(entities[0].text).toBe("John Smith");
|
|
81
|
+
expect(entities[0].label).toBe("PERSON");
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
it("detects organization entities with canonical ORGANIZATION label", async () => {
|
|
85
|
+
const entities = await engine.scan("I work at Acme Corp downtown.");
|
|
86
|
+
|
|
87
|
+
expect(entities).toHaveLength(1);
|
|
88
|
+
expect(entities[0].text).toBe("Acme Corp");
|
|
89
|
+
expect(entities[0].label).toBe("ORGANIZATION");
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
it("detects multiple entity types in the same text", async () => {
|
|
93
|
+
const entities = await engine.scan(
|
|
94
|
+
"John Smith works at Acme Corp in New York.",
|
|
95
|
+
);
|
|
96
|
+
|
|
97
|
+
expect(entities).toHaveLength(3);
|
|
98
|
+
|
|
99
|
+
const labels = entities.map((e) => e.label);
|
|
100
|
+
expect(labels).toContain("PERSON");
|
|
101
|
+
expect(labels).toContain("ORGANIZATION");
|
|
102
|
+
expect(labels).toContain("LOCATION");
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
it("returns empty array for text with no entities", async () => {
|
|
106
|
+
const entities = await engine.scan("Hello world, this is a test.");
|
|
107
|
+
|
|
108
|
+
expect(entities).toEqual([]);
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
it("returns empty array for empty string input", async () => {
|
|
112
|
+
const entities = await engine.scan("");
|
|
113
|
+
|
|
114
|
+
expect(entities).toEqual([]);
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
it("allows setting custom labels without crashing", async () => {
|
|
118
|
+
expect(() => engine.setCustomLabels(["product", "event"])).not.toThrow();
|
|
119
|
+
|
|
120
|
+
// Scan still works after setting custom labels
|
|
121
|
+
const entities = await engine.scan("John Smith attended the event.");
|
|
122
|
+
expect(entities).toHaveLength(1);
|
|
123
|
+
expect(entities[0].label).toBe("PERSON");
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
it("applies canonical type mapping (lowercase person -> PERSON)", async () => {
|
|
127
|
+
// The mock returns lowercase "person" as label; canonicalType should map it to "PERSON"
|
|
128
|
+
const entities = await engine.scan("John Smith is here.");
|
|
129
|
+
|
|
130
|
+
expect(entities[0].label).toBe("PERSON");
|
|
131
|
+
// Verify it's not lowercase
|
|
132
|
+
expect(entities[0].label).not.toBe("person");
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
it("sets source to gliner for all detected entities", async () => {
|
|
136
|
+
const entities = await engine.scan(
|
|
137
|
+
"John Smith works at Acme Corp in New York.",
|
|
138
|
+
);
|
|
139
|
+
|
|
140
|
+
for (const entity of entities) {
|
|
141
|
+
expect(entity.source).toBe("gliner");
|
|
142
|
+
}
|
|
143
|
+
});
|
|
144
|
+
|
|
145
|
+
it("confidence comes from model score", async () => {
|
|
146
|
+
const entities = await engine.scan(
|
|
147
|
+
"John Smith works at Acme Corp in New York.",
|
|
148
|
+
);
|
|
149
|
+
|
|
150
|
+
const person = entities.find((e) => e.label === "PERSON");
|
|
151
|
+
const org = entities.find((e) => e.label === "ORGANIZATION");
|
|
152
|
+
const loc = entities.find((e) => e.label === "LOCATION");
|
|
153
|
+
|
|
154
|
+
// These match the scores set in our mock
|
|
155
|
+
expect(person?.confidence).toBe(0.95);
|
|
156
|
+
expect(org?.confidence).toBe(0.88);
|
|
157
|
+
expect(loc?.confidence).toBe(0.91);
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
it("throws if scan is called before initialize", async () => {
|
|
161
|
+
const uninitializedEngine = new GlinerEngine("some-model", 0.5);
|
|
162
|
+
|
|
163
|
+
await expect(uninitializedEngine.scan("test")).rejects.toThrow(
|
|
164
|
+
"GLiNER engine not initialized. Call initialize() first.",
|
|
165
|
+
);
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
it("reports isInitialized correctly", async () => {
|
|
169
|
+
const freshEngine = new GlinerEngine("some-model", 0.5);
|
|
170
|
+
expect(freshEngine.isInitialized).toBe(false);
|
|
171
|
+
|
|
172
|
+
await freshEngine.initialize();
|
|
173
|
+
expect(freshEngine.isInitialized).toBe(true);
|
|
174
|
+
});
|
|
175
|
+
|
|
176
|
+
it("includes correct start and end offsets", async () => {
|
|
177
|
+
const text = "Contact John Smith for details.";
|
|
178
|
+
const entities = await engine.scan(text);
|
|
179
|
+
|
|
180
|
+
expect(entities).toHaveLength(1);
|
|
181
|
+
expect(entities[0].start).toBe(8); // "Contact " is 8 chars
|
|
182
|
+
expect(entities[0].end).toBe(18); // 8 + "John Smith".length = 18
|
|
183
|
+
});
|
|
184
|
+
});
|