@nekzus/liop 1.2.0 → 2.0.0-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +88 -61
- package/dist/bridge/stream.js +14 -6
- package/dist/client/index.js +7 -7
- package/dist/crypto/verifier.d.ts +1 -1
- package/dist/crypto/verifier.js +2 -1
- package/dist/gateway/router.d.ts +7 -0
- package/dist/gateway/router.js +21 -3
- package/dist/sandbox/guardian.js +27 -4
- package/dist/sandbox/wasi.js +25 -0
- package/dist/security/zk.d.ts +1 -1
- package/dist/security/zk.js +11 -1
- package/dist/server/index.d.ts +23 -1
- package/dist/server/index.js +140 -30
- package/dist/server/ner-scanner.d.ts +29 -0
- package/dist/server/ner-scanner.js +141 -0
- package/dist/server/pii.d.ts +27 -1
- package/dist/server/pii.js +167 -5
- package/dist/workers/logic-execution.js +4 -2
- package/dist/workers/zk-verifier.d.ts +2 -0
- package/dist/workers/zk-verifier.js +15 -1
- package/package.json +4 -3
package/dist/server/index.js
CHANGED
|
@@ -10,8 +10,9 @@ import { zodToJsonSchema } from "zod-to-json-schema";
|
|
|
10
10
|
import { MeshNode } from "../mesh/node.js";
|
|
11
11
|
import { LiopRpcServer } from "../rpc/server.js";
|
|
12
12
|
import { log } from "../utils/logger.js";
|
|
13
|
+
import { NerScanner } from "./ner-scanner.js";
|
|
13
14
|
import { PII_PATTERNS, PII_PRESETS, PiiScanner } from "./pii.js";
|
|
14
|
-
export { PII_PATTERNS, PII_PRESETS, PiiScanner };
|
|
15
|
+
export { NerScanner, PII_PATTERNS, PII_PRESETS, PiiScanner };
|
|
15
16
|
/**
|
|
16
17
|
* When enabled, `payload` tools that are not LIOP v1 envelopes are passed through to the
|
|
17
18
|
* registered handler unchanged (no worker extraction). Default off for strict protocol tests.
|
|
@@ -29,6 +30,10 @@ export class LiopServer {
|
|
|
29
30
|
CACHE_TTL_MS = 24 * 60 * 60 * 1000; // 24 hours
|
|
30
31
|
THROTTLE_THRESHOLD = 5;
|
|
31
32
|
THROTTLE_COOLDOWN_MS = 60 * 1000; // 60 seconds
|
|
33
|
+
// [OWASP-A01] Sliding window rate limiter — prevents micro-query exfiltration
|
|
34
|
+
toolCallWindows = new Map();
|
|
35
|
+
toolCallMaxPerWindow;
|
|
36
|
+
toolCallWindowMs;
|
|
32
37
|
tools = new Map();
|
|
33
38
|
resources = new Map();
|
|
34
39
|
prompts = new Map();
|
|
@@ -67,8 +72,10 @@ export class LiopServer {
|
|
|
67
72
|
const compact = logic.replace(/\s+/g, " ");
|
|
68
73
|
if (policy.enforceAggregationFirst) {
|
|
69
74
|
const rowExtractionPatterns = [
|
|
70
|
-
|
|
71
|
-
|
|
75
|
+
// Block raw record dumps but allow safe aggregation chains
|
|
76
|
+
// (.reduce, .length, .filter().length, .every, .some)
|
|
77
|
+
/return\s+env\.records(?!\s*\.\s*(?:reduce|length|filter|every|some|find)\b)/i,
|
|
78
|
+
/return\s*\{[\s\S]*\b(accounts|patients|rows|records)\s*:\s*env\.records(?!\s*\.\s*(?:reduce|length|filter)\b)/i,
|
|
72
79
|
];
|
|
73
80
|
if (rowExtractionPatterns.some((p) => p.test(compact))) {
|
|
74
81
|
return "Preflight policy rejected: potential row-level export pattern detected.";
|
|
@@ -84,15 +91,29 @@ export class LiopServer {
|
|
|
84
91
|
return null;
|
|
85
92
|
const parsed = this.parseUnknownJson(output);
|
|
86
93
|
if (policy.outputSchema) {
|
|
87
|
-
|
|
94
|
+
// SEC-HARDENING: Force strict mode on ZodObject schemas to prevent
|
|
95
|
+
// key aliasing bypasses via .passthrough(). However, respect schemas
|
|
96
|
+
// that explicitly use .catchall() — calling .strict() would override
|
|
97
|
+
// the catchall with ZodNever, destroying the developer's intent.
|
|
98
|
+
const effectiveSchema = (() => {
|
|
99
|
+
if (!(policy.outputSchema instanceof z.ZodObject)) {
|
|
100
|
+
return policy.outputSchema;
|
|
101
|
+
}
|
|
102
|
+
const obj = policy.outputSchema;
|
|
103
|
+
// If schema has an explicit catchall (not ZodNever), respect it
|
|
104
|
+
if (!(obj._def.catchall instanceof z.ZodNever)) {
|
|
105
|
+
return obj;
|
|
106
|
+
}
|
|
107
|
+
// Otherwise force strict to block unrecognized keys by default
|
|
108
|
+
return obj.strict();
|
|
109
|
+
})();
|
|
110
|
+
const schemaResult = effectiveSchema.safeParse(parsed);
|
|
88
111
|
if (!schemaResult.success) {
|
|
89
|
-
//
|
|
90
|
-
|
|
91
|
-
? parsed.slice(0, 200)
|
|
92
|
-
: JSON.stringify(parsed).slice(0, 200);
|
|
112
|
+
// SEC-CRITICAL: Never expose rejected data in error messages.
|
|
113
|
+
// Only report the structural violation (unrecognized keys, type mismatches).
|
|
93
114
|
return `[LIOP] Output schema violation for ${toolName}: ${schemaResult.error.issues
|
|
94
115
|
.map((i) => `${i.path.join(".") || "<root>"} ${i.message}`)
|
|
95
|
-
.join("; ")}.
|
|
116
|
+
.join("; ")}. HINT: Your output must conform to the declared schema. Use 'env.records' to access the dataset and return only allowed fields.`;
|
|
96
117
|
}
|
|
97
118
|
}
|
|
98
119
|
if (policy.enforceAggregationFirst &&
|
|
@@ -143,6 +164,14 @@ export class LiopServer {
|
|
|
143
164
|
return this.unwrapForAggregationPolicyScan(joined);
|
|
144
165
|
}
|
|
145
166
|
violatesAggregationFirstPolicy(input, policyObj) {
|
|
167
|
+
const maxRows = typeof policyObj === "object" &&
|
|
168
|
+
typeof policyObj.maxOutputRows === "number"
|
|
169
|
+
? policyObj.maxOutputRows
|
|
170
|
+
: 10;
|
|
171
|
+
const allowPrimitives = typeof policyObj === "object" &&
|
|
172
|
+
typeof policyObj.allowPrimitiveArrays === "boolean"
|
|
173
|
+
? policyObj.allowPrimitiveArrays
|
|
174
|
+
: true;
|
|
146
175
|
if (typeof input === "string") {
|
|
147
176
|
const trimmed = input.trim();
|
|
148
177
|
if ((trimmed.startsWith("{") && trimmed.endsWith("}")) ||
|
|
@@ -157,14 +186,6 @@ export class LiopServer {
|
|
|
157
186
|
return false;
|
|
158
187
|
}
|
|
159
188
|
if (Array.isArray(input)) {
|
|
160
|
-
const maxRows = typeof policyObj === "object" &&
|
|
161
|
-
typeof policyObj.maxOutputRows === "number"
|
|
162
|
-
? policyObj.maxOutputRows
|
|
163
|
-
: 10;
|
|
164
|
-
const allowPrimitives = typeof policyObj === "object" &&
|
|
165
|
-
typeof policyObj.allowPrimitiveArrays === "boolean"
|
|
166
|
-
? policyObj.allowPrimitiveArrays
|
|
167
|
-
: true;
|
|
168
189
|
if (input.length > 0 &&
|
|
169
190
|
input.every((item) => typeof item === "object" && item !== null)) {
|
|
170
191
|
// Treat tabular row export as non-aggregated leakage risk if above threshold.
|
|
@@ -182,6 +203,11 @@ export class LiopServer {
|
|
|
182
203
|
return input.some((item) => this.violatesAggregationFirstPolicy(item, policyObj));
|
|
183
204
|
}
|
|
184
205
|
if (input && typeof input === "object") {
|
|
206
|
+
const keys = Object.keys(input);
|
|
207
|
+
// Treat flat dictionary with too many keys as non-aggregated leakage risk (Dynamic Key Bypass).
|
|
208
|
+
if (keys.length > maxRows) {
|
|
209
|
+
return true;
|
|
210
|
+
}
|
|
185
211
|
return Object.values(input).some((value) => this.violatesAggregationFirstPolicy(value, policyObj));
|
|
186
212
|
}
|
|
187
213
|
return false;
|
|
@@ -189,6 +215,9 @@ export class LiopServer {
|
|
|
189
215
|
constructor(serverInfo, config) {
|
|
190
216
|
this.serverInfo = serverInfo;
|
|
191
217
|
this.config = config;
|
|
218
|
+
const nerScanner = this.config?.security?.enableNerScanning
|
|
219
|
+
? new NerScanner()
|
|
220
|
+
: null;
|
|
192
221
|
this.piiScanner = new PiiScanner(this.config?.security?.piiPatterns ?? PII_PRESETS.GLOBAL_STRICT, this.config?.security?.forbiddenKeys ?? [
|
|
193
222
|
"id",
|
|
194
223
|
"name",
|
|
@@ -210,7 +239,15 @@ export class LiopServer {
|
|
|
210
239
|
"token",
|
|
211
240
|
"secret",
|
|
212
241
|
"privateKey",
|
|
213
|
-
]);
|
|
242
|
+
], nerScanner);
|
|
243
|
+
// [OWASP-A01] Rate limit: config > env > default (30 calls/min)
|
|
244
|
+
const rlConfig = this.config?.security?.rateLimit;
|
|
245
|
+
this.toolCallWindowMs =
|
|
246
|
+
rlConfig?.windowMs ??
|
|
247
|
+
Number.parseInt(process.env.LIOP_RATE_LIMIT_WINDOW_MS ?? "60000", 10);
|
|
248
|
+
this.toolCallMaxPerWindow =
|
|
249
|
+
rlConfig?.maxPerWindow ??
|
|
250
|
+
Number.parseInt(process.env.LIOP_RATE_LIMIT_MAX ?? "30", 10);
|
|
214
251
|
// Initialize Zero-Blocking Worker Pool for Heavy Cryptography & Sandboxing
|
|
215
252
|
const isTS = import.meta.url.endsWith(".ts");
|
|
216
253
|
const workerExt = isTS ? ".ts" : ".js";
|
|
@@ -239,6 +276,12 @@ export class LiopServer {
|
|
|
239
276
|
maxQueue: "auto",
|
|
240
277
|
taskQueue: new FixedQueue(),
|
|
241
278
|
execArgv,
|
|
279
|
+
// [DoS Defense] Enforce hard memory ceiling per worker thread.
|
|
280
|
+
// Workers exceeding this limit are terminated by Node.js runtime.
|
|
281
|
+
resourceLimits: {
|
|
282
|
+
maxOldGenerationSizeMb: this.config?.workerPool?.maxHeapMb ??
|
|
283
|
+
Number.parseInt(process.env.LIOP_WORKER_MAX_HEAP_MB ?? "64", 10),
|
|
284
|
+
},
|
|
242
285
|
});
|
|
243
286
|
// [Token Economy] Auto-register LIOP protocol spec as a single Resource.
|
|
244
287
|
// This centralizes the envelope documentation that was previously
|
|
@@ -568,6 +611,37 @@ Protocol Adherence is mandatory for successful execution.`,
|
|
|
568
611
|
this.logicCache.clear();
|
|
569
612
|
log.info("[LIOP-SDK] AST Security Cache cleared by Admin.");
|
|
570
613
|
}
|
|
614
|
+
/**
|
|
615
|
+
* Sliding window rate limiter for tool call frequency.
|
|
616
|
+
* Prevents micro-query exfiltration attacks where an attacker
|
|
617
|
+
* makes hundreds of individually-legitimate calls to reconstruct
|
|
618
|
+
* the full dataset field by field. (OWASP A01)
|
|
619
|
+
*/
|
|
620
|
+
checkToolCallRateLimit(toolName) {
|
|
621
|
+
const now = Date.now();
|
|
622
|
+
const windowMs = this.toolCallWindowMs;
|
|
623
|
+
const maxPerWindow = this.toolCallMaxPerWindow;
|
|
624
|
+
const window = this.toolCallWindows.get(toolName) || [];
|
|
625
|
+
// Evict expired timestamps outside the sliding window
|
|
626
|
+
const active = window.filter((t) => now - t < windowMs);
|
|
627
|
+
if (active.length >= maxPerWindow) {
|
|
628
|
+
const retryAfterSec = Math.ceil((active[0] + windowMs - now) / 1000);
|
|
629
|
+
return {
|
|
630
|
+
content: [
|
|
631
|
+
{
|
|
632
|
+
type: "text",
|
|
633
|
+
text: `LIOP_RATE_LIMITED: Too many calls to ${toolName}. ` +
|
|
634
|
+
`Max ${maxPerWindow} per ${windowMs / 1000}s window. ` +
|
|
635
|
+
`Retry after ${retryAfterSec}s.`,
|
|
636
|
+
},
|
|
637
|
+
],
|
|
638
|
+
isError: true,
|
|
639
|
+
};
|
|
640
|
+
}
|
|
641
|
+
active.push(now);
|
|
642
|
+
this.toolCallWindows.set(toolName, active);
|
|
643
|
+
return null;
|
|
644
|
+
}
|
|
571
645
|
/**
|
|
572
646
|
* Emulates calling a tool (used locally or via LIOPMcpBridge)
|
|
573
647
|
*/
|
|
@@ -576,6 +650,10 @@ Protocol Adherence is mandatory for successful execution.`,
|
|
|
576
650
|
if (!entry) {
|
|
577
651
|
throw new Error(`Tool not found: ${request.name}`);
|
|
578
652
|
}
|
|
653
|
+
// [OWASP-A01] Rate limiting: prevent micro-query exfiltration
|
|
654
|
+
const rateLimitResult = this.checkToolCallRateLimit(request.name);
|
|
655
|
+
if (rateLimitResult)
|
|
656
|
+
return rateLimitResult;
|
|
579
657
|
try {
|
|
580
658
|
// Validate inputs natively with Zod before execution
|
|
581
659
|
const parsedArgs = entry.schema.parse(request.arguments || {});
|
|
@@ -813,10 +891,11 @@ Protocol Adherence is mandatory for successful execution.`,
|
|
|
813
891
|
]);
|
|
814
892
|
const aggregationViolation = this.violatesAggregationFirstPolicy(this.unwrapForAggregationPolicyScan(finalOutput));
|
|
815
893
|
if (violation || aggregationViolation) {
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
log.info(`[LIOP-RPC] Secure egress blocked in gRPC stream: ${
|
|
819
|
-
response.semantic_evidence =
|
|
894
|
+
// SEC-CRITICAL: Log details server-side, never expose to caller
|
|
895
|
+
const internalReason = violation || "Aggregation-First Policy Violation";
|
|
896
|
+
log.info(`[LIOP-RPC] Secure egress blocked in gRPC stream: ${internalReason}`);
|
|
897
|
+
response.semantic_evidence =
|
|
898
|
+
"[LIOP] Egress Security Violation. Output blocked due to policy enforcement.";
|
|
820
899
|
response.is_error = true;
|
|
821
900
|
}
|
|
822
901
|
call.write(response, () => {
|
|
@@ -825,10 +904,16 @@ Protocol Adherence is mandatory for successful execution.`,
|
|
|
825
904
|
}
|
|
826
905
|
catch (error) {
|
|
827
906
|
const e = error;
|
|
828
|
-
|
|
907
|
+
const isDev = process.env.NODE_ENV === "development" ||
|
|
908
|
+
process.env.NODE_ENV === "test";
|
|
909
|
+
const detail = e.message || String(error);
|
|
910
|
+
log.error(`[LIOP-RPC] Execution Error: ${detail}`);
|
|
911
|
+
const errorMessage = isDev
|
|
912
|
+
? `Execution Error: ${detail}`
|
|
913
|
+
: "[LIOP] Execution Failed. The injected logic violated runtime constraints or encountered a fatal error.";
|
|
829
914
|
// Send error response before closing, avoiding "stream closed without results"
|
|
830
915
|
const errorResponse = {
|
|
831
|
-
semantic_evidence:
|
|
916
|
+
semantic_evidence: errorMessage,
|
|
832
917
|
cryptographic_proof: Buffer.from(""),
|
|
833
918
|
zk_receipt: Buffer.from(""),
|
|
834
919
|
is_error: true,
|
|
@@ -881,9 +966,20 @@ Protocol Adherence is mandatory for successful execution.`,
|
|
|
881
966
|
: undefined;
|
|
882
967
|
const policyViolation = this.validateOutputPolicy(toolName || "unknown_tool", workerResponse.output, toolPolicy);
|
|
883
968
|
if (policyViolation) {
|
|
969
|
+
// SEC-CRITICAL: Log details server-side, never expose to caller in Production
|
|
884
970
|
log.info(`[LIOP-SDK] Output policy blocked for ${toolName || "unknown_tool"}: ${policyViolation}`);
|
|
971
|
+
const isDev = process.env.NODE_ENV === "development" ||
|
|
972
|
+
process.env.NODE_ENV === "test";
|
|
973
|
+
const errorMessage = isDev
|
|
974
|
+
? policyViolation
|
|
975
|
+
: "[LIOP] Egress Security Violation. Output blocked due to policy enforcement. HINT: Return only aggregated, non-PII results using .reduce() to produce a flat {key:value} object with allowed schema fields.";
|
|
885
976
|
return {
|
|
886
|
-
content: [
|
|
977
|
+
content: [
|
|
978
|
+
{
|
|
979
|
+
type: "text",
|
|
980
|
+
text: errorMessage,
|
|
981
|
+
},
|
|
982
|
+
],
|
|
887
983
|
isError: true,
|
|
888
984
|
};
|
|
889
985
|
}
|
|
@@ -891,14 +987,21 @@ Protocol Adherence is mandatory for successful execution.`,
|
|
|
891
987
|
const violation = this.piiScanner.scan(content);
|
|
892
988
|
const aggregationViolation = this.violatesAggregationFirstPolicy(workerResponse.output);
|
|
893
989
|
if (violation || aggregationViolation) {
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
990
|
+
// SEC-CRITICAL: Log the specific violation reason server-side only.
|
|
991
|
+
// Never expose detection details (entity names, matched values) to the caller in Production.
|
|
992
|
+
const internalReason = violation ||
|
|
993
|
+
"Aggregation-First Policy Violation: Output blocked due to dynamic flat-key policy enforcement.";
|
|
994
|
+
log.info(`[LIOP-SDK] Secure egress blocked in local execution: ${internalReason}`);
|
|
995
|
+
const isDev = process.env.NODE_ENV === "development" ||
|
|
996
|
+
process.env.NODE_ENV === "test";
|
|
997
|
+
const errorMessage = isDev
|
|
998
|
+
? `[LIOP] Egress Security Violation: ${internalReason}`
|
|
999
|
+
: "[LIOP] Egress Security Violation. Output blocked due to policy enforcement. HINT: Return only aggregated, non-PII results using .reduce() to produce a flat {key:value} object with allowed schema fields.";
|
|
897
1000
|
return {
|
|
898
1001
|
content: [
|
|
899
1002
|
{
|
|
900
1003
|
type: "text",
|
|
901
|
-
text:
|
|
1004
|
+
text: errorMessage,
|
|
902
1005
|
},
|
|
903
1006
|
],
|
|
904
1007
|
isError: true,
|
|
@@ -908,11 +1011,18 @@ Protocol Adherence is mandatory for successful execution.`,
|
|
|
908
1011
|
}
|
|
909
1012
|
catch (error) {
|
|
910
1013
|
const e = error;
|
|
1014
|
+
const isDev = process.env.NODE_ENV === "development" ||
|
|
1015
|
+
process.env.NODE_ENV === "test";
|
|
1016
|
+
const detail = e.message || String(error);
|
|
1017
|
+
log.error(`[LIOP-SDK] WorkerPool Execution Fault: ${detail}`);
|
|
1018
|
+
const errorMessage = isDev
|
|
1019
|
+
? `WorkerPoolError: ${detail}`
|
|
1020
|
+
: "[LIOP] Execution Failed. The injected logic violated runtime constraints or encountered a fatal error.";
|
|
911
1021
|
return {
|
|
912
1022
|
content: [
|
|
913
1023
|
{
|
|
914
1024
|
type: "text",
|
|
915
|
-
text:
|
|
1025
|
+
text: errorMessage,
|
|
916
1026
|
},
|
|
917
1027
|
],
|
|
918
1028
|
isError: true,
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/** Single named entity detected by the NER scanner. */
|
|
2
|
+
export interface NerEntity {
|
|
3
|
+
type: "person" | "place" | "organization";
|
|
4
|
+
text: string;
|
|
5
|
+
}
|
|
6
|
+
/** Result of an NER scan operation. */
|
|
7
|
+
export interface NerScanResult {
|
|
8
|
+
detected: boolean;
|
|
9
|
+
entities: NerEntity[];
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Scans text content for named entities that may represent PII.
|
|
13
|
+
* Uses `compromise/three` for person, place, and organization detection.
|
|
14
|
+
*
|
|
15
|
+
* Designed for egress filtering — optimized for recall over precision
|
|
16
|
+
* to ensure sensitive data does not leak through aliased output keys.
|
|
17
|
+
*/
|
|
18
|
+
export declare class NerScanner {
|
|
19
|
+
/**
|
|
20
|
+
* Scans a single string value for named entities.
|
|
21
|
+
* Returns detected entities if the text contains recognizable PII.
|
|
22
|
+
*/
|
|
23
|
+
scan(text: string): NerScanResult;
|
|
24
|
+
/**
|
|
25
|
+
* Recursively scans all string values within an object/array.
|
|
26
|
+
* Stops at the first detection for performance (fail-fast).
|
|
27
|
+
*/
|
|
28
|
+
scanDeep(input: unknown, seen?: WeakSet<object>): NerScanResult;
|
|
29
|
+
}
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LIOP NER Content Scanner (The Shield V3 — Named Entity Recognition Layer)
|
|
3
|
+
*
|
|
4
|
+
* Lightweight NER scanner using `compromise` NLP for detecting
|
|
5
|
+
* person names, places, and organizations in free-text output values.
|
|
6
|
+
*
|
|
7
|
+
* This layer operates AFTER the regex-based PII scanner and
|
|
8
|
+
* catches entities that lack a deterministic format pattern
|
|
9
|
+
* (e.g., "Evelyn Reed" cannot be detected by regex).
|
|
10
|
+
*
|
|
11
|
+
* Architecture: opt-in per-server via `enableNerScanning: true`.
|
|
12
|
+
* Performance: ~10ms for typical SDK output sizes (< 10KB).
|
|
13
|
+
*
|
|
14
|
+
* @see https://github.com/spencermountain/compromise
|
|
15
|
+
*/
|
|
16
|
+
import nlp from "compromise/three";
|
|
17
|
+
/**
|
|
18
|
+
* Medical/pharmaceutical vocabulary safelist.
|
|
19
|
+
* These terms are tagged as #Medication to prevent the NER
|
|
20
|
+
* from misclassifying them as person/organization names.
|
|
21
|
+
* Extends progressively — add terms as false positives arise.
|
|
22
|
+
*/
|
|
23
|
+
const MEDICAL_VOCABULARY = {
|
|
24
|
+
aspirin: "Medication",
|
|
25
|
+
lisinopril: "Medication",
|
|
26
|
+
metformin: "Medication",
|
|
27
|
+
amlodipine: "Medication",
|
|
28
|
+
atorvastatin: "Medication",
|
|
29
|
+
omeprazole: "Medication",
|
|
30
|
+
losartan: "Medication",
|
|
31
|
+
simvastatin: "Medication",
|
|
32
|
+
levothyroxine: "Medication",
|
|
33
|
+
ibuprofen: "Medication",
|
|
34
|
+
acetaminophen: "Medication",
|
|
35
|
+
amoxicillin: "Medication",
|
|
36
|
+
ciprofloxacin: "Medication",
|
|
37
|
+
prednisone: "Medication",
|
|
38
|
+
warfarin: "Medication",
|
|
39
|
+
insulin: "Medication",
|
|
40
|
+
hydrochlorothiazide: "Medication",
|
|
41
|
+
gabapentin: "Medication",
|
|
42
|
+
albuterol: "Medication",
|
|
43
|
+
pantoprazole: "Medication",
|
|
44
|
+
// Generic clinical terms
|
|
45
|
+
hypertension: "Condition",
|
|
46
|
+
diabetes: "Condition",
|
|
47
|
+
bronchitis: "Condition",
|
|
48
|
+
pneumonia: "Condition",
|
|
49
|
+
asthma: "Condition",
|
|
50
|
+
};
|
|
51
|
+
// Register medical vocabulary BEFORE any scan operations.
|
|
52
|
+
// compromise's addWords() overrides the default classification,
|
|
53
|
+
// preventing these terms from being tagged as #Person or #Organization.
|
|
54
|
+
nlp.addWords(MEDICAL_VOCABULARY);
|
|
55
|
+
// Minimum string length to attempt NER analysis.
|
|
56
|
+
// Shorter strings are unlikely to contain meaningful named entities.
|
|
57
|
+
const MIN_TEXT_LENGTH = 4;
|
|
58
|
+
// Pattern to identify strings that are purely numeric/symbolic (skip NER)
|
|
59
|
+
const NON_TEXT_PATTERN = /^[\d\s.,:;!?()[\]{}<>@#$%^&*+=|\\/"'`~_-]+$/;
|
|
60
|
+
/**
|
|
61
|
+
* Scans text content for named entities that may represent PII.
|
|
62
|
+
* Uses `compromise/three` for person, place, and organization detection.
|
|
63
|
+
*
|
|
64
|
+
* Designed for egress filtering — optimized for recall over precision
|
|
65
|
+
* to ensure sensitive data does not leak through aliased output keys.
|
|
66
|
+
*/
|
|
67
|
+
export class NerScanner {
|
|
68
|
+
/**
|
|
69
|
+
* Scans a single string value for named entities.
|
|
70
|
+
* Returns detected entities if the text contains recognizable PII.
|
|
71
|
+
*/
|
|
72
|
+
scan(text) {
|
|
73
|
+
if (text.length < MIN_TEXT_LENGTH || NON_TEXT_PATTERN.test(text)) {
|
|
74
|
+
return { detected: false, entities: [] };
|
|
75
|
+
}
|
|
76
|
+
const doc = nlp(text);
|
|
77
|
+
const entities = [];
|
|
78
|
+
const people = doc.people().out("array");
|
|
79
|
+
for (const person of people) {
|
|
80
|
+
const trimmed = person.trim();
|
|
81
|
+
if (trimmed.length >= MIN_TEXT_LENGTH) {
|
|
82
|
+
entities.push({ type: "person", text: trimmed });
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
const places = doc.places().out("array");
|
|
86
|
+
for (const place of places) {
|
|
87
|
+
const trimmed = place.trim();
|
|
88
|
+
if (trimmed.length >= MIN_TEXT_LENGTH) {
|
|
89
|
+
entities.push({ type: "place", text: trimmed });
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
const orgs = doc.organizations().out("array");
|
|
93
|
+
for (const org of orgs) {
|
|
94
|
+
const trimmed = org.trim();
|
|
95
|
+
if (trimmed.length >= MIN_TEXT_LENGTH) {
|
|
96
|
+
entities.push({ type: "organization", text: trimmed });
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
return {
|
|
100
|
+
detected: entities.length > 0,
|
|
101
|
+
entities,
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* Recursively scans all string values within an object/array.
|
|
106
|
+
* Stops at the first detection for performance (fail-fast).
|
|
107
|
+
*/
|
|
108
|
+
scanDeep(input, seen = new WeakSet()) {
|
|
109
|
+
if (input === null || input === undefined) {
|
|
110
|
+
return { detected: false, entities: [] };
|
|
111
|
+
}
|
|
112
|
+
if (typeof input === "string") {
|
|
113
|
+
return this.scan(input);
|
|
114
|
+
}
|
|
115
|
+
if (typeof input === "object") {
|
|
116
|
+
if (seen.has(input)) {
|
|
117
|
+
return { detected: false, entities: [] };
|
|
118
|
+
}
|
|
119
|
+
seen.add(input);
|
|
120
|
+
const values = Array.isArray(input)
|
|
121
|
+
? input
|
|
122
|
+
: Object.values(input);
|
|
123
|
+
const allEntities = [];
|
|
124
|
+
for (const value of values) {
|
|
125
|
+
const result = this.scanDeep(value, seen);
|
|
126
|
+
if (result.detected) {
|
|
127
|
+
allEntities.push(...result.entities);
|
|
128
|
+
// Fail-fast: return immediately on first person detection
|
|
129
|
+
if (result.entities.some((e) => e.type === "person")) {
|
|
130
|
+
return { detected: true, entities: allEntities };
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
return {
|
|
135
|
+
detected: allEntities.length > 0,
|
|
136
|
+
entities: allEntities,
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
return { detected: false, entities: [] };
|
|
140
|
+
}
|
|
141
|
+
}
|
package/dist/server/pii.d.ts
CHANGED
|
@@ -30,11 +30,37 @@ export declare const PII_PRESETS: {
|
|
|
30
30
|
export declare class PiiScanner {
|
|
31
31
|
private patterns;
|
|
32
32
|
private forbiddenKeysSet;
|
|
33
|
-
|
|
33
|
+
private nerScanner;
|
|
34
|
+
/**
|
|
35
|
+
* Safelist of keys that contain forbidden substrings but are NOT PII.
|
|
36
|
+
* Prevents false positives from fuzzy matching (e.g., "grid" contains "id").
|
|
37
|
+
*/
|
|
38
|
+
private static readonly KEY_SAFELIST;
|
|
39
|
+
/**
|
|
40
|
+
* Short forbidden tokens (< 4 chars) that require boundary-aware matching.
|
|
41
|
+
* Uses regex boundary detection to avoid false positives.
|
|
42
|
+
*/
|
|
43
|
+
private shortTokenBoundaryPatterns;
|
|
44
|
+
/**
|
|
45
|
+
* Long forbidden tokens (>= 4 chars) that use substring containment.
|
|
46
|
+
*/
|
|
47
|
+
private longForbiddenTokens;
|
|
48
|
+
constructor(patterns?: PiiRule[], forbiddenKeys?: string[], nerScanner?: import("./ner-scanner.js").NerScanner | null);
|
|
34
49
|
/**
|
|
35
50
|
* Scans any input (string, object, array) for PII violations.
|
|
36
51
|
* Returns the pattern/rule name that triggered the violation, or null if safe.
|
|
52
|
+
*
|
|
53
|
+
* Detection pipeline (fail-fast):
|
|
54
|
+
* 1. Exact key match (O(1) Set lookup)
|
|
55
|
+
* 2. Fuzzy key match (boundary detection for short tokens, substring for long)
|
|
56
|
+
* 3. Regex/algorithmic pattern match on string values
|
|
57
|
+
* 4. NER content scan on string values (if enabled)
|
|
37
58
|
*/
|
|
38
59
|
scan(input: unknown, seen?: WeakSet<object>): string | null;
|
|
60
|
+
/**
|
|
61
|
+
* Checks a key against fuzzy matching rules.
|
|
62
|
+
* Short tokens use boundary-aware regex; long tokens use substring containment.
|
|
63
|
+
*/
|
|
64
|
+
private checkKeyFuzzy;
|
|
39
65
|
private checkString;
|
|
40
66
|
}
|