shroud-privacy 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +190 -0
- package/NOTICE +7 -0
- package/README.md +369 -0
- package/dist/audit.d.ts +46 -0
- package/dist/audit.js +127 -0
- package/dist/canary.d.ts +31 -0
- package/dist/canary.js +73 -0
- package/dist/config.d.ts +27 -0
- package/dist/config.js +123 -0
- package/dist/detectors/base.d.ts +8 -0
- package/dist/detectors/base.js +2 -0
- package/dist/detectors/code.d.ts +25 -0
- package/dist/detectors/code.js +144 -0
- package/dist/detectors/context.d.ts +31 -0
- package/dist/detectors/context.js +357 -0
- package/dist/detectors/patterns.d.ts +15 -0
- package/dist/detectors/patterns.js +58 -0
- package/dist/detectors/regex.d.ts +28 -0
- package/dist/detectors/regex.js +955 -0
- package/dist/generators/base.d.ts +6 -0
- package/dist/generators/base.js +2 -0
- package/dist/generators/codes.d.ts +20 -0
- package/dist/generators/codes.js +231 -0
- package/dist/generators/names.d.ts +29 -0
- package/dist/generators/names.js +194 -0
- package/dist/generators/network.d.ts +86 -0
- package/dist/generators/network.js +477 -0
- package/dist/hooks.d.ts +27 -0
- package/dist/hooks.js +457 -0
- package/dist/index.d.ts +12 -0
- package/dist/index.js +58 -0
- package/dist/mapping.d.ts +33 -0
- package/dist/mapping.js +72 -0
- package/dist/obfuscator.d.ts +78 -0
- package/dist/obfuscator.js +603 -0
- package/dist/redaction.d.ts +26 -0
- package/dist/redaction.js +76 -0
- package/dist/store.d.ts +40 -0
- package/dist/store.js +79 -0
- package/dist/types.d.ts +101 -0
- package/dist/types.js +35 -0
- package/ncg_adapter.py +530 -0
- package/openclaw.plugin.json +72 -0
- package/package.json +56 -0
- package/shroud_bridge.mjs +225 -0
package/dist/audit.js
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tamper-evident audit log for PII detection events (in-memory only).
|
|
3
|
+
*
|
|
4
|
+
* Logs what was detected (category, count, timestamp) WITHOUT storing real values.
|
|
5
|
+
* Uses HMAC chaining for tamper evidence -- each log entry includes a hash of
|
|
6
|
+
* the previous entry, so any modification/deletion is detectable.
|
|
7
|
+
*/
|
|
8
|
+
import { createHash, createHmac, randomBytes } from "node:crypto";
|
|
9
|
+
export class AuditLogger {
|
|
10
|
+
_secret;
|
|
11
|
+
_sessionId;
|
|
12
|
+
_maxEntries;
|
|
13
|
+
_lastHash;
|
|
14
|
+
_entries;
|
|
15
|
+
_stats;
|
|
16
|
+
constructor(secretKey, maxEntries = 200) {
|
|
17
|
+
this._secret = Buffer.from(secretKey, "utf-8");
|
|
18
|
+
this._sessionId = createHash("sha256")
|
|
19
|
+
.update(`${secretKey}:${Date.now()}`)
|
|
20
|
+
.digest("hex")
|
|
21
|
+
.slice(0, 12);
|
|
22
|
+
this._maxEntries = maxEntries;
|
|
23
|
+
this._lastHash = "0".repeat(64); // Genesis hash
|
|
24
|
+
this._entries = [];
|
|
25
|
+
this._stats = {
|
|
26
|
+
totalObfuscationEvents: 0,
|
|
27
|
+
totalDeobfuscationEvents: 0,
|
|
28
|
+
totalEntities: 0,
|
|
29
|
+
totalReplacements: 0,
|
|
30
|
+
byCategory: {},
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
/** Generate a unique request ID. */
|
|
34
|
+
static generateRequestId() {
|
|
35
|
+
return randomBytes(8).toString("hex");
|
|
36
|
+
}
|
|
37
|
+
/** Log an obfuscation event (no real values stored). */
|
|
38
|
+
logObfuscation(entities, textLength, requestId, processingTimeMs) {
|
|
39
|
+
if (entities.length === 0)
|
|
40
|
+
return;
|
|
41
|
+
// Aggregate by category
|
|
42
|
+
const categories = {};
|
|
43
|
+
for (const entity of entities) {
|
|
44
|
+
const cat = entity.category;
|
|
45
|
+
categories[cat] = (categories[cat] ?? 0) + 1;
|
|
46
|
+
}
|
|
47
|
+
this._writeEntry("obfuscation", categories, entities.length, textLength, requestId, processingTimeMs);
|
|
48
|
+
// Update running stats
|
|
49
|
+
this._stats.totalObfuscationEvents += 1;
|
|
50
|
+
this._stats.totalEntities += entities.length;
|
|
51
|
+
for (const [cat, count] of Object.entries(categories)) {
|
|
52
|
+
this._stats.byCategory[cat] = (this._stats.byCategory[cat] ?? 0) + count;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
/** Log a deobfuscation event. */
|
|
56
|
+
logDeobfuscation(replacementsMade, requestId, processingTimeMs) {
|
|
57
|
+
if (replacementsMade <= 0)
|
|
58
|
+
return;
|
|
59
|
+
this._writeEntry("deobfuscation", {}, replacementsMade, 0, requestId, processingTimeMs);
|
|
60
|
+
// Update running stats
|
|
61
|
+
this._stats.totalDeobfuscationEvents += 1;
|
|
62
|
+
this._stats.totalReplacements += replacementsMade;
|
|
63
|
+
}
|
|
64
|
+
_writeEntry(eventType, categories, totalEntities, textLength, requestId, processingTimeMs) {
|
|
65
|
+
const ts = Date.now();
|
|
66
|
+
const tsIso = new Date(ts).toISOString();
|
|
67
|
+
// Compute chain hash
|
|
68
|
+
const sortedCategories = JSON.stringify(categories, Object.keys(categories).sort());
|
|
69
|
+
const payload = `${this._lastHash}:${ts}:${eventType}:${sortedCategories}`;
|
|
70
|
+
const chainHash = createHmac("sha256", this._secret)
|
|
71
|
+
.update(payload)
|
|
72
|
+
.digest("hex");
|
|
73
|
+
const entry = {
|
|
74
|
+
timestamp: ts,
|
|
75
|
+
timestampIso: tsIso,
|
|
76
|
+
eventType,
|
|
77
|
+
sessionId: this._sessionId,
|
|
78
|
+
requestId: requestId ?? AuditLogger.generateRequestId(),
|
|
79
|
+
categories,
|
|
80
|
+
totalEntities,
|
|
81
|
+
textLength,
|
|
82
|
+
processingTimeMs: Math.round((processingTimeMs ?? 0) * 100) / 100,
|
|
83
|
+
chainHash,
|
|
84
|
+
};
|
|
85
|
+
this._lastHash = chainHash;
|
|
86
|
+
// Ring buffer: drop oldest if at capacity
|
|
87
|
+
if (this._entries.length >= this._maxEntries) {
|
|
88
|
+
this._entries.shift();
|
|
89
|
+
}
|
|
90
|
+
this._entries.push(entry);
|
|
91
|
+
}
|
|
92
|
+
/** Return aggregate statistics (safe to expose). */
|
|
93
|
+
getStats() {
|
|
94
|
+
return {
|
|
95
|
+
sessionId: this._sessionId,
|
|
96
|
+
totalEvents: this._stats.totalObfuscationEvents +
|
|
97
|
+
this._stats.totalDeobfuscationEvents,
|
|
98
|
+
totalObfuscationEvents: this._stats.totalObfuscationEvents,
|
|
99
|
+
totalDeobfuscationEvents: this._stats.totalDeobfuscationEvents,
|
|
100
|
+
totalEntitiesScrubbed: this._stats.totalEntities,
|
|
101
|
+
totalReplacementsRestored: this._stats.totalReplacements,
|
|
102
|
+
byCategory: { ...this._stats.byCategory },
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Verify the integrity of the audit log chain.
|
|
107
|
+
* Returns { valid, entriesChecked }.
|
|
108
|
+
*/
|
|
109
|
+
verifyChain() {
|
|
110
|
+
let prevHash = "0".repeat(64);
|
|
111
|
+
let count = 0;
|
|
112
|
+
for (const entry of this._entries) {
|
|
113
|
+
count += 1;
|
|
114
|
+
// Recompute expected hash
|
|
115
|
+
const sortedCategories = JSON.stringify(entry.categories, Object.keys(entry.categories).sort());
|
|
116
|
+
const payload = `${prevHash}:${entry.timestamp}:${entry.eventType}:${sortedCategories}`;
|
|
117
|
+
const expected = createHmac("sha256", this._secret)
|
|
118
|
+
.update(payload)
|
|
119
|
+
.digest("hex");
|
|
120
|
+
if (entry.chainHash !== expected) {
|
|
121
|
+
return { valid: false, entriesChecked: count };
|
|
122
|
+
}
|
|
123
|
+
prevHash = entry.chainHash;
|
|
124
|
+
}
|
|
125
|
+
return { valid: true, entriesChecked: count };
|
|
126
|
+
}
|
|
127
|
+
}
|
package/dist/canary.d.ts
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Canary token injection for detecting LLM data leakage.
|
|
3
|
+
*
|
|
4
|
+
* Injects unique, trackable tokens into obfuscated prompts. These tokens
|
|
5
|
+
* serve no semantic purpose but can be monitored for leakage -- if a canary
|
|
6
|
+
* appears in another user's output or in a training data audit, it proves
|
|
7
|
+
* your data was exposed.
|
|
8
|
+
*/
|
|
9
|
+
export interface CanaryToken {
|
|
10
|
+
token: string;
|
|
11
|
+
sessionId: string;
|
|
12
|
+
timestamp: number;
|
|
13
|
+
messageIndex: number;
|
|
14
|
+
}
|
|
15
|
+
export declare class CanaryInjector {
|
|
16
|
+
private readonly _prefix;
|
|
17
|
+
private readonly _secret;
|
|
18
|
+
private _sessionId;
|
|
19
|
+
private _messageCounter;
|
|
20
|
+
private _tokens;
|
|
21
|
+
constructor(prefix: string, secretKey: string);
|
|
22
|
+
get sessionId(): string;
|
|
23
|
+
/** Inject a canary token into text. Returns modified text. */
|
|
24
|
+
inject(text: string): string;
|
|
25
|
+
/** Return all canary tokens injected in this session. */
|
|
26
|
+
getTokens(): CanaryToken[];
|
|
27
|
+
/** Check if any known canary tokens appear in given text. */
|
|
28
|
+
checkLeak(text: string): CanaryToken[];
|
|
29
|
+
/** Reset for a new session. */
|
|
30
|
+
reset(): void;
|
|
31
|
+
}
|
package/dist/canary.js
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Canary token injection for detecting LLM data leakage.
|
|
3
|
+
*
|
|
4
|
+
* Injects unique, trackable tokens into obfuscated prompts. These tokens
|
|
5
|
+
* serve no semantic purpose but can be monitored for leakage -- if a canary
|
|
6
|
+
* appears in another user's output or in a training data audit, it proves
|
|
7
|
+
* your data was exposed.
|
|
8
|
+
*/
|
|
9
|
+
import { createHash } from "node:crypto";
|
|
10
|
+
export class CanaryInjector {
|
|
11
|
+
_prefix;
|
|
12
|
+
_secret;
|
|
13
|
+
_sessionId;
|
|
14
|
+
_messageCounter;
|
|
15
|
+
_tokens;
|
|
16
|
+
constructor(prefix, secretKey) {
|
|
17
|
+
this._prefix = prefix;
|
|
18
|
+
this._secret = secretKey;
|
|
19
|
+
this._sessionId = createHash("sha256")
|
|
20
|
+
.update(`${secretKey}:${Date.now()}`)
|
|
21
|
+
.digest("hex")
|
|
22
|
+
.slice(0, 12);
|
|
23
|
+
this._messageCounter = 0;
|
|
24
|
+
this._tokens = [];
|
|
25
|
+
}
|
|
26
|
+
get sessionId() {
|
|
27
|
+
return this._sessionId;
|
|
28
|
+
}
|
|
29
|
+
/** Inject a canary token into text. Returns modified text. */
|
|
30
|
+
inject(text) {
|
|
31
|
+
this._messageCounter += 1;
|
|
32
|
+
const ts = Date.now();
|
|
33
|
+
// Generate unique token
|
|
34
|
+
const raw = `${this._sessionId}:${this._messageCounter}:${ts}`;
|
|
35
|
+
const tokenHash = createHash("sha256")
|
|
36
|
+
.update(this._secret + raw)
|
|
37
|
+
.digest("hex")
|
|
38
|
+
.slice(0, 16);
|
|
39
|
+
const token = `${this._prefix}-${tokenHash}`;
|
|
40
|
+
const canary = {
|
|
41
|
+
token,
|
|
42
|
+
sessionId: this._sessionId,
|
|
43
|
+
timestamp: ts,
|
|
44
|
+
messageIndex: this._messageCounter,
|
|
45
|
+
};
|
|
46
|
+
this._tokens.push(canary);
|
|
47
|
+
// Inject as a non-semantic comment at the end of the text
|
|
48
|
+
return `${text}\n<!-- ${token} -->`;
|
|
49
|
+
}
|
|
50
|
+
/** Return all canary tokens injected in this session. */
|
|
51
|
+
getTokens() {
|
|
52
|
+
return [...this._tokens];
|
|
53
|
+
}
|
|
54
|
+
/** Check if any known canary tokens appear in given text. */
|
|
55
|
+
checkLeak(text) {
|
|
56
|
+
const leaked = [];
|
|
57
|
+
for (const canary of this._tokens) {
|
|
58
|
+
if (text.includes(canary.token)) {
|
|
59
|
+
leaked.push(canary);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
return leaked;
|
|
63
|
+
}
|
|
64
|
+
/** Reset for a new session. */
|
|
65
|
+
reset() {
|
|
66
|
+
this._sessionId = createHash("sha256")
|
|
67
|
+
.update(`${this._secret}:${Date.now()}`)
|
|
68
|
+
.digest("hex")
|
|
69
|
+
.slice(0, 12);
|
|
70
|
+
this._messageCounter = 0;
|
|
71
|
+
this._tokens = [];
|
|
72
|
+
}
|
|
73
|
+
}
|
package/dist/config.d.ts
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Configuration resolver for the Shroud plugin.
|
|
3
|
+
*
|
|
4
|
+
* Merges plugin config with environment variables and provides defaults.
|
|
5
|
+
*/
|
|
6
|
+
import { ShroudConfig } from "./types.js";
|
|
7
|
+
/**
|
|
8
|
+
* Resolve a fully populated ShroudConfig from optional plugin config
|
|
9
|
+
* and environment variables.
|
|
10
|
+
*
|
|
11
|
+
* Priority: env vars > pluginConfig > defaults.
|
|
12
|
+
*/
|
|
13
|
+
export declare function resolveConfig(pluginConfig?: unknown): ShroudConfig;
|
|
14
|
+
/** Validation issue severity. */
|
|
15
|
+
export type ConfigSeverity = "error" | "warning" | "info";
|
|
16
|
+
/** A single config validation issue. */
|
|
17
|
+
export interface ConfigIssue {
|
|
18
|
+
severity: ConfigSeverity;
|
|
19
|
+
field: string;
|
|
20
|
+
message: string;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Validate a resolved ShroudConfig and return actionable issues.
|
|
24
|
+
*
|
|
25
|
+
* Does NOT throw — callers decide how to handle warnings vs errors.
|
|
26
|
+
*/
|
|
27
|
+
export declare function validateConfig(config: ShroudConfig): ConfigIssue[];
|
package/dist/config.js
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Configuration resolver for the Shroud plugin.
|
|
3
|
+
*
|
|
4
|
+
* Merges plugin config with environment variables and provides defaults.
|
|
5
|
+
*/
|
|
6
|
+
import { randomBytes } from "node:crypto";
|
|
7
|
+
/**
|
|
8
|
+
* Resolve a fully populated ShroudConfig from optional plugin config
|
|
9
|
+
* and environment variables.
|
|
10
|
+
*
|
|
11
|
+
* Priority: env vars > pluginConfig > defaults.
|
|
12
|
+
*/
|
|
13
|
+
export function resolveConfig(pluginConfig) {
|
|
14
|
+
const raw = pluginConfig != null && typeof pluginConfig === "object"
|
|
15
|
+
? pluginConfig
|
|
16
|
+
: {};
|
|
17
|
+
// Env var overrides
|
|
18
|
+
const envSecretKey = process.env.SHROUD_SECRET_KEY;
|
|
19
|
+
const envSalt = process.env.SHROUD_PERSISTENT_SALT;
|
|
20
|
+
let secretKey = envSecretKey ??
|
|
21
|
+
(typeof raw.secretKey === "string" ? raw.secretKey : "");
|
|
22
|
+
// Auto-generate if missing
|
|
23
|
+
if (!secretKey) {
|
|
24
|
+
secretKey = randomBytes(32).toString("hex");
|
|
25
|
+
}
|
|
26
|
+
// Warn if too short (but don't throw -- let the plugin still load)
|
|
27
|
+
if (secretKey.length < 16) {
|
|
28
|
+
console.warn("[shroud] WARNING: secretKey is shorter than 16 characters. " +
|
|
29
|
+
"This weakens mapping security. Set SHROUD_SECRET_KEY or pass a longer key.");
|
|
30
|
+
}
|
|
31
|
+
const persistentSalt = envSalt ??
|
|
32
|
+
(typeof raw.persistentSalt === "string" ? raw.persistentSalt : "");
|
|
33
|
+
// Validate redactionLevel
|
|
34
|
+
const redactionRaw = raw.redactionLevel;
|
|
35
|
+
const validLevels = ["full", "masked", "stats"];
|
|
36
|
+
const redactionLevel = typeof redactionRaw === "string" && validLevels.includes(redactionRaw)
|
|
37
|
+
? redactionRaw
|
|
38
|
+
: "full";
|
|
39
|
+
const config = {
|
|
40
|
+
secretKey,
|
|
41
|
+
persistentSalt,
|
|
42
|
+
minConfidence: typeof raw.minConfidence === "number" ? raw.minConfidence : 0.0,
|
|
43
|
+
allowlist: Array.isArray(raw.allowlist)
|
|
44
|
+
? raw.allowlist
|
|
45
|
+
: [],
|
|
46
|
+
denylist: Array.isArray(raw.denylist)
|
|
47
|
+
? raw.denylist
|
|
48
|
+
: [],
|
|
49
|
+
canaryEnabled: typeof raw.canaryEnabled === "boolean" ? raw.canaryEnabled : false,
|
|
50
|
+
canaryPrefix: typeof raw.canaryPrefix === "string"
|
|
51
|
+
? raw.canaryPrefix
|
|
52
|
+
: "SHROUD-CANARY",
|
|
53
|
+
auditEnabled: typeof raw.auditEnabled === "boolean" ? raw.auditEnabled : false,
|
|
54
|
+
logMappings: typeof raw.logMappings === "boolean" ? raw.logMappings : false,
|
|
55
|
+
customPatterns: Array.isArray(raw.customPatterns)
|
|
56
|
+
? raw.customPatterns
|
|
57
|
+
: [],
|
|
58
|
+
// Verbose audit logging
|
|
59
|
+
verboseLogging: typeof raw.verboseLogging === "boolean" ? raw.verboseLogging : false,
|
|
60
|
+
auditLogFormat: raw.auditLogFormat === "json" ? "json" : "human",
|
|
61
|
+
auditIncludeProofHashes: typeof raw.auditIncludeProofHashes === "boolean"
|
|
62
|
+
? raw.auditIncludeProofHashes
|
|
63
|
+
: false,
|
|
64
|
+
auditHashSalt: typeof raw.auditHashSalt === "string" ? raw.auditHashSalt : "",
|
|
65
|
+
auditHashTruncate: typeof raw.auditHashTruncate === "number" ? raw.auditHashTruncate : 12,
|
|
66
|
+
auditMaxFakesSample: typeof raw.auditMaxFakesSample === "number"
|
|
67
|
+
? raw.auditMaxFakesSample
|
|
68
|
+
: 0,
|
|
69
|
+
detectorOverrides: raw.detectorOverrides != null && typeof raw.detectorOverrides === "object"
|
|
70
|
+
? raw.detectorOverrides
|
|
71
|
+
: {},
|
|
72
|
+
// Tool chain depth
|
|
73
|
+
maxToolDepth: typeof raw.maxToolDepth === "number" ? raw.maxToolDepth : 10,
|
|
74
|
+
// Redaction level
|
|
75
|
+
redactionLevel,
|
|
76
|
+
// Dry-run mode
|
|
77
|
+
dryRun: typeof raw.dryRun === "boolean" ? raw.dryRun : false,
|
|
78
|
+
// LRU store eviction (0 = unlimited)
|
|
79
|
+
maxStoreMappings: typeof raw.maxStoreMappings === "number" ? raw.maxStoreMappings : 0,
|
|
80
|
+
};
|
|
81
|
+
return config;
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Validate a resolved ShroudConfig and return actionable issues.
|
|
85
|
+
*
|
|
86
|
+
* Does NOT throw — callers decide how to handle warnings vs errors.
|
|
87
|
+
*/
|
|
88
|
+
export function validateConfig(config) {
|
|
89
|
+
const issues = [];
|
|
90
|
+
// Secret key checks
|
|
91
|
+
if (config.secretKey.length < 16) {
|
|
92
|
+
issues.push({ severity: "error", field: "secretKey", message: "secretKey is shorter than 16 chars — mappings are weak. Set SHROUD_SECRET_KEY." });
|
|
93
|
+
}
|
|
94
|
+
else if (config.secretKey.length < 32) {
|
|
95
|
+
issues.push({ severity: "warning", field: "secretKey", message: "secretKey is shorter than 32 chars — consider a longer key for production." });
|
|
96
|
+
}
|
|
97
|
+
// minConfidence range
|
|
98
|
+
if (config.minConfidence < 0 || config.minConfidence > 1) {
|
|
99
|
+
issues.push({ severity: "error", field: "minConfidence", message: `minConfidence=${config.minConfidence} is outside [0,1]. Set to a value between 0 and 1.` });
|
|
100
|
+
}
|
|
101
|
+
// maxStoreMappings negative
|
|
102
|
+
if (config.maxStoreMappings < 0) {
|
|
103
|
+
issues.push({ severity: "error", field: "maxStoreMappings", message: "maxStoreMappings must be >= 0 (0 = unlimited)." });
|
|
104
|
+
}
|
|
105
|
+
// dryRun informational
|
|
106
|
+
if (config.dryRun) {
|
|
107
|
+
issues.push({ severity: "info", field: "dryRun", message: "Dry-run mode is active — entities are detected but text is NOT obfuscated." });
|
|
108
|
+
}
|
|
109
|
+
// Custom patterns with invalid regex
|
|
110
|
+
for (const cp of config.customPatterns) {
|
|
111
|
+
try {
|
|
112
|
+
new RegExp(cp.pattern);
|
|
113
|
+
}
|
|
114
|
+
catch {
|
|
115
|
+
issues.push({ severity: "error", field: "customPatterns", message: `Custom pattern "${cp.name}" has invalid regex: ${cp.pattern}` });
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
// Detector overrides referencing unknown rules (info-level since we can't check at config time)
|
|
119
|
+
if (Object.keys(config.detectorOverrides).length > 0) {
|
|
120
|
+
issues.push({ severity: "info", field: "detectorOverrides", message: `${Object.keys(config.detectorOverrides).length} detector override(s) configured.` });
|
|
121
|
+
}
|
|
122
|
+
return issues;
|
|
123
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Code-aware detector that finds sensitive data inside string literals and comments.
|
|
3
|
+
*
|
|
4
|
+
* Scans source code for string literals and comments across common languages,
|
|
5
|
+
* then runs the standard regex detector on the extracted text to find PII
|
|
6
|
+
* that would otherwise be missed by scanning raw code.
|
|
7
|
+
*/
|
|
8
|
+
import { DetectedEntity } from "../types.js";
|
|
9
|
+
import { BaseDetector } from "./base.js";
|
|
10
|
+
import { RegexDetector } from "./regex.js";
|
|
11
|
+
/**
|
|
12
|
+
* Detects sensitive data embedded in source code strings and comments.
|
|
13
|
+
*
|
|
14
|
+
* Extracts string literals and comments from code, then runs PII detection
|
|
15
|
+
* on the extracted text. Entity positions are mapped back to the original
|
|
16
|
+
* source positions.
|
|
17
|
+
*/
|
|
18
|
+
export declare class CodeDetector implements BaseDetector {
|
|
19
|
+
readonly name = "code";
|
|
20
|
+
private _inner;
|
|
21
|
+
constructor(inner?: RegexDetector);
|
|
22
|
+
detect(text: string): DetectedEntity[];
|
|
23
|
+
/** Extract string literals and comments from code. */
|
|
24
|
+
private _extractSpans;
|
|
25
|
+
}
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Code-aware detector that finds sensitive data inside string literals and comments.
|
|
3
|
+
*
|
|
4
|
+
* Scans source code for string literals and comments across common languages,
|
|
5
|
+
* then runs the standard regex detector on the extracted text to find PII
|
|
6
|
+
* that would otherwise be missed by scanning raw code.
|
|
7
|
+
*/
|
|
8
|
+
import { RegexDetector } from "./regex.js";
|
|
9
|
+
/** Language-agnostic patterns for extracting strings and comments. */
|
|
10
|
+
const SPAN_PATTERNS = [
|
|
11
|
+
// Triple-quoted strings (Python, etc.)
|
|
12
|
+
/"""[\s\S]*?"""/g,
|
|
13
|
+
/'''[\s\S]*?'''/g,
|
|
14
|
+
// Double-quoted strings
|
|
15
|
+
/"(?:[^"\\]|\\.)*"/g,
|
|
16
|
+
// Single-quoted strings
|
|
17
|
+
/'(?:[^'\\]|\\.)*'/g,
|
|
18
|
+
// Backtick strings (JS/Go/etc.)
|
|
19
|
+
/`(?:[^`\\]|\\.)*`/g,
|
|
20
|
+
// Line comments (C-style, Python, Ruby, Shell)
|
|
21
|
+
/\/\/[^\n]*/g,
|
|
22
|
+
/#[^\n]*/g,
|
|
23
|
+
// Block comments
|
|
24
|
+
/\/\*[\s\S]*?\*\//g,
|
|
25
|
+
];
|
|
26
|
+
/** Patterns that are purely code constructs with no data (skip these). */
|
|
27
|
+
const CODE_NOISE = new RegExp("^[\\s\"'`#/\\*]*" +
|
|
28
|
+
"(?:import |from |require\\(|use |include |" +
|
|
29
|
+
"package |module |class |def |func |fn |" +
|
|
30
|
+
"return |const |let |var |type |interface )" +
|
|
31
|
+
"[^@]*$");
|
|
32
|
+
const CODE_INDICATORS = [
|
|
33
|
+
"def ", "class ", "function ", "import ", "from ", "require(",
|
|
34
|
+
"const ", "let ", "var ", "func ", "fn ", "pub ", "private ",
|
|
35
|
+
"return ", "if (", "for (", "while (", "package ", "module ",
|
|
36
|
+
"#!/", "# -*- coding", "use strict", "pragma ",
|
|
37
|
+
"SELECT ", "INSERT ", "CREATE TABLE",
|
|
38
|
+
];
|
|
39
|
+
/** Heuristic: does this text look like source code? */
|
|
40
|
+
function looksLikeCode(text) {
|
|
41
|
+
const lines = text.split("\n");
|
|
42
|
+
if (lines.length < 3) {
|
|
43
|
+
return false;
|
|
44
|
+
}
|
|
45
|
+
let score = 0;
|
|
46
|
+
for (const indicator of CODE_INDICATORS) {
|
|
47
|
+
if (text.includes(indicator)) {
|
|
48
|
+
score++;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
// Also check for common syntax patterns
|
|
52
|
+
if (/[{};]\s*$/m.test(text)) {
|
|
53
|
+
score++;
|
|
54
|
+
}
|
|
55
|
+
if (/^\s*(def|class|func|fn)\s+\w+/m.test(text)) {
|
|
56
|
+
score++;
|
|
57
|
+
}
|
|
58
|
+
return score >= 2;
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Detects sensitive data embedded in source code strings and comments.
|
|
62
|
+
*
|
|
63
|
+
* Extracts string literals and comments from code, then runs PII detection
|
|
64
|
+
* on the extracted text. Entity positions are mapped back to the original
|
|
65
|
+
* source positions.
|
|
66
|
+
*/
|
|
67
|
+
export class CodeDetector {
|
|
68
|
+
name = "code";
|
|
69
|
+
_inner;
|
|
70
|
+
constructor(inner) {
|
|
71
|
+
this._inner = inner ?? new RegexDetector();
|
|
72
|
+
}
|
|
73
|
+
detect(text) {
|
|
74
|
+
// Only run if the text looks like code
|
|
75
|
+
if (!looksLikeCode(text)) {
|
|
76
|
+
return [];
|
|
77
|
+
}
|
|
78
|
+
const spans = this._extractSpans(text);
|
|
79
|
+
const entities = [];
|
|
80
|
+
const seenSpans = new Set();
|
|
81
|
+
for (const span of spans) {
|
|
82
|
+
const innerText = span.text;
|
|
83
|
+
const innerOffset = span.start;
|
|
84
|
+
// Skip spans that look like pure code constructs
|
|
85
|
+
if (CODE_NOISE.test(innerText)) {
|
|
86
|
+
continue;
|
|
87
|
+
}
|
|
88
|
+
// Run PII detection on the inner text
|
|
89
|
+
const innerEntities = this._inner.detect(innerText);
|
|
90
|
+
for (const entity of innerEntities) {
|
|
91
|
+
// Map positions back to original text
|
|
92
|
+
const absStart = innerOffset + entity.start;
|
|
93
|
+
const absEnd = innerOffset + entity.end;
|
|
94
|
+
const spanKey = `${absStart}:${absEnd}`;
|
|
95
|
+
if (seenSpans.has(spanKey)) {
|
|
96
|
+
continue;
|
|
97
|
+
}
|
|
98
|
+
seenSpans.add(spanKey);
|
|
99
|
+
entities.push({
|
|
100
|
+
value: entity.value,
|
|
101
|
+
start: absStart,
|
|
102
|
+
end: absEnd,
|
|
103
|
+
category: entity.category,
|
|
104
|
+
confidence: entity.confidence * 0.9, // Slightly lower since it's inside code
|
|
105
|
+
detector: `code:${entity.detector}`,
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
entities.sort((a, b) => a.start - b.start);
|
|
110
|
+
return entities;
|
|
111
|
+
}
|
|
112
|
+
/** Extract string literals and comments from code. */
|
|
113
|
+
_extractSpans(text) {
|
|
114
|
+
const spans = [];
|
|
115
|
+
const covered = new Set();
|
|
116
|
+
for (const pattern of SPAN_PATTERNS) {
|
|
117
|
+
pattern.lastIndex = 0;
|
|
118
|
+
for (const match of text.matchAll(pattern)) {
|
|
119
|
+
const start = match.index;
|
|
120
|
+
const end = start + match[0].length;
|
|
121
|
+
// Skip if overlapping with already-found span
|
|
122
|
+
let overlaps = false;
|
|
123
|
+
for (let i = start; i < end; i++) {
|
|
124
|
+
if (covered.has(i)) {
|
|
125
|
+
overlaps = true;
|
|
126
|
+
break;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
if (overlaps) {
|
|
130
|
+
continue;
|
|
131
|
+
}
|
|
132
|
+
for (let i = start; i < end; i++) {
|
|
133
|
+
covered.add(i);
|
|
134
|
+
}
|
|
135
|
+
const kind = match[0].startsWith("/") || match[0].startsWith("#")
|
|
136
|
+
? "comment"
|
|
137
|
+
: "string";
|
|
138
|
+
spans.push({ text: match[0], start, end, kind });
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
spans.sort((a, b) => a.start - b.start);
|
|
142
|
+
return spans;
|
|
143
|
+
}
|
|
144
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Context-aware detection enhancements.
|
|
3
|
+
*
|
|
4
|
+
* Wraps another detector and applies post-detection intelligence:
|
|
5
|
+
* 1. Context-aware confidence boosting (config keyword density)
|
|
6
|
+
* 3. Proximity-based PII clustering (nearby entities boost each other)
|
|
7
|
+
* 4. Config-block hostname extraction (hostname X -> detect bare X)
|
|
8
|
+
* 9. Learned entity propagation (cross-invocation memory)
|
|
9
|
+
* 10. Confidence decay by frequency (common words lose confidence)
|
|
10
|
+
*/
|
|
11
|
+
import { DetectedEntity } from "../types.js";
|
|
12
|
+
import { BaseDetector } from "./base.js";
|
|
13
|
+
export declare class ContextDetector implements BaseDetector {
|
|
14
|
+
readonly name = "context";
|
|
15
|
+
private _inner;
|
|
16
|
+
/** Feature 9: Learned entities from previous invocations. */
|
|
17
|
+
private _learnedEntities;
|
|
18
|
+
constructor(inner: BaseDetector);
|
|
19
|
+
detect(text: string): DetectedEntity[];
|
|
20
|
+
/** Reset learned entities (called on Obfuscator.reset()). */
|
|
21
|
+
reset(): void;
|
|
22
|
+
/** Get count of learned entities. */
|
|
23
|
+
get learnedCount(): number;
|
|
24
|
+
private _boostFromContext;
|
|
25
|
+
private _splitBlocks;
|
|
26
|
+
private _boostByProximity;
|
|
27
|
+
private _extractAndPropagateHostnames;
|
|
28
|
+
private _injectLearnedEntities;
|
|
29
|
+
private _learnEntities;
|
|
30
|
+
private _decayCommonWords;
|
|
31
|
+
}
|