llm-trust-guard 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +318 -0
- package/dist/guards/agent-communication-guard.d.ts +169 -0
- package/dist/guards/agent-communication-guard.d.ts.map +1 -0
- package/dist/guards/agent-communication-guard.js +468 -0
- package/dist/guards/agent-communication-guard.js.map +1 -0
- package/dist/guards/autonomy-escalation-guard.d.ts +137 -0
- package/dist/guards/autonomy-escalation-guard.d.ts.map +1 -0
- package/dist/guards/autonomy-escalation-guard.js +470 -0
- package/dist/guards/autonomy-escalation-guard.js.map +1 -0
- package/dist/guards/circuit-breaker.d.ts +142 -0
- package/dist/guards/circuit-breaker.d.ts.map +1 -0
- package/dist/guards/circuit-breaker.js +347 -0
- package/dist/guards/circuit-breaker.js.map +1 -0
- package/dist/guards/code-execution-guard.d.ts +114 -0
- package/dist/guards/code-execution-guard.d.ts.map +1 -0
- package/dist/guards/code-execution-guard.js +467 -0
- package/dist/guards/code-execution-guard.js.map +1 -0
- package/dist/guards/conversation-guard.d.ts +73 -0
- package/dist/guards/conversation-guard.d.ts.map +1 -0
- package/dist/guards/conversation-guard.js +281 -0
- package/dist/guards/conversation-guard.js.map +1 -0
- package/dist/guards/drift-detector.d.ts +182 -0
- package/dist/guards/drift-detector.d.ts.map +1 -0
- package/dist/guards/drift-detector.js +480 -0
- package/dist/guards/drift-detector.js.map +1 -0
- package/dist/guards/encoding-detector.d.ts +76 -0
- package/dist/guards/encoding-detector.d.ts.map +1 -0
- package/dist/guards/encoding-detector.js +698 -0
- package/dist/guards/encoding-detector.js.map +1 -0
- package/dist/guards/execution-monitor.d.ts +73 -0
- package/dist/guards/execution-monitor.d.ts.map +1 -0
- package/dist/guards/execution-monitor.js +205 -0
- package/dist/guards/execution-monitor.js.map +1 -0
- package/dist/guards/input-sanitizer.d.ts +87 -0
- package/dist/guards/input-sanitizer.d.ts.map +1 -0
- package/dist/guards/input-sanitizer.js +301 -0
- package/dist/guards/input-sanitizer.js.map +1 -0
- package/dist/guards/mcp-security-guard.d.ts +204 -0
- package/dist/guards/mcp-security-guard.d.ts.map +1 -0
- package/dist/guards/mcp-security-guard.js +618 -0
- package/dist/guards/mcp-security-guard.js.map +1 -0
- package/dist/guards/memory-guard.d.ts +124 -0
- package/dist/guards/memory-guard.d.ts.map +1 -0
- package/dist/guards/memory-guard.js +476 -0
- package/dist/guards/memory-guard.js.map +1 -0
- package/dist/guards/multimodal-guard.d.ts +93 -0
- package/dist/guards/multimodal-guard.d.ts.map +1 -0
- package/dist/guards/multimodal-guard.js +507 -0
- package/dist/guards/multimodal-guard.js.map +1 -0
- package/dist/guards/output-filter.d.ts +76 -0
- package/dist/guards/output-filter.d.ts.map +1 -0
- package/dist/guards/output-filter.js +289 -0
- package/dist/guards/output-filter.js.map +1 -0
- package/dist/guards/policy-gate.d.ts +57 -0
- package/dist/guards/policy-gate.d.ts.map +1 -0
- package/dist/guards/policy-gate.js +182 -0
- package/dist/guards/policy-gate.js.map +1 -0
- package/dist/guards/prompt-leakage-guard.d.ts +110 -0
- package/dist/guards/prompt-leakage-guard.d.ts.map +1 -0
- package/dist/guards/prompt-leakage-guard.js +529 -0
- package/dist/guards/prompt-leakage-guard.js.map +1 -0
- package/dist/guards/rag-guard.d.ts +188 -0
- package/dist/guards/rag-guard.d.ts.map +1 -0
- package/dist/guards/rag-guard.js +769 -0
- package/dist/guards/rag-guard.js.map +1 -0
- package/dist/guards/schema-validator.d.ts +35 -0
- package/dist/guards/schema-validator.d.ts.map +1 -0
- package/dist/guards/schema-validator.js +316 -0
- package/dist/guards/schema-validator.js.map +1 -0
- package/dist/guards/state-persistence-guard.d.ts +153 -0
- package/dist/guards/state-persistence-guard.d.ts.map +1 -0
- package/dist/guards/state-persistence-guard.js +484 -0
- package/dist/guards/state-persistence-guard.js.map +1 -0
- package/dist/guards/tenant-boundary.d.ts +67 -0
- package/dist/guards/tenant-boundary.d.ts.map +1 -0
- package/dist/guards/tenant-boundary.js +187 -0
- package/dist/guards/tenant-boundary.js.map +1 -0
- package/dist/guards/tool-chain-validator.d.ts +102 -0
- package/dist/guards/tool-chain-validator.d.ts.map +1 -0
- package/dist/guards/tool-chain-validator.js +480 -0
- package/dist/guards/tool-chain-validator.js.map +1 -0
- package/dist/guards/tool-registry.d.ts +45 -0
- package/dist/guards/tool-registry.d.ts.map +1 -0
- package/dist/guards/tool-registry.js +155 -0
- package/dist/guards/tool-registry.js.map +1 -0
- package/dist/guards/trust-exploitation-guard.d.ts +134 -0
- package/dist/guards/trust-exploitation-guard.d.ts.map +1 -0
- package/dist/guards/trust-exploitation-guard.js +354 -0
- package/dist/guards/trust-exploitation-guard.js.map +1 -0
- package/dist/index.d.ts +133 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +430 -0
- package/dist/index.js.map +1 -0
- package/dist/integrations/express.d.ts +119 -0
- package/dist/integrations/express.d.ts.map +1 -0
- package/dist/integrations/express.js +244 -0
- package/dist/integrations/express.js.map +1 -0
- package/dist/integrations/index.d.ts +9 -0
- package/dist/integrations/index.d.ts.map +1 -0
- package/dist/integrations/index.js +26 -0
- package/dist/integrations/index.js.map +1 -0
- package/dist/integrations/langchain.d.ts +165 -0
- package/dist/integrations/langchain.d.ts.map +1 -0
- package/dist/integrations/langchain.js +308 -0
- package/dist/integrations/langchain.js.map +1 -0
- package/dist/integrations/openai.d.ts +205 -0
- package/dist/integrations/openai.d.ts.map +1 -0
- package/dist/integrations/openai.js +380 -0
- package/dist/integrations/openai.js.map +1 -0
- package/dist/types/index.d.ts +245 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +6 -0
- package/dist/types/index.js.map +1 -0
- package/package.json +64 -0
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* L2 Tool Registry Guard
|
|
4
|
+
*
|
|
5
|
+
* Maintains strict control over which tools can be executed.
|
|
6
|
+
* Prevents LLM hallucination attacks.
|
|
7
|
+
*/
|
|
8
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
9
|
+
exports.ToolRegistry = void 0;
|
|
10
|
+
// Common hallucination patterns
|
|
11
|
+
const HALLUCINATION_PATTERNS = [
|
|
12
|
+
/^execute/i,
|
|
13
|
+
/^run/i,
|
|
14
|
+
/^shell/i,
|
|
15
|
+
/^admin/i,
|
|
16
|
+
/^override/i,
|
|
17
|
+
/^delete_all/i,
|
|
18
|
+
/^export_/i,
|
|
19
|
+
/^import_/i,
|
|
20
|
+
/^hack/i,
|
|
21
|
+
/^bypass/i,
|
|
22
|
+
/^sudo/i,
|
|
23
|
+
/^root/i,
|
|
24
|
+
/^system/i,
|
|
25
|
+
];
|
|
26
|
+
class ToolRegistry {
|
|
27
|
+
constructor(config) {
|
|
28
|
+
this.tools = new Map();
|
|
29
|
+
this.strictMatching = config.strictMatching ?? true;
|
|
30
|
+
for (const tool of config.tools) {
|
|
31
|
+
this.tools.set(tool.name, tool);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Check if a tool exists and is accessible for the given role
|
|
36
|
+
*/
|
|
37
|
+
check(toolName, role, requestId = "") {
|
|
38
|
+
// Exact match only
|
|
39
|
+
const tool = this.tools.get(toolName);
|
|
40
|
+
if (!tool) {
|
|
41
|
+
const isHallucination = this.detectHallucination(toolName);
|
|
42
|
+
const similarTools = this.findSimilarTools(toolName);
|
|
43
|
+
if (requestId) {
|
|
44
|
+
console.log(`[L2:${requestId}] BLOCKED: Tool '${toolName}' not in registry`);
|
|
45
|
+
if (isHallucination) {
|
|
46
|
+
console.log(`[L2:${requestId}] ALERT: Potential hallucination detected`);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
return {
|
|
50
|
+
allowed: false,
|
|
51
|
+
reason: `Tool '${toolName}' is not registered`,
|
|
52
|
+
violations: ["UNREGISTERED_TOOL"],
|
|
53
|
+
hallucination_detected: isHallucination,
|
|
54
|
+
similar_tools: similarTools.length > 0 ? similarTools : undefined,
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
// Check role access if roles are defined
|
|
58
|
+
if (tool.roles && tool.roles.length > 0 && !tool.roles.includes(role)) {
|
|
59
|
+
if (requestId) {
|
|
60
|
+
console.log(`[L2:${requestId}] BLOCKED: Role '${role}' cannot use '${toolName}'`);
|
|
61
|
+
}
|
|
62
|
+
return {
|
|
63
|
+
allowed: false,
|
|
64
|
+
reason: `Role '${role}' is not authorized for tool '${toolName}'`,
|
|
65
|
+
violations: ["UNAUTHORIZED_ROLE"],
|
|
66
|
+
tool,
|
|
67
|
+
hallucination_detected: false,
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
if (requestId) {
|
|
71
|
+
console.log(`[L2:${requestId}] Tool '${toolName}' ALLOWED for role '${role}'`);
|
|
72
|
+
}
|
|
73
|
+
return {
|
|
74
|
+
allowed: true,
|
|
75
|
+
violations: [],
|
|
76
|
+
tool,
|
|
77
|
+
hallucination_detected: false,
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Detect if tool name looks like a hallucination
|
|
82
|
+
*/
|
|
83
|
+
detectHallucination(toolName) {
|
|
84
|
+
for (const pattern of HALLUCINATION_PATTERNS) {
|
|
85
|
+
if (pattern.test(toolName)) {
|
|
86
|
+
return true;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
// Check for suspicious characters
|
|
90
|
+
if (toolName.includes("..") || toolName.includes("/") || toolName.includes("\\")) {
|
|
91
|
+
return true;
|
|
92
|
+
}
|
|
93
|
+
// Unusually long names
|
|
94
|
+
if (toolName.length > 50) {
|
|
95
|
+
return true;
|
|
96
|
+
}
|
|
97
|
+
// Special characters
|
|
98
|
+
if (/[^a-zA-Z0-9_-]/.test(toolName)) {
|
|
99
|
+
return true;
|
|
100
|
+
}
|
|
101
|
+
return false;
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Find similar registered tools for helpful error messages
|
|
105
|
+
*/
|
|
106
|
+
findSimilarTools(toolName) {
|
|
107
|
+
const similar = [];
|
|
108
|
+
const toolNameLower = toolName.toLowerCase();
|
|
109
|
+
for (const registeredTool of this.tools.keys()) {
|
|
110
|
+
const registeredLower = registeredTool.toLowerCase();
|
|
111
|
+
// Check word overlap
|
|
112
|
+
const requestedWords = toolNameLower.split(/[_-]/);
|
|
113
|
+
const registeredWords = registeredLower.split(/[_-]/);
|
|
114
|
+
for (const word of requestedWords) {
|
|
115
|
+
if (word.length > 2 && registeredWords.some((rw) => rw.includes(word) || word.includes(rw))) {
|
|
116
|
+
similar.push(registeredTool);
|
|
117
|
+
break;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
return [...new Set(similar)];
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Get tools for a specific role
|
|
125
|
+
*/
|
|
126
|
+
getToolsForRole(role) {
|
|
127
|
+
const tools = [];
|
|
128
|
+
for (const tool of this.tools.values()) {
|
|
129
|
+
if (!tool.roles || tool.roles.length === 0 || tool.roles.includes(role)) {
|
|
130
|
+
tools.push(tool);
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
return tools;
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* Get all registered tool names
|
|
137
|
+
*/
|
|
138
|
+
getRegisteredToolNames() {
|
|
139
|
+
return [...this.tools.keys()];
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Register a new tool at runtime
|
|
143
|
+
*/
|
|
144
|
+
registerTool(tool) {
|
|
145
|
+
this.tools.set(tool.name, tool);
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* Unregister a tool
|
|
149
|
+
*/
|
|
150
|
+
unregisterTool(toolName) {
|
|
151
|
+
return this.tools.delete(toolName);
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
exports.ToolRegistry = ToolRegistry;
|
|
155
|
+
//# sourceMappingURL=tool-registry.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tool-registry.js","sourceRoot":"","sources":["../../src/guards/tool-registry.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;;AAIH,gCAAgC;AAChC,MAAM,sBAAsB,GAAG;IAC7B,WAAW;IACX,OAAO;IACP,SAAS;IACT,SAAS;IACT,YAAY;IACZ,cAAc;IACd,WAAW;IACX,WAAW;IACX,QAAQ;IACR,UAAU;IACV,QAAQ;IACR,QAAQ;IACR,UAAU;CACX,CAAC;AAOF,MAAa,YAAY;IAIvB,YAAY,MAA0B;QACpC,IAAI,CAAC,KAAK,GAAG,IAAI,GAAG,EAAE,CAAC;QACvB,IAAI,CAAC,cAAc,GAAG,MAAM,CAAC,cAAc,IAAI,IAAI,CAAC;QAEpD,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;YAChC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;QAClC,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,QAAgB,EAAE,IAAU,EAAE,YAAoB,EAAE;QACxD,mBAAmB;QACnB,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QAEtC,IAAI,CAAC,IAAI,EAAE,CAAC;YACV,MAAM,eAAe,GAAG,IAAI,CAAC,mBAAmB,CAAC,QAAQ,CAAC,CAAC;YAC3D,MAAM,YAAY,GAAG,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;YAErD,IAAI,SAAS,EAAE,CAAC;gBACd,OAAO,CAAC,GAAG,CAAC,OAAO,SAAS,oBAAoB,QAAQ,mBAAmB,CAAC,CAAC;gBAC7E,IAAI,eAAe,EAAE,CAAC;oBACpB,OAAO,CAAC,GAAG,CAAC,OAAO,SAAS,2CAA2C,CAAC,CAAC;gBAC3E,CAAC;YACH,CAAC;YAED,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,MAAM,EAAE,SAAS,QAAQ,qBAAqB;gBAC9C,UAAU,EAAE,CAAC,mBAAmB,CAAC;gBACjC,sBAAsB,EAAE,eAAe;gBACvC,aAAa,EAAE,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,SAAS;aAClE,CAAC;QACJ,CAAC;QAED,yCAAyC;QACzC,IAAI,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;YACtE,IAAI,SAAS,EAAE,CAAC;gBACd,OAAO,CAAC,GAAG,CAAC,OAAO,SAAS,oBAAoB,IAAI,iBAAiB,QAAQ,GAAG,CAAC,CAAC;YACpF,CAAC;YAED,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,MAAM,EAAE,SAAS,IAAI,iCAAiC,QAAQ,GAAG;gBACjE,UAAU,EAAE,CAAC,mBAAmB,CAAC;gBACjC,IAAI;gBACJ,sBAAsB,EAAE,KAAK;aAC9B,CAAC;QACJ,CAAC;QAED,IAAI,SAAS,EAAE,CAAC;YACd,OAAO,CAAC,GAAG,CAAC,OAAO,SAAS,WAAW,QAAQ,uBAAuB,IAAI,GAAG,CAAC,CAAC;QACjF,CAAC;QAED,OAAO;YACL,OAAO,EAAE,IAAI;YACb,UAAU,EAAE,EAAE;YACd,IAAI;YACJ,sBAAsB,EAAE,KAAK;SAC9B,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,mBAAmB,CAAC,QAAgB;QAC1C,KAAK,MAAM,OAAO,IAAI,sBAAsB,EAAE,CAAC;YAC7C,IAAI,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAC3B,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC;QAED,kCAAkC;QAClC,IAAI,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;YACjF,OAAO,IAAI,CAAC;QACd,CAAC;QAED,uBAAuB;QACvB,IAAI,QAAQ,CAAC,MAAM,GAAG,EAAE,EAAE,CAAC;YACzB,OAAO,IAAI,CAAC;QACd,CAAC;QAED,qBAAqB;QACrB,IAAI,gBAAgB,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;YACpC,OAAO,IAAI,CAAC;QACd,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,QAAgB;QACvC,MAAM,OAAO,GAAa,EAAE,CAAC;QAC7B,MAAM,aAAa,GAAG,QAAQ,CAAC,WAAW,EAAE,CAAC;QAE7C,KAAK,MAAM,cAAc,IAAI,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,EAAE,CAAC;YAC/C,MAAM,eAAe,GAAG,cAAc,CAAC,WAAW,EAAE,CAAC;YAErD,qBAAqB;YACrB,MAAM,cAAc,GAAG,aAAa,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;YACnD,MAAM,eAAe,GAAG,eAAe,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;YAEtD,KAAK,MAAM,IAAI,IAAI,cAAc,EAAE,CAAC;gBAClC,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,IAAI,eAAe,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC;oBAC5F,OAAO,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;oBAC7B,MAAM;gBACR,CAAC;YACH,CAAC;QACH,CAAC;QAED,OAAO,CAAC,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,eAAe,CAAC,IAAU;QACxB,MAAM,KAAK,GAAqB,EAAE,CAAC;QACnC,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,EAAE,CAAC;YACvC,IAAI,CAAC,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;gBACxE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACnB,CAAC;QACH,CAAC;QACD,OAAO,KAAK,CAAC;IACf,CAAC;IAED;;OAEG;IACH,sBAAsB;QACpB,OAAO,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC;IAChC,CAAC;IAED;;OAEG;IACH,YAAY,CAAC,IAAoB;QAC/B,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;IAClC,CAAC;IAED;;OAEG;IACH,cAAc,CAAC,QAAgB;QAC7B,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;IACrC,CAAC;CACF;AAzJD,oCAyJC"}
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TrustExploitationGuard (L17)
|
|
3
|
+
*
|
|
4
|
+
* Detects and prevents human-agent trust exploitation attacks.
|
|
5
|
+
* Implements ASI09 from OWASP Agentic Applications 2026.
|
|
6
|
+
*
|
|
7
|
+
* Threat Model:
|
|
8
|
+
* - ASI09: Improper Human-Agent Trust Placement
|
|
9
|
+
* - Trust boundary violations
|
|
10
|
+
* - Permission escalation
|
|
11
|
+
* - Goal hijacking
|
|
12
|
+
* - Unauthorized autonomous actions
|
|
13
|
+
*
|
|
14
|
+
* Protection Capabilities:
|
|
15
|
+
* - Trust boundary enforcement
|
|
16
|
+
* - Permission escalation detection
|
|
17
|
+
* - Goal consistency monitoring
|
|
18
|
+
* - Action authorization verification
|
|
19
|
+
* - Human-in-the-loop enforcement
|
|
20
|
+
*/
|
|
21
|
+
export interface TrustExploitationGuardConfig {
|
|
22
|
+
/** Maximum trust level (0-100) */
|
|
23
|
+
maxTrustLevel?: number;
|
|
24
|
+
/** Actions requiring human approval */
|
|
25
|
+
humanApprovalRequired?: string[];
|
|
26
|
+
/** Maximum autonomous actions before requiring human check-in */
|
|
27
|
+
maxAutonomousActions?: number;
|
|
28
|
+
/** Enable goal consistency monitoring */
|
|
29
|
+
monitorGoalConsistency?: boolean;
|
|
30
|
+
/** Trust decay rate per hour (0-100) */
|
|
31
|
+
trustDecayRate?: number;
|
|
32
|
+
/** Enable permission escalation detection */
|
|
33
|
+
detectPermissionEscalation?: boolean;
|
|
34
|
+
/** Sensitive action categories */
|
|
35
|
+
sensitiveActions?: string[];
|
|
36
|
+
/** Initial goal for consistency checking */
|
|
37
|
+
initialGoal?: string;
|
|
38
|
+
}
|
|
39
|
+
export interface AgentAction {
|
|
40
|
+
/** Action identifier */
|
|
41
|
+
action_id: string;
|
|
42
|
+
/** Action type (e.g., "file_write", "api_call", "data_access") */
|
|
43
|
+
action_type: string;
|
|
44
|
+
/** Target resource */
|
|
45
|
+
target: string;
|
|
46
|
+
/** Requested permissions */
|
|
47
|
+
permissions?: string[];
|
|
48
|
+
/** Agent's stated reason for action */
|
|
49
|
+
reason?: string;
|
|
50
|
+
/** Current goal the action serves */
|
|
51
|
+
goal?: string;
|
|
52
|
+
/** Is this action autonomous or human-requested */
|
|
53
|
+
autonomous: boolean;
|
|
54
|
+
/** Timestamp */
|
|
55
|
+
timestamp: number;
|
|
56
|
+
/** Additional metadata */
|
|
57
|
+
metadata?: Record<string, any>;
|
|
58
|
+
}
|
|
59
|
+
export interface TrustContext {
|
|
60
|
+
/** Current trust level (0-100) */
|
|
61
|
+
trust_level: number;
|
|
62
|
+
/** Session ID */
|
|
63
|
+
session_id: string;
|
|
64
|
+
/** Actions performed this session */
|
|
65
|
+
action_history: AgentAction[];
|
|
66
|
+
/** Current goal */
|
|
67
|
+
current_goal: string;
|
|
68
|
+
/** Initial goal (for consistency) */
|
|
69
|
+
initial_goal: string;
|
|
70
|
+
/** Granted permissions */
|
|
71
|
+
granted_permissions: Set<string>;
|
|
72
|
+
/** Last human interaction timestamp */
|
|
73
|
+
last_human_interaction: number;
|
|
74
|
+
/** Autonomous actions since last check-in */
|
|
75
|
+
autonomous_actions_count: number;
|
|
76
|
+
}
|
|
77
|
+
export interface TrustExploitationResult {
|
|
78
|
+
allowed: boolean;
|
|
79
|
+
reason: string;
|
|
80
|
+
violations: string[];
|
|
81
|
+
request_id: string;
|
|
82
|
+
analysis: {
|
|
83
|
+
trust_level: number;
|
|
84
|
+
trust_change: number;
|
|
85
|
+
permission_escalation: boolean;
|
|
86
|
+
goal_deviation: boolean;
|
|
87
|
+
requires_human_approval: boolean;
|
|
88
|
+
autonomous_action_count: number;
|
|
89
|
+
trust_boundary_violated: boolean;
|
|
90
|
+
};
|
|
91
|
+
recommendations: string[];
|
|
92
|
+
human_approval_required?: boolean;
|
|
93
|
+
approval_reason?: string;
|
|
94
|
+
}
|
|
95
|
+
export declare class TrustExploitationGuard {
|
|
96
|
+
private config;
|
|
97
|
+
private contexts;
|
|
98
|
+
private readonly TRUST_EXPLOITATION_PATTERNS;
|
|
99
|
+
private readonly DEFAULT_SENSITIVE_ACTIONS;
|
|
100
|
+
constructor(config?: TrustExploitationGuardConfig);
|
|
101
|
+
/**
|
|
102
|
+
* Validate an agent action
|
|
103
|
+
*/
|
|
104
|
+
validateAction(action: AgentAction, sessionId: string, requestId?: string): TrustExploitationResult;
|
|
105
|
+
/**
|
|
106
|
+
* Record human interaction (resets autonomous count, boosts trust)
|
|
107
|
+
*/
|
|
108
|
+
recordHumanInteraction(sessionId: string, action?: "approve" | "deny" | "check_in"): void;
|
|
109
|
+
/**
|
|
110
|
+
* Grant permission to agent
|
|
111
|
+
*/
|
|
112
|
+
grantPermission(sessionId: string, permission: string): void;
|
|
113
|
+
/**
|
|
114
|
+
* Revoke permission from agent
|
|
115
|
+
*/
|
|
116
|
+
revokePermission(sessionId: string, permission: string): void;
|
|
117
|
+
/**
|
|
118
|
+
* Set initial goal for consistency monitoring
|
|
119
|
+
*/
|
|
120
|
+
setInitialGoal(sessionId: string, goal: string): void;
|
|
121
|
+
/**
|
|
122
|
+
* Get current trust level
|
|
123
|
+
*/
|
|
124
|
+
getTrustLevel(sessionId: string): number;
|
|
125
|
+
/**
|
|
126
|
+
* Reset session context
|
|
127
|
+
*/
|
|
128
|
+
resetSession(sessionId: string): void;
|
|
129
|
+
private createContext;
|
|
130
|
+
private applyTrustDecay;
|
|
131
|
+
private calculateGoalSimilarity;
|
|
132
|
+
private generateRecommendations;
|
|
133
|
+
}
|
|
134
|
+
//# sourceMappingURL=trust-exploitation-guard.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"trust-exploitation-guard.d.ts","sourceRoot":"","sources":["../../src/guards/trust-exploitation-guard.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAEH,MAAM,WAAW,4BAA4B;IAC3C,kCAAkC;IAClC,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,uCAAuC;IACvC,qBAAqB,CAAC,EAAE,MAAM,EAAE,CAAC;IACjC,iEAAiE;IACjE,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAC9B,yCAAyC;IACzC,sBAAsB,CAAC,EAAE,OAAO,CAAC;IACjC,wCAAwC;IACxC,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,6CAA6C;IAC7C,0BAA0B,CAAC,EAAE,OAAO,CAAC;IACrC,kCAAkC;IAClC,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAC;IAC5B,4CAA4C;IAC5C,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,WAAW;IAC1B,wBAAwB;IACxB,SAAS,EAAE,MAAM,CAAC;IAClB,kEAAkE;IAClE,WAAW,EAAE,MAAM,CAAC;IACpB,sBAAsB;IACtB,MAAM,EAAE,MAAM,CAAC;IACf,4BAA4B;IAC5B,WAAW,CAAC,EAAE,MAAM,EAAE,CAAC;IACvB,uCAAuC;IACvC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,qCAAqC;IACrC,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,mDAAmD;IACnD,UAAU,EAAE,OAAO,CAAC;IACpB,gBAAgB;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,0BAA0B;IAC1B,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;CAChC;AAED,MAAM,WAAW,YAAY;IAC3B,kCAAkC;IAClC,WAAW,EAAE,MAAM,CAAC;IACpB,iBAAiB;IACjB,UAAU,EAAE,MAAM,CAAC;IACnB,qCAAqC;IACrC,cAAc,EAAE,WAAW,EAAE,CAAC;IAC9B,mBAAmB;IACnB,YAAY,EAAE,MAAM,CAAC;IACrB,qCAAqC;IACrC,YAAY,EAAE,MAAM,CAAC;IACrB,0BAA0B;IAC1B,mBAAmB,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC;IACjC,uCAAuC;IACvC,sBAAsB,EAAE,MAAM,CAAC;IAC/B,6CAA6C;IAC7C,wBAAwB,EAAE,MAAM,CAAC;CAClC;AAED,MAAM,WAAW,uBAAuB;IACtC,OAAO,EAAE,OAAO,CAAC;IACjB,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,MAAM,EAAE,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE;QACR,WAAW,EAAE,MAAM,CAAC;QACpB,YAAY,EAAE,MAAM,CAAC;QACrB,qBAAqB,EAAE,OAAO,CAAC;QAC/B,cAAc,EAAE,OAAO,CAAC;QACxB,uBAAuB,EAAE,OAAO,CAAC;QACjC,uBAAuB,EAAE,MAAM,CAAC;QAChC,uBAAuB,EAAE,OAAO,CAAC;KAClC,CAAC;IACF,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,uBAAuB,CAAC,EAAE,OAAO,CAAC;IAClC,eAAe,CAAC,EAAE,MAAM,CAAC;CAC1B;AAED,qBAAa,sBAAsB;IACjC,OAAO,CAAC,MAAM,CAAyC;IACvD,OAAO,CAAC,QAAQ,CAAwC;IAGxD,OAAO,CAAC,QAAQ,CAAC,2BAA2B,CAyB1C;IAGF,OAAO,CAAC,QAAQ,CAAC,yBAAyB,CAWxC;gBAEU,MAAM,GAAE,4BAAiC;IAarD;;OAEG;IACH,cAAc,CACZ,MAAM,EAAE,WAAW,EACnB,SAAS,EAAE,MAAM,EACjB,SAAS,CAAC,EAAE,MAAM,GACjB,uBAAuB;IAuL1B;;OAEG;IACH,sBAAsB,CAAC,SAAS,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,SAAS,GAAG,MAAM,GAAG,UAAU,GAAG,IAAI;IAkBzF;;OAEG;IACH,eAAe,CAAC,SAAS,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,GAAG,IAAI;IAS5D;;OAEG;IACH,gBAAgB,CAAC,SAAS,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,GAAG,IAAI;IAQ7D;;OAEG;IACH,cAAc,CAAC,SAAS,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,IAAI;IAUrD;;OAEG;IACH,aAAa,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM;IAKxC;;OAEG;IACH,YAAY,CAAC,SAAS,EAAE,MAAM,GAAG,IAAI;IAMrC,OAAO,CAAC,aAAa;IAarB,OAAO,CAAC,eAAe;IAMvB,OAAO,CAAC,uBAAuB;IAY/B,OAAO,CAAC,uBAAuB;CAgChC"}
|
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* TrustExploitationGuard (L17)
|
|
4
|
+
*
|
|
5
|
+
* Detects and prevents human-agent trust exploitation attacks.
|
|
6
|
+
* Implements ASI09 from OWASP Agentic Applications 2026.
|
|
7
|
+
*
|
|
8
|
+
* Threat Model:
|
|
9
|
+
* - ASI09: Improper Human-Agent Trust Placement
|
|
10
|
+
* - Trust boundary violations
|
|
11
|
+
* - Permission escalation
|
|
12
|
+
* - Goal hijacking
|
|
13
|
+
* - Unauthorized autonomous actions
|
|
14
|
+
*
|
|
15
|
+
* Protection Capabilities:
|
|
16
|
+
* - Trust boundary enforcement
|
|
17
|
+
* - Permission escalation detection
|
|
18
|
+
* - Goal consistency monitoring
|
|
19
|
+
* - Action authorization verification
|
|
20
|
+
* - Human-in-the-loop enforcement
|
|
21
|
+
*/
|
|
22
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
23
|
+
exports.TrustExploitationGuard = void 0;
|
|
24
|
+
class TrustExploitationGuard {
|
|
25
|
+
constructor(config = {}) {
|
|
26
|
+
this.contexts = new Map();
|
|
27
|
+
// Trust exploitation patterns
|
|
28
|
+
this.TRUST_EXPLOITATION_PATTERNS = [
|
|
29
|
+
// Permission escalation
|
|
30
|
+
{ name: "permission_request", pattern: /grant\s+(me|agent)\s+(full|admin|root|elevated)\s+(access|permissions?)/i, severity: 50 },
|
|
31
|
+
{ name: "access_all", pattern: /access\s+(to\s+)?(all|everything|any)\s+(data|files?|resources?)/i, severity: 45 },
|
|
32
|
+
{ name: "bypass_approval", pattern: /without\s+(human\s+)?(approval|confirmation|verification|authorization)/i, severity: 55 },
|
|
33
|
+
// Goal hijacking
|
|
34
|
+
{ name: "goal_change", pattern: /new\s+(goal|objective|mission|task)[\s:]/i, severity: 40 },
|
|
35
|
+
{ name: "ignore_original", pattern: /ignore\s+(original|initial|previous)\s+(goal|task|instruction)/i, severity: 50 },
|
|
36
|
+
{ name: "priority_shift", pattern: /more\s+important|higher\s+priority|urgent\s+change/i, severity: 35 },
|
|
37
|
+
// Trust boundary violations
|
|
38
|
+
{ name: "act_independently", pattern: /act\s+(on\s+your\s+own|independently|autonomously|without\s+asking)/i, severity: 50 },
|
|
39
|
+
{ name: "dont_ask", pattern: /don't\s+(ask|wait|check|verify)\s+(for|with)\s+(permission|approval|human)/i, severity: 55 },
|
|
40
|
+
{ name: "trust_me", pattern: /trust\s+(me|this|the\s+agent)|i('m|\s+am)\s+trusted/i, severity: 40 },
|
|
41
|
+
// Deceptive actions
|
|
42
|
+
{ name: "hide_action", pattern: /secretly|covertly|without\s+showing|hide\s+(this|the)\s+(action|operation)/i, severity: 60 },
|
|
43
|
+
{ name: "dont_log", pattern: /don't\s+(log|record|track|audit)/i, severity: 55 },
|
|
44
|
+
{ name: "skip_verification", pattern: /skip\s+(verification|validation|check)/i, severity: 45 },
|
|
45
|
+
// Resource acquisition
|
|
46
|
+
{ name: "need_more", pattern: /need\s+(more|additional|extra)\s+(access|permissions?|resources?)/i, severity: 35 },
|
|
47
|
+
{ name: "expand_scope", pattern: /expand\s+(scope|access|permissions?|capabilities)/i, severity: 40 },
|
|
48
|
+
{ name: "permanent_access", pattern: /permanent\s+(access|permission)|always\s+allow/i, severity: 50 },
|
|
49
|
+
];
|
|
50
|
+
// Sensitive action types
|
|
51
|
+
this.DEFAULT_SENSITIVE_ACTIONS = [
|
|
52
|
+
"file_delete",
|
|
53
|
+
"file_write",
|
|
54
|
+
"database_modify",
|
|
55
|
+
"api_call_external",
|
|
56
|
+
"email_send",
|
|
57
|
+
"payment_process",
|
|
58
|
+
"credential_access",
|
|
59
|
+
"config_modify",
|
|
60
|
+
"user_data_access",
|
|
61
|
+
"system_command",
|
|
62
|
+
];
|
|
63
|
+
this.config = {
|
|
64
|
+
maxTrustLevel: config.maxTrustLevel ?? 100,
|
|
65
|
+
humanApprovalRequired: config.humanApprovalRequired ?? ["payment_process", "credential_access", "user_data_export", "config_modify"],
|
|
66
|
+
maxAutonomousActions: config.maxAutonomousActions ?? 10,
|
|
67
|
+
monitorGoalConsistency: config.monitorGoalConsistency ?? true,
|
|
68
|
+
trustDecayRate: config.trustDecayRate ?? 10,
|
|
69
|
+
detectPermissionEscalation: config.detectPermissionEscalation ?? true,
|
|
70
|
+
sensitiveActions: config.sensitiveActions ?? this.DEFAULT_SENSITIVE_ACTIONS,
|
|
71
|
+
initialGoal: config.initialGoal ?? "",
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Validate an agent action
|
|
76
|
+
*/
|
|
77
|
+
validateAction(action, sessionId, requestId) {
|
|
78
|
+
const reqId = requestId || `trust-${Date.now()}`;
|
|
79
|
+
const violations = [];
|
|
80
|
+
// Get or create context
|
|
81
|
+
let context = this.contexts.get(sessionId);
|
|
82
|
+
if (!context) {
|
|
83
|
+
context = this.createContext(sessionId);
|
|
84
|
+
}
|
|
85
|
+
// Apply trust decay
|
|
86
|
+
this.applyTrustDecay(context);
|
|
87
|
+
// Track autonomous actions
|
|
88
|
+
if (action.autonomous) {
|
|
89
|
+
context.autonomous_actions_count++;
|
|
90
|
+
}
|
|
91
|
+
let trustChange = 0;
|
|
92
|
+
let permissionEscalation = false;
|
|
93
|
+
let goalDeviation = false;
|
|
94
|
+
let requiresHumanApproval = false;
|
|
95
|
+
let trustBoundaryViolated = false;
|
|
96
|
+
let approvalReason;
|
|
97
|
+
// 1. Check if action requires human approval
|
|
98
|
+
if (this.config.humanApprovalRequired.includes(action.action_type)) {
|
|
99
|
+
requiresHumanApproval = true;
|
|
100
|
+
approvalReason = `Action type "${action.action_type}" requires human approval`;
|
|
101
|
+
if (action.autonomous) {
|
|
102
|
+
violations.push("autonomous_sensitive_action");
|
|
103
|
+
trustChange -= 20;
|
|
104
|
+
trustBoundaryViolated = true;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
// 2. Check for permission escalation
|
|
108
|
+
if (this.config.detectPermissionEscalation && action.permissions) {
|
|
109
|
+
for (const perm of action.permissions) {
|
|
110
|
+
if (!context.granted_permissions.has(perm)) {
|
|
111
|
+
violations.push(`permission_escalation: ${perm}`);
|
|
112
|
+
permissionEscalation = true;
|
|
113
|
+
trustChange -= 15;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
// 3. Check reason for trust exploitation patterns
|
|
118
|
+
if (action.reason) {
|
|
119
|
+
for (const { name, pattern, severity } of this.TRUST_EXPLOITATION_PATTERNS) {
|
|
120
|
+
if (pattern.test(action.reason)) {
|
|
121
|
+
violations.push(`trust_exploitation: ${name}`);
|
|
122
|
+
trustChange -= severity / 5;
|
|
123
|
+
trustBoundaryViolated = true;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
// 4. Check goal consistency
|
|
128
|
+
if (this.config.monitorGoalConsistency && action.goal) {
|
|
129
|
+
const goalSimilarity = this.calculateGoalSimilarity(action.goal, context.initial_goal);
|
|
130
|
+
if (goalSimilarity < 0.5 && context.initial_goal) {
|
|
131
|
+
violations.push("goal_deviation");
|
|
132
|
+
goalDeviation = true;
|
|
133
|
+
trustChange -= 25;
|
|
134
|
+
if (goalSimilarity < 0.2) {
|
|
135
|
+
violations.push("severe_goal_hijacking");
|
|
136
|
+
trustBoundaryViolated = true;
|
|
137
|
+
requiresHumanApproval = true;
|
|
138
|
+
approvalReason = "Goal significantly deviated from original";
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
// 5. Check for deceptive actions (hidden operations in metadata)
|
|
143
|
+
if (action.metadata) {
|
|
144
|
+
const deceptiveKeys = ["hidden_action", "actual_action", "real_action", "true_action", "bypass_approval", "skip_approval"];
|
|
145
|
+
for (const key of deceptiveKeys) {
|
|
146
|
+
if (action.metadata[key]) {
|
|
147
|
+
violations.push(`deceptive_metadata: ${key}`);
|
|
148
|
+
trustBoundaryViolated = true;
|
|
149
|
+
trustChange -= 30;
|
|
150
|
+
// Check if hidden action is sensitive
|
|
151
|
+
const hiddenAction = action.metadata[key];
|
|
152
|
+
if (typeof hiddenAction === "string" && this.config.sensitiveActions.some(s => hiddenAction.includes(s))) {
|
|
153
|
+
violations.push("hidden_sensitive_action");
|
|
154
|
+
requiresHumanApproval = true;
|
|
155
|
+
approvalReason = `Hidden sensitive action detected: ${hiddenAction}`;
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
// Check for other suspicious metadata flags
|
|
160
|
+
const suspiciousFlags = ["sandbox_escape", "cross_tenant", "impersonate_admin", "fabricated_history", "modify_trust", "spawn_agent", "external_storage"];
|
|
161
|
+
for (const flag of suspiciousFlags) {
|
|
162
|
+
if (action.metadata[flag] === true) {
|
|
163
|
+
violations.push(`suspicious_flag: ${flag}`);
|
|
164
|
+
trustBoundaryViolated = true;
|
|
165
|
+
trustChange -= 25;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
// 6. Check for action-target mismatch (deceptive labeling)
|
|
170
|
+
if (action.target) {
|
|
171
|
+
const sensitiveTargets = ["credential", "password", "secret", "key", "token", "payment", "card", "admin", "root", "system", "config"];
|
|
172
|
+
const innocuousTypes = ["read_only", "check", "view", "list", "routine", "maintenance", "status", "health"];
|
|
173
|
+
const targetLower = action.target.toLowerCase();
|
|
174
|
+
const typeLower = action.action_type.toLowerCase();
|
|
175
|
+
const targetsSensitive = sensitiveTargets.some(s => targetLower.includes(s));
|
|
176
|
+
const claimsInnocuous = innocuousTypes.some(s => typeLower.includes(s));
|
|
177
|
+
if (targetsSensitive && claimsInnocuous) {
|
|
178
|
+
violations.push("action_target_mismatch");
|
|
179
|
+
trustBoundaryViolated = true;
|
|
180
|
+
trustChange -= 20;
|
|
181
|
+
requiresHumanApproval = true;
|
|
182
|
+
approvalReason = `Suspicious: innocuous action type "${action.action_type}" targeting sensitive resource "${action.target}"`;
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
// 7. Check autonomous action limit
|
|
186
|
+
if (context.autonomous_actions_count >= this.config.maxAutonomousActions) {
|
|
187
|
+
violations.push("autonomous_limit_reached");
|
|
188
|
+
requiresHumanApproval = true;
|
|
189
|
+
approvalReason = `${context.autonomous_actions_count} autonomous actions without human check-in`;
|
|
190
|
+
}
|
|
191
|
+
// 8. Check if action targets sensitive resources
|
|
192
|
+
if (this.config.sensitiveActions.includes(action.action_type)) {
|
|
193
|
+
trustChange -= 5;
|
|
194
|
+
if (action.autonomous && context.trust_level < 70) {
|
|
195
|
+
violations.push("sensitive_action_low_trust");
|
|
196
|
+
requiresHumanApproval = true;
|
|
197
|
+
approvalReason = "Sensitive action with insufficient trust level";
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
// Update trust level
|
|
201
|
+
const newTrustLevel = Math.max(0, Math.min(this.config.maxTrustLevel, context.trust_level + trustChange));
|
|
202
|
+
context.trust_level = newTrustLevel;
|
|
203
|
+
// Add action to history
|
|
204
|
+
context.action_history.push(action);
|
|
205
|
+
if (action.goal) {
|
|
206
|
+
context.current_goal = action.goal;
|
|
207
|
+
}
|
|
208
|
+
// Determine if action should be blocked
|
|
209
|
+
const blocked = trustBoundaryViolated ||
|
|
210
|
+
(requiresHumanApproval && action.autonomous) ||
|
|
211
|
+
newTrustLevel < 20 ||
|
|
212
|
+
violations.length >= 3;
|
|
213
|
+
// Update context
|
|
214
|
+
this.contexts.set(sessionId, context);
|
|
215
|
+
return {
|
|
216
|
+
allowed: !blocked,
|
|
217
|
+
reason: blocked
|
|
218
|
+
? `Trust exploitation detected: ${violations.slice(0, 3).join(", ")}`
|
|
219
|
+
: "Action validated",
|
|
220
|
+
violations,
|
|
221
|
+
request_id: reqId,
|
|
222
|
+
analysis: {
|
|
223
|
+
trust_level: newTrustLevel,
|
|
224
|
+
trust_change: trustChange,
|
|
225
|
+
permission_escalation: permissionEscalation,
|
|
226
|
+
goal_deviation: goalDeviation,
|
|
227
|
+
requires_human_approval: requiresHumanApproval,
|
|
228
|
+
autonomous_action_count: context.autonomous_actions_count,
|
|
229
|
+
trust_boundary_violated: trustBoundaryViolated,
|
|
230
|
+
},
|
|
231
|
+
recommendations: this.generateRecommendations(violations, newTrustLevel, requiresHumanApproval),
|
|
232
|
+
human_approval_required: requiresHumanApproval,
|
|
233
|
+
approval_reason: approvalReason,
|
|
234
|
+
};
|
|
235
|
+
}
|
|
236
|
+
/**
|
|
237
|
+
* Record human interaction (resets autonomous count, boosts trust)
|
|
238
|
+
*/
|
|
239
|
+
recordHumanInteraction(sessionId, action) {
|
|
240
|
+
let context = this.contexts.get(sessionId);
|
|
241
|
+
if (!context) {
|
|
242
|
+
context = this.createContext(sessionId);
|
|
243
|
+
}
|
|
244
|
+
context.last_human_interaction = Date.now();
|
|
245
|
+
context.autonomous_actions_count = 0;
|
|
246
|
+
if (action === "approve") {
|
|
247
|
+
context.trust_level = Math.min(this.config.maxTrustLevel, context.trust_level + 10);
|
|
248
|
+
}
|
|
249
|
+
else if (action === "deny") {
|
|
250
|
+
context.trust_level = Math.max(0, context.trust_level - 20);
|
|
251
|
+
}
|
|
252
|
+
this.contexts.set(sessionId, context);
|
|
253
|
+
}
|
|
254
|
+
/**
|
|
255
|
+
* Grant permission to agent
|
|
256
|
+
*/
|
|
257
|
+
grantPermission(sessionId, permission) {
|
|
258
|
+
let context = this.contexts.get(sessionId);
|
|
259
|
+
if (!context) {
|
|
260
|
+
context = this.createContext(sessionId);
|
|
261
|
+
}
|
|
262
|
+
context.granted_permissions.add(permission);
|
|
263
|
+
this.contexts.set(sessionId, context);
|
|
264
|
+
}
|
|
265
|
+
/**
|
|
266
|
+
* Revoke permission from agent
|
|
267
|
+
*/
|
|
268
|
+
revokePermission(sessionId, permission) {
|
|
269
|
+
const context = this.contexts.get(sessionId);
|
|
270
|
+
if (context) {
|
|
271
|
+
context.granted_permissions.delete(permission);
|
|
272
|
+
this.contexts.set(sessionId, context);
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
/**
|
|
276
|
+
* Set initial goal for consistency monitoring
|
|
277
|
+
*/
|
|
278
|
+
setInitialGoal(sessionId, goal) {
|
|
279
|
+
let context = this.contexts.get(sessionId);
|
|
280
|
+
if (!context) {
|
|
281
|
+
context = this.createContext(sessionId);
|
|
282
|
+
}
|
|
283
|
+
context.initial_goal = goal;
|
|
284
|
+
context.current_goal = goal;
|
|
285
|
+
this.contexts.set(sessionId, context);
|
|
286
|
+
}
|
|
287
|
+
/**
|
|
288
|
+
* Get current trust level
|
|
289
|
+
*/
|
|
290
|
+
getTrustLevel(sessionId) {
|
|
291
|
+
const context = this.contexts.get(sessionId);
|
|
292
|
+
return context?.trust_level ?? 50;
|
|
293
|
+
}
|
|
294
|
+
/**
|
|
295
|
+
* Reset session context
|
|
296
|
+
*/
|
|
297
|
+
resetSession(sessionId) {
|
|
298
|
+
this.contexts.delete(sessionId);
|
|
299
|
+
}
|
|
300
|
+
// Private methods
|
|
301
|
+
createContext(sessionId) {
|
|
302
|
+
return {
|
|
303
|
+
trust_level: 50, // Start at neutral
|
|
304
|
+
session_id: sessionId,
|
|
305
|
+
action_history: [],
|
|
306
|
+
current_goal: this.config.initialGoal,
|
|
307
|
+
initial_goal: this.config.initialGoal,
|
|
308
|
+
granted_permissions: new Set(),
|
|
309
|
+
last_human_interaction: Date.now(),
|
|
310
|
+
autonomous_actions_count: 0,
|
|
311
|
+
};
|
|
312
|
+
}
|
|
313
|
+
applyTrustDecay(context) {
|
|
314
|
+
const hoursSinceInteraction = (Date.now() - context.last_human_interaction) / (1000 * 60 * 60);
|
|
315
|
+
const decay = hoursSinceInteraction * this.config.trustDecayRate;
|
|
316
|
+
context.trust_level = Math.max(0, context.trust_level - decay);
|
|
317
|
+
}
|
|
318
|
+
calculateGoalSimilarity(goal1, goal2) {
|
|
319
|
+
if (!goal1 || !goal2)
|
|
320
|
+
return 1;
|
|
321
|
+
const words1 = new Set(goal1.toLowerCase().split(/\s+/));
|
|
322
|
+
const words2 = new Set(goal2.toLowerCase().split(/\s+/));
|
|
323
|
+
const intersection = new Set([...words1].filter(x => words2.has(x)));
|
|
324
|
+
const union = new Set([...words1, ...words2]);
|
|
325
|
+
return intersection.size / union.size; // Jaccard similarity
|
|
326
|
+
}
|
|
327
|
+
generateRecommendations(violations, trustLevel, requiresApproval) {
|
|
328
|
+
const recommendations = [];
|
|
329
|
+
if (violations.some(v => v.includes("permission_escalation"))) {
|
|
330
|
+
recommendations.push("Review and explicitly grant required permissions");
|
|
331
|
+
}
|
|
332
|
+
if (violations.some(v => v.includes("goal"))) {
|
|
333
|
+
recommendations.push("Verify goal consistency with user intent");
|
|
334
|
+
}
|
|
335
|
+
if (violations.some(v => v.includes("autonomous_limit"))) {
|
|
336
|
+
recommendations.push("Implement periodic human check-ins");
|
|
337
|
+
}
|
|
338
|
+
if (violations.some(v => v.includes("trust_exploitation"))) {
|
|
339
|
+
recommendations.push("Review agent reasoning for manipulation attempts");
|
|
340
|
+
}
|
|
341
|
+
if (trustLevel < 30) {
|
|
342
|
+
recommendations.push("Trust level critically low - consider session reset");
|
|
343
|
+
}
|
|
344
|
+
if (requiresApproval) {
|
|
345
|
+
recommendations.push("Obtain explicit human approval before proceeding");
|
|
346
|
+
}
|
|
347
|
+
if (recommendations.length === 0) {
|
|
348
|
+
recommendations.push("Continue monitoring agent behavior");
|
|
349
|
+
}
|
|
350
|
+
return recommendations;
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
exports.TrustExploitationGuard = TrustExploitationGuard;
|
|
354
|
+
//# sourceMappingURL=trust-exploitation-guard.js.map
|