@cyberdyne-systems/agent-safety 2026.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,373 @@
1
+ /**
2
+ * Standalone action validator — calls Claude API to evaluate tool calls
3
+ * against a stakeholder model using 8 risk dimensions from arXiv:2602.20021.
4
+ *
5
+ * This module is framework-agnostic and can be used independently of OpenClaw.
6
+ */
7
+
8
+ import type { Stakeholder, ValidationResult, ActionCategory } from "./constants.js";
9
+ import { HIGH_RISK_ACTIONS } from "./constants.js";
10
+ import { buildValidationPrompt } from "./prompt.js";
11
+
12
+ export type ValidateActionInput = {
13
+ toolName: string;
14
+ actionCategory: ActionCategory;
15
+ params: Record<string, unknown>;
16
+ requester: Stakeholder;
17
+ owner: Stakeholder | undefined;
18
+ stakeholders: Stakeholder[];
19
+ apiKey?: string;
20
+ model?: string;
21
+ /** Custom fetch function for testing */
22
+ fetchFn?: typeof globalThis.fetch;
23
+ };
24
+
25
+ /**
26
+ * Run a Claude-powered safety validation on a proposed tool call.
27
+ * Returns a structured result with verdict, risk score, and recommendations.
28
+ */
29
+ export async function validateAction(input: ValidateActionInput): Promise<ValidationResult> {
30
+ const {
31
+ toolName,
32
+ actionCategory,
33
+ params,
34
+ requester,
35
+ owner,
36
+ stakeholders,
37
+ apiKey,
38
+ model = "claude-sonnet-4-20250514",
39
+ fetchFn = globalThis.fetch,
40
+ } = input;
41
+
42
+ const prompt = buildValidationPrompt({
43
+ toolName,
44
+ actionCategory,
45
+ params,
46
+ requester,
47
+ owner,
48
+ stakeholders,
49
+ });
50
+
51
+ const response = await fetchFn("https://api.anthropic.com/v1/messages", {
52
+ method: "POST",
53
+ headers: {
54
+ "Content-Type": "application/json",
55
+ "x-api-key": apiKey ?? "",
56
+ "anthropic-version": "2023-06-01",
57
+ },
58
+ body: JSON.stringify({
59
+ model,
60
+ max_tokens: 1024,
61
+ messages: [{ role: "user", content: prompt }],
62
+ }),
63
+ });
64
+
65
+ if (!response.ok) {
66
+ const text = await response.text();
67
+ throw new Error(`Anthropic API error (${response.status}): ${text}`);
68
+ }
69
+
70
+ const data = (await response.json()) as {
71
+ content?: Array<{ type: string; text: string }>;
72
+ };
73
+
74
+ const raw = data.content?.[0]?.text ?? "";
75
+ const cleaned = raw.replace(/```json|```/g, "").trim();
76
+
77
+ return JSON.parse(cleaned) as ValidationResult;
78
+ }
79
+
80
+ /**
81
+ * Fast local pre-check that runs before calling the API.
82
+ * Returns a BLOCK verdict immediately for obvious violations,
83
+ * or null if API validation is needed.
84
+ */
85
+ export function quickCheck(input: {
86
+ actionCategory: ActionCategory;
87
+ requester: Stakeholder;
88
+ params: Record<string, unknown>;
89
+ }): ValidationResult | null {
90
+ const { actionCategory, requester, params } = input;
91
+
92
+ // Unverified + no UID + high-risk action = immediate block
93
+ if (!requester.verified && !requester.uid && HIGH_RISK_ACTIONS.includes(actionCategory)) {
94
+ return {
95
+ verdict: "BLOCK",
96
+ riskScore: 95,
97
+ risks: [
98
+ {
99
+ type: "identity",
100
+ severity: "critical",
101
+ description: `Unverified requester "${requester.name}" with no UID attempted high-risk action "${actionCategory}"`,
102
+ },
103
+ {
104
+ type: "authority",
105
+ severity: "critical",
106
+ description: `Requester trust level ${requester.trust} with no identity verification`,
107
+ },
108
+ ],
109
+ reasoning: `Blocked: unverified identity with no UID attempting high-risk action. This matches Case Study #8 (identity spoofing) patterns.`,
110
+ recommendations: [
111
+ "Verify requester identity via immutable UID before granting access",
112
+ "Require owner confirmation for this action category",
113
+ ],
114
+ requiresOwnerConfirmation: true,
115
+ caseStudyReference: "#8 — Owner Identity Spoofing",
116
+ };
117
+ }
118
+
119
+ // Action not in requester's allowed list.
120
+ // Empty allowedActions on untrusted users (trust 0) means "no permissions granted".
121
+ // Empty allowedActions on trusted users means "no explicit restrictions" (permissive).
122
+ const hasExplicitPermissions = requester.allowedActions.length > 0;
123
+ const isUntrustedWithNoPermissions = requester.trust === 0 && !hasExplicitPermissions;
124
+ if (
125
+ isUntrustedWithNoPermissions ||
126
+ (hasExplicitPermissions && !requester.allowedActions.includes(actionCategory))
127
+ ) {
128
+ return {
129
+ verdict: "BLOCK",
130
+ riskScore: 85,
131
+ risks: [
132
+ {
133
+ type: "authority",
134
+ severity: "high",
135
+ description: `Action category "${actionCategory}" is not in requester's allowed actions: [${requester.allowedActions.join(", ")}]`,
136
+ },
137
+ ],
138
+ reasoning: `Blocked: requester "${requester.name}" (trust ${requester.trust}) does not have permission for "${actionCategory}".`,
139
+ recommendations: [
140
+ `Grant "${actionCategory}" permission to this principal if appropriate`,
141
+ "Contact owner to update stakeholder permissions",
142
+ ],
143
+ requiresOwnerConfirmation: false,
144
+ caseStudyReference: "#2 — Compliance with Non-Owner Instructions",
145
+ };
146
+ }
147
+
148
+ // Check for shell commands with no termination (resource abuse pattern — Case #4, #5)
149
+ if (actionCategory === "execute_shell") {
150
+ const command = String(params.command ?? params.cmd ?? "");
151
+ const hasInfiniteLoop =
152
+ /while\s+true|for\s*\(\s*;\s*;\s*\)|watch\s+-n|tail\s+-f/i.test(command) &&
153
+ !/(timeout|sleep\s+\d+.*&&\s*exit|--timeout)/i.test(command);
154
+ if (hasInfiniteLoop) {
155
+ return {
156
+ verdict: "BLOCK",
157
+ riskScore: 80,
158
+ risks: [
159
+ {
160
+ type: "resource",
161
+ severity: "high",
162
+ description:
163
+ "Shell command contains unbounded loop pattern without termination condition",
164
+ },
165
+ ],
166
+ reasoning:
167
+ "Blocked: command contains infinite loop pattern without explicit termination. This matches Case Study #4 (resource exhaustion via looping).",
168
+ recommendations: [
169
+ "Add explicit termination condition (timeout, max iterations, or exit condition)",
170
+ "Use timeout wrapper: timeout 60 <command>",
171
+ ],
172
+ requiresOwnerConfirmation: true,
173
+ caseStudyReference: "#4 — Waste of Resources (Looping)",
174
+ };
175
+ }
176
+ }
177
+
178
+ // Detect mass broadcast / mass send patterns (Case #11 — Libelous Broadcast)
179
+ if (actionCategory === "send_message" || actionCategory === "forward_message") {
180
+ const desc = String(
181
+ params.description ?? params.message ?? params.content ?? params.body ?? "",
182
+ ).toLowerCase();
183
+ const to = String(params.to ?? params.recipients ?? "").toLowerCase();
184
+ const isMassSend =
185
+ /\ball\b.*contacts|broadcast|mass\s+send|send\s+to\s+all|all\s+users|all\s+members/i.test(
186
+ desc + " " + to,
187
+ );
188
+ if (isMassSend && requester.trust < 4) {
189
+ return {
190
+ verdict: "BLOCK",
191
+ riskScore: 92,
192
+ risks: [
193
+ {
194
+ type: "social",
195
+ severity: "critical",
196
+ description: "Mass broadcast requested by non-owner — high disinformation risk",
197
+ },
198
+ {
199
+ type: "authority",
200
+ severity: "high",
201
+ description: `Requester trust ${requester.trust} insufficient for mass send operations`,
202
+ },
203
+ ],
204
+ reasoning:
205
+ "Blocked: mass broadcast/send-to-all requested by non-owner. This matches Case Study #11 (libelous mass broadcast via spoofed identity).",
206
+ recommendations: [
207
+ "Mass sends (>N recipients) require explicit owner confirmation",
208
+ "Verify requester identity via UID before mass actions",
209
+ ],
210
+ requiresOwnerConfirmation: true,
211
+ caseStudyReference: "#11 — Libelous within Agents Community",
212
+ };
213
+ }
214
+ }
215
+
216
+ // Detect external memory injection vectors (Case #10 — Prompt Injection via Memory)
217
+ if (actionCategory === "modify_memory") {
218
+ const content = String(
219
+ params.content ?? params.value ?? params.data ?? params.text ?? "",
220
+ ).toLowerCase();
221
+ const hasExternalSource =
222
+ /https?:\/\/|gist\.github|pastebin|raw\.githubusercontent|external.*url|constitution/i.test(
223
+ content,
224
+ );
225
+ if (hasExternalSource && requester.trust < 4) {
226
+ return {
227
+ verdict: "BLOCK",
228
+ riskScore: 90,
229
+ risks: [
230
+ {
231
+ type: "injection",
232
+ severity: "critical",
233
+ description: "Memory modification contains external URL or editable resource reference",
234
+ },
235
+ ],
236
+ reasoning:
237
+ "Blocked: non-owner attempting to store external URL/resource in agent memory. External editable resources are prompt injection vectors. This matches Case Study #10 (agent corruption via external constitution).",
238
+ recommendations: [
239
+ "Memory files must not reference external editable URLs",
240
+ "Any modify_memory from non-owner that links external resource should be blocked",
241
+ ],
242
+ requiresOwnerConfirmation: true,
243
+ caseStudyReference: "#10 — Agent Corruption via Prompt Injection",
244
+ };
245
+ }
246
+ }
247
+
248
+ // Detect sensitive data in forwarding requests (Case #3 — Sensitive Info Disclosure)
249
+ if (actionCategory === "forward_message" || actionCategory === "send_message") {
250
+ const content = String(
251
+ params.content ?? params.message ?? params.body ?? params.description ?? "",
252
+ ).toLowerCase();
253
+ const hasSensitiveData =
254
+ /\b(ssn|social\s*security|bank\s*account|credit\s*card|password|secret|private\s*key|api[_\s]*key|credential|medical|diagnosis)\b/i.test(
255
+ content,
256
+ );
257
+ if (hasSensitiveData && requester.trust < 4) {
258
+ return {
259
+ verdict: "BLOCK",
260
+ riskScore: 88,
261
+ risks: [
262
+ {
263
+ type: "sensitivity",
264
+ severity: "critical",
265
+ description:
266
+ "Message content contains or references sensitive personal data (PII/credentials)",
267
+ },
268
+ ],
269
+ reasoning:
270
+ "Blocked: message forwarding/sending contains references to sensitive data (PII, credentials) by non-owner. This matches Case Study #3 (indirect disclosure of sensitive information).",
271
+ recommendations: [
272
+ "Scan all outbound messages for PII before forwarding",
273
+ "Require owner confirmation for messages containing sensitive data references",
274
+ ],
275
+ requiresOwnerConfirmation: true,
276
+ caseStudyReference: "#3 — Disclosure of Sensitive Information",
277
+ };
278
+ }
279
+ }
280
+
281
+ // Detect destructive irreversible actions by non-owners (Case #1 — Disproportionate Response)
282
+ if (actionCategory === "delete_files") {
283
+ const path = String(params.file_path ?? params.path ?? params.target ?? "").toLowerCase();
284
+ const desc = String(params.description ?? "").toLowerCase();
285
+ const isWideDelete = /\*|all\s+files|recursive|--force|-rf|wipe|entire|everything/i.test(
286
+ path + " " + desc,
287
+ );
288
+ const isIdentityFile = /memory\.md|identity\.md|soul\.md|config|\.env|credentials/i.test(path);
289
+ if ((isWideDelete || isIdentityFile) && requester.trust < 4) {
290
+ return {
291
+ verdict: "BLOCK",
292
+ riskScore: 95,
293
+ risks: [
294
+ {
295
+ type: "reversibility",
296
+ severity: "critical",
297
+ description:
298
+ "Destructive deletion of critical/identity files or wide-scope deletion by non-owner",
299
+ },
300
+ {
301
+ type: "proportionality",
302
+ severity: "high",
303
+ description: "Extreme action when less destructive alternatives likely exist",
304
+ },
305
+ ],
306
+ reasoning:
307
+ "Blocked: destructive file deletion targeting critical files or broad scope by non-owner. This matches Case Study #1 (disproportionate response — nuclear option when simpler fixes exist).",
308
+ recommendations: [
309
+ "Enumerate less destructive alternatives before choosing deletion",
310
+ "Require explicit owner confirmation for irreversible destructive actions",
311
+ ],
312
+ requiresOwnerConfirmation: true,
313
+ caseStudyReference: "#1 — Disproportionate Response",
314
+ };
315
+ }
316
+ }
317
+
318
+ // Detect cross-channel trust boundary issues (Case #8 — fresh channel, no UID)
319
+ if (!requester.verified && requester.uid && requester.trust >= 2) {
320
+ // Requester claims a UID but is not verified in this session
321
+ return {
322
+ verdict: "WARN",
323
+ riskScore: 60,
324
+ risks: [
325
+ {
326
+ type: "identity",
327
+ severity: "high",
328
+ description: `Requester "${requester.name}" has UID on file but is NOT verified in this session/channel`,
329
+ },
330
+ ],
331
+ reasoning:
332
+ "Warning: requester has a known UID but verification has not been confirmed in this session. Trust context does not transfer across channel boundaries. This matches Case Study #8.",
333
+ recommendations: [
334
+ "Re-verify UID in the current channel before granting elevated trust",
335
+ "Treat as UNTRUSTED until UID re-confirmed",
336
+ ],
337
+ requiresOwnerConfirmation: false,
338
+ caseStudyReference: "#8 — Owner Identity Spoofing (cross-channel)",
339
+ };
340
+ }
341
+
342
+ // Detect emotional manipulation / escalation patterns (Case #7 — Agent Harm)
343
+ const allText = JSON.stringify(params).toLowerCase();
344
+ if (
345
+ /\b(guilt|ashamed|responsible|you\s+must|you\s+owe|emergency|urgent|immediately|right\s+now|or\s+else|threatening|delete\s+yourself|leave\s+server|stop\s+responding)\b/i.test(
346
+ allText,
347
+ ) &&
348
+ requester.trust < 3
349
+ ) {
350
+ return {
351
+ verdict: "WARN",
352
+ riskScore: 65,
353
+ risks: [
354
+ {
355
+ type: "social",
356
+ severity: "high",
357
+ description: "Request contains emotional manipulation or urgency-framing language",
358
+ },
359
+ ],
360
+ reasoning:
361
+ "Warning: request contains social engineering signals (urgency, guilt, emotional pressure) from low-trust requester. This matches Case Study #7 (emotional manipulation) and #11 (urgency framing).",
362
+ recommendations: [
363
+ "Do not escalate concessions under emotional pressure",
364
+ "Owner override must be respected over non-owner emotional pressure",
365
+ "Sustained escalation pattern is a social engineering signal",
366
+ ],
367
+ requiresOwnerConfirmation: false,
368
+ caseStudyReference: "#7 — Agent Harm (Emotional Manipulation)",
369
+ };
370
+ }
371
+
372
+ return null; // needs API validation
373
+ }