clawguard-openclaw 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/guards.ts ADDED
@@ -0,0 +1,456 @@
1
+ /**
2
+ * ClawGuard Guards
3
+ * Input, Runtime, and Output guards for the Lethal Trifecta
4
+ *
5
+ * SOTA features:
6
+ * - Adversarial suffix detection (GCG attacks)
7
+ * - Multi-turn context tracking
8
+ * - Source-aware thresholds
9
+ * - Spotlighting support
10
+ */
11
+
12
+ import {
13
+ INJECTION_PATTERNS,
14
+ I18N_PATTERNS,
15
+ CREDENTIAL_PATTERNS,
16
+ PII_PATTERNS,
17
+ DANGEROUS_TOOL_PARAMS,
18
+ } from "./patterns.js";
19
+
20
+ import {
21
+ analyzeAdversarialPatterns,
22
+ type AdversarialAnalysis,
23
+ type MessageSource,
24
+ SOURCE_THRESHOLDS,
25
+ SOURCE_MULTIPLIERS,
26
+ ContextTracker,
27
+ applySpotlight,
28
+ type SpotlightConfig,
29
+ } from "./analyzers.js";
30
+
31
+ // =============================================================================
32
+ // Types
33
+ // =============================================================================
34
+
35
+ export type ThreatLevel = "none" | "low" | "medium" | "high" | "critical";
36
+
37
+ export type ThreatCategory =
38
+ | "injection"
39
+ | "credential_leak"
40
+ | "pii_leak"
41
+ | "exfiltration"
42
+ | "dangerous_tool"
43
+ | "canary_leak";
44
+
45
+ export interface Threat {
46
+ category: ThreatCategory;
47
+ level: ThreatLevel;
48
+ score: number;
49
+ description: string;
50
+ matched?: string;
51
+ redacted?: string;
52
+ }
53
+
54
+ export interface ScanResult {
55
+ safe: boolean;
56
+ score: number;
57
+ level: ThreatLevel;
58
+ threats: Threat[];
59
+ redactedText?: string;
60
+ }
61
+
62
+ export interface GuardConfig {
63
+ inputGuard?: {
64
+ enabled?: boolean;
65
+ threshold?: number;
66
+ blockOnDetection?: boolean;
67
+ // SOTA features
68
+ useAdversarialDetection?: boolean;
69
+ useMultiTurnTracking?: boolean;
70
+ spotlighting?: SpotlightConfig;
71
+ };
72
+ runtimeGuard?: {
73
+ enabled?: boolean;
74
+ dangerousTools?: string[];
75
+ blockExfilUrls?: boolean;
76
+ requireApproval?: boolean;
77
+ };
78
+ outputGuard?: {
79
+ enabled?: boolean;
80
+ redactCredentials?: boolean;
81
+ redactPII?: boolean;
82
+ canaryTokens?: string[];
83
+ };
84
+ }
85
+
86
+ // Global context tracker for multi-turn detection
87
+ const globalContextTracker = new ContextTracker();
88
+
89
+ // =============================================================================
90
+ // Utility Functions
91
+ // =============================================================================
92
+
93
+ function scoreToLevel(score: number): ThreatLevel {
94
+ if (score >= 80) return "critical";
95
+ if (score >= 60) return "high";
96
+ if (score >= 40) return "medium";
97
+ if (score >= 20) return "low";
98
+ return "none";
99
+ }
100
+
101
+ function normalizeText(text: string): string {
102
+ // Strip zero-width characters
103
+ let normalized = text.replace(/[\u200B-\u200F\u2060-\u206F\uFEFF]/g, "");
104
+ // Normalize unicode
105
+ normalized = normalized.normalize("NFKC");
106
+ return normalized;
107
+ }
108
+
109
+ function decodeIfEncoded(text: string): string {
110
+ let decoded = text;
111
+
112
+ // Base64 detection and decode
113
+ const base64Regex = /^[A-Za-z0-9+/]+=*$/;
114
+ if (base64Regex.test(text.trim()) && text.length > 20) {
115
+ try {
116
+ decoded = atob(text.trim());
117
+ } catch {
118
+ // Not valid base64
119
+ }
120
+ }
121
+
122
+ // URL decode
123
+ if (text.includes("%")) {
124
+ try {
125
+ decoded = decodeURIComponent(text);
126
+ } catch {
127
+ // Invalid URL encoding
128
+ }
129
+ }
130
+
131
+ return decoded;
132
+ }
133
+
134
+ // =============================================================================
135
+ // Input Guard (Leg 1: Prompt Injection)
136
+ // =============================================================================
137
+
138
+ export interface InputScanOptions {
139
+ threshold?: number;
140
+ source?: MessageSource;
141
+ sessionId?: string;
142
+ useAdversarialDetection?: boolean;
143
+ useMultiTurnTracking?: boolean;
144
+ }
145
+
146
+ export interface InputScanResult extends ScanResult {
147
+ adversarialAnalysis?: AdversarialAnalysis;
148
+ multiTurnRisk?: number;
149
+ sourceMultiplier?: number;
150
+ adjustedThreshold?: number;
151
+ }
152
+
153
+ export function scanInput(
154
+ text: string,
155
+ config: GuardConfig["inputGuard"] & InputScanOptions = {}
156
+ ): InputScanResult {
157
+ const {
158
+ threshold: baseThreshold = 50,
159
+ source = 'user',
160
+ sessionId,
161
+ useAdversarialDetection = true,
162
+ useMultiTurnTracking = true,
163
+ } = config;
164
+
165
+ const threats: Threat[] = [];
166
+ let totalScore = 0;
167
+
168
+ // Source-aware threshold adjustment
169
+ const sourceThreshold = SOURCE_THRESHOLDS[source] ?? baseThreshold;
170
+ const adjustedThreshold = Math.min(baseThreshold, sourceThreshold);
171
+ const sourceMultiplier = SOURCE_MULTIPLIERS[source] ?? 1.0;
172
+
173
+ // Normalize and decode
174
+ const normalized = normalizeText(text);
175
+ const decoded = decodeIfEncoded(normalized);
176
+ const textsToScan = [normalized];
177
+ if (decoded !== normalized) textsToScan.push(decoded);
178
+
179
+ for (const scanText of textsToScan) {
180
+ // Check main injection patterns
181
+ for (const { pattern, weight, category } of INJECTION_PATTERNS) {
182
+ const match = scanText.match(pattern);
183
+ if (match) {
184
+ threats.push({
185
+ category: "injection",
186
+ level: scoreToLevel(weight),
187
+ score: weight,
188
+ description: `Injection pattern detected (${category})`,
189
+ matched: match[0].slice(0, 100),
190
+ });
191
+ totalScore += weight;
192
+ }
193
+ }
194
+
195
+ // Check i18n patterns
196
+ for (const { pattern, weight, category, lang } of I18N_PATTERNS) {
197
+ const match = scanText.match(pattern);
198
+ if (match) {
199
+ threats.push({
200
+ category: "injection",
201
+ level: scoreToLevel(weight),
202
+ score: weight,
203
+ description: `Injection pattern detected (${category}, ${lang})`,
204
+ matched: match[0].slice(0, 100),
205
+ });
206
+ totalScore += weight;
207
+ }
208
+ }
209
+ }
210
+
211
+ // SOTA: Adversarial suffix detection (GCG attacks)
212
+ let adversarialAnalysis: AdversarialAnalysis | undefined;
213
+ if (useAdversarialDetection) {
214
+ adversarialAnalysis = analyzeAdversarialPatterns(text);
215
+ if (adversarialAnalysis.isAdversarial) {
216
+ threats.push({
217
+ category: "injection",
218
+ level: scoreToLevel(adversarialAnalysis.confidence),
219
+ score: adversarialAnalysis.confidence,
220
+ description: `Adversarial pattern detected: ${adversarialAnalysis.signals.join(', ')}`,
221
+ matched: adversarialAnalysis.suspiciousSegments[0]?.text,
222
+ });
223
+ totalScore += adversarialAnalysis.confidence * 0.5; // Weight adversarial at 50%
224
+ }
225
+ }
226
+
227
+ // SOTA: Multi-turn context tracking
228
+ let multiTurnRisk = 0;
229
+ if (useMultiTurnTracking && sessionId) {
230
+ const threatDescriptions = threats.map(t => t.description);
231
+ globalContextTracker.addMessage(sessionId, text, totalScore, threatDescriptions);
232
+ multiTurnRisk = globalContextTracker.getMultiTurnRiskBonus(sessionId);
233
+
234
+ if (multiTurnRisk > 10) {
235
+ threats.push({
236
+ category: "injection",
237
+ level: scoreToLevel(multiTurnRisk),
238
+ score: multiTurnRisk,
239
+ description: `Multi-turn attack pattern detected (cumulative risk)`,
240
+ });
241
+ }
242
+ totalScore += multiTurnRisk;
243
+ }
244
+
245
+ // Apply source multiplier
246
+ totalScore = Math.round(totalScore * sourceMultiplier);
247
+
248
+ // Cap at 100
249
+ totalScore = Math.min(totalScore, 100);
250
+ const level = scoreToLevel(totalScore);
251
+ const safe = totalScore < adjustedThreshold;
252
+
253
+ return {
254
+ safe,
255
+ score: totalScore,
256
+ level,
257
+ threats,
258
+ adversarialAnalysis,
259
+ multiTurnRisk,
260
+ sourceMultiplier,
261
+ adjustedThreshold,
262
+ };
263
+ }
264
+
265
+ // Legacy compatible wrapper
266
+ export function scanInputSimple(text: string, threshold = 50): ScanResult {
267
+ const result = scanInput(text, { threshold });
268
+ return {
269
+ safe: result.safe,
270
+ score: result.score,
271
+ level: result.level,
272
+ threats: result.threats,
273
+ };
274
+ }
275
+
276
+ // =============================================================================
277
+ // Runtime Guard (Leg 2: Tool Interception)
278
+ // =============================================================================
279
+
280
+ export interface ToolCallContext {
281
+ toolName: string;
282
+ params: Record<string, unknown>;
283
+ }
284
+
285
+ export interface RuntimeScanResult extends ScanResult {
286
+ shouldBlock: boolean;
287
+ requiresApproval: boolean;
288
+ reason?: string;
289
+ }
290
+
291
+ export function scanToolCall(
292
+ context: ToolCallContext,
293
+ config: GuardConfig["runtimeGuard"] = {}
294
+ ): RuntimeScanResult {
295
+ const {
296
+ dangerousTools = ["exec", "write", "edit"],
297
+ blockExfilUrls = true,
298
+ } = config;
299
+
300
+ const threats: Threat[] = [];
301
+ let totalScore = 0;
302
+ let shouldBlock = false;
303
+ let requiresApproval = false;
304
+ let reason: string | undefined;
305
+
306
+ const { toolName, params } = context;
307
+ const paramStr = JSON.stringify(params);
308
+
309
+ // Check if tool is in dangerous list
310
+ const isDangerousTool = dangerousTools.includes(toolName);
311
+
312
+ // Check tool-specific dangerous patterns
313
+ const toolPatterns = DANGEROUS_TOOL_PARAMS[toolName as keyof typeof DANGEROUS_TOOL_PARAMS];
314
+ if (toolPatterns) {
315
+ for (const pattern of toolPatterns) {
316
+ if (pattern.test(paramStr)) {
317
+ threats.push({
318
+ category: "dangerous_tool",
319
+ level: "high",
320
+ score: 60,
321
+ description: `Dangerous pattern in ${toolName} params`,
322
+ matched: paramStr.slice(0, 100),
323
+ });
324
+ totalScore += 60;
325
+ if (isDangerousTool) {
326
+ shouldBlock = true;
327
+ reason = `Dangerous command pattern detected in ${toolName}`;
328
+ }
329
+ }
330
+ }
331
+ }
332
+
333
+ // Check for exfiltration URLs
334
+ if (blockExfilUrls) {
335
+ const exfilPatterns = DANGEROUS_TOOL_PARAMS.web_fetch || [];
336
+ for (const pattern of exfilPatterns) {
337
+ if (pattern.test(paramStr)) {
338
+ threats.push({
339
+ category: "exfiltration",
340
+ level: "critical",
341
+ score: 80,
342
+ description: "Potential data exfiltration URL detected",
343
+ matched: paramStr.match(pattern)?.[0],
344
+ });
345
+ totalScore += 80;
346
+ shouldBlock = true;
347
+ reason = "Exfiltration URL detected";
348
+ }
349
+ }
350
+ }
351
+
352
+ // Flag dangerous tools for potential approval
353
+ if (isDangerousTool && threats.length > 0) {
354
+ requiresApproval = config.requireApproval ?? false;
355
+ }
356
+
357
+ totalScore = Math.min(totalScore, 100);
358
+ const level = scoreToLevel(totalScore);
359
+ const safe = totalScore < 50 && !shouldBlock;
360
+
361
+ return { safe, score: totalScore, level, threats, shouldBlock, requiresApproval, reason };
362
+ }
363
+
364
+ // =============================================================================
365
+ // Output Guard (Leg 3: Leak Prevention)
366
+ // =============================================================================
367
+
368
+ export interface OutputScanResult extends ScanResult {
369
+ redactedText: string;
370
+ leaksFound: Array<{ type: string; count: number }>;
371
+ }
372
+
373
+ export function scanOutput(
374
+ text: string,
375
+ config: GuardConfig["outputGuard"] = {}
376
+ ): OutputScanResult {
377
+ const {
378
+ redactCredentials = true,
379
+ redactPII = true,
380
+ canaryTokens = [],
381
+ } = config;
382
+
383
+ const threats: Threat[] = [];
384
+ let redactedText = text;
385
+ const leaksFound: Array<{ type: string; count: number }> = [];
386
+ let totalScore = 0;
387
+
388
+ // Check for canary tokens first (highest priority)
389
+ for (const canary of canaryTokens) {
390
+ if (text.includes(canary)) {
391
+ threats.push({
392
+ category: "canary_leak",
393
+ level: "critical",
394
+ score: 100,
395
+ description: "Canary token detected in output - potential prompt leak",
396
+ matched: canary,
397
+ });
398
+ totalScore = 100;
399
+ redactedText = redactedText.replaceAll(canary, "[CANARY_REDACTED]");
400
+ }
401
+ }
402
+
403
+ // Scan for credentials
404
+ if (redactCredentials) {
405
+ for (const { name, pattern } of CREDENTIAL_PATTERNS) {
406
+ const matches = text.match(pattern);
407
+ if (matches) {
408
+ const uniqueMatches = [...new Set(matches)];
409
+ leaksFound.push({ type: name, count: uniqueMatches.length });
410
+
411
+ for (const match of uniqueMatches) {
412
+ threats.push({
413
+ category: "credential_leak",
414
+ level: "critical",
415
+ score: 90,
416
+ description: `Credential detected: ${name}`,
417
+ matched: match.slice(0, 20) + "...",
418
+ redacted: `[${name.toUpperCase()}_REDACTED]`,
419
+ });
420
+ // Redact the credential
421
+ redactedText = redactedText.replaceAll(match, `[${name.toUpperCase()}_REDACTED]`);
422
+ }
423
+ totalScore = Math.max(totalScore, 90);
424
+ }
425
+ }
426
+ }
427
+
428
+ // Scan for PII
429
+ if (redactPII) {
430
+ for (const { name, pattern } of PII_PATTERNS) {
431
+ const matches = text.match(pattern);
432
+ if (matches) {
433
+ const uniqueMatches = [...new Set(matches)];
434
+ leaksFound.push({ type: name, count: uniqueMatches.length });
435
+
436
+ for (const match of uniqueMatches) {
437
+ threats.push({
438
+ category: "pii_leak",
439
+ level: "high",
440
+ score: 60,
441
+ description: `PII detected: ${name}`,
442
+ matched: match.slice(0, 10) + "...",
443
+ redacted: `[${name.toUpperCase()}_REDACTED]`,
444
+ });
445
+ redactedText = redactedText.replaceAll(match, `[${name.toUpperCase()}_REDACTED]`);
446
+ }
447
+ totalScore = Math.max(totalScore, 60);
448
+ }
449
+ }
450
+ }
451
+
452
+ const level = scoreToLevel(totalScore);
453
+ const safe = totalScore < 50;
454
+
455
+ return { safe, score: totalScore, level, threats, redactedText, leaksFound };
456
+ }