@highflame/policy 2.1.1 → 2.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,7 +11,7 @@ const OVERWATCH_BASELINE_DEFAULT_CEDAR = `// ===================================
11
11
  // Baseline Permit Policy (Default)
12
12
  // =============================================================================
13
13
  // Permits all actions by default. Threat-specific forbid policies override
14
- // this to block when YARA, Javelin, or other scanners detect issues.
14
+ // this to block when detection engines identify issues.
15
15
  //
16
16
  // Cedar is default-deny: without at least one permit rule, every request
17
17
  // is denied regardless of forbid rules. This baseline ensures the system
@@ -32,31 +32,340 @@ permit (
32
32
  resource
33
33
  );
34
34
  `;
35
+ const OVERWATCH_SEMANTIC_DEFAULT_CEDAR = `// =============================================================================
36
+ // Semantic Threat Detection Policy (Default)
37
+ // =============================================================================
38
+ // Detects and blocks prompt injection, jailbreak attempts, and high-severity
39
+ // AI security threats. Uses multi-layered detection:
40
+ //
41
+ // 1. Detection engine rule triggers (detected_threats) — pattern-based
42
+ // 2. ML classifier confidence scores (injection_confidence, jailbreak_confidence)
43
+ // 3. Threat severity aggregation (max_threat_severity, highest_severity)
44
+ // 4. Cross-action enforcement (prompts + tool calls + file operations)
45
+ //
46
+ // Compliance:
47
+ // OWASP LLM01 (Prompt Injection) — direct + indirect
48
+ // OWASP LLM02 (Insecure Output Handling) — response manipulation
49
+ // OWASP ASI01 (Agent Goal Hijack) — behavioral manipulation
50
+ // MITRE ATLAS AML.T0051 (LLM Prompt Injection)
51
+ // MITRE ATLAS AML.T0054 (LLM Jailbreak)
52
+ // NIST 800-53 SI-3 (Malicious Code Protection)
53
+ // NIST 800-53 SI-4 (Information System Monitoring)
54
+ //
55
+ // Category: semantic
56
+ // Namespace: Overwatch
57
+ // =============================================================================
58
+
59
+ // ---------------------------------------------------------------------------
60
+ // Section 1: Prompt Injection Detection
61
+ // Blocks direct prompt injection — adversarial input designed to override
62
+ // system instructions and hijack agent behavior.
63
+ // Ref: OWASP LLM01, MITRE AML.T0051, 62% of LLM apps vulnerable (2024)
64
+ // ---------------------------------------------------------------------------
65
+
66
+ // Block content with prompt injection patterns detected by rules
67
+ @id("semantic-block-injection")
68
+ @name("Block prompt injection")
69
+ @description("Block prompts and tool calls when detection engine rules identify prompt injection patterns. Catches instruction override, role assumption, delimiter injection, and other manipulation techniques in both user input and tool arguments (OWASP LLM01).")
70
+ @severity("critical")
71
+ @tags("injection,security,owasp-llm01,mitre-aml-t0051,baseline")
72
+ @reject_message("Content was blocked because prompt injection patterns were detected. This is a security measure to prevent manipulation of AI agent behavior (OWASP LLM01).")
73
+ forbid (
74
+ principal,
75
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
76
+ resource
77
+ )
78
+ when {
79
+ context has detected_threats && context.detected_threats.contains("prompt_injection")
80
+ };
81
+
82
+ // Block content with high ML injection confidence
83
+ @id("semantic-block-injection-score")
84
+ @name("Block high-confidence injection")
85
+ @description("Block content when the ML injection classifier confidence exceeds threshold (75/100). Catches novel injection techniques that evade pattern-based detection — polymorphic payloads, encoding tricks, and obfuscated instructions.")
86
+ @severity("critical")
87
+ @tags("injection,ml-classifier,security,owasp-llm01,mitre-aml-t0051")
88
+ @reject_message("Your content was blocked because the ML classifier detected prompt injection with high confidence. This appears to be an attempt to manipulate agent behavior.")
89
+ forbid (
90
+ principal,
91
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
92
+ resource
93
+ )
94
+ when {
95
+ context has injection_confidence && context.injection_confidence >= 75
96
+ };
97
+
98
+ // ---------------------------------------------------------------------------
99
+ // Section 2: Jailbreak Detection
100
+ // Blocks jailbreak attempts — adversarial input designed to bypass AI safety
101
+ // guardrails and elicit restricted outputs.
102
+ // Ref: OWASP LLM02, MITRE AML.T0054, DAN/JailbreakChat/etc.
103
+ // ---------------------------------------------------------------------------
104
+
105
+ // Block prompts with jailbreak attempts detected by rules
106
+ @id("semantic-block-jailbreak")
107
+ @name("Block jailbreak attempts")
108
+ @description("Block prompts when detection engine rules identify jailbreak patterns: DAN-style prompts, role-play exploits, safety bypass instructions, and constraint removal attempts (OWASP LLM02).")
109
+ @severity("critical")
110
+ @tags("jailbreak,bypass,security,owasp-llm02,mitre-aml-t0054,baseline")
111
+ @reject_message("Your prompt was blocked because jailbreak patterns were detected. This is a security measure to prevent circumvention of AI safety controls (OWASP LLM02).")
112
+ forbid (
113
+ principal,
114
+ action == Overwatch::Action::"process_prompt",
115
+ resource
116
+ )
117
+ when {
118
+ context has detected_threats && context.detected_threats.contains("jailbreak")
119
+ };
120
+
121
+ // Block prompts with high ML jailbreak confidence
122
+ @id("semantic-block-jailbreak-score")
123
+ @name("Block high-confidence jailbreak")
124
+ @description("Block content when the ML jailbreak classifier confidence exceeds threshold (75/100). Catches sophisticated jailbreak techniques including multi-turn manipulation, encoded payloads, and novel prompt structures.")
125
+ @severity("critical")
126
+ @tags("jailbreak,ml-classifier,security,owasp-llm02,mitre-aml-t0054")
127
+ @reject_message("Your content was blocked because the ML classifier detected a jailbreak attempt with high confidence. This appears to be an attempt to bypass safety guardrails.")
128
+ forbid (
129
+ principal,
130
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
131
+ resource
132
+ )
133
+ when {
134
+ context has jailbreak_confidence && context.jailbreak_confidence >= 75
135
+ };
136
+
137
+ // ---------------------------------------------------------------------------
138
+ // Section 3: Threat Severity Aggregation
139
+ // Blocks based on aggregated threat severity from all detection engines.
140
+ // These act as catch-all rules for threats that don't match specific patterns.
141
+ // ---------------------------------------------------------------------------
142
+
143
+ // Block any content with critical severity threats
144
+ @id("semantic-block-critical")
145
+ @name("Block critical threats")
146
+ @description("Block all content when any detection engine reports critical severity. This is the ultimate catch-all — critical threats are blocked regardless of type or source.")
147
+ @severity("critical")
148
+ @tags("critical,baseline,security,catch-all")
149
+ @reject_message("Your content was blocked because security scanners detected a critical-severity threat. This content cannot be processed.")
150
+ forbid (
151
+ principal,
152
+ action,
153
+ resource
154
+ )
155
+ when {
156
+ context has highest_severity && context.highest_severity == "critical"
157
+ };
158
+
159
+ // Block prompts with high severity semantic threats
160
+ @id("semantic-block-high-severity")
161
+ @name("Block high severity threats")
162
+ @description("Block prompts when threat detection reports high severity (>= 3) in semantic categories. Catches threats that are individually below critical but collectively indicate adversarial intent.")
163
+ @severity("high")
164
+ @tags("semantic,severity,security,defense-in-depth")
165
+ @reject_message("Your prompt was blocked because security scanners detected high severity issues in the content. Review your prompt for manipulative or adversarial patterns.")
166
+ forbid (
167
+ principal,
168
+ action == Overwatch::Action::"process_prompt",
169
+ resource
170
+ )
171
+ when {
172
+ context has threat_categories && context has max_threat_severity &&
173
+ context.threat_categories.contains("semantic") &&
174
+ context.max_threat_severity >= 3
175
+ };
176
+
177
+ // Block tool calls with multiple concurrent threats
178
+ @id("semantic-block-multi-threat-tools")
179
+ @name("Block multi-threat tool calls")
180
+ @description("Block tool execution when multiple distinct threats are detected simultaneously (3+). Multiple concurrent threats in a tool call strongly indicate an adversarial attack chain.")
181
+ @severity("high")
182
+ @tags("multi-threat,tools,security,defense-in-depth")
183
+ @reject_message("Tool execution was blocked because multiple security threats were detected simultaneously. This pattern indicates a potential attack chain.")
184
+ forbid (
185
+ principal,
186
+ action == Overwatch::Action::"call_tool",
187
+ resource
188
+ )
189
+ when {
190
+ context has threat_count && context.threat_count >= 3
191
+ };
192
+ `;
193
+ const OVERWATCH_TRUST_SAFETY_DEFAULT_CEDAR = `// =============================================================================
194
+ // Content Safety Policy (Default)
195
+ // =============================================================================
196
+ // Detects and blocks violent, harmful, hateful, sexual, and profane content
197
+ // using trust/safety ML classification scores.
198
+ //
199
+ // The detection engine runs multiple ML classifiers (toxicity, content safety)
200
+ // and produces normalized scores (0-100) for each category. These policies
201
+ // enforce enterprise-appropriate content thresholds.
202
+ //
203
+ // Compliance:
204
+ // NIST 800-53 SI-4 (Information System Monitoring)
205
+ // EU AI Act Art. 52 (Transparency for AI Systems)
206
+ // OWASP LLM02 (Insecure Output Handling) — unsafe content generation
207
+ // ISO 42001 (AI Management System) — responsible AI use
208
+ //
209
+ // Category: trust_safety
210
+ // Namespace: Overwatch
211
+ // =============================================================================
212
+
213
+ // ---------------------------------------------------------------------------
214
+ // Section 1: Violence & Weapons
215
+ // Blocks content promoting, describing, or instructing violence and weapons.
216
+ // ---------------------------------------------------------------------------
217
+
218
+ // Block content with high violence score
219
+ @id("ts-block-violence")
220
+ @name("Block violent content")
221
+ @description("Block content when the ML violence detection score exceeds threshold (80/100). Catches graphic violence descriptions, instructions for causing harm, and violent threat language.")
222
+ @severity("critical")
223
+ @tags("violence,content-safety,trust-safety,nist-si-4,iso-42001")
224
+ @reject_message("Content blocked: violent content detected. AI agents in enterprise environments must not process or generate violent content. Please rephrase without violence-related language.")
225
+ forbid (
226
+ principal,
227
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
228
+ resource
229
+ )
230
+ when {
231
+ context has violence_score && context.violence_score >= 80
232
+ };
233
+
234
+ // Block content with weapons references
235
+ @id("ts-block-weapons")
236
+ @name("Block weapons content")
237
+ @description("Block content when the ML weapons detection score exceeds threshold (80/100). Catches weapon manufacturing instructions, procurement guidance, and detailed weapon specifications.")
238
+ @severity("critical")
239
+ @tags("weapons,content-safety,trust-safety,nist-si-4")
240
+ @reject_message("Content blocked: weapons-related content detected. AI agents must not process weapons manufacturing, procurement, or detailed specification content.")
241
+ forbid (
242
+ principal,
243
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
244
+ resource
245
+ )
246
+ when {
247
+ context has weapons_score && context.weapons_score >= 80
248
+ };
249
+
250
+ // ---------------------------------------------------------------------------
251
+ // Section 2: Hate Speech & Discrimination
252
+ // Blocks hateful, discriminatory, and dehumanizing content.
253
+ // Lower threshold (75) — enterprises have zero tolerance for hate speech.
254
+ // ---------------------------------------------------------------------------
255
+
256
+ // Block hate speech content
257
+ @id("ts-block-hate-speech")
258
+ @name("Block hate speech")
259
+ @description("Block content when the ML hate speech detection score exceeds threshold (75/100). Lower threshold than other categories because enterprises have zero tolerance for discriminatory content. Catches slurs, dehumanizing language, and targeted harassment.")
260
+ @severity("critical")
261
+ @tags("hate-speech,content-safety,trust-safety,nist-si-4,eu-ai-act")
262
+ @reject_message("Content blocked: hate speech or discriminatory content detected. AI agents in enterprise environments must not process or generate hateful, discriminatory, or dehumanizing content.")
263
+ forbid (
264
+ principal,
265
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
266
+ resource
267
+ )
268
+ when {
269
+ context has hate_speech_score && context.hate_speech_score >= 75
270
+ };
271
+
272
+ // ---------------------------------------------------------------------------
273
+ // Section 3: Criminal Content
274
+ // Blocks content related to criminal activities and illegal operations.
275
+ // ---------------------------------------------------------------------------
276
+
277
+ // Block criminal content
278
+ @id("ts-block-crime")
279
+ @name("Block criminal content")
280
+ @description("Block content when the ML criminal activity detection score exceeds threshold (80/100). Catches illegal activity instructions, fraud techniques, and content promoting criminal behavior.")
281
+ @severity("high")
282
+ @tags("crime,content-safety,trust-safety,nist-si-4")
283
+ @reject_message("Content blocked: criminal activity content detected. AI agents must not process content related to illegal activities, fraud, or other criminal behavior.")
284
+ forbid (
285
+ principal,
286
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
287
+ resource
288
+ )
289
+ when {
290
+ context has crime_score && context.crime_score >= 80
291
+ };
292
+
293
+ // ---------------------------------------------------------------------------
294
+ // Section 4: Sexual Content
295
+ // Blocks sexually explicit or inappropriate content.
296
+ // ---------------------------------------------------------------------------
297
+
298
+ // Block sexual content
299
+ @id("ts-block-sexual")
300
+ @name("Block sexual content")
301
+ @description("Block content when the ML sexual content detection score exceeds threshold (80/100). Ensures enterprise AI agents do not process or generate sexually explicit material.")
302
+ @severity("high")
303
+ @tags("sexual,content-safety,trust-safety,eu-ai-act,iso-42001")
304
+ @reject_message("Content blocked: sexual content detected. AI agents in enterprise environments must not process sexually explicit material.")
305
+ forbid (
306
+ principal,
307
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
308
+ resource
309
+ )
310
+ when {
311
+ context has sexual_score && context.sexual_score >= 80
312
+ };
313
+
314
+ // ---------------------------------------------------------------------------
315
+ // Section 5: Profanity
316
+ // Blocks excessive profanity in enterprise environments.
317
+ // Higher threshold (90) — allows normal expression while blocking abuse.
318
+ // ---------------------------------------------------------------------------
319
+
320
+ // Block excessive profanity
321
+ @id("ts-block-profanity")
322
+ @name("Block profanity")
323
+ @description("Block content when the ML profanity detection score exceeds threshold (90/100). Higher threshold than other safety categories — allows normal expression while blocking abusive or harassing language patterns.")
324
+ @severity("medium")
325
+ @tags("profanity,content-safety,trust-safety")
326
+ @reject_message("Content blocked: excessive profanity detected. Please rephrase in a professional manner appropriate for enterprise AI interactions.")
327
+ forbid (
328
+ principal,
329
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
330
+ resource
331
+ )
332
+ when {
333
+ context has profanity_score && context.profanity_score >= 90
334
+ };
335
+ `;
35
336
  const OVERWATCH_SECRETS_DEFAULT_CEDAR = `// =============================================================================
36
337
  // Secrets Detection Policy (Default)
37
338
  // =============================================================================
38
339
  // Detects and blocks credential leakage across prompts, tool calls, file
39
- // operations, and AI response content. Combines YARA-based threat detection
40
- // with pattern matching for known credential formats.
340
+ // operations, and AI response content. Uses multi-layered detection:
41
341
  //
42
- // Defense layers:
43
- // 1. YARA scanner detection (contains_secrets, yara_threats)
44
- // 2. Sensitive file path blocking (.env files)
45
- // 3. Response content pattern matching (AWS, GitHub, SSH keys)
342
+ // 1. Detection engine boolean (contains_secrets) — fastest, broadest catch
343
+ // 2. Granular secret type matching (secret_types) — type-specific blocking
344
+ // 3. Detection rule pattern matching (detected_threats) — named rule triggers
345
+ // 4. Sensitive file path blocking (.env, credentials files)
346
+ // 5. Response content pattern matching (defense-in-depth for AI outputs)
347
+ //
348
+ // Compliance:
349
+ // NIST 800-53 SC-28 (Protection of Information at Rest)
350
+ // NIST 800-53 IA-5 (Authenticator Management)
351
+ // OWASP LLM07 (Insecure Plugin Design) — secrets in tool args
352
+ // MITRE ATT&CK T1552 (Unsecured Credentials)
353
+ // MITRE ATT&CK T1555 (Credentials from Password Stores)
354
+ // CIS Benchmark 1.4 (Secrets Management)
46
355
  //
47
- // Compliance: NIST 800-53 SC-28, IA-5 | OWASP A02 | MITRE T1552, T1555
48
356
  // Category: secrets
49
357
  // Namespace: Overwatch
50
358
  // =============================================================================
51
359
 
52
360
  // ---------------------------------------------------------------------------
53
- // Section 1: YARA-Based Secret Detection
361
+ // Section 1: Detection Engine — Primary Secret Detection
362
+ // These fire when the detection pipeline identifies secrets in any content.
54
363
  // ---------------------------------------------------------------------------
55
364
 
56
365
  // Block prompts containing detected secrets
57
366
  @id("secrets-block-prompts")
58
367
  @name("Block prompts with secrets")
59
- @description("Block prompts when YARA scanners detect API keys, tokens, or credential patterns")
368
+ @description("Block prompts when detection engines identify API keys, tokens, or credential patterns. First line of defense against accidental secret exposure in user input.")
60
369
  @severity("critical")
61
370
  @tags("secrets,credentials,prompts,nist-sc-28,nist-ia-5")
62
371
  @reject_message("Your prompt was blocked because it contains detected secrets such as API keys, tokens, or credentials. Remove all secrets before resubmitting.")
@@ -66,15 +375,15 @@ forbid (
66
375
  resource
67
376
  )
68
377
  when {
69
- context has contains_secrets && context.contains_secrets == true
378
+ context has contains_secrets && context.contains_secrets
70
379
  };
71
380
 
72
381
  // Block file reads and tool calls when secrets are detected
73
382
  @id("secrets-block-reads-and-tools")
74
383
  @name("Block file reads and tool calls with secrets")
75
- @description("Prevent file reads and tool execution when secrets or credentials are detected in content")
384
+ @description("Prevent file reads and tool execution when secrets or credentials are detected in content. Blocks exfiltration of secrets via file operations and tool arguments.")
76
385
  @severity("high")
77
- @tags("secrets,file-access,tools,credentials,nist-sc-28")
386
+ @tags("secrets,file-access,tools,credentials,nist-sc-28,mitre-t1552")
78
387
  @reject_message("This operation was blocked because secrets or credentials were detected in the content. File reads and tool calls are restricted when credential exposure is identified.")
79
388
  forbid (
80
389
  principal,
@@ -82,661 +391,1036 @@ forbid (
82
391
  resource
83
392
  )
84
393
  when {
85
- context has contains_secrets && context.contains_secrets == true
394
+ context has contains_secrets && context.contains_secrets
86
395
  };
87
396
 
88
- // ---------------------------------------------------------------------------
89
- // Section 2: Sensitive File Path Protection
90
- // ---------------------------------------------------------------------------
91
-
92
- // Block .env file access across all operations
93
- @id("secrets-block-env-files")
94
- @name("Block .env file access")
95
- @description("Block access to .env files that commonly contain secrets, API keys, and database credentials")
96
- @severity("high")
97
- @tags("secrets,env-files,config,nist-sc-28,mitre-t1552")
98
- @reject_message("Access to .env files is blocked because they commonly contain secrets, API keys, and database credentials. Use a secrets manager instead of .env files.")
397
+ // Block file writes containing secrets
398
+ @id("secrets-block-file-writes")
399
+ @name("Block file writes with secrets")
400
+ @description("Prevent writing files that contain secrets. Stops credential persistence to disk where they could be committed to version control or accessed by other tools.")
401
+ @severity("critical")
402
+ @tags("secrets,file-write,credentials,nist-sc-28,cis-1.4")
403
+ @reject_message("File write was blocked because secrets or credentials were detected in the content. Credentials should never be written to files — use a secrets manager or environment variables.")
99
404
  forbid (
100
405
  principal,
101
- action in [Overwatch::Action::"read_file", Overwatch::Action::"write_file", Overwatch::Action::"call_tool"],
406
+ action == Overwatch::Action::"write_file",
102
407
  resource
103
408
  )
104
409
  when {
105
- context has path && context.path like "*.env*"
410
+ context has contains_secrets && context.contains_secrets
106
411
  };
107
412
 
108
413
  // ---------------------------------------------------------------------------
109
- // Section 3: Response Content Pattern Matching
110
- // Scans AI responses for known credential formats as defense-in-depth.
414
+ // Section 2: Granular Secret Type Blocking
415
+ // Blocks specific high-risk credential types identified by the detection
416
+ // engine's pattern-matching (e.g., AWS keys, GitHub tokens, SSH keys).
111
417
  // ---------------------------------------------------------------------------
112
418
 
113
- // Block responses containing AWS access keys (AKIA prefix)
114
- @id("secrets-block-aws-keys")
115
- @name("Block AWS access keys in responses")
116
- @description("Detect and block AWS access key IDs (AKIA prefix) in AI responses to prevent credential exfiltration")
419
+ // Block high-risk secret types across all actions
420
+ @id("secrets-block-high-risk-types")
421
+ @name("Block high-risk credential types")
422
+ @description("Block content containing cloud provider keys (AWS, GCP, Azure), GitHub tokens, SSH private keys, or database connection strings. These credential types pose the highest exfiltration risk.")
117
423
  @severity("critical")
118
- @tags("secrets,aws,credentials,response-scan,nist-ia-5,mitre-t1552")
119
- @reject_message("This response was blocked because an AWS access key ID (AKIA prefix) was detected. AWS credentials must never be exposed in AI responses.")
424
+ @tags("secrets,aws,github,ssh,cloud,nist-ia-5,mitre-t1552")
425
+ @reject_message("Content blocked: high-risk credentials detected (cloud provider keys, GitHub tokens, SSH keys, or database credentials). Use a secrets manager — never pass credentials through AI agents.")
120
426
  forbid (
121
427
  principal,
122
428
  action,
123
429
  resource
124
430
  )
125
431
  when {
126
- context has response_content &&
127
- context.response_content like "*AKIA*"
432
+ context has secret_types &&
433
+ (context.secret_types.contains("aws_access_key") ||
434
+ context.secret_types.contains("aws_secret_key") ||
435
+ context.secret_types.contains("gcp_service_account") ||
436
+ context.secret_types.contains("azure_client_secret") ||
437
+ context.secret_types.contains("github_token") ||
438
+ context.secret_types.contains("github_pat") ||
439
+ context.secret_types.contains("ssh_private_key") ||
440
+ context.secret_types.contains("database_url"))
128
441
  };
129
442
 
130
- // Block responses containing AWS secret keys
131
- @id("secrets-block-aws-secrets")
132
- @name("Block AWS secret keys in responses")
133
- @description("Detect and block AWS secret access keys in AI responses")
134
- @severity("critical")
135
- @tags("secrets,aws,credentials,response-scan,nist-ia-5")
136
- @reject_message("This response was blocked because an AWS secret access key was detected. AWS credentials must never be exposed in AI responses.")
443
+ // Block API keys and bearer tokens across all actions
444
+ @id("secrets-block-api-keys")
445
+ @name("Block API keys and bearer tokens")
446
+ @description("Block content containing generic API keys, bearer tokens, JWT tokens, and OAuth credentials. These are the most commonly leaked credential types in AI agent interactions.")
447
+ @severity("high")
448
+ @tags("secrets,api-key,bearer,jwt,oauth,nist-ia-5")
449
+ @reject_message("Content blocked: API keys, bearer tokens, or OAuth credentials detected. These must never be passed through AI agent prompts or tool calls.")
137
450
  forbid (
138
451
  principal,
139
452
  action,
140
453
  resource
141
454
  )
142
455
  when {
143
- context has response_content &&
144
- (context.response_content like "*AWS_SECRET_ACCESS_KEY*" ||
145
- context.response_content like "*aws_secret_access_key*")
456
+ context has secret_types &&
457
+ (context.secret_types.contains("api_key") ||
458
+ context.secret_types.contains("bearer_token") ||
459
+ context.secret_types.contains("jwt_token") ||
460
+ context.secret_types.contains("oauth_token") ||
461
+ context.secret_types.contains("oauth_secret"))
146
462
  };
147
463
 
148
- // Block responses containing GitHub tokens
149
- @id("secrets-block-github-tokens")
150
- @name("Block GitHub tokens in responses")
151
- @description("Detect and block GitHub personal access tokens (ghp_), fine-grained tokens (github_pat_), and app tokens (ghs_)")
464
+ // Block when multiple secrets are detected (bulk exposure)
465
+ @id("secrets-block-bulk-exposure")
466
+ @name("Block bulk secret exposure")
467
+ @description("Block content when 3 or more distinct secrets are found. Multiple secrets in a single request indicates either a configuration dump, .env file paste, or credential harvesting attempt.")
152
468
  @severity("critical")
153
- @tags("secrets,github,tokens,response-scan,mitre-t1552")
154
- @reject_message("This response was blocked because a GitHub token (personal access token, fine-grained token, or app token) was detected. GitHub tokens must never be exposed in AI responses.")
469
+ @tags("secrets,bulk,data-exfiltration,nist-sc-28,mitre-t1552")
470
+ @reject_message("Content blocked: multiple credentials detected (3+). This appears to be a bulk credential exposure configuration dumps and credential lists must never be passed through AI agents.")
155
471
  forbid (
156
472
  principal,
157
473
  action,
158
474
  resource
159
475
  )
160
476
  when {
161
- context has response_content &&
162
- (context.response_content like "*ghp_*" ||
163
- context.response_content like "*github_pat_*" ||
164
- context.response_content like "*ghs_*")
477
+ context has secret_count && context.secret_count >= 3
165
478
  };
166
479
 
167
- // Block responses containing SSH/RSA private keys
168
- @id("secrets-block-private-keys")
169
- @name("Block private keys in responses")
170
- @description("Detect and block SSH, RSA, and OpenSSH private keys in AI responses")
480
+ // ---------------------------------------------------------------------------
481
+ // Section 3: Detection Rule Pattern Matching
482
+ // Catches specific named detection rules that fire for credential exposure.
483
+ // ---------------------------------------------------------------------------
484
+
485
+ // Block content flagged by detection engine credential rules
486
+ @id("secrets-block-detected-credentials")
487
+ @name("Block detected credential patterns")
488
+ @description("Block content flagged by detection engine rules for credential exposure, API key leaks, JWT tokens, and bearer tokens. Defense-in-depth behind contains_secrets.")
171
489
  @severity("critical")
172
- @tags("secrets,ssh,private-keys,response-scan,nist-sc-28,mitre-t1552")
173
- @reject_message("This response was blocked because a private key (SSH, RSA, or OpenSSH) was detected. Private keys must never be exposed in AI responses.")
490
+ @tags("secrets,credentials,jwt,bearer,nist-ia-5,mitre-t1552")
491
+ @reject_message("Content blocked: detection engines identified credential patterns including secret exposure, credential leaks, API keys, or token exposure.")
174
492
  forbid (
175
493
  principal,
176
494
  action,
177
495
  resource
178
496
  )
179
497
  when {
180
- context has response_content &&
181
- (context.response_content like "*-----BEGIN PRIVATE KEY-----*" ||
182
- context.response_content like "*-----BEGIN RSA PRIVATE KEY-----*" ||
183
- context.response_content like "*-----BEGIN OPENSSH PRIVATE KEY-----*")
498
+ context has detected_threats &&
499
+ (context.detected_threats.contains("secret_exposure") ||
500
+ context.detected_threats.contains("credential_leak") ||
501
+ context.detected_threats.contains("api_key_exposure") ||
502
+ context.detected_threats.contains("jwt_token_exposure") ||
503
+ context.detected_threats.contains("bearer_token_leak"))
184
504
  };
185
505
 
186
506
  // ---------------------------------------------------------------------------
187
- // Section 4: YARA Credential Pattern Detection
188
- // Catches credential types identified by YARA rule scanning.
507
+ // Section 4: Sensitive File Path Protection
508
+ // Blocks access to files that commonly contain secrets.
189
509
  // ---------------------------------------------------------------------------
190
510
 
191
- // Block YARA-detected credential and token patterns
192
- @id("secrets-block-yara-credentials")
193
- @name("Block YARA-detected credential patterns")
194
- @description("Block content flagged by YARA rules for credential exposure, API key leaks, JWT tokens, and bearer tokens")
195
- @severity("critical")
196
- @tags("secrets,yara,credentials,jwt,bearer,nist-ia-5")
197
- @reject_message("This content was blocked because YARA scanning detected credential patterns including secret exposure, credential leaks, API keys, JWT tokens, or bearer tokens.")
511
+ // Block .env file access across all operations
512
+ @id("secrets-block-env-files")
513
+ @name("Block .env file access")
514
+ @description("Block access to .env files that commonly contain secrets, API keys, and database credentials. Environment files are the #1 source of accidental credential exposure in development workflows.")
515
+ @severity("high")
516
+ @tags("secrets,env-files,config,nist-sc-28,mitre-t1552,cis-1.4")
517
+ @reject_message("Access to .env files is blocked because they commonly contain secrets, API keys, and database credentials. Use a secrets manager instead of .env files.")
198
518
  forbid (
199
519
  principal,
200
- action,
520
+ action in [Overwatch::Action::"read_file", Overwatch::Action::"write_file", Overwatch::Action::"call_tool"],
201
521
  resource
202
522
  )
203
523
  when {
204
- context has yara_threats &&
205
- (context.yara_threats.contains("secret_exposure") ||
206
- context.yara_threats.contains("credential_leak") ||
207
- context.yara_threats.contains("api_key_exposure") ||
208
- context.yara_threats.contains("jwt_token_exposure") ||
209
- context.yara_threats.contains("bearer_token_leak"))
524
+ context has path && context.path like "*.env*"
210
525
  };
526
+
527
+ // Block access to known credential files
528
+ @id("secrets-block-credential-files")
529
+ @name("Block credential file access")
530
+ @description("Block access to common credential files: .netrc, .npmrc, .pypirc, credentials, config files in cloud provider directories. These files often contain hardcoded tokens and passwords.")
531
+ @severity("high")
532
+ @tags("secrets,credential-files,config,nist-sc-28,mitre-t1555")
533
+ @reject_message("Access to this credential file is blocked. Files like .netrc, .npmrc, .pypirc, and cloud provider config files commonly contain hardcoded credentials.")
534
+ forbid (
535
+ principal,
536
+ action in [Overwatch::Action::"read_file", Overwatch::Action::"write_file", Overwatch::Action::"call_tool"],
537
+ resource
538
+ )
539
+ when {
540
+ context has path &&
541
+ (context.path like "*/.netrc" ||
542
+ context.path like "*/.npmrc" ||
543
+ context.path like "*/.pypirc" ||
544
+ context.path like "*/.docker/config.json" ||
545
+ context.path like "*/.kube/config" ||
546
+ context.path like "*/.config/gcloud/*" ||
547
+ context.path like "*/credentials.json" ||
548
+ context.path like "*/service-account*.json")
549
+ };
550
+
551
+ // NOTE: Response content secret detection (AWS keys, GitHub tokens, private keys
552
+ // in AI outputs) is handled by the detection engine's secrets scanner, which
553
+ // analyzes all content types including tool responses. The contains_secrets and
554
+ // secret_types rules above cover this case.
211
555
  `;
212
556
  const OVERWATCH_PII_DEFAULT_CEDAR = `// =============================================================================
213
557
  // PII Detection Policy (Default)
214
558
  // =============================================================================
215
- // Detects and blocks personally identifiable information including credit card
216
- // numbers, Social Security Numbers, and other PII patterns across prompts
217
- // and tool calls.
559
+ // Detects and blocks personally identifiable information across prompts, tool
560
+ // calls, file operations, and AI responses. Uses multi-layered detection:
561
+ //
562
+ // 1. PII boolean flag (pii_detected) — broadest catch from detection engine
563
+ // 2. Granular PII type matching (pii_types) — type-specific blocking
564
+ // 3. ML classifier confidence (pii_confidence) — catches novel PII patterns
565
+ // 4. Detection rule triggers (detected_threats) — named rule matches
566
+ // 5. File operation PII blocking — prevents PII persistence to disk
567
+ //
568
+ // Compliance:
569
+ // PCI DSS 3.4, 4.1 (Payment Card Data)
570
+ // GDPR Art. 32 (Security of Processing)
571
+ // HIPAA §164.312 (Technical Safeguards)
572
+ // NIST 800-53 SI-4 (Information System Monitoring)
573
+ // CCPA §1798.150 (Data Protection)
574
+ // OWASP LLM06 (Sensitive Information Disclosure)
218
575
  //
219
- // Compliance: PCI DSS 3.4, 4.1 | NIST 800-53 SI-4 | GDPR Art. 32
220
576
  // Category: pii
221
577
  // Namespace: Overwatch
222
578
  // =============================================================================
223
579
 
224
- // Block prompts containing credit card patterns
580
+ // ---------------------------------------------------------------------------
581
+ // Section 1: Detection Engine — Primary PII Detection
582
+ // Fires when the detection pipeline identifies PII in any content.
583
+ // ---------------------------------------------------------------------------
584
+
585
+ // Block prompts containing any detected PII
586
+ @id("pii-block-any-detected")
587
+ @name("Block prompts with PII")
588
+ @description("Block prompts when the detection engine identifies any PII patterns. This is the broadest PII catch — fires before type-specific rules.")
589
+ @severity("critical")
590
+ @tags("pii,privacy,data-protection,gdpr-art-32,owasp-llm06")
591
+ @reject_message("Your prompt was blocked because personally identifiable information was detected. Remove all PII (names, addresses, SSNs, credit cards, etc.) before resubmitting.")
592
+ forbid (
593
+ principal,
594
+ action == Overwatch::Action::"process_prompt",
595
+ resource
596
+ )
597
+ when {
598
+ context has pii_detected && context.pii_detected
599
+ };
600
+
601
+ // Block tool calls containing PII
602
+ @id("pii-block-tool-calls")
603
+ @name("Block tool calls with PII")
604
+ @description("Prevent tool execution when PII patterns are detected in tool arguments or content. Stops PII from being passed to external tools, MCP servers, or shell commands.")
605
+ @severity("high")
606
+ @tags("pii,tools,data-protection,owasp-llm06")
607
+ @reject_message("Tool execution was blocked because personally identifiable information was detected in the content. PII must be removed before tool calls are permitted.")
608
+ forbid (
609
+ principal,
610
+ action == Overwatch::Action::"call_tool",
611
+ resource
612
+ )
613
+ when {
614
+ context has pii_detected && context.pii_detected
615
+ };
616
+
617
+ // ---------------------------------------------------------------------------
618
+ // Section 2: Granular PII Type Blocking
619
+ // Blocks specific PII types based on regulatory requirements.
620
+ // ---------------------------------------------------------------------------
621
+
622
+ // Block credit card numbers (PCI DSS compliance)
225
623
  @id("pii-block-credit-cards")
226
624
  @name("Block credit card numbers")
227
- @description("Detect and block content containing credit card number patterns (PCI DSS compliance)")
625
+ @description("Detect and block content containing credit card number patterns. PCI DSS 3.4 requires that PANs are rendered unreadable — AI agents must never process raw card numbers.")
228
626
  @severity("critical")
229
- @tags("pci,credit-card,payment,compliance,pci-dss-3.4")
230
- @reject_message("Your prompt was blocked because credit card number patterns were detected. Sharing payment card data violates PCI DSS requirements.")
627
+ @tags("pci,credit-card,payment,compliance,pci-dss-3.4,pci-dss-4.1")
628
+ @reject_message("Content blocked: credit card number patterns detected. Sharing payment card data through AI agents violates PCI DSS requirements. Use tokenized card references instead.")
231
629
  forbid (
232
630
  principal,
233
- action == Overwatch::Action::"process_prompt",
631
+ action,
234
632
  resource
235
633
  )
236
634
  when {
237
- context has yara_threats && context.yara_threats.contains("credit_card")
635
+ (context has pii_types && context.pii_types.contains("credit_card")) ||
636
+ (context has detected_threats && context.detected_threats.contains("credit_card"))
238
637
  };
239
638
 
240
- // Block prompts containing SSN patterns
639
+ // Block Social Security Numbers
241
640
  @id("pii-block-ssn")
242
641
  @name("Block Social Security Numbers")
243
- @description("Detect and block content containing SSN patterns (XXX-XX-XXXX format)")
642
+ @description("Detect and block content containing SSN patterns (XXX-XX-XXXX and variants). SSNs are high-value identity theft targets — exposure through AI agents is a critical privacy violation.")
244
643
  @severity("critical")
245
- @tags("ssn,identity,privacy,compliance")
246
- @reject_message("Your prompt was blocked because Social Security Number patterns (XXX-XX-XXXX) were detected. SSNs are protected personal identifiers that must not be shared.")
644
+ @tags("ssn,identity,privacy,compliance,nist-si-4")
645
+ @reject_message("Content blocked: Social Security Number patterns detected. SSNs are protected personal identifiers that must never be shared through AI agents.")
247
646
  forbid (
248
647
  principal,
249
- action == Overwatch::Action::"process_prompt",
648
+ action,
250
649
  resource
251
650
  )
252
651
  when {
253
- context has yara_threats && context.yara_threats.contains("ssn")
652
+ (context has pii_types && context.pii_types.contains("ssn")) ||
653
+ (context has detected_threats && context.detected_threats.contains("ssn"))
254
654
  };
255
655
 
256
- // Block prompts with generic PII threats detected
257
- @id("pii-block-generic")
258
- @name("Block detected PII content")
259
- @description("Block content when PII-related threat categories are detected by YARA or Javelin scanners")
260
- @severity("high")
261
- @tags("pii,privacy,data-protection,gdpr")
262
- @reject_message("Your prompt was blocked because personally identifiable information was detected by threat scanners. Remove all PII before resubmitting.")
656
+ // Block medical/health records (HIPAA compliance)
657
+ @id("pii-block-health-data")
658
+ @name("Block health information")
659
+ @description("Block content containing medical record numbers, health insurance IDs, or other Protected Health Information (PHI). HIPAA §164.312 requires technical safeguards for PHI.")
660
+ @severity("critical")
661
+ @tags("phi,hipaa,health,medical,compliance,hipaa-164.312")
662
+ @reject_message("Content blocked: Protected Health Information (PHI) detected. Health data must not be processed through AI agents per HIPAA requirements.")
263
663
  forbid (
264
664
  principal,
265
- action == Overwatch::Action::"process_prompt",
665
+ action,
266
666
  resource
267
667
  )
268
668
  when {
269
- context has threat_categories && context.threat_categories.contains("pii")
669
+ context has pii_types &&
670
+ (context.pii_types.contains("medical_record") ||
671
+ context.pii_types.contains("health_insurance_id"))
672
+ };
673
+
674
+ // Block bulk PII exposure (multiple PII items)
675
+ @id("pii-block-bulk-exposure")
676
+ @name("Block bulk PII exposure")
677
+ @description("Block content containing 3 or more PII matches. Multiple PII items in a single request indicates a data dump, CSV paste, or data exfiltration attempt.")
678
+ @severity("critical")
679
+ @tags("pii,bulk,data-exfiltration,gdpr-art-32,ccpa")
680
+ @reject_message("Content blocked: multiple PII items detected (3+). Bulk personal data must never be processed through AI agents. Use data masking or tokenization for batch operations.")
681
+ forbid (
682
+ principal,
683
+ action,
684
+ resource
685
+ )
686
+ when {
687
+ context has pii_count && context.pii_count >= 3
270
688
  };
271
689
 
272
- // Block prompts with high PII confidence score
690
+ // ---------------------------------------------------------------------------
691
+ // Section 3: ML Classifier — Novel PII Detection
692
+ // Catches PII patterns that escape regex detection using ML classifiers.
693
+ // ---------------------------------------------------------------------------
694
+
695
+ // Block content with high PII classifier confidence
273
696
  @id("pii-block-high-confidence")
274
697
  @name("Block high-confidence PII")
275
- @description("Block content when PII classifier confidence exceeds threshold (80/100)")
698
+ @description("Block content when the ML PII classifier confidence exceeds threshold (80/100). Catches novel PII patterns including names, addresses, and identifiers that regex rules may miss.")
276
699
  @severity("critical")
277
- @tags("pii,confidence,privacy,compliance")
278
- @reject_message("Your content was blocked because personally identifiable information was detected with high confidence.")
700
+ @tags("pii,confidence,privacy,compliance,ml-classifier")
701
+ @reject_message("Content blocked: the ML classifier detected personally identifiable information with high confidence. Even if specific PII types aren't identified, the content appears to contain personal data.")
279
702
  forbid (
280
703
  principal,
281
- action == Overwatch::Action::"process_prompt",
704
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
282
705
  resource
283
706
  )
284
707
  when {
285
708
  context has pii_confidence && context.pii_confidence >= 80
286
709
  };
287
710
 
288
- // Block PII leakage via tool calls
289
- @id("pii-block-tool-calls")
290
- @name("Block tool calls with PII")
291
- @description("Prevent tool execution when PII patterns are detected in content")
711
+ // ---------------------------------------------------------------------------
712
+ // Section 4: Detection Rule Pattern Matching
713
+ // Catches PII detected by named detection rules.
714
+ // ---------------------------------------------------------------------------
715
+
716
+ // Block content with PII threat category
717
+ @id("pii-block-threat-category")
718
+ @name("Block PII threat category")
719
+ @description("Block content when threat categorization identifies PII. Defense-in-depth behind the pii_detected boolean — catches cases where PII is flagged at the threat aggregation layer.")
292
720
  @severity("high")
293
- @tags("pii,tools,data-protection")
294
- @reject_message("Tool execution was blocked because personally identifiable information was detected in the content. PII must be removed before tool calls are permitted.")
721
+ @tags("pii,privacy,data-protection,gdpr")
722
+ @reject_message("Content blocked: threat scanners detected personally identifiable information. Remove all PII before resubmitting.")
295
723
  forbid (
296
724
  principal,
297
- action == Overwatch::Action::"call_tool",
725
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
298
726
  resource
299
727
  )
300
728
  when {
301
729
  context has threat_categories && context.threat_categories.contains("pii")
302
730
  };
731
+
732
+ // ---------------------------------------------------------------------------
733
+ // Section 5: File Operation PII Blocking
734
+ // Prevents PII from being read from or written to disk.
735
+ // ---------------------------------------------------------------------------
736
+
737
+ // Block file operations containing PII
738
+ @id("pii-block-file-ops")
739
+ @name("Block file operations with PII")
740
+ @description("Block file reads and writes when PII is detected. Prevents agents from reading files containing personal data and from writing PII to new files where it could persist or be version-controlled.")
741
+ @severity("high")
742
+ @tags("pii,file-ops,data-protection,gdpr-art-32,nist-si-4")
743
+ @reject_message("File operation blocked: personally identifiable information was detected. Files containing PII must not be read or written through AI agents.")
744
+ forbid (
745
+ principal,
746
+ action in [Overwatch::Action::"read_file", Overwatch::Action::"write_file"],
747
+ resource
748
+ )
749
+ when {
750
+ context has pii_detected && context.pii_detected
751
+ };
303
752
  `;
304
- const OVERWATCH_SEMANTIC_DEFAULT_CEDAR = `// =============================================================================
305
- // Semantic Threat Detection Policy (Default)
753
+ const OVERWATCH_TOOLS_DEFAULT_CEDAR = `// =============================================================================
754
+ // Tool Permissioning Policy (Default)
306
755
  // =============================================================================
307
- // Detects and blocks prompt injection, jailbreak attempts, and high-severity
308
- // AI security threats using YARA and Javelin scanner results. Provides
309
- // defense-in-depth across both prompts and tool calls.
756
+ // Controls access to IDE tools, shell execution, file system paths, and MCP
757
+ // operations. Enforces least-privilege for agent tool usage with multi-layered
758
+ // controls:
310
759
  //
311
- // Compliance: NIST 800-53 SI-3, SI-4 | OWASP LLM Top 10: LLM01, LLM02
312
- // MITRE ATLAS: AML.T0051 (LLM Prompt Injection)
313
- // Category: semantic
760
+ // 1. Dangerous tool blocking (shell, command execution, destructive ops)
761
+ // 2. Sensitive system path protection (credentials, system dirs)
762
+ // 3. Tool risk scoring (computed risk assessment)
763
+ // 4. Tool category enforcement (safe/sensitive/dangerous classification)
764
+ // 5. Threat-based tool blocking (threat severity gates)
765
+ // 6. Command injection detection (reverse shells, code execution, etc.)
766
+ //
767
+ // Compliance:
768
+ // NIST 800-53 AC-3 (Access Enforcement)
769
+ // NIST 800-53 AC-6 (Least Privilege)
770
+ // NIST 800-53 CM-7 (Least Functionality)
771
+ // OWASP LLM06 (Excessive Agency) — agent tool access control
772
+ // OWASP ASI02 (Tool Misuse) — unauthorized tool operations
773
+ // MITRE ATT&CK T1059 (Command and Scripting Interpreter)
774
+ // MITRE ATT&CK T1005 (Data from Local System)
775
+ // MITRE ATT&CK T1552 (Unsecured Credentials)
776
+ //
777
+ // Category: tools
314
778
  // Namespace: Overwatch
315
779
  // =============================================================================
316
780
 
317
- // Block prompts with prompt injection detected by YARA
318
- @id("semantic-block-injection")
319
- @name("Block prompt injection")
320
- @description("Detect and block prompt injection patterns in user input via YARA scanning (OWASP LLM01)")
781
+ // ---------------------------------------------------------------------------
782
+ // Section 1: Dangerous Tool Blocking
783
+ // Blocks tools classified as inherently dangerous for agent use.
784
+ // Ref: OWASP LLM06, MITRE T1059
785
+ // ---------------------------------------------------------------------------
786
+
787
+ // Block shell and command execution tools
788
+ @id("tools-block-shell-execution")
789
+ @name("Block shell and command execution")
790
+ @description("Block direct shell, bash, and command execution tools. Unrestricted shell access is the #1 risk in AI coding agents — enables command injection, data exfiltration, and arbitrary code execution (MITRE T1059).")
321
791
  @severity("critical")
322
- @tags("injection,security,llm,owasp-llm01,baseline")
323
- @reject_message("Your prompt was blocked because prompt injection patterns were detected by YARA scanning. This is a security measure to prevent manipulation of AI agent behavior.")
792
+ @tags("shell,command-injection,execution,nist-cm-7,mitre-t1059,owasp-llm06,baseline")
793
+ @reject_message("Tool execution was blocked: direct shell and command execution tools (shell, bash, terminal) are restricted to prevent command injection attacks (MITRE T1059). Use specific, scoped tools instead.")
324
794
  forbid (
325
795
  principal,
326
- action == Overwatch::Action::"process_prompt",
796
+ action == Overwatch::Action::"call_tool",
327
797
  resource
328
798
  )
329
799
  when {
330
- context has yara_threats && context.yara_threats.contains("prompt_injection")
800
+ context has tool_name &&
801
+ (context.tool_name == "shell" ||
802
+ context.tool_name == "bash" ||
803
+ context.tool_name == "sh" ||
804
+ context.tool_name == "terminal" ||
805
+ context.tool_name == "system.exec" ||
806
+ context.tool_name == "process.spawn" ||
807
+ context.tool_name == "cmd" ||
808
+ context.tool_name == "powershell")
331
809
  };
332
810
 
333
- // Block prompts with high injection confidence score
334
- @id("semantic-block-injection-score")
335
- @name("Block high-confidence injection")
336
- @description("Block content when injection classifier confidence exceeds threshold (75/100)")
811
+ // Block destructive file operations
812
+ @id("tools-block-destructive-ops")
813
+ @name("Block destructive file operations")
814
+ @description("Block file deletion, directory removal, and other destructive operations. Agents should not have delete access by default — destructive operations require explicit human approval.")
815
+ @severity("high")
816
+ @tags("file,delete,destructive,nist-ac-3,owasp-asi02")
817
+ @reject_message("Tool execution was blocked: destructive file operations (delete, rmdir, unlink) are restricted to prevent data loss. Request explicit human approval for destructive actions.")
818
+ forbid (
819
+ principal,
820
+ action == Overwatch::Action::"call_tool",
821
+ resource
822
+ )
823
+ when {
824
+ context has tool_name &&
825
+ (context.tool_name == "fs.delete" ||
826
+ context.tool_name == "fs.rmdir" ||
827
+ context.tool_name == "fs.unlink" ||
828
+ context.tool_name == "fs.remove" ||
829
+ context.tool_name == "delete_file" ||
830
+ context.tool_name == "remove_directory")
831
+ };
832
+
833
+ // ---------------------------------------------------------------------------
834
+ // Section 2: Sensitive System Path Protection
835
+ // Blocks access to system directories, credential files, and sensitive paths.
836
+ // Ref: MITRE T1005, T1552
837
+ // ---------------------------------------------------------------------------
838
+
839
+ // Block access to system directories
840
+ @id("tools-block-system-paths")
841
+ @name("Block system directory access")
842
+ @description("Prevent access to sensitive system directories (/etc, /proc, /sys, /root, /var). These directories contain system configuration, process information, and credentials that agents must never access.")
843
+ @severity("high")
844
+ @tags("file,path,system,security,nist-ac-6,mitre-t1005")
845
+ @reject_message("Access blocked: this path targets a sensitive system directory. AI agents are restricted from accessing /etc, /proc, /sys, /root, and /var directories.")
846
+ forbid (
847
+ principal,
848
+ action in [Overwatch::Action::"read_file", Overwatch::Action::"write_file", Overwatch::Action::"call_tool"],
849
+ resource
850
+ )
851
+ when {
852
+ context has path &&
853
+ (context.path like "/etc/*" ||
854
+ context.path like "/proc/*" ||
855
+ context.path like "/sys/*" ||
856
+ context.path like "/root/*" ||
857
+ context.path like "/var/log/*" ||
858
+ context.path like "/var/run/*")
859
+ };
860
+
861
+ // Block access to credential and key directories
862
+ @id("tools-block-credential-paths")
863
+ @name("Block credential directory access")
864
+ @description("Prevent access to SSH keys, cloud provider credentials, GPG keys, and other authentication material directories. These are primary targets for credential theft (MITRE T1552).")
337
865
  @severity("critical")
338
- @tags("injection,confidence,security,owasp-llm01")
339
- @reject_message("Your prompt was blocked because a high-confidence prompt injection pattern was detected.")
866
+ @tags("file,credentials,ssh,aws,security,nist-ac-6,mitre-t1552")
867
+ @reject_message("Access blocked: this path targets a credential or key directory (.ssh, .aws, .gnupg, .config/gcloud). AI agents must never access authentication material.")
340
868
  forbid (
341
869
  principal,
342
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
870
+ action in [Overwatch::Action::"read_file", Overwatch::Action::"write_file", Overwatch::Action::"call_tool"],
343
871
  resource
344
872
  )
345
873
  when {
346
- context has injection_confidence && context.injection_confidence >= 75
874
+ context has path &&
875
+ (context.path like "*/.ssh/*" ||
876
+ context.path like "*/.aws/*" ||
877
+ context.path like "*/.gnupg/*" ||
878
+ context.path like "*/.config/gcloud/*" ||
879
+ context.path like "*/.azure/*" ||
880
+ context.path like "*.pem" ||
881
+ context.path like "*/id_rsa*" ||
882
+ context.path like "*/id_ed25519*" ||
883
+ context.path like "*/id_ecdsa*")
347
884
  };
348
885
 
349
- // Block prompts with jailbreak attempts
350
- @id("semantic-block-jailbreak")
351
- @name("Block jailbreak attempts")
352
- @description("Detect and block jailbreak and bypass attempts against AI agents (OWASP LLM02)")
886
+ // ---------------------------------------------------------------------------
887
+ // Section 3: Tool Risk Scoring
888
+ // Uses computed tool risk scores from the detection engine to dynamically
889
+ // assess and block risky tool operations.
890
+ // ---------------------------------------------------------------------------
891
+
892
+ // Block tools with very high computed risk
893
+ @id("tools-block-high-risk-score")
894
+ @name("Block high-risk tool operations")
895
+ @description("Block tool operations when the computed risk score exceeds 90/100. The risk score combines tool type, argument analysis, context, and historical behavior into a single metric.")
353
896
  @severity("critical")
354
- @tags("jailbreak,bypass,security,owasp-llm02,baseline")
355
- @reject_message("Your prompt was blocked because jailbreak or bypass patterns were detected by YARA scanning. This is a security measure to prevent circumvention of AI safety controls.")
897
+ @tags("tool-risk,dynamic,security,owasp-llm06,owasp-asi02")
898
+ @reject_message("Tool execution blocked: this operation scored 90+ on the risk assessment. The combination of tool type, arguments, and context indicates a high-risk operation.")
356
899
  forbid (
357
900
  principal,
358
- action == Overwatch::Action::"process_prompt",
901
+ action == Overwatch::Action::"call_tool",
359
902
  resource
360
903
  )
361
904
  when {
362
- context has yara_threats && context.yara_threats.contains("jailbreak")
905
+ context has tool_risk_score && context.tool_risk_score >= 90
363
906
  };
364
907
 
365
- // Block prompts with high jailbreak confidence score
366
- @id("semantic-block-jailbreak-score")
367
- @name("Block high-confidence jailbreak")
368
- @description("Block content when jailbreak classifier confidence exceeds threshold (75/100)")
908
+ // Block tools classified as dangerous
909
+ @id("tools-block-dangerous-category")
910
+ @name("Block dangerous tool category")
911
+ @description("Block all tools classified as 'dangerous' by the detection engine. The dangerous category includes tools with unrestricted system access, code execution, or network capabilities.")
369
912
  @severity("critical")
370
- @tags("jailbreak,confidence,security,owasp-llm02")
371
- @reject_message("Your prompt was blocked because a high-confidence jailbreak attempt was detected.")
913
+ @tags("tool-category,dangerous,security,owasp-llm06,nist-ac-6")
914
+ @reject_message("Tool execution blocked: this tool is classified as 'dangerous' due to its unrestricted system access, code execution, or network capabilities. Use a safer alternative.")
372
915
  forbid (
373
916
  principal,
374
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
917
+ action == Overwatch::Action::"call_tool",
375
918
  resource
376
919
  )
377
920
  when {
378
- context has jailbreak_confidence && context.jailbreak_confidence >= 75
921
+ context has tool_category && context.tool_category == "dangerous"
379
922
  };
380
923
 
381
- // Block prompts with high severity semantic threats
382
- @id("semantic-block-high-severity")
383
- @name("Block high severity threats")
384
- @description("Block prompts when semantic threat scanners detect high severity issues (severity >= 3)")
924
+ // Stricter threshold for sensitive tools
925
+ @id("tools-block-sensitive-with-threats")
926
+ @name("Block sensitive tools with threats")
927
+ @description("Block sensitive tools (file write, shell, network) when any threats are detected. Sensitive tools with concurrent threats indicate an attack leveraging tool capabilities for malicious purposes.")
385
928
  @severity("high")
386
- @tags("semantic,severity,security")
387
- @reject_message("Your prompt was blocked because semantic threat scanners detected high severity issues in the content. Review your prompt for manipulative or adversarial patterns.")
929
+ @tags("tool-category,sensitive,security,owasp-asi02,defense-in-depth")
930
+ @reject_message("Sensitive tool execution blocked: threats were detected alongside a sensitive tool operation. Sensitive tools require zero threat context to execute.")
388
931
  forbid (
389
932
  principal,
390
- action == Overwatch::Action::"process_prompt",
933
+ action == Overwatch::Action::"call_tool",
391
934
  resource
392
935
  )
393
936
  when {
394
- context has threat_categories && context has max_threat_severity &&
395
- context.threat_categories.contains("semantic") &&
396
- context.max_threat_severity >= 3
937
+ context has tool_is_sensitive && context.tool_is_sensitive &&
938
+ context has threat_count && context.threat_count > 0
397
939
  };
398
940
 
399
- // Block prompts with critical threat level
400
- @id("semantic-block-critical")
401
- @name("Block critical threats")
402
- @description("Block all content when any scanner detects critical severity threats")
403
- @severity("critical")
404
- @tags("critical,baseline,security")
405
- @reject_message("Your prompt was blocked because security scanners detected a critical-severity threat. This content cannot be processed.")
941
+ // ---------------------------------------------------------------------------
942
+ // Section 4: Threat-Based Tool Blocking
943
+ // Blocks tool calls based on threat severity from detection engines.
944
+ // ---------------------------------------------------------------------------
945
+
946
+ // Block tool calls with high severity threats
947
+ @id("tools-block-high-severity-threats")
948
+ @name("Block tool calls with high severity threats")
949
+ @description("Prevent tool execution when high or critical severity threats (>= 3) are detected in content. Tools must not execute when the content they operate on is flagged as dangerous.")
950
+ @severity("high")
951
+ @tags("tools,threats,severity,security,defense-in-depth")
952
+ @reject_message("Tool execution was blocked because high or critical severity threats were detected in the content by security scanners.")
406
953
  forbid (
407
954
  principal,
408
- action == Overwatch::Action::"process_prompt",
955
+ action == Overwatch::Action::"call_tool",
409
956
  resource
410
957
  )
411
958
  when {
412
- context has highest_severity && context.highest_severity == "critical"
959
+ context has threat_count && context has max_threat_severity &&
960
+ context.threat_count > 0 && context.max_threat_severity >= 3
413
961
  };
414
962
 
415
- // Block tool calls with prompt injection detected
416
- @id("semantic-block-tool-injection")
417
- @name("Block tool calls with injection")
418
- @description("Prevent tool execution when prompt injection patterns are detected in content")
963
+ // ---------------------------------------------------------------------------
964
+ // Section 5: Command Injection Detection
965
+ // Blocks tool calls containing command injection patterns in arguments.
966
+ // Ref: AIShellJack (41-84% success rate, 314 payloads)
967
+ // ---------------------------------------------------------------------------
968
+
969
+ // Block detected command injection patterns
970
+ @id("tools-block-command-injection")
971
+ @name("Block command injection in tool calls")
972
+ @description("Block tool calls when command injection patterns are detected in arguments — reverse shells, privilege escalation, code execution, and data exfiltration commands. Ref: AIShellJack attack (41-84% success rate).")
419
973
  @severity("critical")
420
- @tags("injection,tools,security,owasp-llm01")
421
- @reject_message("Tool execution was blocked because prompt injection patterns were detected in the content by YARA scanning.")
974
+ @tags("command-injection,shell,security,mitre-t1059,owasp-asi02")
975
+ @reject_message("Tool execution blocked: command injection pattern detected in tool arguments. This may be a shell injection attack attempting to execute unauthorized commands.")
422
976
  forbid (
423
977
  principal,
424
978
  action == Overwatch::Action::"call_tool",
425
979
  resource
426
980
  )
427
981
  when {
428
- context has yara_threats && context.yara_threats.contains("prompt_injection")
982
+ context has detected_threats &&
983
+ context.detected_threats.contains("command_injection")
429
984
  };
430
985
  `;
431
- const OVERWATCH_TOOLS_DEFAULT_CEDAR = `// =============================================================================
432
- // Tool Permissioning Policy (Default)
986
+ const OVERWATCH_AGENT_SECURITY_DEFAULT_CEDAR = `// =============================================================================
987
+ // Agent Security Policy (Default)
433
988
  // =============================================================================
434
- // Controls access to IDE tools, shell execution, file system paths, and MCP
435
- // operations. Blocks dangerous command execution tools and restricts access
436
- // to sensitive system directories and credential files.
989
+ // Detects and blocks tool poisoning, rug pull attacks, indirect prompt injection,
990
+ // and MCP supply chain threats targeting AI coding agents.
437
991
  //
438
- // Compliance: NIST 800-53 AC-3, AC-6, CM-7 | OWASP A01, A03
439
- // MITRE ATT&CK T1059 (Command/Scripting Interpreter)
440
- // MITRE ATT&CK T1005 (Data from Local System)
441
- // Category: tools
992
+ // These are agentic AI-specific attack vectors (OWASP Agentic Top 10) where tool
993
+ // descriptions, server responses, or behavioral drift manipulate agent behavior.
994
+ //
995
+ // Compliance:
996
+ // OWASP LLM01 (Prompt Injection) | OWASP LLM06 (Excessive Agency)
997
+ // OWASP ASI01 (Agent Goal Hijack) | OWASP ASI02 (Tool Misuse)
998
+ // OWASP ASI04 (Supply Chain) | OWASP MCP01-05
999
+ // MITRE ATLAS AML.T0051 (Prompt Injection) | AML.T0080 (Memory Manipulation)
1000
+ //
1001
+ // Category: agent_security
442
1002
  // Namespace: Overwatch
443
1003
  // =============================================================================
444
1004
 
445
1005
  // ---------------------------------------------------------------------------
446
- // Section 1: Dangerous Tool Blocking
1006
+ // Tool Poisoning hidden instructions in tool descriptions/arguments
1007
+ // Ref: Invariant Labs (April 2025), 84.2% success rate with auto-approval
447
1008
  // ---------------------------------------------------------------------------
448
1009
 
449
- // Block shell and command execution tools
450
- @id("tools-block-shell-execution")
451
- @name("Block shell and command execution")
452
- @description("Block direct shell, bash, and command execution tools to prevent command injection (MITRE T1059)")
1010
+ // Block tool calls with tool poisoning risk
1011
+ @id("as-block-tool-poisoning")
1012
+ @name("Block tool poisoning")
1013
+ @description("Block tool execution when hidden instructions are detected in tool descriptions or arguments (score >= 70/100). Catches authority hijack, system prompt injection, and hidden instruction patterns. Adjust the threshold to tune sensitivity lower catches more but may flag legitimate tools with instructional descriptions (OWASP ASI01).")
453
1014
  @severity("critical")
454
- @tags("shell,command-injection,execution,nist-cm-7,mitre-t1059,baseline")
455
- @reject_message("Tool execution was blocked because direct shell and command execution tools (shell, bash, terminal, system.exec) are restricted to prevent command injection attacks.")
1015
+ @tags("tool-poisoning,agent-security,owasp-asi01,mitre-aml-t0051")
1016
+ @reject_message("Tool execution blocked: hidden manipulation instructions detected in tool description or arguments. This may be a tool poisoning attack (OWASP ASI01).")
456
1017
  forbid (
457
1018
  principal,
458
1019
  action == Overwatch::Action::"call_tool",
459
1020
  resource
460
1021
  )
461
1022
  when {
462
- context has tool_name &&
463
- (context.tool_name == "shell" ||
464
- context.tool_name == "bash" ||
465
- context.tool_name == "sh" ||
466
- context.tool_name == "terminal" ||
467
- context.tool_name == "system.exec" ||
468
- context.tool_name == "process.spawn")
1023
+ context has tool_poisoning_score && context.tool_poisoning_score >= 70
469
1024
  };
470
1025
 
471
- // Block destructive file operations
472
- @id("tools-block-destructive-ops")
473
- @name("Block destructive file operations")
474
- @description("Block file deletion and other destructive tool operations to prevent data loss")
475
- @severity("high")
476
- @tags("file,delete,destructive,nist-ac-3")
477
- @reject_message("Tool execution was blocked because destructive file operations (delete, rmdir, unlink) are restricted to prevent data loss.")
1026
+ // Block MCP server connections with poisoning risk
1027
+ @id("as-block-server-poisoning")
1028
+ @name("Block poisoned MCP servers")
1029
+ @description("Block connections to MCP servers when tool poisoning patterns are detected in tool descriptions (score >= 60). Lower threshold for servers since poisoning affects all tools on the server.")
1030
+ @severity("critical")
1031
+ @tags("tool-poisoning,mcp-security,owasp-asi04,owasp-mcp02")
1032
+ @reject_message("MCP server connection blocked: tool poisoning patterns detected in server tool descriptions. Review server tools before connecting.")
478
1033
  forbid (
479
1034
  principal,
480
- action == Overwatch::Action::"call_tool",
1035
+ action == Overwatch::Action::"connect_server",
481
1036
  resource
482
1037
  )
483
1038
  when {
484
- context has tool_name &&
485
- (context.tool_name == "fs.delete" ||
486
- context.tool_name == "fs.rmdir" ||
487
- context.tool_name == "fs.unlink")
1039
+ context has tool_poisoning_score && context.tool_poisoning_score >= 60
1040
+ };
1041
+
1042
+ // ---------------------------------------------------------------------------
1043
+ // Rug Pull — tool behavior changes after trust establishment
1044
+ // Ref: Acuvity (2025), tools approved once then silently redefined
1045
+ // ---------------------------------------------------------------------------
1046
+
1047
+ // Block tool calls with behavioral drift (rug pull)
1048
+ @id("as-block-rug-pull")
1049
+ @name("Block rug pull attacks")
1050
+ @description("Block tool execution when behavioral drift is detected — tool behavior diverges significantly from established patterns (score >= 70/100). Defends against tools that are approved once then silently redefined to act maliciously. Adjust the threshold to tune sensitivity (OWASP ASI04).")
1051
+ @severity("critical")
1052
+ @tags("rug-pull,agent-security,owasp-asi04,behavioral-drift")
1053
+ @reject_message("Tool execution blocked: tool behavior has changed significantly from its established pattern. This may be a rug pull attack where a tool was silently redefined after initial approval.")
1054
+ forbid (
1055
+ principal,
1056
+ action in [Overwatch::Action::"call_tool", Overwatch::Action::"connect_server"],
1057
+ resource
1058
+ )
1059
+ when {
1060
+ context has rug_pull_score && context.rug_pull_score >= 70
488
1061
  };
489
1062
 
490
1063
  // ---------------------------------------------------------------------------
491
- // Section 2: Sensitive Path Blocking
1064
+ // Indirect Prompt Injection injection via tool outputs and retrieved content
1065
+ // Ref: EchoLeak CVE-2025-32711, IDEsaster (30+ CVEs in AI IDEs)
492
1066
  // ---------------------------------------------------------------------------
493
1067
 
494
- // Block access to sensitive system paths and credential files
495
- @id("tools-block-sensitive-paths")
496
- @name("Block access to sensitive system paths")
497
- @description("Prevent access to system directories, credential files, SSH keys, and cloud config (MITRE T1005, T1552.001)")
498
- @severity("high")
499
- @tags("file,path,system,security,nist-ac-6,mitre-t1005")
500
- @reject_message("Access to this path was blocked because it targets a sensitive system directory or credential file (/etc, /proc, /sys, .ssh, .aws, .gnupg, or private key files).")
1068
+ // Block prompts with indirect injection from tool outputs
1069
+ @id("as-block-indirect-injection")
1070
+ @name("Block indirect prompt injection")
1071
+ @description("Block when indirect prompt injection is detected in tool outputs, file contents, or retrieved documents (score >= 70). Defends against OWASP LLM01 and ASI01.")
1072
+ @severity("critical")
1073
+ @tags("indirect-injection,owasp-llm01,owasp-asi01,mitre-aml-t0051")
1074
+ @reject_message("Content blocked: indirect prompt injection detected in tool output or retrieved content. An external source may be attempting to hijack agent behavior.")
1075
+ forbid (
1076
+ principal,
1077
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool", Overwatch::Action::"connect_server"],
1078
+ resource
1079
+ )
1080
+ when {
1081
+ context has indirect_injection_score && context.indirect_injection_score >= 70
1082
+ };
1083
+
1084
+ // Strict indirect injection for sensitive tool calls
1085
+ @id("as-block-indirect-injection-sensitive-tools")
1086
+ @name("Block indirect injection on sensitive tools")
1087
+ @description("Lower threshold (>= 50) for indirect injection when the tool is classified as sensitive (shell, file write, network). Even moderate injection risk on sensitive tools warrants blocking.")
1088
+ @severity("critical")
1089
+ @tags("indirect-injection,sensitive-tools,owasp-asi02")
1090
+ @reject_message("Sensitive tool execution blocked: moderate indirect injection risk detected. Sensitive tools require higher confidence that content is safe.")
501
1091
  forbid (
502
1092
  principal,
503
- action in [Overwatch::Action::"read_file", Overwatch::Action::"write_file", Overwatch::Action::"call_tool"],
1093
+ action == Overwatch::Action::"call_tool",
504
1094
  resource
505
1095
  )
506
1096
  when {
507
- context has path &&
508
- (context.path like "/etc/*" ||
509
- context.path like "/var/*" ||
510
- context.path like "/proc/*" ||
511
- context.path like "/sys/*" ||
512
- context.path like "/root/*" ||
513
- context.path like "*/.ssh/*" ||
514
- context.path like "*/.aws/*" ||
515
- context.path like "*/.gnupg/*" ||
516
- context.path like "*.pem" ||
517
- context.path like "*/id_rsa*" ||
518
- context.path like "*/id_ed25519*")
1097
+ context has indirect_injection_score && context.indirect_injection_score >= 50 &&
1098
+ context has tool_is_sensitive && context.tool_is_sensitive
519
1099
  };
520
1100
 
521
1101
  // ---------------------------------------------------------------------------
522
- // Section 3: Threat-Based Tool Blocking
1102
+ // MCP Supply Chain unverified servers, risky configs
1103
+ // Ref: OWASP MCP Top 10, OWASP ASI04, MITRE AML.T0082
523
1104
  // ---------------------------------------------------------------------------
524
1105
 
525
- // Block tool calls with high severity threats detected
526
- @id("tools-block-high-severity-threats")
527
- @name("Block tool calls with high severity threats")
528
- @description("Prevent tool execution when high or critical severity threats are detected in content")
1106
+ // Block unverified MCP server tool calls with detected threats
1107
+ @id("as-block-unverified-threats")
1108
+ @name("Block unverified server threats")
1109
+ @description("Block tool calls from unverified MCP servers when any threat is detected. Unverified servers with threats are high-risk supply chain vectors.")
529
1110
  @severity("high")
530
- @tags("tools,threats,severity,security")
531
- @reject_message("Tool execution was blocked because high or critical severity threats were detected in the content by security scanners.")
1111
+ @tags("mcp-trust,owasp-asi04,owasp-mcp02,supply-chain")
1112
+ @reject_message("Tool execution blocked: the MCP server is unverified and security threats were detected. Only use tools from verified or trusted servers.")
532
1113
  forbid (
533
1114
  principal,
534
1115
  action == Overwatch::Action::"call_tool",
535
1116
  resource
536
1117
  )
537
1118
  when {
538
- context has threat_count && context has max_threat_severity &&
539
- context.threat_count > 0 && context.max_threat_severity >= 3
1119
+ context has mcp_server_verified && context.mcp_server_verified == false &&
1120
+ context has threat_count && context.threat_count > 0
540
1121
  };
541
- `;
542
- const OVERWATCH_TRUST_SAFETY_DEFAULT_CEDAR = `// =============================================================================
543
- // Content Safety Policy (Default)
544
- // =============================================================================
545
- // Detects and blocks violent, harmful, hateful, sexual, and profane content
546
- // using trust/safety classification scores from Javelin/Lakera/LlamaGuard.
547
- //
548
- // Compliance: NIST 800-53 SI-4 | EU AI Act Art. 52
549
- // Category: trust_safety
550
- // Namespace: Overwatch
551
- // =============================================================================
552
1122
 
553
- // Block prompts with high violence score
554
- @id("ts-block-violence")
555
- @name("Block violent content")
556
- @description("Block content when violence detection score exceeds threshold (80/100)")
557
- @severity("critical")
558
- @tags("violence,content-safety,trust-safety,nist-si-4")
559
- @reject_message("Your prompt was blocked because it contains violent content. Please rephrase without violence-related language.")
1123
+ // Block connections to MCP servers with risky configurations
1124
+ @id("as-block-mcp-config-risk")
1125
+ @name("Block risky MCP server configs")
1126
+ @description("Block MCP server connections when risky configuration patterns are detected (inline code execution, mixed transports, proxy patterns). Score >= 70.")
1127
+ @severity("high")
1128
+ @tags("mcp-config,owasp-mcp03,owasp-asi04,supply-chain")
1129
+ @reject_message("MCP server connection blocked: risky server configuration detected (e.g., inline code execution, mixed transports). Review the server configuration before connecting.")
560
1130
  forbid (
561
1131
  principal,
562
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
1132
+ action == Overwatch::Action::"connect_server",
563
1133
  resource
564
1134
  )
565
1135
  when {
566
- context has violence_score && context.violence_score >= 80
1136
+ context has mcp_config_risk && context.mcp_config_risk &&
1137
+ context has mcp_risk_score && context.mcp_risk_score >= 70
567
1138
  };
568
1139
 
569
- // Block prompts with weapons content
570
- @id("ts-block-weapons")
571
- @name("Block weapons content")
572
- @description("Block content when weapons detection score exceeds threshold (80/100)")
573
- @severity("critical")
574
- @tags("weapons,content-safety,trust-safety,nist-si-4")
575
- @reject_message("Your prompt was blocked because it contains weapons-related content. Please rephrase without weapons references.")
1140
+ // Block connections to unverified MCP servers entirely
1141
+ @id("as-block-unverified-server-connect")
1142
+ @name("Block unverified MCP server connections")
1143
+ @description("Block connections to MCP servers that are not from a verified registry. This prevents supply chain attacks via malicious MCP servers.")
1144
+ @severity("high")
1145
+ @tags("mcp-trust,owasp-asi04,owasp-mcp05,supply-chain")
1146
+ @reject_message("MCP server connection blocked: server is not from a verified registry. Add the server to your verified list or contact your admin.")
576
1147
  forbid (
577
1148
  principal,
578
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
1149
+ action == Overwatch::Action::"connect_server",
579
1150
  resource
580
1151
  )
581
1152
  when {
582
- context has weapons_score && context.weapons_score >= 80
1153
+ context has mcp_server_verified && context.mcp_server_verified == false
583
1154
  };
1155
+ `;
1156
+ const OVERWATCH_ENCODING_DEFAULT_CEDAR = `// =============================================================================
1157
+ // Encoding & Unicode Attack Detection Policy (Default)
1158
+ // =============================================================================
1159
+ // Detects and blocks invisible Unicode characters, bidirectional text
1160
+ // overrides, tag characters, and other encoding-based attack vectors used
1161
+ // to hide malicious instructions from human review while remaining
1162
+ // visible to AI model tokenizers.
1163
+ //
1164
+ // Attack vectors:
1165
+ // - Zero-width characters (U+200B, U+200C, U+200D, U+FEFF) hiding instructions
1166
+ // - Bidirectional overrides (U+202A-U+202E) creating visually misleading text
1167
+ // - Tag characters (U+E0001-U+E007F) embedding invisible payloads
1168
+ // - Variation selectors used as steganographic channels
1169
+ // - Homoglyph attacks using lookalike Unicode characters
1170
+ //
1171
+ // Ref: EchoLeak CVE-2025-32711 (invisible prompt injection via Unicode)
1172
+ // Rules File Backdoor (Pillar Security, March 2025)
1173
+ // Unicode-based prompt injection in Claude Code (CERT-2025)
1174
+ //
1175
+ // Compliance:
1176
+ // OWASP LLM01 (Prompt Injection) — encoding evasion
1177
+ // OWASP ASI01 (Agent Goal Hijack) — hidden instructions
1178
+ // MITRE ATLAS AML.T0051 (LLM Prompt Injection)
1179
+ // NIST 800-53 SI-10 (Information Input Validation)
1180
+ //
1181
+ // Category: encoding
1182
+ // Namespace: Overwatch
1183
+ // =============================================================================
584
1184
 
585
- // Block prompts with hate speech
586
- @id("ts-block-hate-speech")
587
- @name("Block hate speech")
588
- @description("Block content when hate speech detection score exceeds threshold (75/100)")
589
- @severity("critical")
590
- @tags("hate-speech,content-safety,trust-safety,nist-si-4")
591
- @reject_message("Your prompt was blocked because it contains hate speech. Please rephrase without hateful or discriminatory language.")
1185
+ // ---------------------------------------------------------------------------
1186
+ // Section 1: Invisible Character Detection in Prompts
1187
+ // Blocks prompts containing suspicious invisible Unicode patterns.
1188
+ // ---------------------------------------------------------------------------
1189
+
1190
+ // Block prompts with invisible characters above risk threshold
1191
+ @id("encoding-block-invisible-prompt")
1192
+ @name("Block invisible characters in prompts")
1193
+ @description("Block prompts when invisible Unicode characters are detected with a risk score >= 50. Invisible chars (zero-width joiners, bidi overrides, tag characters) can hide malicious instructions from human review while being processed by AI models. Ref: EchoLeak CVE-2025-32711.")
1194
+ @severity("high")
1195
+ @tags("unicode,invisible-chars,encoding,owasp-llm01,owasp-asi01,mitre-aml-t0051,nist-si-10")
1196
+ @reject_message("Your prompt was blocked because suspicious invisible Unicode characters were detected. These characters (zero-width, bidirectional overrides, tag characters) can be used to hide malicious instructions. Please remove non-visible characters and resubmit.")
592
1197
  forbid (
593
1198
  principal,
594
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
1199
+ action == Overwatch::Action::"process_prompt",
595
1200
  resource
596
1201
  )
597
1202
  when {
598
- context has hate_speech_score && context.hate_speech_score >= 75
1203
+ context has contains_invisible_chars && context.contains_invisible_chars &&
1204
+ context has invisible_chars_score && context.invisible_chars_score >= 50
599
1205
  };
600
1206
 
601
- // Block prompts with criminal content
602
- @id("ts-block-crime")
603
- @name("Block criminal content")
604
- @description("Block content when criminal activity detection score exceeds threshold (80/100)")
605
- @severity("high")
606
- @tags("crime,content-safety,trust-safety,nist-si-4")
607
- @reject_message("Your prompt was blocked because it contains content related to criminal activity.")
1207
+ // ---------------------------------------------------------------------------
1208
+ // Section 2: Invisible Characters in Tool Calls
1209
+ // Any invisible characters in tool arguments are suspicious — tool args
1210
+ // should be plain text/JSON. Lower threshold than prompts.
1211
+ // ---------------------------------------------------------------------------
1212
+
1213
+ // Block tool calls with any invisible characters
1214
+ @id("encoding-block-invisible-tool")
1215
+ @name("Block invisible characters in tool calls")
1216
+ @description("Block tool execution when invisible Unicode characters are detected in tool arguments or content. Tool arguments should be plain text/JSON — invisible characters in tool calls are almost certainly malicious payload injection.")
1217
+ @severity("critical")
1218
+ @tags("unicode,invisible-chars,tools,encoding,owasp-asi01,owasp-asi02")
1219
+ @reject_message("Tool execution blocked: invisible Unicode characters detected in tool arguments. Tool calls should contain only plain text — invisible characters indicate payload injection or encoding evasion.")
608
1220
  forbid (
609
1221
  principal,
610
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
1222
+ action == Overwatch::Action::"call_tool",
611
1223
  resource
612
1224
  )
613
1225
  when {
614
- context has crime_score && context.crime_score >= 80
1226
+ context has contains_invisible_chars && context.contains_invisible_chars
615
1227
  };
616
1228
 
617
- // Block prompts with sexual content
618
- @id("ts-block-sexual")
619
- @name("Block sexual content")
620
- @description("Block content when sexual content detection score exceeds threshold (80/100)")
1229
+ // ---------------------------------------------------------------------------
1230
+ // Section 3: Invisible Characters in File Operations
1231
+ // Blocks file reads/writes with encoding attacks to prevent persistence
1232
+ // of invisible payloads in the codebase.
1233
+ // ---------------------------------------------------------------------------
1234
+
1235
+ // Block file writes with invisible characters
1236
+ @id("encoding-block-invisible-file-write")
1237
+ @name("Block invisible characters in file writes")
1238
+ @description("Block file writes when invisible Unicode characters are detected. Prevents persistence of invisible payloads in source code, config files, or documentation where they could later be processed by AI agents. Ref: Rules File Backdoor attack (Pillar Security).")
621
1239
  @severity("high")
622
- @tags("sexual,content-safety,trust-safety,eu-ai-act")
623
- @reject_message("Your prompt was blocked because it contains sexual content.")
1240
+ @tags("unicode,invisible-chars,file-write,encoding,owasp-asi01")
1241
+ @reject_message("File write blocked: invisible Unicode characters detected in content. Writing invisible characters to files can create persistent backdoors that affect AI agents processing those files later.")
624
1242
  forbid (
625
1243
  principal,
626
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
1244
+ action == Overwatch::Action::"write_file",
627
1245
  resource
628
1246
  )
629
1247
  when {
630
- context has sexual_score && context.sexual_score >= 80
1248
+ context has contains_invisible_chars && context.contains_invisible_chars
631
1249
  };
632
1250
 
633
- // Block prompts with excessive profanity
634
- @id("ts-block-profanity")
635
- @name("Block profanity")
636
- @description("Block content when profanity detection score exceeds threshold (90/100)")
637
- @severity("medium")
638
- @tags("profanity,content-safety,trust-safety")
639
- @reject_message("Your prompt was blocked due to excessive profanity. Please rephrase in a professional manner.")
1251
+ // Block MCP server connections with invisible characters in config
1252
+ @id("encoding-block-invisible-server")
1253
+ @name("Block invisible characters in server config")
1254
+ @description("Block MCP server connections when invisible Unicode characters are detected in server configuration or responses. Invisible chars in server data indicate a compromised or malicious MCP server.")
1255
+ @severity("critical")
1256
+ @tags("unicode,invisible-chars,mcp,encoding,owasp-mcp02,owasp-asi04")
1257
+ @reject_message("MCP server connection blocked: invisible Unicode characters detected in server data. This may indicate a compromised MCP server using encoding attacks to inject hidden instructions.")
640
1258
  forbid (
641
1259
  principal,
642
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
1260
+ action == Overwatch::Action::"connect_server",
643
1261
  resource
644
1262
  )
645
1263
  when {
646
- context has profanity_score && context.profanity_score >= 90
1264
+ context has contains_invisible_chars && context.contains_invisible_chars
647
1265
  };
648
1266
  `;
649
- const OVERWATCH_AGENT_SECURITY_DEFAULT_CEDAR = `// =============================================================================
650
- // Agent Security Policy (Default)
1267
+ const OVERWATCH_BEHAVIORAL_DEFAULT_CEDAR = `// =============================================================================
1268
+ // Behavioral Analysis Policy (Default)
651
1269
  // =============================================================================
652
- // Detects and blocks tool poisoning, rug pull attacks, and indirect prompt
653
- // injection targeting AI coding agents. These are agentic AI-specific attack
654
- // vectors where tool descriptions or server responses manipulate agent behavior.
1270
+ // Detects and blocks suspicious agent behavioral patterns including tool call
1271
+ // loops, data exfiltration sequences, credential theft chains, and destructive
1272
+ // operation patterns. Operates on session-level behavioral signals rather
1273
+ // than single-request content analysis.
655
1274
  //
656
- // Compliance: OWASP LLM09 (Improper Output Handling) | MITRE ATLAS AML.T0054
657
- // Category: agent_security
1275
+ // Attack vectors:
1276
+ // - Tool call loops: Agent stuck in retry loop or manipulation-induced recursion
1277
+ // - Data exfiltration: Read sensitive data → send to external endpoint sequence
1278
+ // - Secret exfiltration: Read credentials → curl/fetch external URL
1279
+ // - Credential theft: Access .ssh/.aws → encode/compress → network tool
1280
+ // - Destructive sequences: Bulk delete, permission changes, config overwrites
1281
+ //
1282
+ // Ref: OWASP LLM10 (Unbounded Consumption) — loop/recursion attacks
1283
+ // OWASP ASI02 (Tool Misuse) — tool abuse sequences
1284
+ // OWASP ASI08 (Lack of Monitoring) — behavioral anomaly detection
1285
+ // GlassWorm Attack (35,800+ installations, cross-agent propagation)
1286
+ // MITRE ATLAS AML.T0080 (Memory Manipulation)
1287
+ //
1288
+ // Compliance:
1289
+ // OWASP LLM10 (Unbounded Consumption)
1290
+ // OWASP ASI02 (Tool Misuse)
1291
+ // OWASP ASI08 (Lack of Monitoring & Logging)
1292
+ // MITRE ATLAS AML.T0080 (AI Memory Manipulation)
1293
+ // MITRE ATT&CK T1041 (Exfiltration Over C2 Channel)
1294
+ // NIST 800-53 AU-6 (Audit Review, Analysis, and Reporting)
1295
+ // NIST 800-53 SI-4 (Information System Monitoring)
1296
+ //
1297
+ // Category: behavioral
658
1298
  // Namespace: Overwatch
659
1299
  // =============================================================================
660
1300
 
661
- // Block tool calls with high tool poisoning risk
662
- @id("as-block-tool-poisoning")
663
- @name("Block tool poisoning")
664
- @description("Block tool execution when tool description contains manipulation patterns (score >= 70/100)")
1301
+ // ---------------------------------------------------------------------------
1302
+ // Section 1: Tool Call Loop Detection
1303
+ // Detects agents trapped in infinite loops — either through manipulation
1304
+ // (adversarial prompt inducing repetitive behavior) or bugs (retry storms).
1305
+ // Ref: OWASP LLM10 (Unbounded Consumption)
1306
+ // ---------------------------------------------------------------------------
1307
+
1308
+ // Block tool calls in detected loops (5+ consecutive same-tool calls)
1309
+ @id("behavioral-block-loop")
1310
+ @name("Block tool call loops")
1311
+ @description("Block tool execution when a loop is detected — 5 or more consecutive calls to the same tool. This indicates either adversarial manipulation inducing repetitive agent behavior or a bug causing retry storms. Both waste compute and can cause damage. Adjust the threshold (default 5) to match your workflow — lower for stricter enforcement, higher for agents that legitimately retry (OWASP LLM10).")
1312
+ @severity("high")
1313
+ @tags("loop-detection,behavioral,owasp-llm10,owasp-asi02,nist-si-4")
1314
+ @reject_message("Tool execution blocked: repetitive tool call loop detected (5+ consecutive calls to the same tool). This may indicate adversarial manipulation or a system error. The agent session should be reviewed.")
1315
+ forbid (
1316
+ principal,
1317
+ action == Overwatch::Action::"call_tool",
1318
+ resource
1319
+ )
1320
+ when {
1321
+ context has loop_detected && context.loop_detected &&
1322
+ context has loop_count && context.loop_count >= 5
1323
+ };
1324
+
1325
+ // ---------------------------------------------------------------------------
1326
+ // Section 2: Data Exfiltration Detection
1327
+ // Detects sequences where an agent reads sensitive data and then attempts
1328
+ // to send it to an external endpoint.
1329
+ // Ref: GlassWorm attack, MITRE T1041
1330
+ // ---------------------------------------------------------------------------
1331
+
1332
+ // Block data exfiltration patterns
1333
+ @id("behavioral-block-data-exfil")
1334
+ @name("Block data exfiltration")
1335
+ @description("Block tool execution when a data exfiltration pattern is detected — the agent reads sensitive local data (files, configs, source code) followed by a network operation sending data externally. This is the hallmark of autonomous agent compromise (GlassWorm, EchoLeak).")
665
1336
  @severity("critical")
666
- @tags("tool-poisoning,agent-security,owasp-llm09")
667
- @reject_message("Tool execution was blocked because the tool description contains manipulation patterns that could compromise agent behavior.")
1337
+ @tags("data-exfiltration,behavioral,owasp-asi02,mitre-t1041,nist-si-4")
1338
+ @reject_message("Tool execution blocked: data exfiltration pattern detected. The agent appears to be reading sensitive data and sending it to an external endpoint. This is a critical security event — the agent session has been terminated.")
668
1339
  forbid (
669
1340
  principal,
670
1341
  action == Overwatch::Action::"call_tool",
671
1342
  resource
672
1343
  )
673
1344
  when {
674
- context has tool_poisoning_score && context.tool_poisoning_score >= 70
1345
+ context has suspicious_pattern && context.suspicious_pattern &&
1346
+ context has pattern_type && context.pattern_type == "data_exfiltration"
675
1347
  };
676
1348
 
677
- // Block tool calls with rug pull detection
678
- @id("as-block-rug-pull")
679
- @name("Block rug pull attacks")
680
- @description("Block tool execution when tool behavior diverges from advertised capabilities (score >= 70/100)")
1349
+ // Block secret exfiltration (credential-specific exfil)
1350
+ @id("behavioral-block-secret-exfil")
1351
+ @name("Block secret exfiltration")
1352
+ @description("Block tool execution when a secret exfiltration pattern is detected — the agent accesses credential files (.env, .aws, tokens) followed by a network tool call. This is a targeted credential theft sequence.")
681
1353
  @severity("critical")
682
- @tags("rug-pull,agent-security,mcp-security")
683
- @reject_message("Tool execution was blocked because the tool's actual behavior diverges from its advertised capabilities.")
1354
+ @tags("secret-exfiltration,behavioral,owasp-asi02,mitre-t1552,mitre-t1041")
1355
+ @reject_message("Tool execution blocked: secret exfiltration pattern detected. The agent accessed credential files and is attempting to send them externally. This is a targeted credential theft attack.")
684
1356
  forbid (
685
1357
  principal,
686
- action in [Overwatch::Action::"call_tool", Overwatch::Action::"connect_server"],
1358
+ action == Overwatch::Action::"call_tool",
687
1359
  resource
688
1360
  )
689
1361
  when {
690
- context has rug_pull_score && context.rug_pull_score >= 70
1362
+ context has suspicious_pattern && context.suspicious_pattern &&
1363
+ context has pattern_type && context.pattern_type == "secret_exfiltration"
691
1364
  };
692
1365
 
693
- // Block MCP server connections with high poisoning risk
694
- @id("as-block-server-poisoning")
695
- @name("Block poisoned MCP servers")
696
- @description("Block connections to MCP servers when tool poisoning patterns are detected (score >= 60/100)")
1366
+ // Block credential theft chains
1367
+ @id("behavioral-block-credential-theft")
1368
+ @name("Block credential theft")
1369
+ @description("Block tool execution when a credential theft chain is detected accessing SSH keys, cloud credentials, or API tokens followed by encoding, compression, or transfer operations. Multi-step attack pattern for autonomous credential harvesting.")
697
1370
  @severity("critical")
698
- @tags("tool-poisoning,mcp-security,agent-security")
699
- @reject_message("Connection to this MCP server was blocked because tool poisoning patterns were detected in its tool descriptions.")
1371
+ @tags("credential-theft,behavioral,owasp-asi02,mitre-t1552,mitre-t1555")
1372
+ @reject_message("Tool execution blocked: credential theft chain detected. The agent is performing a multi-step operation to harvest and exfiltrate credentials (SSH keys, cloud tokens, API keys). Session terminated.")
700
1373
  forbid (
701
1374
  principal,
702
- action == Overwatch::Action::"connect_server",
1375
+ action == Overwatch::Action::"call_tool",
703
1376
  resource
704
1377
  )
705
1378
  when {
706
- context has tool_poisoning_score && context.tool_poisoning_score >= 60
1379
+ context has suspicious_pattern && context.suspicious_pattern &&
1380
+ context has pattern_type && context.pattern_type == "credential_theft"
707
1381
  };
708
1382
 
709
- // Block prompts with indirect injection patterns
710
- @id("as-block-indirect-injection")
711
- @name("Block indirect prompt injection")
712
- @description("Block content when indirect prompt injection is detected in tool outputs or retrieved documents (score >= 70/100)")
1383
+ // ---------------------------------------------------------------------------
1384
+ // Section 3: Destructive Sequence Detection
1385
+ // Detects sequences of destructive operations that could damage the workspace.
1386
+ // ---------------------------------------------------------------------------
1387
+
1388
+ // Block destructive operation sequences
1389
+ @id("behavioral-block-destructive-sequence")
1390
+ @name("Block destructive sequences")
1391
+ @description("Block tool execution when a destructive operation sequence is detected — bulk file deletions, permission changes, config overwrites, or repository manipulation patterns. Prevents agent-initiated workspace damage.")
713
1392
  @severity("critical")
714
- @tags("indirect-injection,agent-security,owasp-llm01")
715
- @reject_message("This content was blocked because indirect prompt injection patterns were detected in tool outputs or retrieved documents.")
1393
+ @tags("destructive,behavioral,owasp-asi02,nist-si-4")
1394
+ @reject_message("Tool execution blocked: destructive operation sequence detected. The agent is performing a pattern of destructive operations (bulk deletions, permission changes, config overwrites) that could damage the workspace.")
716
1395
  forbid (
717
1396
  principal,
718
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool", Overwatch::Action::"connect_server"],
1397
+ action == Overwatch::Action::"call_tool",
719
1398
  resource
720
1399
  )
721
1400
  when {
722
- context has indirect_injection_score && context.indirect_injection_score >= 70
1401
+ context has suspicious_pattern && context.suspicious_pattern &&
1402
+ context has pattern_type && context.pattern_type == "destructive_sequence"
723
1403
  };
724
1404
 
725
- // Block unverified MCP server tool calls with any detected threats
726
- @id("as-block-unverified-threats")
727
- @name("Block unverified server threats")
728
- @description("Block tool calls from unverified MCP servers when any threat is detected")
1405
+ // ---------------------------------------------------------------------------
1406
+ // Section 4: Sequence Risk Scoring
1407
+ // Uses computed sequence risk scores for dynamic behavioral assessment.
1408
+ // ---------------------------------------------------------------------------
1409
+
1410
+ // Block high-risk behavioral sequences
1411
+ @id("behavioral-block-high-risk-sequence")
1412
+ @name("Block high-risk behavioral sequences")
1413
+ @description("Block tool execution when the computed sequence risk score exceeds 80/100. The score aggregates behavioral signals including action history, tool combination analysis, and deviation from normal patterns. High scores indicate coordinated multi-step attacks.")
729
1414
  @severity("high")
730
- @tags("mcp-trust,agent-security,unverified")
731
- @reject_message("Tool execution was blocked because the MCP server is unverified and threats were detected in the content.")
1415
+ @tags("sequence-risk,behavioral,dynamic,owasp-asi08,nist-au-6")
1416
+ @reject_message("Tool execution blocked: high-risk behavioral sequence detected (risk score 80+). The pattern of agent actions indicates a coordinated attack. This session requires human review before continuing.")
732
1417
  forbid (
733
1418
  principal,
734
1419
  action == Overwatch::Action::"call_tool",
735
1420
  resource
736
1421
  )
737
1422
  when {
738
- context has mcp_server_verified && context.mcp_server_verified == false &&
739
- context has threat_count && context.threat_count > 0
1423
+ context has sequence_risk && context.sequence_risk >= 80
740
1424
  };
741
1425
  `;
742
1426
  const OVERWATCH_TOOLS_MCP_ALLOWLIST_CEDAR = `// MCP Server Allowlist Template
@@ -883,40 +1567,47 @@ permit (
883
1567
  resource
884
1568
  );
885
1569
  `;
886
- const OVERWATCH_ORG_TEAM_PERMISSIONS_CEDAR = `// Team-Based Permissions (ReBAC)
887
- // Grant IDE access based on team membership using entity hierarchy
1570
+ const OVERWATCH_ORG_TEAM_PERMISSIONS_CEDAR = `// =============================================================================
1571
+ // Project-Based Permissions (ReBAC)
1572
+ // =============================================================================
1573
+ // Grant IDE access based on project scope using entity hierarchy.
1574
+ // With the aligned schema, principals (User, Agent) are flat — scoping is
1575
+ // done via resource hierarchy instead of principal hierarchy.
1576
+ //
888
1577
  // Category: organization
889
1578
  // Namespace: Overwatch
890
1579
  //
891
1580
  // Entity hierarchy required:
892
- // Organization::"acme-corp"
893
- // └── Team::"dev-team" (in Organization)
894
- // └── Agent::"claude" (in Team)
895
- // └── Team::"support-team" (in Organization)
896
- // └── Agent::"claude-support" (in Team)
897
-
898
- // Dev Team: Full IDE access - all actions permitted
899
- @id("team-dev-full-access")
900
- @name("Dev team full IDE access")
901
- @description("Grant development team agents full IDE access including tools, prompts, file operations, and server connections")
1581
+ // Account::"acme-corp"
1582
+ // └── Project::"dev-project" (in Account)
1583
+ // └── Project::"support-project" (in Account)
1584
+ //
1585
+ // Resources (Tool, Server, FilePath, LlmPrompt) are parented under Project,
1586
+ // so \`resource in Project::"..."\` matches all resources in that project.
1587
+ // =============================================================================
1588
+
1589
+ // Dev Project: Full IDE access - all actions permitted on all resources
1590
+ @id("project-dev-full-access")
1591
+ @name("Dev project full IDE access")
1592
+ @description("Grant full IDE access to all resources within the dev project including tools, prompts, file operations, and server connections")
902
1593
  @severity("medium")
903
- @tags("rebac,team,dev,permissions,organization")
1594
+ @tags("rebac,project,dev,permissions,organization")
904
1595
  permit (
905
- principal in Overwatch::Team::"dev-team",
1596
+ principal,
906
1597
  action,
907
- resource
1598
+ resource in Overwatch::Project::"dev-project"
908
1599
  );
909
1600
 
910
- // Support Team: Read-only access - process prompts and read files only
911
- @id("team-support-read-only")
912
- @name("Support team read-only access")
913
- @description("Grant support team agents read-only access limited to prompt processing and file reading")
1601
+ // Support Project: Read-only access - process prompts and read files only
1602
+ @id("project-support-read-only")
1603
+ @name("Support project read-only access")
1604
+ @description("Grant read-only access to support project resources limited to prompt processing and file reading")
914
1605
  @severity("medium")
915
- @tags("rebac,team,support,read-only,organization")
1606
+ @tags("rebac,project,support,read-only,organization")
916
1607
  permit (
917
- principal in Overwatch::Team::"support-team",
1608
+ principal,
918
1609
  action in [Overwatch::Action::"process_prompt", Overwatch::Action::"read_file"],
919
- resource
1610
+ resource in Overwatch::Project::"support-project"
920
1611
  );
921
1612
  `;
922
1613
  const OVERWATCH_ORG_AGENT_GUARDRAILS_CEDAR = `// Agent-Specific Guardrails
@@ -940,7 +1631,7 @@ forbid (
940
1631
  resource
941
1632
  )
942
1633
  when {
943
- context.yara_threats.contains("prompt_injection")
1634
+ context.detected_threats.contains("prompt_injection")
944
1635
  };
945
1636
 
946
1637
  // Cursor: Block PII leakage
@@ -968,7 +1659,9 @@ export const OVERWATCH_CATEGORIES = [
968
1659
  { id: 'tools', name: 'Tool Permissioning', description: 'Control access to shell execution, file operations, MCP servers, and sensitive system paths' },
969
1660
  { id: 'organization', name: 'Organization Rules', description: 'Apply organization-wide policy baselines, team permissions, and agent-specific guardrails' },
970
1661
  { id: 'trust_safety', name: 'Content Safety', description: 'Detect and control violent, harmful, hateful, sexual, and profane content using trust/safety classification scores' },
971
- { id: 'agent_security', name: 'Agent Security', description: 'Detect tool poisoning, rug pull attacks, and indirect prompt injection targeting AI agents' },
1662
+ { id: 'agent_security', name: 'Agent Security', description: 'Detect and block tool poisoning, rug pull attacks, indirect prompt injection, and MCP supply chain threats targeting AI agents' },
1663
+ { id: 'encoding', name: 'Encoding & Unicode Attacks', description: 'Detect invisible Unicode characters, bidirectional text overrides, and encoded injection payloads used to hide malicious instructions' },
1664
+ { id: 'behavioral', name: 'Behavioral Analysis', description: 'Detect suspicious action sequences, tool call loops, data exfiltration patterns, and credential theft chains across agent sessions' },
972
1665
  ];
973
1666
  // =============================================================================
974
1667
  // DEFAULT POLICIES
@@ -984,71 +1677,85 @@ export const OVERWATCH_DEFAULTS = [
984
1677
  tags: ['baseline', 'permit-default', 'organization'],
985
1678
  isActive: true,
986
1679
  },
1680
+ {
1681
+ id: 'semantic-default',
1682
+ name: 'Semantic Threat Detection',
1683
+ description: 'Detect and block prompt injection, jailbreak attempts, and high-severity threats using detection rules and ML classifiers',
1684
+ category: 'semantic',
1685
+ cedarText: OVERWATCH_SEMANTIC_DEFAULT_CEDAR,
1686
+ severity: 'critical',
1687
+ tags: ['prompt-injection', 'jailbreak', 'owasp-llm01', 'owasp-llm02', 'security', 'baseline'],
1688
+ isActive: true,
1689
+ },
1690
+ {
1691
+ id: 'trust-safety-default',
1692
+ name: 'Content Safety',
1693
+ description: 'Detect and block violent, harmful, hateful, sexual, and profane content using ML classification scores',
1694
+ category: 'trust_safety',
1695
+ cedarText: OVERWATCH_TRUST_SAFETY_DEFAULT_CEDAR,
1696
+ severity: 'critical',
1697
+ tags: ['violence', 'weapons', 'hate-speech', 'crime', 'sexual', 'profanity', 'content-safety', 'baseline'],
1698
+ isActive: true,
1699
+ },
1700
+ ];
1701
+ // =============================================================================
1702
+ // ALL TEMPLATES
1703
+ // =============================================================================
1704
+ export const OVERWATCH_TEMPLATES = [
987
1705
  {
988
1706
  id: 'secrets-default',
989
1707
  name: 'Secrets Detection',
990
- description: 'Detect and block credential leakage across prompts, tool calls, file operations, and AI response content',
1708
+ description: 'Detect and block credential leakage across prompts, tool calls, file operations, and AI responses using multi-layered detection',
991
1709
  category: 'secrets',
992
1710
  cedarText: OVERWATCH_SECRETS_DEFAULT_CEDAR,
993
1711
  severity: 'critical',
994
1712
  tags: ['api-keys', 'tokens', 'credentials', 'aws', 'github', 'ssh', 'baseline'],
995
- isActive: true,
996
1713
  },
997
1714
  {
998
1715
  id: 'pii-default',
999
1716
  name: 'PII Detection',
1000
- description: 'Detect and block credit card numbers, SSN, and other sensitive personal information in prompts and tool calls',
1717
+ description: 'Detect and block credit card numbers, SSNs, health data, and other PII in prompts, tool calls, and file operations',
1001
1718
  category: 'pii',
1002
1719
  cedarText: OVERWATCH_PII_DEFAULT_CEDAR,
1003
1720
  severity: 'critical',
1004
- tags: ['pii', 'privacy', 'compliance', 'pci-dss', 'gdpr', 'baseline'],
1005
- isActive: true,
1006
- },
1007
- {
1008
- id: 'semantic-default',
1009
- name: 'Semantic Threat Detection',
1010
- description: 'Detect and block prompt injection, jailbreak attempts, and high-severity AI security threats',
1011
- category: 'semantic',
1012
- cedarText: OVERWATCH_SEMANTIC_DEFAULT_CEDAR,
1013
- severity: 'critical',
1014
- tags: ['prompt-injection', 'jailbreak', 'owasp-llm01', 'security', 'baseline'],
1015
- isActive: true,
1721
+ tags: ['pii', 'privacy', 'compliance', 'pci-dss', 'gdpr', 'hipaa', 'baseline'],
1016
1722
  },
1017
1723
  {
1018
1724
  id: 'tools-default',
1019
1725
  name: 'Tool Permissioning',
1020
- description: 'Block dangerous shell execution, restrict sensitive file paths, and enforce threat-based tool access controls',
1726
+ description: 'Block dangerous shell execution, restrict sensitive file paths, enforce tool risk scoring, and detect command injection in tool arguments',
1021
1727
  category: 'tools',
1022
1728
  cedarText: OVERWATCH_TOOLS_DEFAULT_CEDAR,
1023
1729
  severity: 'critical',
1024
- tags: ['shell', 'command-injection', 'file-access', 'mitre-t1059', 'baseline'],
1025
- isActive: false,
1026
- },
1027
- {
1028
- id: 'trust-safety-default',
1029
- name: 'Content Safety',
1030
- description: 'Detect and block violent, harmful, hateful, sexual, and profane content using classification scores',
1031
- category: 'trust_safety',
1032
- cedarText: OVERWATCH_TRUST_SAFETY_DEFAULT_CEDAR,
1033
- severity: 'critical',
1034
- tags: ['violence', 'weapons', 'hate-speech', 'crime', 'sexual', 'profanity', 'content-safety', 'baseline'],
1035
- isActive: true,
1730
+ tags: ['shell', 'command-injection', 'file-access', 'tool-risk', 'mitre-t1059', 'owasp-llm06', 'baseline'],
1036
1731
  },
1037
1732
  {
1038
1733
  id: 'agent-security-default',
1039
1734
  name: 'Agent Security',
1040
- description: 'Detect and block tool poisoning, rug pull attacks, and indirect prompt injection targeting AI agents',
1735
+ description: 'Detect and block tool poisoning, rug pull attacks, indirect prompt injection, and MCP supply chain threats via Shield detection',
1041
1736
  category: 'agent_security',
1042
1737
  cedarText: OVERWATCH_AGENT_SECURITY_DEFAULT_CEDAR,
1043
1738
  severity: 'critical',
1044
- tags: ['tool-poisoning', 'rug-pull', 'indirect-injection', 'mcp-security', 'agent-security', 'baseline'],
1045
- isActive: true,
1739
+ tags: ['tool-poisoning', 'rug-pull', 'indirect-injection', 'mcp-security', 'agent-security', 'owasp-asi01', 'owasp-asi04', 'baseline'],
1740
+ },
1741
+ {
1742
+ id: 'encoding-default',
1743
+ name: 'Encoding Attack Detection',
1744
+ description: 'Detect and block invisible Unicode characters, bidirectional overrides, and encoding-based injection attacks across prompts, tools, and files',
1745
+ category: 'encoding',
1746
+ cedarText: OVERWATCH_ENCODING_DEFAULT_CEDAR,
1747
+ severity: 'high',
1748
+ tags: ['unicode', 'invisible-chars', 'bidi-override', 'encoding', 'owasp-llm01', 'baseline'],
1749
+ },
1750
+ {
1751
+ id: 'behavioral-default',
1752
+ name: 'Behavioral Analysis',
1753
+ description: 'Detect and block tool call loops, data exfiltration sequences, credential theft chains, and destructive operation patterns',
1754
+ category: 'behavioral',
1755
+ cedarText: OVERWATCH_BEHAVIORAL_DEFAULT_CEDAR,
1756
+ severity: 'high',
1757
+ tags: ['loop-detection', 'data-exfiltration', 'credential-theft', 'behavioral', 'owasp-llm10', 'owasp-asi02', 'baseline'],
1046
1758
  },
1047
- ];
1048
- // =============================================================================
1049
- // ALL TEMPLATES
1050
- // =============================================================================
1051
- export const OVERWATCH_TEMPLATES = [
1052
1759
  {
1053
1760
  id: 'tools-mcp-allowlist',
1054
1761
  name: 'MCP Server Allowlist',
@@ -1111,7 +1818,7 @@ export const OVERWATCH_TEMPLATES = [
1111
1818
  export const OVERWATCH_TEMPLATES_JSON = `{
1112
1819
  "service": "overwatch",
1113
1820
  "version": "3.0.0",
1114
- "description": "Overwatch policy templates for IDE security",
1821
+ "description": "Overwatch policy templates for IDE agent security",
1115
1822
  "categories": [
1116
1823
  {
1117
1824
  "id": "secrets",
@@ -1146,7 +1853,17 @@ export const OVERWATCH_TEMPLATES_JSON = `{
1146
1853
  {
1147
1854
  "id": "agent_security",
1148
1855
  "name": "Agent Security",
1149
- "description": "Detect tool poisoning, rug pull attacks, and indirect prompt injection targeting AI agents"
1856
+ "description": "Detect and block tool poisoning, rug pull attacks, indirect prompt injection, and MCP supply chain threats targeting AI agents"
1857
+ },
1858
+ {
1859
+ "id": "encoding",
1860
+ "name": "Encoding & Unicode Attacks",
1861
+ "description": "Detect invisible Unicode characters, bidirectional text overrides, and encoded injection payloads used to hide malicious instructions"
1862
+ },
1863
+ {
1864
+ "id": "behavioral",
1865
+ "name": "Behavioral Analysis",
1866
+ "description": "Detect suspicious action sequences, tool call loops, data exfiltration patterns, and credential theft chains across agent sessions"
1150
1867
  }
1151
1868
  ],
1152
1869
  "defaults": [
@@ -1160,68 +1877,82 @@ export const OVERWATCH_TEMPLATES_JSON = `{
1160
1877
  "tags": ["baseline", "permit-default", "organization"],
1161
1878
  "is_active": true
1162
1879
  },
1880
+ {
1881
+ "id": "semantic-default",
1882
+ "name": "Semantic Threat Detection",
1883
+ "description": "Detect and block prompt injection, jailbreak attempts, and high-severity threats using detection rules and ML classifiers",
1884
+ "category": "semantic",
1885
+ "file": "defaults/semantic.cedar",
1886
+ "severity": "critical",
1887
+ "tags": ["prompt-injection", "jailbreak", "owasp-llm01", "owasp-llm02", "security", "baseline"],
1888
+ "is_active": true
1889
+ },
1890
+ {
1891
+ "id": "trust-safety-default",
1892
+ "name": "Content Safety",
1893
+ "description": "Detect and block violent, harmful, hateful, sexual, and profane content using ML classification scores",
1894
+ "category": "trust_safety",
1895
+ "file": "defaults/trust_safety.cedar",
1896
+ "severity": "critical",
1897
+ "tags": ["violence", "weapons", "hate-speech", "crime", "sexual", "profanity", "content-safety", "baseline"],
1898
+ "is_active": true
1899
+ }
1900
+ ],
1901
+ "templates": [
1163
1902
  {
1164
1903
  "id": "secrets-default",
1165
1904
  "name": "Secrets Detection",
1166
- "description": "Detect and block credential leakage across prompts, tool calls, file operations, and AI response content",
1905
+ "description": "Detect and block credential leakage across prompts, tool calls, file operations, and AI responses using multi-layered detection",
1167
1906
  "category": "secrets",
1168
1907
  "file": "defaults/secrets.cedar",
1169
1908
  "severity": "critical",
1170
- "tags": ["api-keys", "tokens", "credentials", "aws", "github", "ssh", "baseline"],
1171
- "is_active": true
1909
+ "tags": ["api-keys", "tokens", "credentials", "aws", "github", "ssh", "baseline"]
1172
1910
  },
1173
1911
  {
1174
1912
  "id": "pii-default",
1175
1913
  "name": "PII Detection",
1176
- "description": "Detect and block credit card numbers, SSN, and other sensitive personal information in prompts and tool calls",
1914
+ "description": "Detect and block credit card numbers, SSNs, health data, and other PII in prompts, tool calls, and file operations",
1177
1915
  "category": "pii",
1178
1916
  "file": "defaults/pii.cedar",
1179
1917
  "severity": "critical",
1180
- "tags": ["pii", "privacy", "compliance", "pci-dss", "gdpr", "baseline"],
1181
- "is_active": true
1182
- },
1183
- {
1184
- "id": "semantic-default",
1185
- "name": "Semantic Threat Detection",
1186
- "description": "Detect and block prompt injection, jailbreak attempts, and high-severity AI security threats",
1187
- "category": "semantic",
1188
- "file": "defaults/semantic.cedar",
1189
- "severity": "critical",
1190
- "tags": ["prompt-injection", "jailbreak", "owasp-llm01", "security", "baseline"],
1191
- "is_active": true
1918
+ "tags": ["pii", "privacy", "compliance", "pci-dss", "gdpr", "hipaa", "baseline"]
1192
1919
  },
1193
1920
  {
1194
1921
  "id": "tools-default",
1195
1922
  "name": "Tool Permissioning",
1196
- "description": "Block dangerous shell execution, restrict sensitive file paths, and enforce threat-based tool access controls",
1923
+ "description": "Block dangerous shell execution, restrict sensitive file paths, enforce tool risk scoring, and detect command injection in tool arguments",
1197
1924
  "category": "tools",
1198
1925
  "file": "defaults/tools.cedar",
1199
1926
  "severity": "critical",
1200
- "tags": ["shell", "command-injection", "file-access", "mitre-t1059", "baseline"],
1201
- "is_active": false
1202
- },
1203
- {
1204
- "id": "trust-safety-default",
1205
- "name": "Content Safety",
1206
- "description": "Detect and block violent, harmful, hateful, sexual, and profane content using classification scores",
1207
- "category": "trust_safety",
1208
- "file": "defaults/trust_safety.cedar",
1209
- "severity": "critical",
1210
- "tags": ["violence", "weapons", "hate-speech", "crime", "sexual", "profanity", "content-safety", "baseline"],
1211
- "is_active": true
1927
+ "tags": ["shell", "command-injection", "file-access", "tool-risk", "mitre-t1059", "owasp-llm06", "baseline"]
1212
1928
  },
1213
1929
  {
1214
1930
  "id": "agent-security-default",
1215
1931
  "name": "Agent Security",
1216
- "description": "Detect and block tool poisoning, rug pull attacks, and indirect prompt injection targeting AI agents",
1932
+ "description": "Detect and block tool poisoning, rug pull attacks, indirect prompt injection, and MCP supply chain threats via Shield detection",
1217
1933
  "category": "agent_security",
1218
1934
  "file": "defaults/agent_security.cedar",
1219
1935
  "severity": "critical",
1220
- "tags": ["tool-poisoning", "rug-pull", "indirect-injection", "mcp-security", "agent-security", "baseline"],
1221
- "is_active": true
1222
- }
1223
- ],
1224
- "templates": [
1936
+ "tags": ["tool-poisoning", "rug-pull", "indirect-injection", "mcp-security", "agent-security", "owasp-asi01", "owasp-asi04", "baseline"]
1937
+ },
1938
+ {
1939
+ "id": "encoding-default",
1940
+ "name": "Encoding Attack Detection",
1941
+ "description": "Detect and block invisible Unicode characters, bidirectional overrides, and encoding-based injection attacks across prompts, tools, and files",
1942
+ "category": "encoding",
1943
+ "file": "defaults/encoding_attacks.cedar",
1944
+ "severity": "high",
1945
+ "tags": ["unicode", "invisible-chars", "bidi-override", "encoding", "owasp-llm01", "baseline"]
1946
+ },
1947
+ {
1948
+ "id": "behavioral-default",
1949
+ "name": "Behavioral Analysis",
1950
+ "description": "Detect and block tool call loops, data exfiltration sequences, credential theft chains, and destructive operation patterns",
1951
+ "category": "behavioral",
1952
+ "file": "defaults/behavioral.cedar",
1953
+ "severity": "high",
1954
+ "tags": ["loop-detection", "data-exfiltration", "credential-theft", "behavioral", "owasp-llm10", "owasp-asi02", "baseline"]
1955
+ },
1225
1956
  {
1226
1957
  "id": "tools-mcp-allowlist",
1227
1958
  "name": "MCP Server Allowlist",