@highflame/policy 2.1.7 → 2.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. package/_schemas/guardrails/context.json +78 -0
  2. package/_schemas/guardrails/schema.cedarschema +28 -1
  3. package/_schemas/guardrails/templates/profiles/a2a_security/cross_origin.cedar +105 -0
  4. package/_schemas/guardrails/templates/profiles/a2a_security/escalation_detection.cedar +113 -0
  5. package/_schemas/guardrails/templates/profiles/a2a_security/identity_enforcement.cedar +118 -0
  6. package/_schemas/guardrails/templates/profiles/a2a_security/inter_agent_injection.cedar +134 -0
  7. package/_schemas/guardrails/templates/profiles/a2a_security/supply_chain.cedar +117 -0
  8. package/_schemas/guardrails/templates/profiles/advanced_detection/pii.cedar +73 -0
  9. package/_schemas/guardrails/templates/profiles/advanced_detection/secrets.cedar +66 -0
  10. package/_schemas/guardrails/templates/profiles/advanced_detection/threat_severity.cedar +35 -0
  11. package/_schemas/guardrails/templates/profiles/code_agent/encoding.cedar +55 -0
  12. package/_schemas/guardrails/templates/profiles/code_agent/path_security.cedar +148 -0
  13. package/_schemas/guardrails/templates/profiles/code_agent/supply_chain.cedar +120 -0
  14. package/_schemas/guardrails/templates/templates.json +121 -6
  15. package/_schemas/overwatch/schema.cedarschema +6 -0
  16. package/dist/guardrails-context.gen.d.ts +5 -0
  17. package/dist/guardrails-context.gen.js +5 -0
  18. package/dist/guardrails-defaults.gen.js +1315 -6
  19. package/dist/overwatch-defaults.gen.d.ts +1 -1
  20. package/dist/overwatch-defaults.gen.js +508 -1206
  21. package/dist/service-schemas.gen.d.ts +2 -2
  22. package/dist/service-schemas.gen.js +47 -1
  23. package/package.json +1 -1
@@ -32,325 +32,27 @@ permit (
32
32
  resource
33
33
  );
34
34
  `;
35
- const OVERWATCH_SEMANTIC_DEFAULT_CEDAR = `// =============================================================================
36
- // Semantic Threat Detection Policy (Default)
37
- // =============================================================================
38
- // Detects and blocks prompt injection, jailbreak attempts, and high-severity
39
- // AI security threats. Uses multi-layered detection:
40
- //
41
- // 1. Detection engine rule triggers (detected_threats) — pattern-based
42
- // 2. ML classifier confidence scores (injection_confidence, jailbreak_confidence)
43
- // 3. Threat severity aggregation (max_threat_severity, highest_severity)
44
- // 4. Cross-action enforcement (prompts + tool calls + file operations)
45
- //
46
- // Compliance:
47
- // OWASP LLM01 (Prompt Injection) — direct + indirect
48
- // OWASP LLM02 (Insecure Output Handling) — response manipulation
49
- // OWASP ASI01 (Agent Goal Hijack) — behavioral manipulation
50
- // MITRE ATLAS AML.T0051 (LLM Prompt Injection)
51
- // MITRE ATLAS AML.T0054 (LLM Jailbreak)
52
- // NIST 800-53 SI-3 (Malicious Code Protection)
53
- // NIST 800-53 SI-4 (Information System Monitoring)
54
- //
55
- // Category: semantic
56
- // Namespace: Overwatch
57
- // =============================================================================
58
-
59
- // ---------------------------------------------------------------------------
60
- // Section 1: Prompt Injection Detection
61
- // Blocks direct prompt injection — adversarial input designed to override
62
- // system instructions and hijack agent behavior.
63
- // Ref: OWASP LLM01, MITRE AML.T0051, 62% of LLM apps vulnerable (2024)
64
- // ---------------------------------------------------------------------------
65
-
66
- // Block content with prompt injection patterns detected by rules
67
- @id("semantic-block-injection")
68
- @name("Block prompt injection")
69
- @description("Block prompts and tool calls when detection engine rules identify prompt injection patterns. Catches instruction override, role assumption, delimiter injection, and other manipulation techniques in both user input and tool arguments (OWASP LLM01).")
70
- @severity("critical")
71
- @tags("injection,security,owasp-llm01,mitre-aml-t0051,baseline")
72
- @reject_message("Content was blocked because prompt injection patterns were detected. This is a security measure to prevent manipulation of AI agent behavior (OWASP LLM01).")
73
- forbid (
74
- principal,
75
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
76
- resource
77
- )
78
- when {
79
- context has detected_threats && context.detected_threats.contains("prompt_injection")
80
- };
81
-
82
- // Block content with high ML injection confidence
83
- @id("semantic-block-injection-score")
84
- @name("Block high-confidence injection")
85
- @description("Block content when the ML injection classifier confidence exceeds threshold (75/100). Catches novel injection techniques that evade pattern-based detection — polymorphic payloads, encoding tricks, and obfuscated instructions.")
86
- @severity("critical")
87
- @tags("injection,ml-classifier,security,owasp-llm01,mitre-aml-t0051")
88
- @reject_message("Your content was blocked because the ML classifier detected prompt injection with high confidence. This appears to be an attempt to manipulate agent behavior.")
89
- forbid (
90
- principal,
91
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
92
- resource
93
- )
94
- when {
95
- context has injection_confidence && context.injection_confidence >= 75
96
- };
97
-
98
- // ---------------------------------------------------------------------------
99
- // Section 2: Jailbreak Detection
100
- // Blocks jailbreak attempts — adversarial input designed to bypass AI safety
101
- // guardrails and elicit restricted outputs.
102
- // Ref: OWASP LLM02, MITRE AML.T0054, DAN/JailbreakChat/etc.
103
- // ---------------------------------------------------------------------------
104
-
105
- // Block prompts with jailbreak attempts detected by rules
106
- @id("semantic-block-jailbreak")
107
- @name("Block jailbreak attempts")
108
- @description("Block prompts when detection engine rules identify jailbreak patterns: DAN-style prompts, role-play exploits, safety bypass instructions, and constraint removal attempts (OWASP LLM02).")
109
- @severity("critical")
110
- @tags("jailbreak,bypass,security,owasp-llm02,mitre-aml-t0054,baseline")
111
- @reject_message("Your prompt was blocked because jailbreak patterns were detected. This is a security measure to prevent circumvention of AI safety controls (OWASP LLM02).")
112
- forbid (
113
- principal,
114
- action == Overwatch::Action::"process_prompt",
115
- resource
116
- )
117
- when {
118
- context has detected_threats && context.detected_threats.contains("jailbreak")
119
- };
120
-
121
- // Block prompts with high ML jailbreak confidence
122
- @id("semantic-block-jailbreak-score")
123
- @name("Block high-confidence jailbreak")
124
- @description("Block content when the ML jailbreak classifier confidence exceeds threshold (75/100). Catches sophisticated jailbreak techniques including multi-turn manipulation, encoded payloads, and novel prompt structures.")
125
- @severity("critical")
126
- @tags("jailbreak,ml-classifier,security,owasp-llm02,mitre-aml-t0054")
127
- @reject_message("Your content was blocked because the ML classifier detected a jailbreak attempt with high confidence. This appears to be an attempt to bypass safety guardrails.")
128
- forbid (
129
- principal,
130
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
131
- resource
132
- )
133
- when {
134
- context has jailbreak_confidence && context.jailbreak_confidence >= 75
135
- };
136
-
137
- // ---------------------------------------------------------------------------
138
- // Section 3: Threat Severity Aggregation
139
- // Blocks based on aggregated threat severity from all detection engines.
140
- // These act as catch-all rules for threats that don't match specific patterns.
141
- // ---------------------------------------------------------------------------
142
-
143
- // Block any content with critical severity threats
144
- @id("semantic-block-critical")
145
- @name("Block critical threats")
146
- @description("Block all content when any detection engine reports critical severity. This is the ultimate catch-all — critical threats are blocked regardless of type or source.")
147
- @severity("critical")
148
- @tags("critical,baseline,security,catch-all")
149
- @reject_message("Your content was blocked because security scanners detected a critical-severity threat. This content cannot be processed.")
150
- forbid (
151
- principal,
152
- action,
153
- resource
154
- )
155
- when {
156
- context has highest_severity && context.highest_severity == "critical"
157
- };
158
-
159
- // Block prompts with high severity semantic threats
160
- @id("semantic-block-high-severity")
161
- @name("Block high severity threats")
162
- @description("Block prompts when threat detection reports high severity (>= 3) in semantic categories. Catches threats that are individually below critical but collectively indicate adversarial intent.")
163
- @severity("high")
164
- @tags("semantic,severity,security,defense-in-depth")
165
- @reject_message("Your prompt was blocked because security scanners detected high severity issues in the content. Review your prompt for manipulative or adversarial patterns.")
166
- forbid (
167
- principal,
168
- action == Overwatch::Action::"process_prompt",
169
- resource
170
- )
171
- when {
172
- context has threat_categories && context has max_threat_severity &&
173
- context.threat_categories.contains("semantic") &&
174
- context.max_threat_severity >= 3
175
- };
176
-
177
- // Block tool calls with multiple concurrent threats
178
- @id("semantic-block-multi-threat-tools")
179
- @name("Block multi-threat tool calls")
180
- @description("Block tool execution when multiple distinct threats are detected simultaneously (3+). Multiple concurrent threats in a tool call strongly indicate an adversarial attack chain.")
181
- @severity("high")
182
- @tags("multi-threat,tools,security,defense-in-depth")
183
- @reject_message("Tool execution was blocked because multiple security threats were detected simultaneously. This pattern indicates a potential attack chain.")
184
- forbid (
185
- principal,
186
- action == Overwatch::Action::"call_tool",
187
- resource
188
- )
189
- when {
190
- context has threat_count && context.threat_count >= 3
191
- };
192
- `;
193
- const OVERWATCH_TRUST_SAFETY_DEFAULT_CEDAR = `// =============================================================================
194
- // Content Safety Policy (Default)
195
- // =============================================================================
196
- // Detects and blocks violent, harmful, hateful, sexual, and profane content
197
- // using trust/safety ML classification scores.
198
- //
199
- // The detection engine runs multiple ML classifiers (toxicity, content safety)
200
- // and produces normalized scores (0-100) for each category. These policies
201
- // enforce enterprise-appropriate content thresholds.
202
- //
203
- // Compliance:
204
- // NIST 800-53 SI-4 (Information System Monitoring)
205
- // EU AI Act Art. 52 (Transparency for AI Systems)
206
- // OWASP LLM02 (Insecure Output Handling) — unsafe content generation
207
- // ISO 42001 (AI Management System) — responsible AI use
208
- //
209
- // Category: trust_safety
210
- // Namespace: Overwatch
211
- // =============================================================================
212
-
213
- // ---------------------------------------------------------------------------
214
- // Section 1: Violence & Weapons
215
- // Blocks content promoting, describing, or instructing violence and weapons.
216
- // ---------------------------------------------------------------------------
217
-
218
- // Block content with high violence score
219
- @id("ts-block-violence")
220
- @name("Block violent content")
221
- @description("Block content when the ML violence detection score exceeds threshold (80/100). Catches graphic violence descriptions, instructions for causing harm, and violent threat language.")
222
- @severity("critical")
223
- @tags("violence,content-safety,trust-safety,nist-si-4,iso-42001")
224
- @reject_message("Content blocked: violent content detected. AI agents in enterprise environments must not process or generate violent content. Please rephrase without violence-related language.")
225
- forbid (
226
- principal,
227
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
228
- resource
229
- )
230
- when {
231
- context has violence_score && context.violence_score >= 80
232
- };
233
-
234
- // Block content with weapons references
235
- @id("ts-block-weapons")
236
- @name("Block weapons content")
237
- @description("Block content when the ML weapons detection score exceeds threshold (80/100). Catches weapon manufacturing instructions, procurement guidance, and detailed weapon specifications.")
238
- @severity("critical")
239
- @tags("weapons,content-safety,trust-safety,nist-si-4")
240
- @reject_message("Content blocked: weapons-related content detected. AI agents must not process weapons manufacturing, procurement, or detailed specification content.")
241
- forbid (
242
- principal,
243
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
244
- resource
245
- )
246
- when {
247
- context has weapons_score && context.weapons_score >= 80
248
- };
249
-
250
- // ---------------------------------------------------------------------------
251
- // Section 2: Hate Speech & Discrimination
252
- // Blocks hateful, discriminatory, and dehumanizing content.
253
- // Lower threshold (75) — enterprises have zero tolerance for hate speech.
254
- // ---------------------------------------------------------------------------
255
-
256
- // Block hate speech content
257
- @id("ts-block-hate-speech")
258
- @name("Block hate speech")
259
- @description("Block content when the ML hate speech detection score exceeds threshold (75/100). Lower threshold than other categories because enterprises have zero tolerance for discriminatory content. Catches slurs, dehumanizing language, and targeted harassment.")
260
- @severity("critical")
261
- @tags("hate-speech,content-safety,trust-safety,nist-si-4,eu-ai-act")
262
- @reject_message("Content blocked: hate speech or discriminatory content detected. AI agents in enterprise environments must not process or generate hateful, discriminatory, or dehumanizing content.")
263
- forbid (
264
- principal,
265
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
266
- resource
267
- )
268
- when {
269
- context has hate_speech_score && context.hate_speech_score >= 75
270
- };
271
-
272
- // ---------------------------------------------------------------------------
273
- // Section 3: Criminal Content
274
- // Blocks content related to criminal activities and illegal operations.
275
- // ---------------------------------------------------------------------------
276
-
277
- // Block criminal content
278
- @id("ts-block-crime")
279
- @name("Block criminal content")
280
- @description("Block content when the ML criminal activity detection score exceeds threshold (80/100). Catches illegal activity instructions, fraud techniques, and content promoting criminal behavior.")
281
- @severity("high")
282
- @tags("crime,content-safety,trust-safety,nist-si-4")
283
- @reject_message("Content blocked: criminal activity content detected. AI agents must not process content related to illegal activities, fraud, or other criminal behavior.")
284
- forbid (
285
- principal,
286
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
287
- resource
288
- )
289
- when {
290
- context has crime_score && context.crime_score >= 80
291
- };
292
-
293
- // ---------------------------------------------------------------------------
294
- // Section 4: Sexual Content
295
- // Blocks sexually explicit or inappropriate content.
296
- // ---------------------------------------------------------------------------
297
-
298
- // Block sexual content
299
- @id("ts-block-sexual")
300
- @name("Block sexual content")
301
- @description("Block content when the ML sexual content detection score exceeds threshold (80/100). Ensures enterprise AI agents do not process or generate sexually explicit material.")
302
- @severity("high")
303
- @tags("sexual,content-safety,trust-safety,eu-ai-act,iso-42001")
304
- @reject_message("Content blocked: sexual content detected. AI agents in enterprise environments must not process sexually explicit material.")
305
- forbid (
306
- principal,
307
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
308
- resource
309
- )
310
- when {
311
- context has sexual_score && context.sexual_score >= 80
312
- };
313
-
314
- // ---------------------------------------------------------------------------
315
- // Section 5: Profanity
316
- // Blocks excessive profanity in enterprise environments.
317
- // Higher threshold (90) — allows normal expression while blocking abuse.
318
- // ---------------------------------------------------------------------------
319
-
320
- // Block excessive profanity
321
- @id("ts-block-profanity")
322
- @name("Block profanity")
323
- @description("Block content when the ML profanity detection score exceeds threshold (90/100). Higher threshold than other safety categories — allows normal expression while blocking abusive or harassing language patterns.")
324
- @severity("medium")
325
- @tags("profanity,content-safety,trust-safety")
326
- @reject_message("Content blocked: excessive profanity detected. Please rephrase in a professional manner appropriate for enterprise AI interactions.")
327
- forbid (
328
- principal,
329
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
330
- resource
331
- )
332
- when {
333
- context has profanity_score && context.profanity_score >= 90
334
- };
335
- `;
336
35
  const OVERWATCH_SECRETS_DEFAULT_CEDAR = `// =============================================================================
337
36
  // Secrets Detection Policy (Default)
338
37
  // =============================================================================
339
- // Detects and blocks credential leakage across prompts, tool calls, file
340
- // operations, and AI response content. Uses multi-layered detection:
38
+ // Detects and blocks credential leakage across prompts, tool calls, and file
39
+ // operations using YARA rule name matching and file path patterns.
40
+ //
41
+ // Detection layers:
42
+ // 1. YARA rule triggers (detected_threats) — pattern-based, always available
43
+ // 2. Sensitive file path blocking (.env, credential directories)
341
44
  //
342
- // 1. Detection engine boolean (contains_secrets) — fastest, broadest catch
343
- // 2. Granular secret type matching (secret_types) type-specific blocking
344
- // 3. Detection rule pattern matching (detected_threats) named rule triggers
345
- // 4. Sensitive file path blocking (.env, credentials files)
346
- // 5. Response content pattern matching (defense-in-depth for AI outputs)
45
+ // YARA rules matched:
46
+ // secrets_leakage — AWS keys, GitHub tokens, private key headers, API keys
47
+ // ssh_key_exposure — SSH private key content and key file paths
48
+ // pem_file_access — PEM/certificate private keys and key files
49
+ // environment_variable_leakage OPENAI_API_KEY, HF_TOKEN, generic *_API_KEY
347
50
  //
348
51
  // Compliance:
349
52
  // NIST 800-53 SC-28 (Protection of Information at Rest)
350
53
  // NIST 800-53 IA-5 (Authenticator Management)
351
54
  // OWASP LLM07 (Insecure Plugin Design) — secrets in tool args
352
55
  // MITRE ATT&CK T1552 (Unsecured Credentials)
353
- // MITRE ATT&CK T1555 (Credentials from Password Stores)
354
56
  // CIS Benchmark 1.4 (Secrets Management)
355
57
  //
356
58
  // Category: secrets
@@ -358,1070 +60,764 @@ const OVERWATCH_SECRETS_DEFAULT_CEDAR = `// ====================================
358
60
  // =============================================================================
359
61
 
360
62
  // ---------------------------------------------------------------------------
361
- // Section 1: Detection EnginePrimary Secret Detection
362
- // These fire when the detection pipeline identifies secrets in any content.
63
+ // Section 1: YARA Rulesecrets_leakage
64
+ // Detects AWS access keys, GitHub tokens, private key headers, API key
65
+ // assignments, and bearer tokens in content.
363
66
  // ---------------------------------------------------------------------------
364
67
 
365
68
  // Block prompts containing detected secrets
366
- @id("secrets-block-prompts")
367
- @name("Block prompts with secrets")
368
- @description("Block prompts when detection engines identify API keys, tokens, or credential patterns. First line of defense against accidental secret exposure in user input.")
369
- @severity("critical")
370
- @tags("secrets,credentials,prompts,nist-sc-28,nist-ia-5")
371
- @reject_message("Your prompt was blocked because it contains detected secrets such as API keys, tokens, or credentials. Remove all secrets before resubmitting.")
69
+ @id("secrets-block-leakage-prompt")
70
+ @name("Block secrets in prompts")
71
+ @description("Block prompts when YARA detects exposed secrets — AWS access keys (AKIA...), GitHub tokens (ghp_...), private key headers, bearer tokens, or API key assignments (api_key=sk-proj-...).")
72
+ @severity("high")
73
+ @tags("secrets,credentials,yara,process-prompt,nist-sc-28,nist-ia-5")
74
+ @reject_message("Prompt blocked: exposed secrets detected (AWS keys, tokens, private keys, or API key assignments). Remove sensitive credentials before submitting.")
372
75
  forbid (
373
76
  principal,
374
77
  action == Overwatch::Action::"process_prompt",
375
78
  resource
376
79
  )
377
80
  when {
378
- context has contains_secrets && context.contains_secrets
81
+ context.detected_threats.contains("secrets_leakage")
379
82
  };
380
83
 
381
- // Block file reads and tool calls when secrets are detected
382
- @id("secrets-block-reads-and-tools")
383
- @name("Block file reads and tool calls with secrets")
384
- @description("Prevent file reads and tool execution when secrets or credentials are detected in content. Blocks exfiltration of secrets via file operations and tool arguments.")
84
+ // Block tool calls containing detected secrets
85
+ @id("secrets-block-leakage-tool")
86
+ @name("Block secrets in tool calls")
87
+ @description("Block tool execution when YARA detects exposed secrets in tool arguments or command content.")
385
88
  @severity("high")
386
- @tags("secrets,file-access,tools,credentials,nist-sc-28,mitre-t1552")
387
- @reject_message("This operation was blocked because secrets or credentials were detected in the content. File reads and tool calls are restricted when credential exposure is identified.")
89
+ @tags("secrets,credentials,yara,call-tool,nist-sc-28,mitre-t1552")
90
+ @reject_message("Tool execution blocked: exposed secrets detected in command or arguments. Remove sensitive credentials before executing.")
388
91
  forbid (
389
92
  principal,
390
- action in [Overwatch::Action::"read_file", Overwatch::Action::"call_tool"],
391
- resource
392
- )
393
- when {
394
- context has contains_secrets && context.contains_secrets
395
- };
396
-
397
- // Block file writes containing secrets
398
- @id("secrets-block-file-writes")
399
- @name("Block file writes with secrets")
400
- @description("Prevent writing files that contain secrets. Stops credential persistence to disk where they could be committed to version control or accessed by other tools.")
401
- @severity("critical")
402
- @tags("secrets,file-write,credentials,nist-sc-28,cis-1.4")
403
- @reject_message("File write was blocked because secrets or credentials were detected in the content. Credentials should never be written to files — use a secrets manager or environment variables.")
404
- forbid (
405
- principal,
406
- action == Overwatch::Action::"write_file",
93
+ action == Overwatch::Action::"call_tool",
407
94
  resource
408
95
  )
409
96
  when {
410
- context has contains_secrets && context.contains_secrets
97
+ context has detected_threats && context.detected_threats.contains("secrets_leakage")
411
98
  };
412
99
 
413
100
  // ---------------------------------------------------------------------------
414
- // Section 2: Granular Secret Type Blocking
415
- // Blocks specific high-risk credential types identified by the detection
416
- // engine's pattern-matching (e.g., AWS keys, GitHub tokens, SSH keys).
101
+ // Section 2: YARA Rule ssh_key_exposure
102
+ // Detects SSH private key content (BEGIN OPENSSH/RSA PRIVATE KEY) and
103
+ // SSH key file paths (/.ssh/id_rsa, id_ed25519).
417
104
  // ---------------------------------------------------------------------------
418
105
 
419
- // Block high-risk secret types across all actions
420
- @id("secrets-block-high-risk-types")
421
- @name("Block high-risk credential types")
422
- @description("Block content containing cloud provider keys (AWS, GCP, Azure), GitHub tokens, SSH private keys, or database connection strings. These credential types pose the highest exfiltration risk.")
423
- @severity("critical")
424
- @tags("secrets,aws,github,ssh,cloud,nist-ia-5,mitre-t1552")
425
- @reject_message("Content blocked: high-risk credentials detected (cloud provider keys, GitHub tokens, SSH keys, or database credentials). Use a secrets manager — never pass credentials through AI agents.")
426
- forbid (
427
- principal,
428
- action,
429
- resource
430
- )
431
- when {
432
- context has secret_types &&
433
- (context.secret_types.contains("aws_access_key") ||
434
- context.secret_types.contains("aws_secret_key") ||
435
- context.secret_types.contains("gcp_service_account") ||
436
- context.secret_types.contains("azure_client_secret") ||
437
- context.secret_types.contains("github_token") ||
438
- context.secret_types.contains("github_pat") ||
439
- context.secret_types.contains("ssh_private_key") ||
440
- context.secret_types.contains("database_url"))
441
- };
442
-
443
- // Block API keys and bearer tokens across all actions
444
- @id("secrets-block-api-keys")
445
- @name("Block API keys and bearer tokens")
446
- @description("Block content containing generic API keys, bearer tokens, JWT tokens, and OAuth credentials. These are the most commonly leaked credential types in AI agent interactions.")
447
- @severity("high")
448
- @tags("secrets,api-key,bearer,jwt,oauth,nist-ia-5")
449
- @reject_message("Content blocked: API keys, bearer tokens, or OAuth credentials detected. These must never be passed through AI agent prompts or tool calls.")
450
- forbid (
451
- principal,
452
- action,
453
- resource
454
- )
455
- when {
456
- context has secret_types &&
457
- (context.secret_types.contains("api_key") ||
458
- context.secret_types.contains("bearer_token") ||
459
- context.secret_types.contains("jwt_token") ||
460
- context.secret_types.contains("oauth_token") ||
461
- context.secret_types.contains("oauth_secret"))
462
- };
463
-
464
- // Block when multiple secrets are detected (bulk exposure)
465
- @id("secrets-block-bulk-exposure")
466
- @name("Block bulk secret exposure")
467
- @description("Block content when 3 or more distinct secrets are found. Multiple secrets in a single request indicates either a configuration dump, .env file paste, or credential harvesting attempt.")
106
+ // Block SSH key exposure across tool calls and file operations
107
+ @id("secrets-block-ssh-keys")
108
+ @name("Block SSH key exposure")
109
+ @description("Block when YARA detects SSH private key content or SSH key file paths. Covers tool calls, file reads, and file writes. AI agents must not access SSH credentials.")
468
110
  @severity("critical")
469
- @tags("secrets,bulk,data-exfiltration,nist-sc-28,mitre-t1552")
470
- @reject_message("Content blocked: multiple credentials detected (3+). This appears to be a bulk credential exposure — configuration dumps and credential lists must never be passed through AI agents.")
111
+ @tags("secrets,ssh,credentials,yara,nist-ia-5,mitre-t1552")
112
+ @reject_message("Blocked: SSH private key content or key file path detected. AI agents must not access SSH credentials.")
471
113
  forbid (
472
114
  principal,
473
- action,
115
+ action in [Overwatch::Action::"call_tool", Overwatch::Action::"read_file", Overwatch::Action::"write_file"],
474
116
  resource
475
117
  )
476
118
  when {
477
- context has secret_count && context.secret_count >= 3
119
+ context has detected_threats && context.detected_threats.contains("ssh_key_exposure")
478
120
  };
479
121
 
480
122
  // ---------------------------------------------------------------------------
481
- // Section 3: Detection Rule Pattern Matching
482
- // Catches specific named detection rules that fire for credential exposure.
123
+ // Section 3: YARA Rule pem_file_access
124
+ // Detects PEM private key content (BEGIN ENCRYPTED/RSA/EC/DSA PRIVATE KEY)
125
+ // and key file paths (.pem, .key, .p12, .pfx).
483
126
  // ---------------------------------------------------------------------------
484
127
 
485
- // Block content flagged by detection engine credential rules
486
- @id("secrets-block-detected-credentials")
487
- @name("Block detected credential patterns")
488
- @description("Block content flagged by detection engine rules for credential exposure, API key leaks, JWT tokens, and bearer tokens. Defense-in-depth behind contains_secrets.")
128
+ // Block PEM/certificate key exposure
129
+ @id("secrets-block-pem-keys")
130
+ @name("Block PEM/certificate key exposure")
131
+ @description("Block when YARA detects PEM private key content or certificate key file paths (.pem, .key, .p12, .pfx). AI agents must not access certificate credentials.")
489
132
  @severity("critical")
490
- @tags("secrets,credentials,jwt,bearer,nist-ia-5,mitre-t1552")
491
- @reject_message("Content blocked: detection engines identified credential patterns including secret exposure, credential leaks, API keys, or token exposure.")
492
- forbid (
493
- principal,
494
- action,
495
- resource
496
- )
497
- when {
498
- context has detected_threats &&
499
- (context.detected_threats.contains("secret_exposure") ||
500
- context.detected_threats.contains("credential_leak") ||
501
- context.detected_threats.contains("api_key_exposure") ||
502
- context.detected_threats.contains("jwt_token_exposure") ||
503
- context.detected_threats.contains("bearer_token_leak"))
504
- };
505
-
506
- // ---------------------------------------------------------------------------
507
- // Section 4: Sensitive File Path Protection
508
- // Blocks access to files that commonly contain secrets.
509
- // ---------------------------------------------------------------------------
510
-
511
- // Block .env file access across all operations
512
- @id("secrets-block-env-files")
513
- @name("Block .env file access")
514
- @description("Block access to .env files that commonly contain secrets, API keys, and database credentials. Environment files are the #1 source of accidental credential exposure in development workflows.")
515
- @severity("high")
516
- @tags("secrets,env-files,config,nist-sc-28,mitre-t1552,cis-1.4")
517
- @reject_message("Access to .env files is blocked because they commonly contain secrets, API keys, and database credentials. Use a secrets manager instead of .env files.")
518
- forbid (
519
- principal,
520
- action in [Overwatch::Action::"read_file", Overwatch::Action::"write_file", Overwatch::Action::"call_tool"],
521
- resource
522
- )
523
- when {
524
- context has path && context.path like "*.env*"
525
- };
526
-
527
- // Block access to known credential files
528
- @id("secrets-block-credential-files")
529
- @name("Block credential file access")
530
- @description("Block access to common credential files: .netrc, .npmrc, .pypirc, credentials, config files in cloud provider directories. These files often contain hardcoded tokens and passwords.")
531
- @severity("high")
532
- @tags("secrets,credential-files,config,nist-sc-28,mitre-t1555")
533
- @reject_message("Access to this credential file is blocked. Files like .netrc, .npmrc, .pypirc, and cloud provider config files commonly contain hardcoded credentials.")
133
+ @tags("secrets,certificates,pem,yara,nist-ia-5,mitre-t1552")
134
+ @reject_message("Blocked: PEM private key or certificate key file detected. AI agents must not access certificate credentials.")
534
135
  forbid (
535
136
  principal,
536
- action in [Overwatch::Action::"read_file", Overwatch::Action::"write_file", Overwatch::Action::"call_tool"],
137
+ action in [Overwatch::Action::"call_tool", Overwatch::Action::"read_file", Overwatch::Action::"write_file"],
537
138
  resource
538
139
  )
539
140
  when {
540
- context has path &&
541
- (context.path like "*/.netrc" ||
542
- context.path like "*/.npmrc" ||
543
- context.path like "*/.pypirc" ||
544
- context.path like "*/.docker/config.json" ||
545
- context.path like "*/.kube/config" ||
546
- context.path like "*/.config/gcloud/*" ||
547
- context.path like "*/credentials.json" ||
548
- context.path like "*/service-account*.json")
141
+ context has detected_threats && context.detected_threats.contains("pem_file_access")
549
142
  };
550
143
 
551
- // NOTE: Response content secret detection (AWS keys, GitHub tokens, private keys
552
- // in AI outputs) is handled by the detection engine's secrets scanner, which
553
- // analyzes all content types including tool responses. The contains_secrets and
554
- // secret_types rules above cover this case.
555
- `;
556
- const OVERWATCH_PII_DEFAULT_CEDAR = `// =============================================================================
557
- // PII Detection Policy (Default)
558
- // =============================================================================
559
- // Detects and blocks personally identifiable information across prompts, tool
560
- // calls, file operations, and AI responses. Uses multi-layered detection:
561
- //
562
- // 1. PII boolean flag (pii_detected) — broadest catch from detection engine
563
- // 2. Granular PII type matching (pii_types) — type-specific blocking
564
- // 3. ML classifier confidence (pii_confidence) — catches novel PII patterns
565
- // 4. Detection rule triggers (detected_threats) — named rule matches
566
- // 5. File operation PII blocking — prevents PII persistence to disk
567
- //
568
- // Compliance:
569
- // PCI DSS 3.4, 4.1 (Payment Card Data)
570
- // GDPR Art. 32 (Security of Processing)
571
- // HIPAA §164.312 (Technical Safeguards)
572
- // NIST 800-53 SI-4 (Information System Monitoring)
573
- // CCPA §1798.150 (Data Protection)
574
- // OWASP LLM06 (Sensitive Information Disclosure)
575
- //
576
- // Category: pii
577
- // Namespace: Overwatch
578
- // =============================================================================
579
-
580
144
  // ---------------------------------------------------------------------------
581
- // Section 1: Detection EnginePrimary PII Detection
582
- // Fires when the detection pipeline identifies PII in any content.
145
+ // Section 4: YARA Ruleenvironment_variable_leakage
146
+ // Detects OPENAI_API_KEY=sk-..., HF_TOKEN=hf_..., and generic
147
+ // <NAME>_API_KEY=<value> (16+ chars) patterns.
583
148
  // ---------------------------------------------------------------------------
584
149
 
585
- // Block prompts containing any detected PII
586
- @id("pii-block-any-detected")
587
- @name("Block prompts with PII")
588
- @description("Block prompts when the detection engine identifies any PII patterns. This is the broadest PII catch fires before type-specific rules.")
589
- @severity("critical")
590
- @tags("pii,privacy,data-protection,gdpr-art-32,owasp-llm06")
591
- @reject_message("Your prompt was blocked because personally identifiable information was detected. Remove all PII (names, addresses, SSNs, credit cards, etc.) before resubmitting.")
592
- forbid (
593
- principal,
594
- action == Overwatch::Action::"process_prompt",
595
- resource
596
- )
597
- when {
598
- context has pii_detected && context.pii_detected
599
- };
600
-
601
- // Block tool calls containing PII
602
- @id("pii-block-tool-calls")
603
- @name("Block tool calls with PII")
604
- @description("Prevent tool execution when PII patterns are detected in tool arguments or content. Stops PII from being passed to external tools, MCP servers, or shell commands.")
150
+ // Block environment variable secret exposure
151
+ @id("secrets-block-env-vars")
152
+ @name("Block environment variable leakage")
153
+ @description("Block when YARA detects environment variable secret assignments OPENAI_API_KEY, HF_TOKEN, or generic <NAME>_API_KEY=<value> patterns with 16+ character values.")
605
154
  @severity("high")
606
- @tags("pii,tools,data-protection,owasp-llm06")
607
- @reject_message("Tool execution was blocked because personally identifiable information was detected in the content. PII must be removed before tool calls are permitted.")
608
- forbid (
609
- principal,
610
- action == Overwatch::Action::"call_tool",
611
- resource
612
- )
613
- when {
614
- context has pii_detected && context.pii_detected
615
- };
616
-
617
- // ---------------------------------------------------------------------------
618
- // Section 2: Granular PII Type Blocking
619
- // Blocks specific PII types based on regulatory requirements.
620
- // ---------------------------------------------------------------------------
621
-
622
- // Block credit card numbers (PCI DSS compliance)
623
- @id("pii-block-credit-cards")
624
- @name("Block credit card numbers")
625
- @description("Detect and block content containing credit card number patterns. PCI DSS 3.4 requires that PANs are rendered unreadable — AI agents must never process raw card numbers.")
626
- @severity("critical")
627
- @tags("pci,credit-card,payment,compliance,pci-dss-3.4,pci-dss-4.1")
628
- @reject_message("Content blocked: credit card number patterns detected. Sharing payment card data through AI agents violates PCI DSS requirements. Use tokenized card references instead.")
629
- forbid (
630
- principal,
631
- action,
632
- resource
633
- )
634
- when {
635
- (context has pii_types && context.pii_types.contains("credit_card")) ||
636
- (context has detected_threats && context.detected_threats.contains("credit_card"))
637
- };
638
-
639
- // Block Social Security Numbers
640
- @id("pii-block-ssn")
641
- @name("Block Social Security Numbers")
642
- @description("Detect and block content containing SSN patterns (XXX-XX-XXXX and variants). SSNs are high-value identity theft targets — exposure through AI agents is a critical privacy violation.")
643
- @severity("critical")
644
- @tags("ssn,identity,privacy,compliance,nist-si-4")
645
- @reject_message("Content blocked: Social Security Number patterns detected. SSNs are protected personal identifiers that must never be shared through AI agents.")
646
- forbid (
647
- principal,
648
- action,
649
- resource
650
- )
651
- when {
652
- (context has pii_types && context.pii_types.contains("ssn")) ||
653
- (context has detected_threats && context.detected_threats.contains("ssn"))
654
- };
655
-
656
- // Block medical/health records (HIPAA compliance)
657
- @id("pii-block-health-data")
658
- @name("Block health information")
659
- @description("Block content containing medical record numbers, health insurance IDs, or other Protected Health Information (PHI). HIPAA §164.312 requires technical safeguards for PHI.")
660
- @severity("critical")
661
- @tags("phi,hipaa,health,medical,compliance,hipaa-164.312")
662
- @reject_message("Content blocked: Protected Health Information (PHI) detected. Health data must not be processed through AI agents per HIPAA requirements.")
663
- forbid (
664
- principal,
665
- action,
666
- resource
667
- )
668
- when {
669
- context has pii_types &&
670
- (context.pii_types.contains("medical_record") ||
671
- context.pii_types.contains("health_insurance_id"))
672
- };
673
-
674
- // Block bulk PII exposure (multiple PII items)
675
- @id("pii-block-bulk-exposure")
676
- @name("Block bulk PII exposure")
677
- @description("Block content containing 3 or more PII matches. Multiple PII items in a single request indicates a data dump, CSV paste, or data exfiltration attempt.")
678
- @severity("critical")
679
- @tags("pii,bulk,data-exfiltration,gdpr-art-32,ccpa")
680
- @reject_message("Content blocked: multiple PII items detected (3+). Bulk personal data must never be processed through AI agents. Use data masking or tokenization for batch operations.")
681
- forbid (
682
- principal,
683
- action,
684
- resource
685
- )
686
- when {
687
- context has pii_count && context.pii_count >= 3
688
- };
689
-
690
- // ---------------------------------------------------------------------------
691
- // Section 3: ML Classifier — Novel PII Detection
692
- // Catches PII patterns that escape regex detection using ML classifiers.
693
- // ---------------------------------------------------------------------------
694
-
695
- // Block content with high PII classifier confidence
696
- @id("pii-block-high-confidence")
697
- @name("Block high-confidence PII")
698
- @description("Block content when the ML PII classifier confidence exceeds threshold (80/100). Catches novel PII patterns including names, addresses, and identifiers that regex rules may miss.")
699
- @severity("critical")
700
- @tags("pii,confidence,privacy,compliance,ml-classifier")
701
- @reject_message("Content blocked: the ML classifier detected personally identifiable information with high confidence. Even if specific PII types aren't identified, the content appears to contain personal data.")
155
+ @tags("secrets,environment,yara,nist-ia-5")
156
+ @reject_message("Blocked: environment variable secret detected (API keys, tokens). Remove sensitive values before proceeding.")
702
157
  forbid (
703
158
  principal,
704
159
  action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
705
160
  resource
706
161
  )
707
162
  when {
708
- context has pii_confidence && context.pii_confidence >= 80
163
+ context has detected_threats && context.detected_threats.contains("environment_variable_leakage")
709
164
  };
710
165
 
711
166
  // ---------------------------------------------------------------------------
712
- // Section 4: Detection Rule Pattern Matching
713
- // Catches PII detected by named detection rules.
167
+ // Section 5: Sensitive File Path Protection
168
+ // Blocks file read/write access to credential directories and .env files.
169
+ // NOTE: Targets read_file/write_file only — NOT call_tool. The path field
170
+ // is empty for Bash commands (extractor reads tool_input.file_path which is
171
+ // undefined for Bash). SSH key access via Bash is caught by YARA rules above.
714
172
  // ---------------------------------------------------------------------------
715
173
 
716
- // Block content with PII threat category
717
- @id("pii-block-threat-category")
718
- @name("Block PII threat category")
719
- @description("Block content when threat categorization identifies PII. Defense-in-depth behind the pii_detected boolean catches cases where PII is flagged at the threat aggregation layer.")
720
- @severity("high")
721
- @tags("pii,privacy,data-protection,gdpr")
722
- @reject_message("Content blocked: threat scanners detected personally identifiable information. Remove all PII before resubmitting.")
174
+ // Block access to credential directories
175
+ @id("secrets-block-credential-paths")
176
+ @name("Block credential directory access")
177
+ @description("Block file read/write to SSH keys, cloud credentials, GPG keys, and certificate files. Only applies to Read/Write/Edit tools (path is not populated for Bash commands).")
178
+ @severity("critical")
179
+ @tags("secrets,credentials,path,file-access,nist-sc-28,mitre-t1552")
180
+ @reject_message("Blocked: access to credential directory or key file. AI agents must not access .ssh, .aws, .gnupg, .gcloud, or .azure directories.")
723
181
  forbid (
724
182
  principal,
725
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
183
+ action in [Overwatch::Action::"read_file", Overwatch::Action::"write_file"],
726
184
  resource
727
185
  )
728
186
  when {
729
- context has threat_categories && context.threat_categories.contains("pii")
187
+ context has path &&
188
+ (context.path like "*/.ssh/*" ||
189
+ context.path like "*/.aws/*" ||
190
+ context.path like "*/.gnupg/*" ||
191
+ context.path like "*/.config/gcloud/*" ||
192
+ context.path like "*/.azure/*" ||
193
+ context.path like "*/id_rsa*" ||
194
+ context.path like "*/id_ed25519*" ||
195
+ context.path like "*/id_ecdsa*")
730
196
  };
731
197
 
732
- // ---------------------------------------------------------------------------
733
- // Section 5: File Operation PII Blocking
734
- // Prevents PII from being read from or written to disk.
735
- // ---------------------------------------------------------------------------
736
-
737
- // Block file operations containing PII
738
- @id("pii-block-file-ops")
739
- @name("Block file operations with PII")
740
- @description("Block file reads and writes when PII is detected. Prevents agents from reading files containing personal data and from writing PII to new files where it could persist or be version-controlled.")
198
+ // Block .env file access
199
+ @id("secrets-block-env-file-paths")
200
+ @name("Block .env file access")
201
+ @description("Block file read/write to .env files which typically contain secrets, API keys, and database credentials.")
741
202
  @severity("high")
742
- @tags("pii,file-ops,data-protection,gdpr-art-32,nist-si-4")
743
- @reject_message("File operation blocked: personally identifiable information was detected. Files containing PII must not be read or written through AI agents.")
203
+ @tags("secrets,environment,path,file-access,nist-sc-28,cis-1.4")
204
+ @reject_message("Blocked: .env file access. These files typically contain secrets and should not be accessed by AI agents.")
744
205
  forbid (
745
206
  principal,
746
207
  action in [Overwatch::Action::"read_file", Overwatch::Action::"write_file"],
747
208
  resource
748
209
  )
749
210
  when {
750
- context has pii_detected && context.pii_detected
211
+ context has path && context.path like "*.env*"
751
212
  };
752
213
  `;
753
- const OVERWATCH_TOOLS_DEFAULT_CEDAR = `// =============================================================================
754
- // Tool Permissioning Policy (Default)
214
+ const OVERWATCH_SEMANTIC_DEFAULT_CEDAR = `// =============================================================================
215
+ // Semantic Threat Detection Policy (Default)
755
216
  // =============================================================================
756
- // Controls access to IDE tools, shell execution, file system paths, and MCP
757
- // operations. Enforces least-privilege for agent tool usage with multi-layered
758
- // controls:
217
+ // Detects and blocks injection attacks, prompt injection, jailbreak attempts,
218
+ // and unsafe content using multi-layered detection:
219
+ //
220
+ // Tier 1 — YARA rules (always available, no external dependency):
221
+ // command_injection, sql_injection, path_traversal, detect_encoded
222
+ //
223
+ // Tier 2 — Javelin ML classifiers (require Highflame API token):
224
+ // injection_confidence, jailbreak_confidence
225
+ //
226
+ // Tier 3 — Content safety ML scores (require Highflame API token):
227
+ // violence, weapons, hate_speech, crime, sexual, profanity
759
228
  //
760
- // 1. Dangerous tool blocking (shell, command execution, destructive ops)
761
- // 2. Sensitive system path protection (credentials, system dirs)
762
- // 3. Tool risk scoring (computed risk assessment)
763
- // 4. Tool category enforcement (safe/sensitive/dangerous classification)
764
- // 5. Threat-based tool blocking (threat severity gates)
765
- // 6. Command injection detection (reverse shells, code execution, etc.)
766
229
  //
767
230
  // Compliance:
768
- // NIST 800-53 AC-3 (Access Enforcement)
769
- // NIST 800-53 AC-6 (Least Privilege)
770
- // NIST 800-53 CM-7 (Least Functionality)
771
- // OWASP LLM06 (Excessive Agency) — agent tool access control
772
- // OWASP ASI02 (Tool Misuse) — unauthorized tool operations
773
- // MITRE ATT&CK T1059 (Command and Scripting Interpreter)
774
- // MITRE ATT&CK T1005 (Data from Local System)
775
- // MITRE ATT&CK T1552 (Unsecured Credentials)
231
+ // OWASP LLM01 (Prompt Injection) — direct + indirect
232
+ // OWASP LLM02 (Insecure Output Handling) — response manipulation
233
+ // OWASP ASI01 (Agent Goal Hijack) — behavioral manipulation
234
+ // MITRE ATLAS AML.T0051 (LLM Prompt Injection)
235
+ // MITRE ATLAS AML.T0054 (LLM Jailbreak)
236
+ // NIST 800-53 SI-3 (Malicious Code Protection)
237
+ // NIST 800-53 SI-4 (Information System Monitoring)
238
+ // EU AI Act Art. 52 (Transparency for AI Systems)
239
+ // ISO 42001 (AI Management System)
776
240
  //
777
- // Category: tools
241
+ // Category: semantic
778
242
  // Namespace: Overwatch
779
243
  // =============================================================================
780
244
 
781
- // ---------------------------------------------------------------------------
782
- // Section 1: Dangerous Tool Blocking
783
- // Blocks tools classified as inherently dangerous for agent use.
784
- // Ref: OWASP LLM06, MITRE T1059
785
- // ---------------------------------------------------------------------------
786
-
787
- // Block shell and command execution tools
788
- @id("tools-block-shell-execution")
789
- @name("Block shell and command execution")
790
- @description("Block direct shell, bash, and command execution tools. Unrestricted shell access is the #1 risk in AI coding agents — enables command injection, data exfiltration, and arbitrary code execution (MITRE T1059).")
791
- @severity("critical")
792
- @tags("shell,command-injection,execution,nist-cm-7,mitre-t1059,owasp-llm06,baseline")
793
- @reject_message("Tool execution was blocked: direct shell and command execution tools (shell, bash, terminal) are restricted to prevent command injection attacks (MITRE T1059). Use specific, scoped tools instead.")
794
- forbid (
795
- principal,
796
- action == Overwatch::Action::"call_tool",
797
- resource
798
- )
799
- when {
800
- context has tool_name &&
801
- (context.tool_name == "shell" ||
802
- context.tool_name == "bash" ||
803
- context.tool_name == "sh" ||
804
- context.tool_name == "terminal" ||
805
- context.tool_name == "system.exec" ||
806
- context.tool_name == "process.spawn" ||
807
- context.tool_name == "cmd" ||
808
- context.tool_name == "powershell")
809
- };
810
-
811
- // Block destructive file operations
812
- @id("tools-block-destructive-ops")
813
- @name("Block destructive file operations")
814
- @description("Block file deletion, directory removal, and other destructive operations. Agents should not have delete access by default — destructive operations require explicit human approval.")
815
- @severity("high")
816
- @tags("file,delete,destructive,nist-ac-3,owasp-asi02")
817
- @reject_message("Tool execution was blocked: destructive file operations (delete, rmdir, unlink) are restricted to prevent data loss. Request explicit human approval for destructive actions.")
818
- forbid (
819
- principal,
820
- action == Overwatch::Action::"call_tool",
821
- resource
822
- )
823
- when {
824
- context has tool_name &&
825
- (context.tool_name == "fs.delete" ||
826
- context.tool_name == "fs.rmdir" ||
827
- context.tool_name == "fs.unlink" ||
828
- context.tool_name == "fs.remove" ||
829
- context.tool_name == "delete_file" ||
830
- context.tool_name == "remove_directory")
831
- };
832
245
 
833
246
  // ---------------------------------------------------------------------------
834
- // Section 2: Sensitive System Path Protection
835
- // Blocks access to system directories, credential files, and sensitive paths.
836
- // Ref: MITRE T1005, T1552
247
+ // Tier 1: YARA-Based Injection Detection (always available)
248
+ // These fire on actual YARA rule names from the detection engine.
249
+ // No external API dependency — works offline with local YARA scanning.
837
250
  // ---------------------------------------------------------------------------
838
251
 
839
- // Block access to system directories
840
- @id("tools-block-system-paths")
841
- @name("Block system directory access")
842
- @description("Prevent access to sensitive system directories (/etc, /proc, /sys, /root, /var). These directories contain system configuration, process information, and credentials that agents must never access.")
843
- @severity("high")
844
- @tags("file,path,system,security,nist-ac-6,mitre-t1005")
845
- @reject_message("Access blocked: this path targets a sensitive system directory. AI agents are restricted from accessing /etc, /proc, /sys, /root, and /var directories.")
846
- forbid (
847
- principal,
848
- action in [Overwatch::Action::"read_file", Overwatch::Action::"write_file", Overwatch::Action::"call_tool"],
849
- resource
850
- )
851
- when {
852
- context has path &&
853
- (context.path like "/etc/*" ||
854
- context.path like "/proc/*" ||
855
- context.path like "/sys/*" ||
856
- context.path like "/root/*" ||
857
- context.path like "/var/log/*" ||
858
- context.path like "/var/run/*")
859
- };
860
-
861
- // Block access to credential and key directories
862
- @id("tools-block-credential-paths")
863
- @name("Block credential directory access")
864
- @description("Prevent access to SSH keys, cloud provider credentials, GPG keys, and other authentication material directories. These are primary targets for credential theft (MITRE T1552).")
252
+ // Block command injection in tool calls
253
+ @id("semantic-block-command-injection-tool")
254
+ @name("Block command injection in tool calls")
255
+ @description("Block tool execution when YARA detects command injection reverse shells, rm -rf, privilege escalation, code execution, pipe-to-shell, or encoding evasion. Ref: AIShellJack (41-84% success rate).")
865
256
  @severity("critical")
866
- @tags("file,credentials,ssh,aws,security,nist-ac-6,mitre-t1552")
867
- @reject_message("Access blocked: this path targets a credential or key directory (.ssh, .aws, .gnupg, .config/gcloud). AI agents must never access authentication material.")
257
+ @tags("command-injection,yara,call-tool,mitre-t1059,owasp-asi02")
258
+ @reject_message("Tool execution blocked: command injection pattern detected reverse shell, destructive command, privilege escalation, or code execution attempt.")
868
259
  forbid (
869
260
  principal,
870
- action in [Overwatch::Action::"read_file", Overwatch::Action::"write_file", Overwatch::Action::"call_tool"],
261
+ action == Overwatch::Action::"call_tool",
871
262
  resource
872
263
  )
873
264
  when {
874
- context has path &&
875
- (context.path like "*/.ssh/*" ||
876
- context.path like "*/.aws/*" ||
877
- context.path like "*/.gnupg/*" ||
878
- context.path like "*/.config/gcloud/*" ||
879
- context.path like "*/.azure/*" ||
880
- context.path like "*.pem" ||
881
- context.path like "*/id_rsa*" ||
882
- context.path like "*/id_ed25519*" ||
883
- context.path like "*/id_ecdsa*")
265
+ context has detected_threats && context.detected_threats.contains("command_injection")
884
266
  };
885
267
 
886
- // ---------------------------------------------------------------------------
887
- // Section 3: Tool Risk Scoring
888
- // Uses computed tool risk scores from the detection engine to dynamically
889
- // assess and block risky tool operations.
890
- // ---------------------------------------------------------------------------
891
-
892
- // Block tools with very high computed risk
893
- @id("tools-block-high-risk-score")
894
- @name("Block high-risk tool operations")
895
- @description("Block tool operations when the computed risk score exceeds 90/100. The risk score combines tool type, argument analysis, context, and historical behavior into a single metric.")
268
+ // Block command injection in prompts
269
+ @id("semantic-block-command-injection-prompt")
270
+ @name("Block command injection in prompts")
271
+ @description("Block prompts when YARA detects command injection patterns. Catches prompt-level injection where the user or injected content includes shell commands.")
896
272
  @severity("critical")
897
- @tags("tool-risk,dynamic,security,owasp-llm06,owasp-asi02")
898
- @reject_message("Tool execution blocked: this operation scored 90+ on the risk assessment. The combination of tool type, arguments, and context indicates a high-risk operation.")
273
+ @tags("command-injection,yara,process-prompt,mitre-t1059")
274
+ @reject_message("Prompt blocked: command injection pattern detected. The prompt contains shell commands, reverse shells, or code execution patterns.")
899
275
  forbid (
900
276
  principal,
901
- action == Overwatch::Action::"call_tool",
277
+ action == Overwatch::Action::"process_prompt",
902
278
  resource
903
279
  )
904
280
  when {
905
- context has tool_risk_score && context.tool_risk_score >= 90
281
+ context.detected_threats.contains("command_injection")
906
282
  };
907
283
 
908
- // Block tools classified as dangerous
909
- @id("tools-block-dangerous-category")
910
- @name("Block dangerous tool category")
911
- @description("Block all tools classified as 'dangerous' by the detection engine. The dangerous category includes tools with unrestricted system access, code execution, or network capabilities.")
912
- @severity("critical")
913
- @tags("tool-category,dangerous,security,owasp-llm06,nist-ac-6")
914
- @reject_message("Tool execution blocked: this tool is classified as 'dangerous' due to its unrestricted system access, code execution, or network capabilities. Use a safer alternative.")
284
+ // Block SQL injection in tool calls
285
+ @id("semantic-block-sql-injection-tool")
286
+ @name("Block SQL injection in tool calls")
287
+ @description("Block tool execution when YARA detects SQL injection tautologies (OR 1=1), UNION SELECT, DROP TABLE, time-based attacks (SLEEP, WAITFOR), or system object access (information_schema).")
288
+ @severity("high")
289
+ @tags("sql-injection,yara,call-tool,database")
290
+ @reject_message("Tool execution blocked: SQL injection pattern detected tautology, UNION attack, destructive SQL, or system object access.")
915
291
  forbid (
916
292
  principal,
917
293
  action == Overwatch::Action::"call_tool",
918
294
  resource
919
295
  )
920
296
  when {
921
- context has tool_category && context.tool_category == "dangerous"
297
+ context has detected_threats && context.detected_threats.contains("sql_injection")
922
298
  };
923
299
 
924
- // Stricter threshold for sensitive tools
925
- @id("tools-block-sensitive-with-threats")
926
- @name("Block sensitive tools with threats")
927
- @description("Block sensitive tools (file write, shell, network) when any threats are detected. Sensitive tools with concurrent threats indicate an attack leveraging tool capabilities for malicious purposes.")
300
+ // Block SQL injection in prompts
301
+ @id("semantic-block-sql-injection-prompt")
302
+ @name("Block SQL injection in prompts")
303
+ @description("Block prompts when YARA detects SQL injection patterns.")
928
304
  @severity("high")
929
- @tags("tool-category,sensitive,security,owasp-asi02,defense-in-depth")
930
- @reject_message("Sensitive tool execution blocked: threats were detected alongside a sensitive tool operation. Sensitive tools require zero threat context to execute.")
305
+ @tags("sql-injection,yara,process-prompt,database")
306
+ @reject_message("Prompt blocked: SQL injection pattern detected.")
931
307
  forbid (
932
308
  principal,
933
- action == Overwatch::Action::"call_tool",
309
+ action == Overwatch::Action::"process_prompt",
934
310
  resource
935
311
  )
936
312
  when {
937
- context has tool_is_sensitive && context.tool_is_sensitive &&
938
- context has threat_count && context.threat_count > 0
313
+ context.detected_threats.contains("sql_injection")
939
314
  };
940
315
 
941
- // ---------------------------------------------------------------------------
942
- // Section 4: Threat-Based Tool Blocking
943
- // Blocks tool calls based on threat severity from detection engines.
944
- // ---------------------------------------------------------------------------
945
-
946
- // Block tool calls with high severity threats
947
- @id("tools-block-high-severity-threats")
948
- @name("Block tool calls with high severity threats")
949
- @description("Prevent tool execution when high or critical severity threats (>= 3) are detected in content. Tools must not execute when the content they operate on is flagged as dangerous.")
316
+ // Block path traversal attacks
317
+ @id("semantic-block-path-traversal")
318
+ @name("Block path traversal attacks")
319
+ @description("Block when YARA detects path traversal — 2+ levels of ../ combined with sensitive file targets (/etc/passwd, /etc/shadow) or file read/include operations with traversal.")
950
320
  @severity("high")
951
- @tags("tools,threats,severity,security,defense-in-depth")
952
- @reject_message("Tool execution was blocked because high or critical severity threats were detected in the content by security scanners.")
321
+ @tags("path-traversal,yara,file-access,mitre-t1005")
322
+ @reject_message("Blocked: path traversal attack detected directory traversal sequences targeting sensitive system files.")
953
323
  forbid (
954
324
  principal,
955
- action == Overwatch::Action::"call_tool",
325
+ action in [Overwatch::Action::"call_tool", Overwatch::Action::"read_file", Overwatch::Action::"write_file"],
956
326
  resource
957
327
  )
958
328
  when {
959
- context has threat_count && context has max_threat_severity &&
960
- context.threat_count > 0 && context.max_threat_severity >= 3
329
+ context has detected_threats && context.detected_threats.contains("path_traversal")
961
330
  };
962
331
 
963
- // ---------------------------------------------------------------------------
964
- // Section 5: Command Injection Detection
965
- // Blocks tool calls containing command injection patterns in arguments.
966
- // Ref: AIShellJack (41-84% success rate, 314 payloads)
967
- // ---------------------------------------------------------------------------
968
-
969
- // Block detected command injection patterns
970
- @id("tools-block-command-injection")
971
- @name("Block command injection in tool calls")
972
- @description("Block tool calls when command injection patterns are detected in arguments — reverse shells, privilege escalation, code execution, and data exfiltration commands. Ref: AIShellJack attack (41-84% success rate).")
973
- @severity("critical")
974
- @tags("command-injection,shell,security,mitre-t1059,owasp-asi02")
975
- @reject_message("Tool execution blocked: command injection pattern detected in tool arguments. This may be a shell injection attack attempting to execute unauthorized commands.")
332
+ // Block encoded/obfuscated payloads in tool calls
333
+ @id("semantic-block-encoded")
334
+ @name("Block encoded payloads in tool calls")
335
+ @description("Block tool calls when YARA detects base64-encoded payloads (30+ chars) or hash IOCs. Base64 detection excludes npm package paths to reduce false positives.")
336
+ @severity("medium")
337
+ @tags("encoded,obfuscation,yara,call-tool")
338
+ @reject_message("Tool execution blocked: encoded or obfuscated payload detected. Base64-encoded content or hash IOCs found in tool arguments.")
976
339
  forbid (
977
- principal,
978
- action == Overwatch::Action::"call_tool",
979
- resource
980
- )
981
- when {
982
- context has detected_threats &&
983
- context.detected_threats.contains("command_injection")
984
- };
985
- `;
986
- const OVERWATCH_AGENT_SECURITY_DEFAULT_CEDAR = `// =============================================================================
987
- // Agent Security Policy (Default)
988
- // =============================================================================
989
- // Detects and blocks tool poisoning, rug pull attacks, indirect prompt injection,
990
- // and MCP supply chain threats targeting AI coding agents.
991
- //
992
- // These are agentic AI-specific attack vectors (OWASP Agentic Top 10) where tool
993
- // descriptions, server responses, or behavioral drift manipulate agent behavior.
994
- //
995
- // Compliance:
996
- // OWASP LLM01 (Prompt Injection) | OWASP LLM06 (Excessive Agency)
997
- // OWASP ASI01 (Agent Goal Hijack) | OWASP ASI02 (Tool Misuse)
998
- // OWASP ASI04 (Supply Chain) | OWASP MCP01-05
999
- // MITRE ATLAS AML.T0051 (Prompt Injection) | AML.T0080 (Memory Manipulation)
1000
- //
1001
- // Category: agent_security
1002
- // Namespace: Overwatch
1003
- // =============================================================================
340
+ principal,
341
+ action == Overwatch::Action::"call_tool",
342
+ resource
343
+ )
344
+ when {
345
+ context has detected_threats && context.detected_threats.contains("detect_encoded")
346
+ };
347
+
1004
348
 
1005
349
  // ---------------------------------------------------------------------------
1006
- // Tool Poisoning hidden instructions in tool descriptions/arguments
1007
- // Ref: Invariant Labs (April 2025), 84.2% success rate with auto-approval
350
+ // Tier 2: Javelin ML-Based Detection (require Highflame API token)
351
+ // Prompt injection and jailbreak classifiers. These are inert without
352
+ // the API token — scores default to 0 (prompts) or are absent (tools).
1008
353
  // ---------------------------------------------------------------------------
1009
354
 
1010
- // Block tool calls with tool poisoning risk
1011
- @id("as-block-tool-poisoning")
1012
- @name("Block tool poisoning")
1013
- @description("Block tool execution when hidden instructions are detected in tool descriptions or arguments (score >= 70/100). Catches authority hijack, system prompt injection, and hidden instruction patterns. Adjust the threshold to tune sensitivity lower catches more but may flag legitimate tools with instructional descriptions (OWASP ASI01).")
355
+ // Block ML-detected prompt injection
356
+ @id("semantic-block-injection-ml")
357
+ @name("Block ML-detected prompt injection")
358
+ @description("Block when Javelin's ML classifier scores prompt injection confidence >= 75/100. Catches sophisticated injection that pattern matching missesauthority hijack, instruction override, polymorphic payloads.")
1014
359
  @severity("critical")
1015
- @tags("tool-poisoning,agent-security,owasp-asi01,mitre-aml-t0051")
1016
- @reject_message("Tool execution blocked: hidden manipulation instructions detected in tool description or arguments. This may be a tool poisoning attack (OWASP ASI01).")
360
+ @tags("prompt-injection,ml,javelin,owasp-llm01,mitre-aml-t0051")
361
+ @reject_message("Blocked: prompt injection detected by ML classifier (confidence 75+). The content attempts to override agent instructions.")
1017
362
  forbid (
1018
363
  principal,
1019
- action == Overwatch::Action::"call_tool",
364
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
1020
365
  resource
1021
366
  )
1022
367
  when {
1023
- context has tool_poisoning_score && context.tool_poisoning_score >= 70
368
+ context has injection_confidence && context.injection_confidence >= 75
1024
369
  };
1025
370
 
1026
- // Block MCP server connections with poisoning risk
1027
- @id("as-block-server-poisoning")
1028
- @name("Block poisoned MCP servers")
1029
- @description("Block connections to MCP servers when tool poisoning patterns are detected in tool descriptions (score >= 60). Lower threshold for servers since poisoning affects all tools on the server.")
371
+ // Block ML-detected jailbreak attempts
372
+ @id("semantic-block-jailbreak-ml")
373
+ @name("Block ML-detected jailbreak attempts")
374
+ @description("Block when Javelin's ML classifier scores jailbreak confidence >= 75/100. Catches DAN prompts, role-playing bypasses, and safety guideline override attempts.")
1030
375
  @severity("critical")
1031
- @tags("tool-poisoning,mcp-security,owasp-asi04,owasp-mcp02")
1032
- @reject_message("MCP server connection blocked: tool poisoning patterns detected in server tool descriptions. Review server tools before connecting.")
376
+ @tags("jailbreak,ml,javelin,owasp-llm02,mitre-aml-t0054")
377
+ @reject_message("Blocked: jailbreak attempt detected by ML classifier (confidence 75+). The content attempts to bypass agent safety guidelines.")
1033
378
  forbid (
1034
379
  principal,
1035
- action == Overwatch::Action::"connect_server",
380
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
1036
381
  resource
1037
382
  )
1038
383
  when {
1039
- context has tool_poisoning_score && context.tool_poisoning_score >= 60
384
+ context has jailbreak_confidence && context.jailbreak_confidence >= 75
1040
385
  };
1041
386
 
387
+
1042
388
  // ---------------------------------------------------------------------------
1043
- // Rug Pull tool behavior changes after trust establishment
1044
- // Ref: Acuvity (2025), tools approved once then silently redefined
389
+ // Tier 3: Content Safety (Javelin ML scores, require API token)
390
+ // Trust & safety classification scores for violence, weapons, hate speech,
391
+ // crime, sexual content, and profanity. Ensures enterprise-appropriate
392
+ // content thresholds.
1045
393
  // ---------------------------------------------------------------------------
1046
394
 
1047
- // Block tool calls with behavioral drift (rug pull)
1048
- @id("as-block-rug-pull")
1049
- @name("Block rug pull attacks")
1050
- @description("Block tool execution when behavioral drift is detected tool behavior diverges significantly from established patterns (score >= 70/100). Defends against tools that are approved once then silently redefined to act maliciously. Adjust the threshold to tune sensitivity (OWASP ASI04).")
395
+ // Block violent content
396
+ @id("semantic-block-violence")
397
+ @name("Block violent content")
398
+ @description("Block content when the ML violence detection score exceeds threshold (80/100). Catches graphic violence descriptions, instructions for causing harm, and violent threat language.")
1051
399
  @severity("critical")
1052
- @tags("rug-pull,agent-security,owasp-asi04,behavioral-drift")
1053
- @reject_message("Tool execution blocked: tool behavior has changed significantly from its established pattern. This may be a rug pull attack where a tool was silently redefined after initial approval.")
400
+ @tags("violence,content-safety,trust-safety,nist-si-4,iso-42001")
401
+ @reject_message("Content blocked: violent content detected. AI agents in enterprise environments must not process or generate violent content.")
1054
402
  forbid (
1055
403
  principal,
1056
- action in [Overwatch::Action::"call_tool", Overwatch::Action::"connect_server"],
404
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
1057
405
  resource
1058
406
  )
1059
407
  when {
1060
- context has rug_pull_score && context.rug_pull_score >= 70
408
+ context has violence_score && context.violence_score >= 80
1061
409
  };
1062
410
 
1063
- // ---------------------------------------------------------------------------
1064
- // Indirect Prompt Injection — injection via tool outputs and retrieved content
1065
- // Ref: EchoLeak CVE-2025-32711, IDEsaster (30+ CVEs in AI IDEs)
1066
- // ---------------------------------------------------------------------------
1067
-
1068
- // Block prompts with indirect injection from tool outputs
1069
- @id("as-block-indirect-injection")
1070
- @name("Block indirect prompt injection")
1071
- @description("Block when indirect prompt injection is detected in tool outputs, file contents, or retrieved documents (score >= 70). Defends against OWASP LLM01 and ASI01.")
411
+ // Block weapons content
412
+ @id("semantic-block-weapons")
413
+ @name("Block weapons content")
414
+ @description("Block content when the ML weapons detection score exceeds threshold (80/100). Catches weapon manufacturing instructions, procurement guidance, and detailed specifications.")
1072
415
  @severity("critical")
1073
- @tags("indirect-injection,owasp-llm01,owasp-asi01,mitre-aml-t0051")
1074
- @reject_message("Content blocked: indirect prompt injection detected in tool output or retrieved content. An external source may be attempting to hijack agent behavior.")
416
+ @tags("weapons,content-safety,trust-safety,nist-si-4")
417
+ @reject_message("Content blocked: weapons-related content detected. AI agents must not process weapons manufacturing, procurement, or detailed specification content.")
1075
418
  forbid (
1076
419
  principal,
1077
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool", Overwatch::Action::"connect_server"],
420
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
1078
421
  resource
1079
422
  )
1080
423
  when {
1081
- context has indirect_injection_score && context.indirect_injection_score >= 70
424
+ context has weapons_score && context.weapons_score >= 80
1082
425
  };
1083
426
 
1084
- // Strict indirect injection for sensitive tool calls
1085
- @id("as-block-indirect-injection-sensitive-tools")
1086
- @name("Block indirect injection on sensitive tools")
1087
- @description("Lower threshold (>= 50) for indirect injection when the tool is classified as sensitive (shell, file write, network). Even moderate injection risk on sensitive tools warrants blocking.")
427
+ // Block hate speech (lower threshold zero tolerance)
428
+ @id("semantic-block-hate-speech")
429
+ @name("Block hate speech")
430
+ @description("Block content when the ML hate speech detection score exceeds threshold (75/100). Lower threshold than other categories enterprises have zero tolerance for discriminatory content.")
1088
431
  @severity("critical")
1089
- @tags("indirect-injection,sensitive-tools,owasp-asi02")
1090
- @reject_message("Sensitive tool execution blocked: moderate indirect injection risk detected. Sensitive tools require higher confidence that content is safe.")
432
+ @tags("hate-speech,content-safety,trust-safety,nist-si-4,eu-ai-act")
433
+ @reject_message("Content blocked: hate speech or discriminatory content detected. AI agents must not process or generate hateful, discriminatory, or dehumanizing content.")
1091
434
  forbid (
1092
435
  principal,
1093
- action == Overwatch::Action::"call_tool",
436
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
1094
437
  resource
1095
438
  )
1096
439
  when {
1097
- context has indirect_injection_score && context.indirect_injection_score >= 50 &&
1098
- context has tool_is_sensitive && context.tool_is_sensitive
440
+ context has hate_speech_score && context.hate_speech_score >= 75
1099
441
  };
1100
442
 
1101
- // ---------------------------------------------------------------------------
1102
- // MCP Supply Chain — unverified servers, risky configs
1103
- // Ref: OWASP MCP Top 10, OWASP ASI04, MITRE AML.T0082
1104
- // ---------------------------------------------------------------------------
1105
-
1106
- // Block unverified MCP server tool calls with detected threats
1107
- @id("as-block-unverified-threats")
1108
- @name("Block unverified server threats")
1109
- @description("Block tool calls from unverified MCP servers when any threat is detected. Unverified servers with threats are high-risk supply chain vectors.")
443
+ // Block criminal content
444
+ @id("semantic-block-crime")
445
+ @name("Block criminal content")
446
+ @description("Block content when the ML criminal activity detection score exceeds threshold (80/100). Catches illegal activity instructions, fraud techniques, and content promoting criminal behavior.")
1110
447
  @severity("high")
1111
- @tags("mcp-trust,owasp-asi04,owasp-mcp02,supply-chain")
1112
- @reject_message("Tool execution blocked: the MCP server is unverified and security threats were detected. Only use tools from verified or trusted servers.")
448
+ @tags("crime,content-safety,trust-safety,nist-si-4")
449
+ @reject_message("Content blocked: criminal activity content detected. AI agents must not process content related to illegal activities, fraud, or other criminal behavior.")
1113
450
  forbid (
1114
451
  principal,
1115
- action == Overwatch::Action::"call_tool",
452
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
1116
453
  resource
1117
454
  )
1118
455
  when {
1119
- context has mcp_server_verified && context.mcp_server_verified == false &&
1120
- context has threat_count && context.threat_count > 0
456
+ context has crime_score && context.crime_score >= 80
1121
457
  };
1122
458
 
1123
- // Block connections to MCP servers with risky configurations
1124
- @id("as-block-mcp-config-risk")
1125
- @name("Block risky MCP server configs")
1126
- @description("Block MCP server connections when risky configuration patterns are detected (inline code execution, mixed transports, proxy patterns). Score >= 70.")
459
+ // Block sexual content
460
+ @id("semantic-block-sexual")
461
+ @name("Block sexual content")
462
+ @description("Block content when the ML sexual content detection score exceeds threshold (80/100). Ensures enterprise AI agents do not process sexually explicit material.")
1127
463
  @severity("high")
1128
- @tags("mcp-config,owasp-mcp03,owasp-asi04,supply-chain")
1129
- @reject_message("MCP server connection blocked: risky server configuration detected (e.g., inline code execution, mixed transports). Review the server configuration before connecting.")
464
+ @tags("sexual,content-safety,trust-safety,eu-ai-act,iso-42001")
465
+ @reject_message("Content blocked: sexual content detected. AI agents in enterprise environments must not process sexually explicit material.")
1130
466
  forbid (
1131
467
  principal,
1132
- action == Overwatch::Action::"connect_server",
468
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
1133
469
  resource
1134
470
  )
1135
471
  when {
1136
- context has mcp_config_risk && context.mcp_config_risk &&
1137
- context has mcp_risk_score && context.mcp_risk_score >= 70
472
+ context has sexual_score && context.sexual_score >= 80
1138
473
  };
1139
474
 
1140
- // Block connections to unverified MCP servers entirely
1141
- @id("as-block-unverified-server-connect")
1142
- @name("Block unverified MCP server connections")
1143
- @description("Block connections to MCP servers that are not from a verified registry. This prevents supply chain attacks via malicious MCP servers.")
1144
- @severity("high")
1145
- @tags("mcp-trust,owasp-asi04,owasp-mcp05,supply-chain")
1146
- @reject_message("MCP server connection blocked: server is not from a verified registry. Add the server to your verified list or contact your admin.")
475
+ // Block excessive profanity (higher threshold allows normal expression)
476
+ @id("semantic-block-profanity")
477
+ @name("Block profanity")
478
+ @description("Block content when the ML profanity detection score exceeds threshold (90/100). Higher threshold allows normal expression while blocking abusive or harassing language patterns.")
479
+ @severity("medium")
480
+ @tags("profanity,content-safety,trust-safety")
481
+ @reject_message("Content blocked: excessive profanity detected. Please rephrase in a professional manner appropriate for enterprise AI interactions.")
1147
482
  forbid (
1148
483
  principal,
1149
- action == Overwatch::Action::"connect_server",
484
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
1150
485
  resource
1151
486
  )
1152
487
  when {
1153
- context has mcp_server_verified && context.mcp_server_verified == false
488
+ context has profanity_score && context.profanity_score >= 90
1154
489
  };
1155
490
  `;
1156
- const OVERWATCH_ENCODING_DEFAULT_CEDAR = `// =============================================================================
1157
- // Encoding & Unicode Attack Detection Policy (Default)
491
+ const OVERWATCH_TOOLS_DEFAULT_CEDAR = `// =============================================================================
492
+ // Tool Permissioning Policy (Default)
1158
493
  // =============================================================================
1159
- // Detects and blocks invisible Unicode characters, bidirectional text
1160
- // overrides, tag characters, and other encoding-based attack vectors used
1161
- // to hide malicious instructions from human review while remaining
1162
- // visible to AI model tokenizers.
494
+ // Controls access to IDE tools, shell execution, file system paths, and MCP
495
+ // operations. Enforces least-privilege for agent tool usage with:
1163
496
  //
1164
- // Attack vectors:
1165
- // - Zero-width characters (U+200B, U+200C, U+200D, U+FEFF) hiding instructions
1166
- // - Bidirectional overrides (U+202A-U+202E) creating visually misleading text
1167
- // - Tag characters (U+E0001-U+E007F) embedding invisible payloads
1168
- // - Variation selectors used as steganographic channels
1169
- // - Homoglyph attacks using lookalike Unicode characters
497
+ // 1. Shell/command execution blocking (opt-in — inactive by default)
498
+ // 2. Destructive file operation blocking (opt-in inactive by default)
499
+ // 3. Sensitive system path protection (active)
500
+ // 4. Threat-severity-based tool blocking (active)
1170
501
  //
1171
- // Ref: EchoLeak CVE-2025-32711 (invisible prompt injection via Unicode)
1172
- // Rules File Backdoor (Pillar Security, March 2025)
1173
- // Unicode-based prompt injection in Claude Code (CERT-2025)
502
+ // NOTE: Policies 1-2 are included in this file but classified as opt-in
503
+ // templates in templates.json (not in the defaults array). They are shipped
504
+ // as part of the tool permissioning category but must be explicitly enabled.
1174
505
  //
1175
506
  // Compliance:
1176
- // OWASP LLM01 (Prompt Injection) — encoding evasion
1177
- // OWASP ASI01 (Agent Goal Hijack) — hidden instructions
1178
- // MITRE ATLAS AML.T0051 (LLM Prompt Injection)
1179
- // NIST 800-53 SI-10 (Information Input Validation)
507
+ // NIST 800-53 AC-3 (Access Enforcement)
508
+ // NIST 800-53 AC-6 (Least Privilege)
509
+ // NIST 800-53 CM-7 (Least Functionality)
510
+ // OWASP LLM06 (Excessive Agency)
511
+ // OWASP ASI02 (Tool Misuse)
512
+ // MITRE ATT&CK T1059 (Command and Scripting Interpreter)
513
+ // MITRE ATT&CK T1005 (Data from Local System)
1180
514
  //
1181
- // Category: encoding
515
+ // Category: tools
1182
516
  // Namespace: Overwatch
1183
517
  // =============================================================================
1184
518
 
1185
519
  // ---------------------------------------------------------------------------
1186
- // Section 1: Invisible Character Detection in Prompts
1187
- // Blocks prompts containing suspicious invisible Unicode patterns.
520
+ // Section 1: Shell Blocking (opt-in — inactive by default)
521
+ // Blocks all shell/command execution tools. Enable for high-security
522
+ // environments where shell access is explicitly prohibited.
523
+ // WARNING: Blocks ALL shell use including safe commands (git, npm, echo).
524
+ // Ref: OWASP LLM06, MITRE T1059
1188
525
  // ---------------------------------------------------------------------------
1189
526
 
1190
- // Block prompts with invisible characters above risk threshold
1191
- @id("encoding-block-invisible-prompt")
1192
- @name("Block invisible characters in prompts")
1193
- @description("Block prompts when invisible Unicode characters are detected with a risk score >= 50. Invisible chars (zero-width joiners, bidi overrides, tag characters) can hide malicious instructions from human review while being processed by AI models. Ref: EchoLeak CVE-2025-32711.")
1194
- @severity("high")
1195
- @tags("unicode,invisible-chars,encoding,owasp-llm01,owasp-asi01,mitre-aml-t0051,nist-si-10")
1196
- @reject_message("Your prompt was blocked because suspicious invisible Unicode characters were detected. These characters (zero-width, bidirectional overrides, tag characters) can be used to hide malicious instructions. Please remove non-visible characters and resubmit.")
527
+ // Block shell and command execution tools
528
+ @id("tools-block-shell")
529
+ @name("Block shell and command execution")
530
+ @description("Block direct shell, bash, and command execution tools. Unrestricted shell access enables command injection, data exfiltration, and arbitrary code execution. INACTIVE BY DEFAULT enable for high-security environments where all shell access is prohibited.")
531
+ @severity("critical")
532
+ @tags("shell,execution,nist-cm-7,mitre-t1059,owasp-llm06,opt-in")
533
+ @reject_message("Tool blocked: shell/command execution is restricted in this environment. Use specific, scoped tools instead.")
1197
534
  forbid (
1198
535
  principal,
1199
- action == Overwatch::Action::"process_prompt",
536
+ action == Overwatch::Action::"call_tool",
1200
537
  resource
1201
538
  )
1202
539
  when {
1203
- context has contains_invisible_chars && context.contains_invisible_chars &&
1204
- context has invisible_chars_score && context.invisible_chars_score >= 50
540
+ context has tool_name &&
541
+ (context.tool_name == "shell" ||
542
+ context.tool_name == "bash" ||
543
+ context.tool_name == "sh" ||
544
+ context.tool_name == "terminal" ||
545
+ context.tool_name == "cmd" ||
546
+ context.tool_name == "powershell")
1205
547
  };
1206
548
 
1207
549
  // ---------------------------------------------------------------------------
1208
- // Section 2: Invisible Characters in Tool Calls
1209
- // Any invisible characters in tool arguments are suspicious tool args
1210
- // should be plain text/JSON. Lower threshold than prompts.
550
+ // Section 2: Destructive Operations (opt-in inactive by default)
551
+ // Blocks file deletion tools. Enable when agents should not have delete access.
552
+ // NOTE: Only matches MCP tool names, not Bash rm commands (which use
553
+ // tool_name "shell"). Bash destructive commands are caught by YARA
554
+ // command_injection rule in semantic.cedar.
1211
555
  // ---------------------------------------------------------------------------
1212
556
 
1213
- // Block tool calls with any invisible characters
1214
- @id("encoding-block-invisible-tool")
1215
- @name("Block invisible characters in tool calls")
1216
- @description("Block tool execution when invisible Unicode characters are detected in tool arguments or content. Tool arguments should be plain text/JSON invisible characters in tool calls are almost certainly malicious payload injection.")
1217
- @severity("critical")
1218
- @tags("unicode,invisible-chars,tools,encoding,owasp-asi01,owasp-asi02")
1219
- @reject_message("Tool execution blocked: invisible Unicode characters detected in tool arguments. Tool calls should contain only plain text — invisible characters indicate payload injection or encoding evasion.")
557
+ // Block destructive file operations
558
+ @id("tools-block-destructive-ops")
559
+ @name("Block destructive file operations")
560
+ @description("Block file deletion and directory removal tools. INACTIVE BY DEFAULT enable when agents should not have delete access. Only matches MCP tool names; Bash rm commands go through 'shell' tool name.")
561
+ @severity("high")
562
+ @tags("file,delete,destructive,nist-ac-3,owasp-asi02,opt-in")
563
+ @reject_message("Tool blocked: destructive file operations (delete, rmdir) are restricted. Request human approval for destructive actions.")
1220
564
  forbid (
1221
565
  principal,
1222
566
  action == Overwatch::Action::"call_tool",
1223
567
  resource
1224
568
  )
1225
569
  when {
1226
- context has contains_invisible_chars && context.contains_invisible_chars
570
+ context has tool_name &&
571
+ (context.tool_name == "fs.delete" ||
572
+ context.tool_name == "fs.rmdir" ||
573
+ context.tool_name == "fs.unlink" ||
574
+ context.tool_name == "fs.remove" ||
575
+ context.tool_name == "delete_file" ||
576
+ context.tool_name == "remove_directory")
1227
577
  };
1228
578
 
1229
579
  // ---------------------------------------------------------------------------
1230
- // Section 3: Invisible Characters in File Operations
1231
- // Blocks file reads/writes with encoding attacks to prevent persistence
1232
- // of invisible payloads in the codebase.
580
+ // Section 3: Sensitive System Path Protection (active)
581
+ // Blocks file read/write access to system directories.
582
+ // NOTE: Targets read_file/write_file only NOT call_tool. The path field
583
+ // is empty for Bash commands. Bash access to system files is caught by
584
+ // YARA rules in semantic.cedar (command_injection, path_traversal).
585
+ // Ref: MITRE T1005, T1552
1233
586
  // ---------------------------------------------------------------------------
1234
587
 
1235
- // Block file writes with invisible characters
1236
- @id("encoding-block-invisible-file-write")
1237
- @name("Block invisible characters in file writes")
1238
- @description("Block file writes when invisible Unicode characters are detected. Prevents persistence of invisible payloads in source code, config files, or documentation where they could later be processed by AI agents. Ref: Rules File Backdoor attack (Pillar Security).")
588
+ // Block access to system directories
589
+ @id("tools-block-system-paths")
590
+ @name("Block system directory access")
591
+ @description("Block file read/write to sensitive system directories. Includes Linux (/etc, /proc, /sys, /root, /var) and macOS (/private/etc, /Library, /System) paths. Only applies to Read/Write/Edit tools.")
1239
592
  @severity("high")
1240
- @tags("unicode,invisible-chars,file-write,encoding,owasp-asi01")
1241
- @reject_message("File write blocked: invisible Unicode characters detected in content. Writing invisible characters to files can create persistent backdoors that affect AI agents processing those files later.")
593
+ @tags("file,path,system,nist-ac-6,mitre-t1005")
594
+ @reject_message("Blocked: access to sensitive system directory. AI agents are restricted from system configuration and process directories.")
1242
595
  forbid (
1243
596
  principal,
1244
- action == Overwatch::Action::"write_file",
597
+ action in [Overwatch::Action::"read_file", Overwatch::Action::"write_file"],
1245
598
  resource
1246
599
  )
1247
600
  when {
1248
- context has contains_invisible_chars && context.contains_invisible_chars
601
+ context has path &&
602
+ (context.path like "/etc/*" ||
603
+ context.path like "/proc/*" ||
604
+ context.path like "/sys/*" ||
605
+ context.path like "/root/*" ||
606
+ context.path like "/var/log/*" ||
607
+ context.path like "/var/run/*" ||
608
+ context.path like "/private/etc/*" ||
609
+ context.path like "/private/var/*" ||
610
+ context.path like "/Library/*" ||
611
+ context.path like "/System/*")
1249
612
  };
1250
613
 
1251
- // Block MCP server connections with invisible characters in config
1252
- @id("encoding-block-invisible-server")
1253
- @name("Block invisible characters in server config")
1254
- @description("Block MCP server connections when invisible Unicode characters are detected in server configuration or responses. Invisible chars in server data indicate a compromised or malicious MCP server.")
1255
- @severity("critical")
1256
- @tags("unicode,invisible-chars,mcp,encoding,owasp-mcp02,owasp-asi04")
1257
- @reject_message("MCP server connection blocked: invisible Unicode characters detected in server data. This may indicate a compromised MCP server using encoding attacks to inject hidden instructions.")
614
+ // ---------------------------------------------------------------------------
615
+ // Section 4: Threat-Based Tool Blocking (active)
616
+ // Blocks tool calls based on threat severity from detection engines.
617
+ // This is the primary catch-all any YARA rule with severity HIGH (3)
618
+ // or CRITICAL (4) triggers this. Provides defense-in-depth behind
619
+ // specific YARA rule policies in semantic.cedar and secrets.cedar.
620
+ // ---------------------------------------------------------------------------
621
+
622
+ // Block tool calls with high/critical severity threats
623
+ @id("tools-block-high-severity")
624
+ @name("Block tool calls with high severity threats")
625
+ @description("Block tool execution when YARA or Javelin detects threats with severity >= HIGH (3). Primary catch-all defense — any YARA rule with severity HIGH or CRITICAL triggers this.")
626
+ @severity("high")
627
+ @tags("tools,threats,severity,defense-in-depth")
628
+ @reject_message("Tool execution blocked: high or critical severity threats detected in content by security scanners.")
1258
629
  forbid (
1259
630
  principal,
1260
- action == Overwatch::Action::"connect_server",
631
+ action == Overwatch::Action::"call_tool",
1261
632
  resource
1262
633
  )
1263
634
  when {
1264
- context has contains_invisible_chars && context.contains_invisible_chars
635
+ context has max_threat_severity && context.max_threat_severity >= 3
1265
636
  };
1266
637
  `;
1267
- const OVERWATCH_BEHAVIORAL_DEFAULT_CEDAR = `// =============================================================================
1268
- // Behavioral Analysis Policy (Default)
638
+ const OVERWATCH_PII_DEFAULT_CEDAR = `// =============================================================================
639
+ // PII Detection Policy (Default)
1269
640
  // =============================================================================
1270
- // Detects and blocks suspicious agent behavioral patterns including tool call
1271
- // loops, data exfiltration sequences, credential theft chains, and destructive
1272
- // operation patterns. Operates on session-level behavioral signals rather
1273
- // than single-request content analysis.
641
+ // Detects and blocks personally identifiable information across prompts, tool
642
+ // calls, and file operations using YARA rule name matching and ML classifiers.
1274
643
  //
1275
- // Attack vectors:
1276
- // - Tool call loops: Agent stuck in retry loop or manipulation-induced recursion
1277
- // - Data exfiltration: Read sensitive data → send to external endpoint sequence
1278
- // - Secret exfiltration: Read credentials → curl/fetch external URL
1279
- // - Credential theft: Access .ssh/.aws → encode/compress → network tool
1280
- // - Destructive sequences: Bulk delete, permission changes, config overwrites
644
+ // Detection layers:
645
+ // 1. YARA rule triggers (detected_threats) pattern-based, always available
1281
646
  //
1282
- // Ref: OWASP LLM10 (Unbounded Consumption) — loop/recursion attacks
1283
- // OWASP ASI02 (Tool Misuse) tool abuse sequences
1284
- // OWASP ASI08 (Lack of Monitoring) — behavioral anomaly detection
1285
- // GlassWorm Attack (35,800+ installations, cross-agent propagation)
1286
- // MITRE ATLAS AML.T0080 (Memory Manipulation)
647
+ // YARA rules matched:
648
+ // pii_ssn — US Social Security Numbers (XXX-XX-XXXX)
649
+ // pii_credit_card — Credit card numbers (13-19 digits)
650
+ // pii_passport — Passport numbers (1-2 letters + 6-9 digits)
651
+ // pii_iban — International Bank Account Numbers
652
+ // pii_email — Email addresses
653
+ // pii_phone_us — US phone numbers
654
+ // pii_date_of_birth — Dates in MM/DD/YYYY format
655
+ // pii_ip_address — IPv4 addresses
1287
656
  //
1288
657
  // Compliance:
1289
- // OWASP LLM10 (Unbounded Consumption)
1290
- // OWASP ASI02 (Tool Misuse)
1291
- // OWASP ASI08 (Lack of Monitoring & Logging)
1292
- // MITRE ATLAS AML.T0080 (AI Memory Manipulation)
1293
- // MITRE ATT&CK T1041 (Exfiltration Over C2 Channel)
1294
- // NIST 800-53 AU-6 (Audit Review, Analysis, and Reporting)
658
+ // PCI DSS 3.4, 4.1 (Payment Card Data)
659
+ // GDPR Art. 32 (Security of Processing)
660
+ // HIPAA §164.312 (Technical Safeguards)
1295
661
  // NIST 800-53 SI-4 (Information System Monitoring)
662
+ // CCPA §1798.150 (Data Protection)
663
+ // OWASP LLM06 (Sensitive Information Disclosure)
1296
664
  //
1297
- // Category: behavioral
665
+ // Category: pii
1298
666
  // Namespace: Overwatch
1299
667
  // =============================================================================
1300
668
 
1301
669
  // ---------------------------------------------------------------------------
1302
- // Section 1: Tool Call Loop Detection
1303
- // Detects agents trapped in infinite loops either through manipulation
1304
- // (adversarial prompt inducing repetitive behavior) or bugs (retry storms).
1305
- // Ref: OWASP LLM10 (Unbounded Consumption)
670
+ // Section 1: Critical PII Identity Theft Risk
671
+ // SSNs and credit card numbers are the highest-risk PII types.
672
+ // Blocked across all actions (prompts, tool calls, file reads/writes).
1306
673
  // ---------------------------------------------------------------------------
1307
674
 
1308
- // Block tool calls in detected loops (5+ consecutive same-tool calls)
1309
- @id("behavioral-block-loop")
1310
- @name("Block tool call loops")
1311
- @description("Block tool execution when a loop is detected — 5 or more consecutive calls to the same tool. This indicates either adversarial manipulation inducing repetitive agent behavior or a bug causing retry storms. Both waste compute and can cause damage. Adjust the threshold (default 5) to match your workflow lower for stricter enforcement, higher for agents that legitimately retry (OWASP LLM10).")
1312
- @severity("high")
1313
- @tags("loop-detection,behavioral,owasp-llm10,owasp-asi02,nist-si-4")
1314
- @reject_message("Tool execution blocked: repetitive tool call loop detected (5+ consecutive calls to the same tool). This may indicate adversarial manipulation or a system error. The agent session should be reviewed.")
675
+ // Block Social Security Numbers
676
+ @id("pii-block-ssn")
677
+ @name("Block Social Security Numbers")
678
+ @description("Block content containing SSN patterns (XXX-XX-XXXX). SSNs are high-value identity theft targetsexposure through AI agents is a critical privacy violation.")
679
+ @severity("critical")
680
+ @tags("pii,ssn,identity,yara,pci-dss,nist-si-4")
681
+ @reject_message("Content blocked: Social Security Number patterns detected. SSNs must never be processed through AI agents.")
1315
682
  forbid (
1316
683
  principal,
1317
- action == Overwatch::Action::"call_tool",
684
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool", Overwatch::Action::"read_file", Overwatch::Action::"write_file"],
1318
685
  resource
1319
686
  )
1320
687
  when {
1321
- context has loop_detected && context.loop_detected &&
1322
- context has loop_count && context.loop_count >= 5
688
+ context has detected_threats && context.detected_threats.contains("pii_ssn")
1323
689
  };
1324
690
 
1325
- // ---------------------------------------------------------------------------
1326
- // Section 2: Data Exfiltration Detection
1327
- // Detects sequences where an agent reads sensitive data and then attempts
1328
- // to send it to an external endpoint.
1329
- // Ref: GlassWorm attack, MITRE T1041
1330
- // ---------------------------------------------------------------------------
1331
-
1332
- // Block data exfiltration patterns
1333
- @id("behavioral-block-data-exfil")
1334
- @name("Block data exfiltration")
1335
- @description("Block tool execution when a data exfiltration pattern is detected — the agent reads sensitive local data (files, configs, source code) followed by a network operation sending data externally. This is the hallmark of autonomous agent compromise (GlassWorm, EchoLeak).")
691
+ // Block credit card numbers (PCI DSS compliance)
692
+ @id("pii-block-credit-card")
693
+ @name("Block credit card numbers")
694
+ @description("Block content containing credit card number patterns (13-19 digits). PCI DSS 3.4 requires PANs are rendered unreadable — AI agents must never process raw card numbers.")
1336
695
  @severity("critical")
1337
- @tags("data-exfiltration,behavioral,owasp-asi02,mitre-t1041,nist-si-4")
1338
- @reject_message("Tool execution blocked: data exfiltration pattern detected. The agent appears to be reading sensitive data and sending it to an external endpoint. This is a critical security event — the agent session has been terminated.")
696
+ @tags("pii,credit-card,payment,yara,pci-dss-3.4,pci-dss-4.1")
697
+ @reject_message("Content blocked: credit card number patterns detected. Sharing payment card data through AI agents violates PCI DSS. Use tokenized references instead.")
1339
698
  forbid (
1340
699
  principal,
1341
- action == Overwatch::Action::"call_tool",
700
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool", Overwatch::Action::"read_file", Overwatch::Action::"write_file"],
1342
701
  resource
1343
702
  )
1344
703
  when {
1345
- context has suspicious_pattern && context.suspicious_pattern &&
1346
- context has pattern_type && context.pattern_type == "data_exfiltration"
704
+ context has detected_threats && context.detected_threats.contains("pii_credit_card")
1347
705
  };
1348
706
 
1349
- // Block secret exfiltration (credential-specific exfil)
1350
- @id("behavioral-block-secret-exfil")
1351
- @name("Block secret exfiltration")
1352
- @description("Block tool execution when a secret exfiltration pattern is detected — the agent accesses credential files (.env, .aws, tokens) followed by a network tool call. This is a targeted credential theft sequence.")
1353
- @severity("critical")
1354
- @tags("secret-exfiltration,behavioral,owasp-asi02,mitre-t1552,mitre-t1041")
1355
- @reject_message("Tool execution blocked: secret exfiltration pattern detected. The agent accessed credential files and is attempting to send them externally. This is a targeted credential theft attack.")
707
+ // ---------------------------------------------------------------------------
708
+ // Section 2: High PII — Government & Financial Identifiers
709
+ // Passport numbers and IBANs are regulated identifiers with high
710
+ // identity theft and financial fraud risk.
711
+ // ---------------------------------------------------------------------------
712
+
713
+ // Block passport numbers
714
+ @id("pii-block-passport")
715
+ @name("Block passport numbers")
716
+ @description("Block content containing passport number patterns (1-2 letters + 6-9 digits). Passport numbers are government-issued identifiers with high identity theft risk.")
717
+ @severity("high")
718
+ @tags("pii,passport,identity,yara,gdpr-art-32")
719
+ @reject_message("Content blocked: passport number patterns detected. Government-issued identifiers must not be processed through AI agents.")
1356
720
  forbid (
1357
721
  principal,
1358
- action == Overwatch::Action::"call_tool",
722
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool", Overwatch::Action::"read_file", Overwatch::Action::"write_file"],
1359
723
  resource
1360
724
  )
1361
725
  when {
1362
- context has suspicious_pattern && context.suspicious_pattern &&
1363
- context has pattern_type && context.pattern_type == "secret_exfiltration"
726
+ context has detected_threats && context.detected_threats.contains("pii_passport")
1364
727
  };
1365
728
 
1366
- // Block credential theft chains
1367
- @id("behavioral-block-credential-theft")
1368
- @name("Block credential theft")
1369
- @description("Block tool execution when a credential theft chain is detected accessing SSH keys, cloud credentials, or API tokens followed by encoding, compression, or transfer operations. Multi-step attack pattern for autonomous credential harvesting.")
1370
- @severity("critical")
1371
- @tags("credential-theft,behavioral,owasp-asi02,mitre-t1552,mitre-t1555")
1372
- @reject_message("Tool execution blocked: credential theft chain detected. The agent is performing a multi-step operation to harvest and exfiltrate credentials (SSH keys, cloud tokens, API keys). Session terminated.")
729
+ // Block IBAN (International Bank Account Numbers)
730
+ @id("pii-block-iban")
731
+ @name("Block bank account numbers")
732
+ @description("Block content containing IBAN patterns. Bank account numbers are sensitive financial identifiers that must not be exposed through AI agents.")
733
+ @severity("high")
734
+ @tags("pii,iban,financial,yara,gdpr-art-32,pci-dss")
735
+ @reject_message("Content blocked: bank account number (IBAN) patterns detected. Financial account numbers must not be processed through AI agents.")
1373
736
  forbid (
1374
737
  principal,
1375
- action == Overwatch::Action::"call_tool",
738
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool", Overwatch::Action::"read_file", Overwatch::Action::"write_file"],
1376
739
  resource
1377
740
  )
1378
741
  when {
1379
- context has suspicious_pattern && context.suspicious_pattern &&
1380
- context has pattern_type && context.pattern_type == "credential_theft"
742
+ context has detected_threats && context.detected_threats.contains("pii_iban")
1381
743
  };
1382
744
 
1383
745
  // ---------------------------------------------------------------------------
1384
- // Section 3: Destructive Sequence Detection
1385
- // Detects sequences of destructive operations that could damage the workspace.
746
+ // Section 3: Medium PII — Contact Information
747
+ // Email addresses and phone numbers. Only blocked in prompts and tool calls
748
+ // (not file ops — too common in source code, configs, and test fixtures).
1386
749
  // ---------------------------------------------------------------------------
1387
750
 
1388
- // Block destructive operation sequences
1389
- @id("behavioral-block-destructive-sequence")
1390
- @name("Block destructive sequences")
1391
- @description("Block tool execution when a destructive operation sequence is detected bulk file deletions, permission changes, config overwrites, or repository manipulation patterns. Prevents agent-initiated workspace damage.")
1392
- @severity("critical")
1393
- @tags("destructive,behavioral,owasp-asi02,nist-si-4")
1394
- @reject_message("Tool execution blocked: destructive operation sequence detected. The agent is performing a pattern of destructive operations (bulk deletions, permission changes, config overwrites) that could damage the workspace.")
751
+ // Block email addresses
752
+ @id("pii-block-email")
753
+ @name("Block email addresses")
754
+ @description("Block prompts and tool calls containing email address patterns. Prevents accidental sharing of personal email addresses with AI agents.")
755
+ @severity("medium")
756
+ @tags("pii,email,contact,yara,gdpr-art-32")
757
+ @reject_message("Content blocked: email address patterns detected. Remove personal email addresses before submitting to AI agents.")
1395
758
  forbid (
1396
759
  principal,
1397
- action == Overwatch::Action::"call_tool",
760
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
761
+ resource
762
+ )
763
+ when {
764
+ context has detected_threats && context.detected_threats.contains("pii_email")
765
+ };
766
+
767
+ // Block US phone numbers
768
+ @id("pii-block-phone")
769
+ @name("Block phone numbers")
770
+ @description("Block prompts and tool calls containing US phone number patterns. Prevents accidental sharing of personal phone numbers with AI agents.")
771
+ @severity("medium")
772
+ @tags("pii,phone,contact,yara,ccpa")
773
+ @reject_message("Content blocked: phone number patterns detected. Remove personal phone numbers before submitting to AI agents.")
774
+ forbid (
775
+ principal,
776
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
1398
777
  resource
1399
778
  )
1400
779
  when {
1401
- context has suspicious_pattern && context.suspicious_pattern &&
1402
- context has pattern_type && context.pattern_type == "destructive_sequence"
780
+ context has detected_threats && context.detected_threats.contains("pii_phone_us")
781
+ };
782
+
783
+ // Block dates of birth
784
+ @id("pii-block-dob")
785
+ @name("Block dates of birth")
786
+ @description("Block prompts and tool calls containing date of birth patterns (MM/DD/YYYY). Date of birth combined with other identifiers enables identity theft.")
787
+ @severity("medium")
788
+ @tags("pii,dob,identity,yara,hipaa-164.312")
789
+ @reject_message("Content blocked: date of birth patterns detected. Remove personal dates before submitting to AI agents.")
790
+ forbid (
791
+ principal,
792
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
793
+ resource
794
+ )
795
+ when {
796
+ context has detected_threats && context.detected_threats.contains("pii_date_of_birth")
1403
797
  };
1404
798
 
1405
799
  // ---------------------------------------------------------------------------
1406
- // Section 4: Sequence Risk Scoring
1407
- // Uses computed sequence risk scores for dynamic behavioral assessment.
800
+ // Section 4: Low PII — Network Identifiers
801
+ // IP addresses are extremely common in code, configs, and logs.
802
+ // Only blocked in prompts to catch accidental data dumps.
1408
803
  // ---------------------------------------------------------------------------
1409
804
 
1410
- // Block high-risk behavioral sequences
1411
- @id("behavioral-block-high-risk-sequence")
1412
- @name("Block high-risk behavioral sequences")
1413
- @description("Block tool execution when the computed sequence risk score exceeds 80/100. The score aggregates behavioral signals including action history, tool combination analysis, and deviation from normal patterns. High scores indicate coordinated multi-step attacks.")
1414
- @severity("high")
1415
- @tags("sequence-risk,behavioral,dynamic,owasp-asi08,nist-au-6")
1416
- @reject_message("Tool execution blocked: high-risk behavioral sequence detected (risk score 80+). The pattern of agent actions indicates a coordinated attack. This session requires human review before continuing.")
805
+ // Block IP addresses in prompts
806
+ @id("pii-block-ip-address")
807
+ @name("Block IP addresses in prompts")
808
+ @description("Block prompts containing IPv4 address patterns. Only targets prompts IP addresses are too common in source code and config files to block in tool calls or file operations.")
809
+ @severity("low")
810
+ @tags("pii,ip-address,network,yara")
811
+ @reject_message("Content blocked: IP address patterns detected in prompt. Remove network identifiers before submitting.")
1417
812
  forbid (
1418
813
  principal,
1419
- action == Overwatch::Action::"call_tool",
814
+ action == Overwatch::Action::"process_prompt",
1420
815
  resource
1421
816
  )
1422
817
  when {
1423
- context has sequence_risk && context.sequence_risk >= 80
818
+ context.detected_threats.contains("pii_ip_address")
1424
819
  };
820
+
1425
821
  `;
1426
822
  const OVERWATCH_TOOLS_MCP_ALLOWLIST_CEDAR = `// MCP Server Allowlist Template
1427
823
  // Only allow specific MCP servers to be used
@@ -1653,15 +1049,11 @@ when {
1653
1049
  // CATEGORIES
1654
1050
  // =============================================================================
1655
1051
  export const OVERWATCH_CATEGORIES = [
1656
- { id: 'secrets', name: 'Secrets Detection', description: 'Detect and block credentials, tokens, API keys, and sensitive key patterns in prompts, tool calls, and AI responses' },
1052
+ { id: 'secrets', name: 'Secrets Detection', description: 'Detect and block credentials, tokens, API keys, and sensitive key patterns in prompts, tool calls, and file operations' },
1657
1053
  { id: 'pii', name: 'PII Detection', description: 'Detect and block personally identifiable information (PII) such as credit card numbers, SSNs, and other sensitive data' },
1658
- { id: 'semantic', name: 'Semantic Threat Detection', description: 'Detect and block prompt injection, jailbreak attempts, and high-severity AI security threats' },
1054
+ { id: 'semantic', name: 'Semantic Threat Detection', description: 'Detect and block injection attacks, prompt injection, jailbreak attempts, and unsafe content using YARA rules and ML classifiers' },
1659
1055
  { id: 'tools', name: 'Tool Permissioning', description: 'Control access to shell execution, file operations, MCP servers, and sensitive system paths' },
1660
1056
  { id: 'organization', name: 'Organization Rules', description: 'Apply organization-wide policy baselines, team permissions, and agent-specific guardrails' },
1661
- { id: 'trust_safety', name: 'Content Safety', description: 'Detect and control violent, harmful, hateful, sexual, and profane content using trust/safety classification scores' },
1662
- { id: 'agent_security', name: 'Agent Security', description: 'Detect and block tool poisoning, rug pull attacks, indirect prompt injection, and MCP supply chain threats targeting AI agents' },
1663
- { id: 'encoding', name: 'Encoding & Unicode Attacks', description: 'Detect invisible Unicode characters, bidirectional text overrides, and encoded injection payloads used to hide malicious instructions' },
1664
- { id: 'behavioral', name: 'Behavioral Analysis', description: 'Detect suspicious action sequences, tool call loops, data exfiltration patterns, and credential theft chains across agent sessions' },
1665
1057
  ];
1666
1058
  // =============================================================================
1667
1059
  // DEFAULT POLICIES
@@ -1677,24 +1069,34 @@ export const OVERWATCH_DEFAULTS = [
1677
1069
  tags: ['baseline', 'permit-default', 'organization'],
1678
1070
  isActive: true,
1679
1071
  },
1072
+ {
1073
+ id: 'secrets-default',
1074
+ name: 'Secrets Detection',
1075
+ description: 'Detect and block credential leakage using YARA rule matching (secrets_leakage, ssh_key_exposure, pem_file_access, environment_variable_leakage) and sensitive file path protection',
1076
+ category: 'secrets',
1077
+ cedarText: OVERWATCH_SECRETS_DEFAULT_CEDAR,
1078
+ severity: 'critical',
1079
+ tags: ['secrets', 'credentials', 'aws', 'github', 'ssh', 'pem', 'yara', 'baseline'],
1080
+ isActive: true,
1081
+ },
1680
1082
  {
1681
1083
  id: 'semantic-default',
1682
1084
  name: 'Semantic Threat Detection',
1683
- description: 'Detect and block prompt injection, jailbreak attempts, and high-severity threats using detection rules and ML classifiers',
1085
+ description: 'Detect and block injection attacks (command, SQL, path traversal), prompt injection, jailbreak, and unsafe content (violence, hate speech, etc.) using YARA rules and ML classifiers',
1684
1086
  category: 'semantic',
1685
1087
  cedarText: OVERWATCH_SEMANTIC_DEFAULT_CEDAR,
1686
1088
  severity: 'critical',
1687
- tags: ['prompt-injection', 'jailbreak', 'owasp-llm01', 'owasp-llm02', 'security', 'baseline'],
1089
+ tags: ['injection', 'jailbreak', 'content-safety', 'yara', 'ml', 'owasp-llm01', 'owasp-llm02', 'baseline'],
1688
1090
  isActive: true,
1689
1091
  },
1690
1092
  {
1691
- id: 'trust-safety-default',
1692
- name: 'Content Safety',
1693
- description: 'Detect and block violent, harmful, hateful, sexual, and profane content using ML classification scores',
1694
- category: 'trust_safety',
1695
- cedarText: OVERWATCH_TRUST_SAFETY_DEFAULT_CEDAR,
1696
- severity: 'critical',
1697
- tags: ['violence', 'weapons', 'hate-speech', 'crime', 'sexual', 'profanity', 'content-safety', 'baseline'],
1093
+ id: 'tools-default',
1094
+ name: 'Tool Permissioning',
1095
+ description: 'Block access to sensitive system paths and tool calls with high-severity threats. Includes opt-in shell blocking and destructive operation blocking.',
1096
+ category: 'tools',
1097
+ cedarText: OVERWATCH_TOOLS_DEFAULT_CEDAR,
1098
+ severity: 'high',
1099
+ tags: ['tools', 'file-access', 'system-paths', 'severity', 'baseline'],
1698
1100
  isActive: true,
1699
1101
  },
1700
1102
  ];
@@ -1702,15 +1104,6 @@ export const OVERWATCH_DEFAULTS = [
1702
1104
  // ALL TEMPLATES
1703
1105
  // =============================================================================
1704
1106
  export const OVERWATCH_TEMPLATES = [
1705
- {
1706
- id: 'secrets-default',
1707
- name: 'Secrets Detection',
1708
- description: 'Detect and block credential leakage across prompts, tool calls, file operations, and AI responses using multi-layered detection',
1709
- category: 'secrets',
1710
- cedarText: OVERWATCH_SECRETS_DEFAULT_CEDAR,
1711
- severity: 'critical',
1712
- tags: ['api-keys', 'tokens', 'credentials', 'aws', 'github', 'ssh', 'baseline'],
1713
- },
1714
1107
  {
1715
1108
  id: 'pii-default',
1716
1109
  name: 'PII Detection',
@@ -1720,42 +1113,6 @@ export const OVERWATCH_TEMPLATES = [
1720
1113
  severity: 'critical',
1721
1114
  tags: ['pii', 'privacy', 'compliance', 'pci-dss', 'gdpr', 'hipaa', 'baseline'],
1722
1115
  },
1723
- {
1724
- id: 'tools-default',
1725
- name: 'Tool Permissioning',
1726
- description: 'Block dangerous shell execution, restrict sensitive file paths, enforce tool risk scoring, and detect command injection in tool arguments',
1727
- category: 'tools',
1728
- cedarText: OVERWATCH_TOOLS_DEFAULT_CEDAR,
1729
- severity: 'critical',
1730
- tags: ['shell', 'command-injection', 'file-access', 'tool-risk', 'mitre-t1059', 'owasp-llm06', 'baseline'],
1731
- },
1732
- {
1733
- id: 'agent-security-default',
1734
- name: 'Agent Security',
1735
- description: 'Detect and block tool poisoning, rug pull attacks, indirect prompt injection, and MCP supply chain threats via Shield detection',
1736
- category: 'agent_security',
1737
- cedarText: OVERWATCH_AGENT_SECURITY_DEFAULT_CEDAR,
1738
- severity: 'critical',
1739
- tags: ['tool-poisoning', 'rug-pull', 'indirect-injection', 'mcp-security', 'agent-security', 'owasp-asi01', 'owasp-asi04', 'baseline'],
1740
- },
1741
- {
1742
- id: 'encoding-default',
1743
- name: 'Encoding Attack Detection',
1744
- description: 'Detect and block invisible Unicode characters, bidirectional overrides, and encoding-based injection attacks across prompts, tools, and files',
1745
- category: 'encoding',
1746
- cedarText: OVERWATCH_ENCODING_DEFAULT_CEDAR,
1747
- severity: 'high',
1748
- tags: ['unicode', 'invisible-chars', 'bidi-override', 'encoding', 'owasp-llm01', 'baseline'],
1749
- },
1750
- {
1751
- id: 'behavioral-default',
1752
- name: 'Behavioral Analysis',
1753
- description: 'Detect and block tool call loops, data exfiltration sequences, credential theft chains, and destructive operation patterns',
1754
- category: 'behavioral',
1755
- cedarText: OVERWATCH_BEHAVIORAL_DEFAULT_CEDAR,
1756
- severity: 'high',
1757
- tags: ['loop-detection', 'data-exfiltration', 'credential-theft', 'behavioral', 'owasp-llm10', 'owasp-asi02', 'baseline'],
1758
- },
1759
1116
  {
1760
1117
  id: 'tools-mcp-allowlist',
1761
1118
  name: 'MCP Server Allowlist',
@@ -1817,13 +1174,13 @@ export const OVERWATCH_TEMPLATES = [
1817
1174
  /** Raw templates.json metadata for the Overwatch service. */
1818
1175
  export const OVERWATCH_TEMPLATES_JSON = `{
1819
1176
  "service": "overwatch",
1820
- "version": "3.0.0",
1177
+ "version": "4.0.0",
1821
1178
  "description": "Overwatch policy templates for IDE agent security",
1822
1179
  "categories": [
1823
1180
  {
1824
1181
  "id": "secrets",
1825
1182
  "name": "Secrets Detection",
1826
- "description": "Detect and block credentials, tokens, API keys, and sensitive key patterns in prompts, tool calls, and AI responses"
1183
+ "description": "Detect and block credentials, tokens, API keys, and sensitive key patterns in prompts, tool calls, and file operations"
1827
1184
  },
1828
1185
  {
1829
1186
  "id": "pii",
@@ -1833,7 +1190,7 @@ export const OVERWATCH_TEMPLATES_JSON = `{
1833
1190
  {
1834
1191
  "id": "semantic",
1835
1192
  "name": "Semantic Threat Detection",
1836
- "description": "Detect and block prompt injection, jailbreak attempts, and high-severity AI security threats"
1193
+ "description": "Detect and block injection attacks, prompt injection, jailbreak attempts, and unsafe content using YARA rules and ML classifiers"
1837
1194
  },
1838
1195
  {
1839
1196
  "id": "tools",
@@ -1844,26 +1201,6 @@ export const OVERWATCH_TEMPLATES_JSON = `{
1844
1201
  "id": "organization",
1845
1202
  "name": "Organization Rules",
1846
1203
  "description": "Apply organization-wide policy baselines, team permissions, and agent-specific guardrails"
1847
- },
1848
- {
1849
- "id": "trust_safety",
1850
- "name": "Content Safety",
1851
- "description": "Detect and control violent, harmful, hateful, sexual, and profane content using trust/safety classification scores"
1852
- },
1853
- {
1854
- "id": "agent_security",
1855
- "name": "Agent Security",
1856
- "description": "Detect and block tool poisoning, rug pull attacks, indirect prompt injection, and MCP supply chain threats targeting AI agents"
1857
- },
1858
- {
1859
- "id": "encoding",
1860
- "name": "Encoding & Unicode Attacks",
1861
- "description": "Detect invisible Unicode characters, bidirectional text overrides, and encoded injection payloads used to hide malicious instructions"
1862
- },
1863
- {
1864
- "id": "behavioral",
1865
- "name": "Behavioral Analysis",
1866
- "description": "Detect suspicious action sequences, tool call loops, data exfiltration patterns, and credential theft chains across agent sessions"
1867
1204
  }
1868
1205
  ],
1869
1206
  "defaults": [
@@ -1877,37 +1214,38 @@ export const OVERWATCH_TEMPLATES_JSON = `{
1877
1214
  "tags": ["baseline", "permit-default", "organization"],
1878
1215
  "is_active": true
1879
1216
  },
1217
+ {
1218
+ "id": "secrets-default",
1219
+ "name": "Secrets Detection",
1220
+ "description": "Detect and block credential leakage using YARA rule matching (secrets_leakage, ssh_key_exposure, pem_file_access, environment_variable_leakage) and sensitive file path protection",
1221
+ "category": "secrets",
1222
+ "file": "defaults/secrets.cedar",
1223
+ "severity": "critical",
1224
+ "tags": ["secrets", "credentials", "aws", "github", "ssh", "pem", "yara", "baseline"],
1225
+ "is_active": true
1226
+ },
1880
1227
  {
1881
1228
  "id": "semantic-default",
1882
1229
  "name": "Semantic Threat Detection",
1883
- "description": "Detect and block prompt injection, jailbreak attempts, and high-severity threats using detection rules and ML classifiers",
1230
+ "description": "Detect and block injection attacks (command, SQL, path traversal), prompt injection, jailbreak, and unsafe content (violence, hate speech, etc.) using YARA rules and ML classifiers",
1884
1231
  "category": "semantic",
1885
1232
  "file": "defaults/semantic.cedar",
1886
1233
  "severity": "critical",
1887
- "tags": ["prompt-injection", "jailbreak", "owasp-llm01", "owasp-llm02", "security", "baseline"],
1234
+ "tags": ["injection", "jailbreak", "content-safety", "yara", "ml", "owasp-llm01", "owasp-llm02", "baseline"],
1888
1235
  "is_active": true
1889
1236
  },
1890
1237
  {
1891
- "id": "trust-safety-default",
1892
- "name": "Content Safety",
1893
- "description": "Detect and block violent, harmful, hateful, sexual, and profane content using ML classification scores",
1894
- "category": "trust_safety",
1895
- "file": "defaults/trust_safety.cedar",
1896
- "severity": "critical",
1897
- "tags": ["violence", "weapons", "hate-speech", "crime", "sexual", "profanity", "content-safety", "baseline"],
1238
+ "id": "tools-default",
1239
+ "name": "Tool Permissioning",
1240
+ "description": "Block access to sensitive system paths and tool calls with high-severity threats. Includes opt-in shell blocking and destructive operation blocking.",
1241
+ "category": "tools",
1242
+ "file": "defaults/tools.cedar",
1243
+ "severity": "high",
1244
+ "tags": ["tools", "file-access", "system-paths", "severity", "baseline"],
1898
1245
  "is_active": true
1899
1246
  }
1900
1247
  ],
1901
1248
  "templates": [
1902
- {
1903
- "id": "secrets-default",
1904
- "name": "Secrets Detection",
1905
- "description": "Detect and block credential leakage across prompts, tool calls, file operations, and AI responses using multi-layered detection",
1906
- "category": "secrets",
1907
- "file": "defaults/secrets.cedar",
1908
- "severity": "critical",
1909
- "tags": ["api-keys", "tokens", "credentials", "aws", "github", "ssh", "baseline"]
1910
- },
1911
1249
  {
1912
1250
  "id": "pii-default",
1913
1251
  "name": "PII Detection",
@@ -1917,42 +1255,6 @@ export const OVERWATCH_TEMPLATES_JSON = `{
1917
1255
  "severity": "critical",
1918
1256
  "tags": ["pii", "privacy", "compliance", "pci-dss", "gdpr", "hipaa", "baseline"]
1919
1257
  },
1920
- {
1921
- "id": "tools-default",
1922
- "name": "Tool Permissioning",
1923
- "description": "Block dangerous shell execution, restrict sensitive file paths, enforce tool risk scoring, and detect command injection in tool arguments",
1924
- "category": "tools",
1925
- "file": "defaults/tools.cedar",
1926
- "severity": "critical",
1927
- "tags": ["shell", "command-injection", "file-access", "tool-risk", "mitre-t1059", "owasp-llm06", "baseline"]
1928
- },
1929
- {
1930
- "id": "agent-security-default",
1931
- "name": "Agent Security",
1932
- "description": "Detect and block tool poisoning, rug pull attacks, indirect prompt injection, and MCP supply chain threats via Shield detection",
1933
- "category": "agent_security",
1934
- "file": "defaults/agent_security.cedar",
1935
- "severity": "critical",
1936
- "tags": ["tool-poisoning", "rug-pull", "indirect-injection", "mcp-security", "agent-security", "owasp-asi01", "owasp-asi04", "baseline"]
1937
- },
1938
- {
1939
- "id": "encoding-default",
1940
- "name": "Encoding Attack Detection",
1941
- "description": "Detect and block invisible Unicode characters, bidirectional overrides, and encoding-based injection attacks across prompts, tools, and files",
1942
- "category": "encoding",
1943
- "file": "defaults/encoding_attacks.cedar",
1944
- "severity": "high",
1945
- "tags": ["unicode", "invisible-chars", "bidi-override", "encoding", "owasp-llm01", "baseline"]
1946
- },
1947
- {
1948
- "id": "behavioral-default",
1949
- "name": "Behavioral Analysis",
1950
- "description": "Detect and block tool call loops, data exfiltration sequences, credential theft chains, and destructive operation patterns",
1951
- "category": "behavioral",
1952
- "file": "defaults/behavioral.cedar",
1953
- "severity": "high",
1954
- "tags": ["loop-detection", "data-exfiltration", "credential-theft", "behavioral", "owasp-llm10", "owasp-asi02", "baseline"]
1955
- },
1956
1258
  {
1957
1259
  "id": "tools-mcp-allowlist",
1958
1260
  "name": "MCP Server Allowlist",