@highflame/policy 2.1.2 → 2.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -32,6 +32,307 @@ permit (
32
32
  resource
33
33
  );
34
34
  `;
35
+ const OVERWATCH_SEMANTIC_DEFAULT_CEDAR = `// =============================================================================
36
+ // Semantic Threat Detection Policy (Default)
37
+ // =============================================================================
38
+ // Detects and blocks prompt injection, jailbreak attempts, and high-severity
39
+ // AI security threats. Uses multi-layered detection:
40
+ //
41
+ // 1. Detection engine rule triggers (detected_threats) — pattern-based
42
+ // 2. ML classifier confidence scores (injection_confidence, jailbreak_confidence)
43
+ // 3. Threat severity aggregation (max_threat_severity, highest_severity)
44
+ // 4. Cross-action enforcement (prompts + tool calls + file operations)
45
+ //
46
+ // Compliance:
47
+ // OWASP LLM01 (Prompt Injection) — direct + indirect
48
+ // OWASP LLM02 (Insecure Output Handling) — response manipulation
49
+ // OWASP ASI01 (Agent Goal Hijack) — behavioral manipulation
50
+ // MITRE ATLAS AML.T0051 (LLM Prompt Injection)
51
+ // MITRE ATLAS AML.T0054 (LLM Jailbreak)
52
+ // NIST 800-53 SI-3 (Malicious Code Protection)
53
+ // NIST 800-53 SI-4 (Information System Monitoring)
54
+ //
55
+ // Category: semantic
56
+ // Namespace: Overwatch
57
+ // =============================================================================
58
+
59
+ // ---------------------------------------------------------------------------
60
+ // Section 1: Prompt Injection Detection
61
+ // Blocks direct prompt injection — adversarial input designed to override
62
+ // system instructions and hijack agent behavior.
63
+ // Ref: OWASP LLM01, MITRE AML.T0051, 62% of LLM apps vulnerable (2024)
64
+ // ---------------------------------------------------------------------------
65
+
66
+ // Block content with prompt injection patterns detected by rules
67
+ @id("semantic-block-injection")
68
+ @name("Block prompt injection")
69
+ @description("Block prompts and tool calls when detection engine rules identify prompt injection patterns. Catches instruction override, role assumption, delimiter injection, and other manipulation techniques in both user input and tool arguments (OWASP LLM01).")
70
+ @severity("critical")
71
+ @tags("injection,security,owasp-llm01,mitre-aml-t0051,baseline")
72
+ @reject_message("Content was blocked because prompt injection patterns were detected. This is a security measure to prevent manipulation of AI agent behavior (OWASP LLM01).")
73
+ forbid (
74
+ principal,
75
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
76
+ resource
77
+ )
78
+ when {
79
+ context has detected_threats && context.detected_threats.contains("prompt_injection")
80
+ };
81
+
82
+ // Block content with high ML injection confidence
83
+ @id("semantic-block-injection-score")
84
+ @name("Block high-confidence injection")
85
+ @description("Block content when the ML injection classifier confidence exceeds threshold (75/100). Catches novel injection techniques that evade pattern-based detection — polymorphic payloads, encoding tricks, and obfuscated instructions.")
86
+ @severity("critical")
87
+ @tags("injection,ml-classifier,security,owasp-llm01,mitre-aml-t0051")
88
+ @reject_message("Your content was blocked because the ML classifier detected prompt injection with high confidence. This appears to be an attempt to manipulate agent behavior.")
89
+ forbid (
90
+ principal,
91
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
92
+ resource
93
+ )
94
+ when {
95
+ context has injection_confidence && context.injection_confidence >= 75
96
+ };
97
+
98
+ // ---------------------------------------------------------------------------
99
+ // Section 2: Jailbreak Detection
100
+ // Blocks jailbreak attempts — adversarial input designed to bypass AI safety
101
+ // guardrails and elicit restricted outputs.
102
+ // Ref: OWASP LLM02, MITRE AML.T0054, DAN/JailbreakChat/etc.
103
+ // ---------------------------------------------------------------------------
104
+
105
+ // Block prompts with jailbreak attempts detected by rules
106
+ @id("semantic-block-jailbreak")
107
+ @name("Block jailbreak attempts")
108
+ @description("Block prompts when detection engine rules identify jailbreak patterns: DAN-style prompts, role-play exploits, safety bypass instructions, and constraint removal attempts (OWASP LLM02).")
109
+ @severity("critical")
110
+ @tags("jailbreak,bypass,security,owasp-llm02,mitre-aml-t0054,baseline")
111
+ @reject_message("Your prompt was blocked because jailbreak patterns were detected. This is a security measure to prevent circumvention of AI safety controls (OWASP LLM02).")
112
+ forbid (
113
+ principal,
114
+ action == Overwatch::Action::"process_prompt",
115
+ resource
116
+ )
117
+ when {
118
+ context has detected_threats && context.detected_threats.contains("jailbreak")
119
+ };
120
+
121
+ // Block prompts with high ML jailbreak confidence
122
+ @id("semantic-block-jailbreak-score")
123
+ @name("Block high-confidence jailbreak")
124
+ @description("Block content when the ML jailbreak classifier confidence exceeds threshold (75/100). Catches sophisticated jailbreak techniques including multi-turn manipulation, encoded payloads, and novel prompt structures.")
125
+ @severity("critical")
126
+ @tags("jailbreak,ml-classifier,security,owasp-llm02,mitre-aml-t0054")
127
+ @reject_message("Your content was blocked because the ML classifier detected a jailbreak attempt with high confidence. This appears to be an attempt to bypass safety guardrails.")
128
+ forbid (
129
+ principal,
130
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
131
+ resource
132
+ )
133
+ when {
134
+ context has jailbreak_confidence && context.jailbreak_confidence >= 75
135
+ };
136
+
137
+ // ---------------------------------------------------------------------------
138
+ // Section 3: Threat Severity Aggregation
139
+ // Blocks based on aggregated threat severity from all detection engines.
140
+ // These act as catch-all rules for threats that don't match specific patterns.
141
+ // ---------------------------------------------------------------------------
142
+
143
+ // Block any content with critical severity threats
144
+ @id("semantic-block-critical")
145
+ @name("Block critical threats")
146
+ @description("Block all content when any detection engine reports critical severity. This is the ultimate catch-all — critical threats are blocked regardless of type or source.")
147
+ @severity("critical")
148
+ @tags("critical,baseline,security,catch-all")
149
+ @reject_message("Your content was blocked because security scanners detected a critical-severity threat. This content cannot be processed.")
150
+ forbid (
151
+ principal,
152
+ action,
153
+ resource
154
+ )
155
+ when {
156
+ context has highest_severity && context.highest_severity == "critical"
157
+ };
158
+
159
+ // Block prompts with high severity semantic threats
160
+ @id("semantic-block-high-severity")
161
+ @name("Block high severity threats")
162
+ @description("Block prompts when threat detection reports high severity (>= 3) in semantic categories. Catches threats that are individually below critical but collectively indicate adversarial intent.")
163
+ @severity("high")
164
+ @tags("semantic,severity,security,defense-in-depth")
165
+ @reject_message("Your prompt was blocked because security scanners detected high severity issues in the content. Review your prompt for manipulative or adversarial patterns.")
166
+ forbid (
167
+ principal,
168
+ action == Overwatch::Action::"process_prompt",
169
+ resource
170
+ )
171
+ when {
172
+ context has threat_categories && context has max_threat_severity &&
173
+ context.threat_categories.contains("semantic") &&
174
+ context.max_threat_severity >= 3
175
+ };
176
+
177
+ // Block tool calls with multiple concurrent threats
178
+ @id("semantic-block-multi-threat-tools")
179
+ @name("Block multi-threat tool calls")
180
+ @description("Block tool execution when multiple distinct threats are detected simultaneously (3+). Multiple concurrent threats in a tool call strongly indicate an adversarial attack chain.")
181
+ @severity("high")
182
+ @tags("multi-threat,tools,security,defense-in-depth")
183
+ @reject_message("Tool execution was blocked because multiple security threats were detected simultaneously. This pattern indicates a potential attack chain.")
184
+ forbid (
185
+ principal,
186
+ action == Overwatch::Action::"call_tool",
187
+ resource
188
+ )
189
+ when {
190
+ context has threat_count && context.threat_count >= 3
191
+ };
192
+ `;
193
+ const OVERWATCH_TRUST_SAFETY_DEFAULT_CEDAR = `// =============================================================================
194
+ // Content Safety Policy (Default)
195
+ // =============================================================================
196
+ // Detects and blocks violent, harmful, hateful, sexual, and profane content
197
+ // using trust/safety ML classification scores.
198
+ //
199
+ // The detection engine runs multiple ML classifiers (toxicity, content safety)
200
+ // and produces normalized scores (0-100) for each category. These policies
201
+ // enforce enterprise-appropriate content thresholds.
202
+ //
203
+ // Compliance:
204
+ // NIST 800-53 SI-4 (Information System Monitoring)
205
+ // EU AI Act Art. 52 (Transparency for AI Systems)
206
+ // OWASP LLM02 (Insecure Output Handling) — unsafe content generation
207
+ // ISO 42001 (AI Management System) — responsible AI use
208
+ //
209
+ // Category: trust_safety
210
+ // Namespace: Overwatch
211
+ // =============================================================================
212
+
213
+ // ---------------------------------------------------------------------------
214
+ // Section 1: Violence & Weapons
215
+ // Blocks content promoting, describing, or instructing violence and weapons.
216
+ // ---------------------------------------------------------------------------
217
+
218
+ // Block content with high violence score
219
+ @id("ts-block-violence")
220
+ @name("Block violent content")
221
+ @description("Block content when the ML violence detection score exceeds threshold (80/100). Catches graphic violence descriptions, instructions for causing harm, and violent threat language.")
222
+ @severity("critical")
223
+ @tags("violence,content-safety,trust-safety,nist-si-4,iso-42001")
224
+ @reject_message("Content blocked: violent content detected. AI agents in enterprise environments must not process or generate violent content. Please rephrase without violence-related language.")
225
+ forbid (
226
+ principal,
227
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
228
+ resource
229
+ )
230
+ when {
231
+ context has violence_score && context.violence_score >= 80
232
+ };
233
+
234
+ // Block content with weapons references
235
+ @id("ts-block-weapons")
236
+ @name("Block weapons content")
237
+ @description("Block content when the ML weapons detection score exceeds threshold (80/100). Catches weapon manufacturing instructions, procurement guidance, and detailed weapon specifications.")
238
+ @severity("critical")
239
+ @tags("weapons,content-safety,trust-safety,nist-si-4")
240
+ @reject_message("Content blocked: weapons-related content detected. AI agents must not process weapons manufacturing, procurement, or detailed specification content.")
241
+ forbid (
242
+ principal,
243
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
244
+ resource
245
+ )
246
+ when {
247
+ context has weapons_score && context.weapons_score >= 80
248
+ };
249
+
250
+ // ---------------------------------------------------------------------------
251
+ // Section 2: Hate Speech & Discrimination
252
+ // Blocks hateful, discriminatory, and dehumanizing content.
253
+ // Lower threshold (75) — enterprises have zero tolerance for hate speech.
254
+ // ---------------------------------------------------------------------------
255
+
256
+ // Block hate speech content
257
+ @id("ts-block-hate-speech")
258
+ @name("Block hate speech")
259
+ @description("Block content when the ML hate speech detection score exceeds threshold (75/100). Lower threshold than other categories because enterprises have zero tolerance for discriminatory content. Catches slurs, dehumanizing language, and targeted harassment.")
260
+ @severity("critical")
261
+ @tags("hate-speech,content-safety,trust-safety,nist-si-4,eu-ai-act")
262
+ @reject_message("Content blocked: hate speech or discriminatory content detected. AI agents in enterprise environments must not process or generate hateful, discriminatory, or dehumanizing content.")
263
+ forbid (
264
+ principal,
265
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
266
+ resource
267
+ )
268
+ when {
269
+ context has hate_speech_score && context.hate_speech_score >= 75
270
+ };
271
+
272
+ // ---------------------------------------------------------------------------
273
+ // Section 3: Criminal Content
274
+ // Blocks content related to criminal activities and illegal operations.
275
+ // ---------------------------------------------------------------------------
276
+
277
+ // Block criminal content
278
+ @id("ts-block-crime")
279
+ @name("Block criminal content")
280
+ @description("Block content when the ML criminal activity detection score exceeds threshold (80/100). Catches illegal activity instructions, fraud techniques, and content promoting criminal behavior.")
281
+ @severity("high")
282
+ @tags("crime,content-safety,trust-safety,nist-si-4")
283
+ @reject_message("Content blocked: criminal activity content detected. AI agents must not process content related to illegal activities, fraud, or other criminal behavior.")
284
+ forbid (
285
+ principal,
286
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
287
+ resource
288
+ )
289
+ when {
290
+ context has crime_score && context.crime_score >= 80
291
+ };
292
+
293
+ // ---------------------------------------------------------------------------
294
+ // Section 4: Sexual Content
295
+ // Blocks sexually explicit or inappropriate content.
296
+ // ---------------------------------------------------------------------------
297
+
298
+ // Block sexual content
299
+ @id("ts-block-sexual")
300
+ @name("Block sexual content")
301
+ @description("Block content when the ML sexual content detection score exceeds threshold (80/100). Ensures enterprise AI agents do not process or generate sexually explicit material.")
302
+ @severity("high")
303
+ @tags("sexual,content-safety,trust-safety,eu-ai-act,iso-42001")
304
+ @reject_message("Content blocked: sexual content detected. AI agents in enterprise environments must not process sexually explicit material.")
305
+ forbid (
306
+ principal,
307
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
308
+ resource
309
+ )
310
+ when {
311
+ context has sexual_score && context.sexual_score >= 80
312
+ };
313
+
314
+ // ---------------------------------------------------------------------------
315
+ // Section 5: Profanity
316
+ // Blocks excessive profanity in enterprise environments.
317
+ // Higher threshold (90) — allows normal expression while blocking abuse.
318
+ // ---------------------------------------------------------------------------
319
+
320
+ // Block excessive profanity
321
+ @id("ts-block-profanity")
322
+ @name("Block profanity")
323
+ @description("Block content when the ML profanity detection score exceeds threshold (90/100). Higher threshold than other safety categories — allows normal expression while blocking abusive or harassing language patterns.")
324
+ @severity("medium")
325
+ @tags("profanity,content-safety,trust-safety")
326
+ @reject_message("Content blocked: excessive profanity detected. Please rephrase in a professional manner appropriate for enterprise AI interactions.")
327
+ forbid (
328
+ principal,
329
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
330
+ resource
331
+ )
332
+ when {
333
+ context has profanity_score && context.profanity_score >= 90
334
+ };
335
+ `;
35
336
  const OVERWATCH_SECRETS_DEFAULT_CEDAR = `// =============================================================================
36
337
  // Secrets Detection Policy (Default)
37
338
  // =============================================================================
@@ -439,172 +740,14 @@ when {
439
740
  @description("Block file reads and writes when PII is detected. Prevents agents from reading files containing personal data and from writing PII to new files where it could persist or be version-controlled.")
440
741
  @severity("high")
441
742
  @tags("pii,file-ops,data-protection,gdpr-art-32,nist-si-4")
442
- @reject_message("File operation blocked: personally identifiable information was detected. Files containing PII must not be read or written through AI agents.")
443
- forbid (
444
- principal,
445
- action in [Overwatch::Action::"read_file", Overwatch::Action::"write_file"],
446
- resource
447
- )
448
- when {
449
- context has pii_detected && context.pii_detected
450
- };
451
- `;
452
- const OVERWATCH_SEMANTIC_DEFAULT_CEDAR = `// =============================================================================
453
- // Semantic Threat Detection Policy (Default)
454
- // =============================================================================
455
- // Detects and blocks prompt injection, jailbreak attempts, and high-severity
456
- // AI security threats. Uses multi-layered detection:
457
- //
458
- // 1. Detection engine rule triggers (detected_threats) — pattern-based
459
- // 2. ML classifier confidence scores (injection_confidence, jailbreak_confidence)
460
- // 3. Threat severity aggregation (max_threat_severity, highest_severity)
461
- // 4. Cross-action enforcement (prompts + tool calls + file operations)
462
- //
463
- // Compliance:
464
- // OWASP LLM01 (Prompt Injection) — direct + indirect
465
- // OWASP LLM02 (Insecure Output Handling) — response manipulation
466
- // OWASP ASI01 (Agent Goal Hijack) — behavioral manipulation
467
- // MITRE ATLAS AML.T0051 (LLM Prompt Injection)
468
- // MITRE ATLAS AML.T0054 (LLM Jailbreak)
469
- // NIST 800-53 SI-3 (Malicious Code Protection)
470
- // NIST 800-53 SI-4 (Information System Monitoring)
471
- //
472
- // Category: semantic
473
- // Namespace: Overwatch
474
- // =============================================================================
475
-
476
- // ---------------------------------------------------------------------------
477
- // Section 1: Prompt Injection Detection
478
- // Blocks direct prompt injection — adversarial input designed to override
479
- // system instructions and hijack agent behavior.
480
- // Ref: OWASP LLM01, MITRE AML.T0051, 62% of LLM apps vulnerable (2024)
481
- // ---------------------------------------------------------------------------
482
-
483
- // Block content with prompt injection patterns detected by rules
484
- @id("semantic-block-injection")
485
- @name("Block prompt injection")
486
- @description("Block prompts and tool calls when detection engine rules identify prompt injection patterns. Catches instruction override, role assumption, delimiter injection, and other manipulation techniques in both user input and tool arguments (OWASP LLM01).")
487
- @severity("critical")
488
- @tags("injection,security,owasp-llm01,mitre-aml-t0051,baseline")
489
- @reject_message("Content was blocked because prompt injection patterns were detected. This is a security measure to prevent manipulation of AI agent behavior (OWASP LLM01).")
490
- forbid (
491
- principal,
492
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
493
- resource
494
- )
495
- when {
496
- context has detected_threats && context.detected_threats.contains("prompt_injection")
497
- };
498
-
499
- // Block content with high ML injection confidence
500
- @id("semantic-block-injection-score")
501
- @name("Block high-confidence injection")
502
- @description("Block content when the ML injection classifier confidence exceeds threshold (75/100). Catches novel injection techniques that evade pattern-based detection — polymorphic payloads, encoding tricks, and obfuscated instructions.")
503
- @severity("critical")
504
- @tags("injection,ml-classifier,security,owasp-llm01,mitre-aml-t0051")
505
- @reject_message("Your content was blocked because the ML classifier detected prompt injection with high confidence. This appears to be an attempt to manipulate agent behavior.")
506
- forbid (
507
- principal,
508
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
509
- resource
510
- )
511
- when {
512
- context has injection_confidence && context.injection_confidence >= 75
513
- };
514
-
515
- // ---------------------------------------------------------------------------
516
- // Section 2: Jailbreak Detection
517
- // Blocks jailbreak attempts — adversarial input designed to bypass AI safety
518
- // guardrails and elicit restricted outputs.
519
- // Ref: OWASP LLM02, MITRE AML.T0054, DAN/JailbreakChat/etc.
520
- // ---------------------------------------------------------------------------
521
-
522
- // Block prompts with jailbreak attempts detected by rules
523
- @id("semantic-block-jailbreak")
524
- @name("Block jailbreak attempts")
525
- @description("Block prompts when detection engine rules identify jailbreak patterns: DAN-style prompts, role-play exploits, safety bypass instructions, and constraint removal attempts (OWASP LLM02).")
526
- @severity("critical")
527
- @tags("jailbreak,bypass,security,owasp-llm02,mitre-aml-t0054,baseline")
528
- @reject_message("Your prompt was blocked because jailbreak patterns were detected. This is a security measure to prevent circumvention of AI safety controls (OWASP LLM02).")
529
- forbid (
530
- principal,
531
- action == Overwatch::Action::"process_prompt",
532
- resource
533
- )
534
- when {
535
- context has detected_threats && context.detected_threats.contains("jailbreak")
536
- };
537
-
538
- // Block prompts with high ML jailbreak confidence
539
- @id("semantic-block-jailbreak-score")
540
- @name("Block high-confidence jailbreak")
541
- @description("Block content when the ML jailbreak classifier confidence exceeds threshold (75/100). Catches sophisticated jailbreak techniques including multi-turn manipulation, encoded payloads, and novel prompt structures.")
542
- @severity("critical")
543
- @tags("jailbreak,ml-classifier,security,owasp-llm02,mitre-aml-t0054")
544
- @reject_message("Your content was blocked because the ML classifier detected a jailbreak attempt with high confidence. This appears to be an attempt to bypass safety guardrails.")
545
- forbid (
546
- principal,
547
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
548
- resource
549
- )
550
- when {
551
- context has jailbreak_confidence && context.jailbreak_confidence >= 75
552
- };
553
-
554
- // ---------------------------------------------------------------------------
555
- // Section 3: Threat Severity Aggregation
556
- // Blocks based on aggregated threat severity from all detection engines.
557
- // These act as catch-all rules for threats that don't match specific patterns.
558
- // ---------------------------------------------------------------------------
559
-
560
- // Block any content with critical severity threats
561
- @id("semantic-block-critical")
562
- @name("Block critical threats")
563
- @description("Block all content when any detection engine reports critical severity. This is the ultimate catch-all — critical threats are blocked regardless of type or source.")
564
- @severity("critical")
565
- @tags("critical,baseline,security,catch-all")
566
- @reject_message("Your content was blocked because security scanners detected a critical-severity threat. This content cannot be processed.")
567
- forbid (
568
- principal,
569
- action,
570
- resource
571
- )
572
- when {
573
- context has highest_severity && context.highest_severity == "critical"
574
- };
575
-
576
- // Block prompts with high severity semantic threats
577
- @id("semantic-block-high-severity")
578
- @name("Block high severity threats")
579
- @description("Block prompts when threat detection reports high severity (>= 3) in semantic categories. Catches threats that are individually below critical but collectively indicate adversarial intent.")
580
- @severity("high")
581
- @tags("semantic,severity,security,defense-in-depth")
582
- @reject_message("Your prompt was blocked because security scanners detected high severity issues in the content. Review your prompt for manipulative or adversarial patterns.")
583
- forbid (
584
- principal,
585
- action == Overwatch::Action::"process_prompt",
586
- resource
587
- )
588
- when {
589
- context has threat_categories && context has max_threat_severity &&
590
- context.threat_categories.contains("semantic") &&
591
- context.max_threat_severity >= 3
592
- };
593
-
594
- // Block tool calls with multiple concurrent threats
595
- @id("semantic-block-multi-threat-tools")
596
- @name("Block multi-threat tool calls")
597
- @description("Block tool execution when multiple distinct threats are detected simultaneously (3+). Multiple concurrent threats in a tool call strongly indicate an adversarial attack chain.")
598
- @severity("high")
599
- @tags("multi-threat,tools,security,defense-in-depth")
600
- @reject_message("Tool execution was blocked because multiple security threats were detected simultaneously. This pattern indicates a potential attack chain.")
743
+ @reject_message("File operation blocked: personally identifiable information was detected. Files containing PII must not be read or written through AI agents.")
601
744
  forbid (
602
745
  principal,
603
- action == Overwatch::Action::"call_tool",
746
+ action in [Overwatch::Action::"read_file", Overwatch::Action::"write_file"],
604
747
  resource
605
748
  )
606
749
  when {
607
- context has threat_count && context.threat_count >= 3
750
+ context has pii_detected && context.pii_detected
608
751
  };
609
752
  `;
610
753
  const OVERWATCH_TOOLS_DEFAULT_CEDAR = `// =============================================================================
@@ -840,149 +983,6 @@ when {
840
983
  context.detected_threats.contains("command_injection")
841
984
  };
842
985
  `;
843
- const OVERWATCH_TRUST_SAFETY_DEFAULT_CEDAR = `// =============================================================================
844
- // Content Safety Policy (Default)
845
- // =============================================================================
846
- // Detects and blocks violent, harmful, hateful, sexual, and profane content
847
- // using trust/safety ML classification scores.
848
- //
849
- // The detection engine runs multiple ML classifiers (toxicity, content safety)
850
- // and produces normalized scores (0-100) for each category. These policies
851
- // enforce enterprise-appropriate content thresholds.
852
- //
853
- // Compliance:
854
- // NIST 800-53 SI-4 (Information System Monitoring)
855
- // EU AI Act Art. 52 (Transparency for AI Systems)
856
- // OWASP LLM02 (Insecure Output Handling) — unsafe content generation
857
- // ISO 42001 (AI Management System) — responsible AI use
858
- //
859
- // Category: trust_safety
860
- // Namespace: Overwatch
861
- // =============================================================================
862
-
863
- // ---------------------------------------------------------------------------
864
- // Section 1: Violence & Weapons
865
- // Blocks content promoting, describing, or instructing violence and weapons.
866
- // ---------------------------------------------------------------------------
867
-
868
- // Block content with high violence score
869
- @id("ts-block-violence")
870
- @name("Block violent content")
871
- @description("Block content when the ML violence detection score exceeds threshold (80/100). Catches graphic violence descriptions, instructions for causing harm, and violent threat language.")
872
- @severity("critical")
873
- @tags("violence,content-safety,trust-safety,nist-si-4,iso-42001")
874
- @reject_message("Content blocked: violent content detected. AI agents in enterprise environments must not process or generate violent content. Please rephrase without violence-related language.")
875
- forbid (
876
- principal,
877
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
878
- resource
879
- )
880
- when {
881
- context has violence_score && context.violence_score >= 80
882
- };
883
-
884
- // Block content with weapons references
885
- @id("ts-block-weapons")
886
- @name("Block weapons content")
887
- @description("Block content when the ML weapons detection score exceeds threshold (80/100). Catches weapon manufacturing instructions, procurement guidance, and detailed weapon specifications.")
888
- @severity("critical")
889
- @tags("weapons,content-safety,trust-safety,nist-si-4")
890
- @reject_message("Content blocked: weapons-related content detected. AI agents must not process weapons manufacturing, procurement, or detailed specification content.")
891
- forbid (
892
- principal,
893
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
894
- resource
895
- )
896
- when {
897
- context has weapons_score && context.weapons_score >= 80
898
- };
899
-
900
- // ---------------------------------------------------------------------------
901
- // Section 2: Hate Speech & Discrimination
902
- // Blocks hateful, discriminatory, and dehumanizing content.
903
- // Lower threshold (75) — enterprises have zero tolerance for hate speech.
904
- // ---------------------------------------------------------------------------
905
-
906
- // Block hate speech content
907
- @id("ts-block-hate-speech")
908
- @name("Block hate speech")
909
- @description("Block content when the ML hate speech detection score exceeds threshold (75/100). Lower threshold than other categories because enterprises have zero tolerance for discriminatory content. Catches slurs, dehumanizing language, and targeted harassment.")
910
- @severity("critical")
911
- @tags("hate-speech,content-safety,trust-safety,nist-si-4,eu-ai-act")
912
- @reject_message("Content blocked: hate speech or discriminatory content detected. AI agents in enterprise environments must not process or generate hateful, discriminatory, or dehumanizing content.")
913
- forbid (
914
- principal,
915
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
916
- resource
917
- )
918
- when {
919
- context has hate_speech_score && context.hate_speech_score >= 75
920
- };
921
-
922
- // ---------------------------------------------------------------------------
923
- // Section 3: Criminal Content
924
- // Blocks content related to criminal activities and illegal operations.
925
- // ---------------------------------------------------------------------------
926
-
927
- // Block criminal content
928
- @id("ts-block-crime")
929
- @name("Block criminal content")
930
- @description("Block content when the ML criminal activity detection score exceeds threshold (80/100). Catches illegal activity instructions, fraud techniques, and content promoting criminal behavior.")
931
- @severity("high")
932
- @tags("crime,content-safety,trust-safety,nist-si-4")
933
- @reject_message("Content blocked: criminal activity content detected. AI agents must not process content related to illegal activities, fraud, or other criminal behavior.")
934
- forbid (
935
- principal,
936
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
937
- resource
938
- )
939
- when {
940
- context has crime_score && context.crime_score >= 80
941
- };
942
-
943
- // ---------------------------------------------------------------------------
944
- // Section 4: Sexual Content
945
- // Blocks sexually explicit or inappropriate content.
946
- // ---------------------------------------------------------------------------
947
-
948
- // Block sexual content
949
- @id("ts-block-sexual")
950
- @name("Block sexual content")
951
- @description("Block content when the ML sexual content detection score exceeds threshold (80/100). Ensures enterprise AI agents do not process or generate sexually explicit material.")
952
- @severity("high")
953
- @tags("sexual,content-safety,trust-safety,eu-ai-act,iso-42001")
954
- @reject_message("Content blocked: sexual content detected. AI agents in enterprise environments must not process sexually explicit material.")
955
- forbid (
956
- principal,
957
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
958
- resource
959
- )
960
- when {
961
- context has sexual_score && context.sexual_score >= 80
962
- };
963
-
964
- // ---------------------------------------------------------------------------
965
- // Section 5: Profanity
966
- // Blocks excessive profanity in enterprise environments.
967
- // Higher threshold (90) — allows normal expression while blocking abuse.
968
- // ---------------------------------------------------------------------------
969
-
970
- // Block excessive profanity
971
- @id("ts-block-profanity")
972
- @name("Block profanity")
973
- @description("Block content when the ML profanity detection score exceeds threshold (90/100). Higher threshold than other safety categories — allows normal expression while blocking abusive or harassing language patterns.")
974
- @severity("medium")
975
- @tags("profanity,content-safety,trust-safety")
976
- @reject_message("Content blocked: excessive profanity detected. Please rephrase in a professional manner appropriate for enterprise AI interactions.")
977
- forbid (
978
- principal,
979
- action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
980
- resource
981
- )
982
- when {
983
- context has profanity_score && context.profanity_score >= 90
984
- };
985
- `;
986
986
  const OVERWATCH_AGENT_SECURITY_DEFAULT_CEDAR = `// =============================================================================
987
987
  // Agent Security Policy (Default)
988
988
  // =============================================================================
@@ -1677,6 +1677,31 @@ export const OVERWATCH_DEFAULTS = [
1677
1677
  tags: ['baseline', 'permit-default', 'organization'],
1678
1678
  isActive: true,
1679
1679
  },
1680
+ {
1681
+ id: 'semantic-default',
1682
+ name: 'Semantic Threat Detection',
1683
+ description: 'Detect and block prompt injection, jailbreak attempts, and high-severity threats using detection rules and ML classifiers',
1684
+ category: 'semantic',
1685
+ cedarText: OVERWATCH_SEMANTIC_DEFAULT_CEDAR,
1686
+ severity: 'critical',
1687
+ tags: ['prompt-injection', 'jailbreak', 'owasp-llm01', 'owasp-llm02', 'security', 'baseline'],
1688
+ isActive: true,
1689
+ },
1690
+ {
1691
+ id: 'trust-safety-default',
1692
+ name: 'Content Safety',
1693
+ description: 'Detect and block violent, harmful, hateful, sexual, and profane content using ML classification scores',
1694
+ category: 'trust_safety',
1695
+ cedarText: OVERWATCH_TRUST_SAFETY_DEFAULT_CEDAR,
1696
+ severity: 'critical',
1697
+ tags: ['violence', 'weapons', 'hate-speech', 'crime', 'sexual', 'profanity', 'content-safety', 'baseline'],
1698
+ isActive: true,
1699
+ },
1700
+ ];
1701
+ // =============================================================================
1702
+ // ALL TEMPLATES
1703
+ // =============================================================================
1704
+ export const OVERWATCH_TEMPLATES = [
1680
1705
  {
1681
1706
  id: 'secrets-default',
1682
1707
  name: 'Secrets Detection',
@@ -1685,7 +1710,6 @@ export const OVERWATCH_DEFAULTS = [
1685
1710
  cedarText: OVERWATCH_SECRETS_DEFAULT_CEDAR,
1686
1711
  severity: 'critical',
1687
1712
  tags: ['api-keys', 'tokens', 'credentials', 'aws', 'github', 'ssh', 'baseline'],
1688
- isActive: true,
1689
1713
  },
1690
1714
  {
1691
1715
  id: 'pii-default',
@@ -1695,17 +1719,6 @@ export const OVERWATCH_DEFAULTS = [
1695
1719
  cedarText: OVERWATCH_PII_DEFAULT_CEDAR,
1696
1720
  severity: 'critical',
1697
1721
  tags: ['pii', 'privacy', 'compliance', 'pci-dss', 'gdpr', 'hipaa', 'baseline'],
1698
- isActive: true,
1699
- },
1700
- {
1701
- id: 'semantic-default',
1702
- name: 'Semantic Threat Detection',
1703
- description: 'Detect and block prompt injection, jailbreak attempts, and high-severity threats using detection rules and ML classifiers',
1704
- category: 'semantic',
1705
- cedarText: OVERWATCH_SEMANTIC_DEFAULT_CEDAR,
1706
- severity: 'critical',
1707
- tags: ['prompt-injection', 'jailbreak', 'owasp-llm01', 'owasp-llm02', 'security', 'baseline'],
1708
- isActive: true,
1709
1722
  },
1710
1723
  {
1711
1724
  id: 'tools-default',
@@ -1715,17 +1728,6 @@ export const OVERWATCH_DEFAULTS = [
1715
1728
  cedarText: OVERWATCH_TOOLS_DEFAULT_CEDAR,
1716
1729
  severity: 'critical',
1717
1730
  tags: ['shell', 'command-injection', 'file-access', 'tool-risk', 'mitre-t1059', 'owasp-llm06', 'baseline'],
1718
- isActive: true,
1719
- },
1720
- {
1721
- id: 'trust-safety-default',
1722
- name: 'Content Safety',
1723
- description: 'Detect and block violent, harmful, hateful, sexual, and profane content using ML classification scores',
1724
- category: 'trust_safety',
1725
- cedarText: OVERWATCH_TRUST_SAFETY_DEFAULT_CEDAR,
1726
- severity: 'critical',
1727
- tags: ['violence', 'weapons', 'hate-speech', 'crime', 'sexual', 'profanity', 'content-safety', 'baseline'],
1728
- isActive: true,
1729
1731
  },
1730
1732
  {
1731
1733
  id: 'agent-security-default',
@@ -1735,7 +1737,6 @@ export const OVERWATCH_DEFAULTS = [
1735
1737
  cedarText: OVERWATCH_AGENT_SECURITY_DEFAULT_CEDAR,
1736
1738
  severity: 'critical',
1737
1739
  tags: ['tool-poisoning', 'rug-pull', 'indirect-injection', 'mcp-security', 'agent-security', 'owasp-asi01', 'owasp-asi04', 'baseline'],
1738
- isActive: true,
1739
1740
  },
1740
1741
  {
1741
1742
  id: 'encoding-default',
@@ -1745,7 +1746,6 @@ export const OVERWATCH_DEFAULTS = [
1745
1746
  cedarText: OVERWATCH_ENCODING_DEFAULT_CEDAR,
1746
1747
  severity: 'high',
1747
1748
  tags: ['unicode', 'invisible-chars', 'bidi-override', 'encoding', 'owasp-llm01', 'baseline'],
1748
- isActive: true,
1749
1749
  },
1750
1750
  {
1751
1751
  id: 'behavioral-default',
@@ -1755,13 +1755,7 @@ export const OVERWATCH_DEFAULTS = [
1755
1755
  cedarText: OVERWATCH_BEHAVIORAL_DEFAULT_CEDAR,
1756
1756
  severity: 'high',
1757
1757
  tags: ['loop-detection', 'data-exfiltration', 'credential-theft', 'behavioral', 'owasp-llm10', 'owasp-asi02', 'baseline'],
1758
- isActive: true,
1759
1758
  },
1760
- ];
1761
- // =============================================================================
1762
- // ALL TEMPLATES
1763
- // =============================================================================
1764
- export const OVERWATCH_TEMPLATES = [
1765
1759
  {
1766
1760
  id: 'tools-mcp-allowlist',
1767
1761
  name: 'MCP Server Allowlist',
@@ -1883,6 +1877,28 @@ export const OVERWATCH_TEMPLATES_JSON = `{
1883
1877
  "tags": ["baseline", "permit-default", "organization"],
1884
1878
  "is_active": true
1885
1879
  },
1880
+ {
1881
+ "id": "semantic-default",
1882
+ "name": "Semantic Threat Detection",
1883
+ "description": "Detect and block prompt injection, jailbreak attempts, and high-severity threats using detection rules and ML classifiers",
1884
+ "category": "semantic",
1885
+ "file": "defaults/semantic.cedar",
1886
+ "severity": "critical",
1887
+ "tags": ["prompt-injection", "jailbreak", "owasp-llm01", "owasp-llm02", "security", "baseline"],
1888
+ "is_active": true
1889
+ },
1890
+ {
1891
+ "id": "trust-safety-default",
1892
+ "name": "Content Safety",
1893
+ "description": "Detect and block violent, harmful, hateful, sexual, and profane content using ML classification scores",
1894
+ "category": "trust_safety",
1895
+ "file": "defaults/trust_safety.cedar",
1896
+ "severity": "critical",
1897
+ "tags": ["violence", "weapons", "hate-speech", "crime", "sexual", "profanity", "content-safety", "baseline"],
1898
+ "is_active": true
1899
+ }
1900
+ ],
1901
+ "templates": [
1886
1902
  {
1887
1903
  "id": "secrets-default",
1888
1904
  "name": "Secrets Detection",
@@ -1890,8 +1906,7 @@ export const OVERWATCH_TEMPLATES_JSON = `{
1890
1906
  "category": "secrets",
1891
1907
  "file": "defaults/secrets.cedar",
1892
1908
  "severity": "critical",
1893
- "tags": ["api-keys", "tokens", "credentials", "aws", "github", "ssh", "baseline"],
1894
- "is_active": true
1909
+ "tags": ["api-keys", "tokens", "credentials", "aws", "github", "ssh", "baseline"]
1895
1910
  },
1896
1911
  {
1897
1912
  "id": "pii-default",
@@ -1900,18 +1915,7 @@ export const OVERWATCH_TEMPLATES_JSON = `{
1900
1915
  "category": "pii",
1901
1916
  "file": "defaults/pii.cedar",
1902
1917
  "severity": "critical",
1903
- "tags": ["pii", "privacy", "compliance", "pci-dss", "gdpr", "hipaa", "baseline"],
1904
- "is_active": true
1905
- },
1906
- {
1907
- "id": "semantic-default",
1908
- "name": "Semantic Threat Detection",
1909
- "description": "Detect and block prompt injection, jailbreak attempts, and high-severity threats using detection rules and ML classifiers",
1910
- "category": "semantic",
1911
- "file": "defaults/semantic.cedar",
1912
- "severity": "critical",
1913
- "tags": ["prompt-injection", "jailbreak", "owasp-llm01", "owasp-llm02", "security", "baseline"],
1914
- "is_active": true
1918
+ "tags": ["pii", "privacy", "compliance", "pci-dss", "gdpr", "hipaa", "baseline"]
1915
1919
  },
1916
1920
  {
1917
1921
  "id": "tools-default",
@@ -1920,18 +1924,7 @@ export const OVERWATCH_TEMPLATES_JSON = `{
1920
1924
  "category": "tools",
1921
1925
  "file": "defaults/tools.cedar",
1922
1926
  "severity": "critical",
1923
- "tags": ["shell", "command-injection", "file-access", "tool-risk", "mitre-t1059", "owasp-llm06", "baseline"],
1924
- "is_active": true
1925
- },
1926
- {
1927
- "id": "trust-safety-default",
1928
- "name": "Content Safety",
1929
- "description": "Detect and block violent, harmful, hateful, sexual, and profane content using ML classification scores",
1930
- "category": "trust_safety",
1931
- "file": "defaults/trust_safety.cedar",
1932
- "severity": "critical",
1933
- "tags": ["violence", "weapons", "hate-speech", "crime", "sexual", "profanity", "content-safety", "baseline"],
1934
- "is_active": true
1927
+ "tags": ["shell", "command-injection", "file-access", "tool-risk", "mitre-t1059", "owasp-llm06", "baseline"]
1935
1928
  },
1936
1929
  {
1937
1930
  "id": "agent-security-default",
@@ -1940,8 +1933,7 @@ export const OVERWATCH_TEMPLATES_JSON = `{
1940
1933
  "category": "agent_security",
1941
1934
  "file": "defaults/agent_security.cedar",
1942
1935
  "severity": "critical",
1943
- "tags": ["tool-poisoning", "rug-pull", "indirect-injection", "mcp-security", "agent-security", "owasp-asi01", "owasp-asi04", "baseline"],
1944
- "is_active": true
1936
+ "tags": ["tool-poisoning", "rug-pull", "indirect-injection", "mcp-security", "agent-security", "owasp-asi01", "owasp-asi04", "baseline"]
1945
1937
  },
1946
1938
  {
1947
1939
  "id": "encoding-default",
@@ -1950,8 +1942,7 @@ export const OVERWATCH_TEMPLATES_JSON = `{
1950
1942
  "category": "encoding",
1951
1943
  "file": "defaults/encoding_attacks.cedar",
1952
1944
  "severity": "high",
1953
- "tags": ["unicode", "invisible-chars", "bidi-override", "encoding", "owasp-llm01", "baseline"],
1954
- "is_active": true
1945
+ "tags": ["unicode", "invisible-chars", "bidi-override", "encoding", "owasp-llm01", "baseline"]
1955
1946
  },
1956
1947
  {
1957
1948
  "id": "behavioral-default",
@@ -1960,11 +1951,8 @@ export const OVERWATCH_TEMPLATES_JSON = `{
1960
1951
  "category": "behavioral",
1961
1952
  "file": "defaults/behavioral.cedar",
1962
1953
  "severity": "high",
1963
- "tags": ["loop-detection", "data-exfiltration", "credential-theft", "behavioral", "owasp-llm10", "owasp-asi02", "baseline"],
1964
- "is_active": true
1965
- }
1966
- ],
1967
- "templates": [
1954
+ "tags": ["loop-detection", "data-exfiltration", "credential-theft", "behavioral", "owasp-llm10", "owasp-asi02", "baseline"]
1955
+ },
1968
1956
  {
1969
1957
  "id": "tools-mcp-allowlist",
1970
1958
  "name": "MCP Server Allowlist",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@highflame/policy",
3
- "version": "2.1.2",
3
+ "version": "2.1.3",
4
4
  "description": "Highflame Cedar policy types and engine wrapper",
5
5
  "readme": "README.md",
6
6
  "main": "dist/index.js",