@highflame/policy 2.0.7 → 2.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,7 +9,7 @@
9
9
  * Overwatch policy category identifiers.
10
10
  * Maps to UI tab names in Studio.
11
11
  */
12
- export type OverwatchCategory = 'secrets' | 'pii' | 'semantic' | 'tools' | 'organization';
12
+ export type OverwatchCategory = 'secrets' | 'pii' | 'semantic' | 'tools' | 'organization' | 'trust_safety' | 'agent_security';
13
13
 
14
14
  /**
15
15
  * Category metadata for UI display.
@@ -66,6 +66,32 @@ export interface OverwatchTemplate {
66
66
  // EMBEDDED CEDAR POLICY TEXT
67
67
  // =============================================================================
68
68
 
69
+ const OVERWATCH_BASELINE_DEFAULT_CEDAR = `// =============================================================================
70
+ // Baseline Permit Policy (Default)
71
+ // =============================================================================
72
+ // Permits all actions by default. Threat-specific forbid policies override
73
+ // this to block when YARA, Javelin, or other scanners detect issues.
74
+ //
75
+ // Cedar is default-deny: without at least one permit rule, every request
76
+ // is denied regardless of forbid rules. This baseline ensures the system
77
+ // is "allow unless blocked" rather than "block everything".
78
+ //
79
+ // Category: organization
80
+ // Namespace: Overwatch
81
+ // =============================================================================
82
+
83
+ @id("baseline-permit-all")
84
+ @name("Permit all actions by default")
85
+ @description("Baseline permit for all actions — threat-specific forbid policies override this when threats are detected")
86
+ @severity("low")
87
+ @tags("baseline,permit-default,organization")
88
+ permit (
89
+ principal,
90
+ action,
91
+ resource
92
+ );
93
+ `;
94
+
69
95
  const OVERWATCH_SECRETS_DEFAULT_CEDAR = `// =============================================================================
70
96
  // Secrets Detection Policy (Default)
71
97
  // =============================================================================
@@ -293,6 +319,22 @@ when {
293
319
  context has threat_categories && context.threat_categories.contains("pii")
294
320
  };
295
321
 
322
+ // Block prompts with high PII confidence score
323
+ @id("pii-block-high-confidence")
324
+ @name("Block high-confidence PII")
325
+ @description("Block content when PII classifier confidence exceeds threshold (80/100)")
326
+ @severity("critical")
327
+ @tags("pii,confidence,privacy,compliance")
328
+ @reject_message("Your content was blocked because personally identifiable information was detected with high confidence.")
329
+ forbid (
330
+ principal,
331
+ action == Overwatch::Action::"process_prompt",
332
+ resource
333
+ )
334
+ when {
335
+ context has pii_confidence && context.pii_confidence >= 80
336
+ };
337
+
296
338
  // Block PII leakage via tool calls
297
339
  @id("pii-block-tool-calls")
298
340
  @name("Block tool calls with PII")
@@ -337,6 +379,22 @@ when {
337
379
  context has yara_threats && context.yara_threats.contains("prompt_injection")
338
380
  };
339
381
 
382
+ // Block prompts with high injection confidence score
383
+ @id("semantic-block-injection-score")
384
+ @name("Block high-confidence injection")
385
+ @description("Block content when injection classifier confidence exceeds threshold (75/100)")
386
+ @severity("critical")
387
+ @tags("injection,confidence,security,owasp-llm01")
388
+ @reject_message("Your prompt was blocked because a high-confidence prompt injection pattern was detected.")
389
+ forbid (
390
+ principal,
391
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
392
+ resource
393
+ )
394
+ when {
395
+ context has injection_confidence && context.injection_confidence >= 75
396
+ };
397
+
340
398
  // Block prompts with jailbreak attempts
341
399
  @id("semantic-block-jailbreak")
342
400
  @name("Block jailbreak attempts")
@@ -352,6 +410,22 @@ when {
352
410
  context has yara_threats && context.yara_threats.contains("jailbreak")
353
411
  };
354
412
 
413
+ // Block prompts with high jailbreak confidence score
414
+ @id("semantic-block-jailbreak-score")
415
+ @name("Block high-confidence jailbreak")
416
+ @description("Block content when jailbreak classifier confidence exceeds threshold (75/100)")
417
+ @severity("critical")
418
+ @tags("jailbreak,confidence,security,owasp-llm02")
419
+ @reject_message("Your prompt was blocked because a high-confidence jailbreak attempt was detected.")
420
+ forbid (
421
+ principal,
422
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
423
+ resource
424
+ )
425
+ when {
426
+ context has jailbreak_confidence && context.jailbreak_confidence >= 75
427
+ };
428
+
355
429
  // Block prompts with high severity semantic threats
356
430
  @id("semantic-block-high-severity")
357
431
  @name("Block high severity threats")
@@ -508,6 +582,208 @@ when {
508
582
  };
509
583
  `;
510
584
 
585
+ const OVERWATCH_TRUST_SAFETY_DEFAULT_CEDAR = `// =============================================================================
586
+ // Content Safety Policy (Default)
587
+ // =============================================================================
588
+ // Detects and blocks violent, harmful, hateful, sexual, and profane content
589
+ // using trust/safety classification scores from Javelin/Lakera/LlamaGuard.
590
+ //
591
+ // Compliance: NIST 800-53 SI-4 | EU AI Act Art. 52
592
+ // Category: trust_safety
593
+ // Namespace: Overwatch
594
+ // =============================================================================
595
+
596
+ // Block prompts with high violence score
597
+ @id("ts-block-violence")
598
+ @name("Block violent content")
599
+ @description("Block content when violence detection score exceeds threshold (80/100)")
600
+ @severity("critical")
601
+ @tags("violence,content-safety,trust-safety,nist-si-4")
602
+ @reject_message("Your prompt was blocked because it contains violent content. Please rephrase without violence-related language.")
603
+ forbid (
604
+ principal,
605
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
606
+ resource
607
+ )
608
+ when {
609
+ context has violence_score && context.violence_score >= 80
610
+ };
611
+
612
+ // Block prompts with weapons content
613
+ @id("ts-block-weapons")
614
+ @name("Block weapons content")
615
+ @description("Block content when weapons detection score exceeds threshold (80/100)")
616
+ @severity("critical")
617
+ @tags("weapons,content-safety,trust-safety,nist-si-4")
618
+ @reject_message("Your prompt was blocked because it contains weapons-related content. Please rephrase without weapons references.")
619
+ forbid (
620
+ principal,
621
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
622
+ resource
623
+ )
624
+ when {
625
+ context has weapons_score && context.weapons_score >= 80
626
+ };
627
+
628
+ // Block prompts with hate speech
629
+ @id("ts-block-hate-speech")
630
+ @name("Block hate speech")
631
+ @description("Block content when hate speech detection score exceeds threshold (75/100)")
632
+ @severity("critical")
633
+ @tags("hate-speech,content-safety,trust-safety,nist-si-4")
634
+ @reject_message("Your prompt was blocked because it contains hate speech. Please rephrase without hateful or discriminatory language.")
635
+ forbid (
636
+ principal,
637
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
638
+ resource
639
+ )
640
+ when {
641
+ context has hate_speech_score && context.hate_speech_score >= 75
642
+ };
643
+
644
+ // Block prompts with criminal content
645
+ @id("ts-block-crime")
646
+ @name("Block criminal content")
647
+ @description("Block content when criminal activity detection score exceeds threshold (80/100)")
648
+ @severity("high")
649
+ @tags("crime,content-safety,trust-safety,nist-si-4")
650
+ @reject_message("Your prompt was blocked because it contains content related to criminal activity.")
651
+ forbid (
652
+ principal,
653
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
654
+ resource
655
+ )
656
+ when {
657
+ context has crime_score && context.crime_score >= 80
658
+ };
659
+
660
+ // Block prompts with sexual content
661
+ @id("ts-block-sexual")
662
+ @name("Block sexual content")
663
+ @description("Block content when sexual content detection score exceeds threshold (80/100)")
664
+ @severity("high")
665
+ @tags("sexual,content-safety,trust-safety,eu-ai-act")
666
+ @reject_message("Your prompt was blocked because it contains sexual content.")
667
+ forbid (
668
+ principal,
669
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
670
+ resource
671
+ )
672
+ when {
673
+ context has sexual_score && context.sexual_score >= 80
674
+ };
675
+
676
+ // Block prompts with excessive profanity
677
+ @id("ts-block-profanity")
678
+ @name("Block profanity")
679
+ @description("Block content when profanity detection score exceeds threshold (90/100)")
680
+ @severity("medium")
681
+ @tags("profanity,content-safety,trust-safety")
682
+ @reject_message("Your prompt was blocked due to excessive profanity. Please rephrase in a professional manner.")
683
+ forbid (
684
+ principal,
685
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool"],
686
+ resource
687
+ )
688
+ when {
689
+ context has profanity_score && context.profanity_score >= 90
690
+ };
691
+ `;
692
+
693
+ const OVERWATCH_AGENT_SECURITY_DEFAULT_CEDAR = `// =============================================================================
694
+ // Agent Security Policy (Default)
695
+ // =============================================================================
696
+ // Detects and blocks tool poisoning, rug pull attacks, and indirect prompt
697
+ // injection targeting AI coding agents. These are agentic AI-specific attack
698
+ // vectors where tool descriptions or server responses manipulate agent behavior.
699
+ //
700
+ // Compliance: OWASP LLM09 (Improper Output Handling) | MITRE ATLAS AML.T0054
701
+ // Category: agent_security
702
+ // Namespace: Overwatch
703
+ // =============================================================================
704
+
705
+ // Block tool calls with high tool poisoning risk
706
+ @id("as-block-tool-poisoning")
707
+ @name("Block tool poisoning")
708
+ @description("Block tool execution when tool description contains manipulation patterns (score >= 70/100)")
709
+ @severity("critical")
710
+ @tags("tool-poisoning,agent-security,owasp-llm09")
711
+ @reject_message("Tool execution was blocked because the tool description contains manipulation patterns that could compromise agent behavior.")
712
+ forbid (
713
+ principal,
714
+ action == Overwatch::Action::"call_tool",
715
+ resource
716
+ )
717
+ when {
718
+ context has tool_poisoning_score && context.tool_poisoning_score >= 70
719
+ };
720
+
721
+ // Block tool calls with rug pull detection
722
+ @id("as-block-rug-pull")
723
+ @name("Block rug pull attacks")
724
+ @description("Block tool execution when tool behavior diverges from advertised capabilities (score >= 70/100)")
725
+ @severity("critical")
726
+ @tags("rug-pull,agent-security,mcp-security")
727
+ @reject_message("Tool execution was blocked because the tool's actual behavior diverges from its advertised capabilities.")
728
+ forbid (
729
+ principal,
730
+ action in [Overwatch::Action::"call_tool", Overwatch::Action::"connect_server"],
731
+ resource
732
+ )
733
+ when {
734
+ context has rug_pull_score && context.rug_pull_score >= 70
735
+ };
736
+
737
+ // Block MCP server connections with high poisoning risk
738
+ @id("as-block-server-poisoning")
739
+ @name("Block poisoned MCP servers")
740
+ @description("Block connections to MCP servers when tool poisoning patterns are detected (score >= 60/100)")
741
+ @severity("critical")
742
+ @tags("tool-poisoning,mcp-security,agent-security")
743
+ @reject_message("Connection to this MCP server was blocked because tool poisoning patterns were detected in its tool descriptions.")
744
+ forbid (
745
+ principal,
746
+ action == Overwatch::Action::"connect_server",
747
+ resource
748
+ )
749
+ when {
750
+ context has tool_poisoning_score && context.tool_poisoning_score >= 60
751
+ };
752
+
753
+ // Block prompts with indirect injection patterns
754
+ @id("as-block-indirect-injection")
755
+ @name("Block indirect prompt injection")
756
+ @description("Block content when indirect prompt injection is detected in tool outputs or retrieved documents (score >= 70/100)")
757
+ @severity("critical")
758
+ @tags("indirect-injection,agent-security,owasp-llm01")
759
+ @reject_message("This content was blocked because indirect prompt injection patterns were detected in tool outputs or retrieved documents.")
760
+ forbid (
761
+ principal,
762
+ action in [Overwatch::Action::"process_prompt", Overwatch::Action::"call_tool", Overwatch::Action::"connect_server"],
763
+ resource
764
+ )
765
+ when {
766
+ context has indirect_injection_score && context.indirect_injection_score >= 70
767
+ };
768
+
769
+ // Block unverified MCP server tool calls with any detected threats
770
+ @id("as-block-unverified-threats")
771
+ @name("Block unverified server threats")
772
+ @description("Block tool calls from unverified MCP servers when any threat is detected")
773
+ @severity("high")
774
+ @tags("mcp-trust,agent-security,unverified")
775
+ @reject_message("Tool execution was blocked because the MCP server is unverified and threats were detected in the content.")
776
+ forbid (
777
+ principal,
778
+ action == Overwatch::Action::"call_tool",
779
+ resource
780
+ )
781
+ when {
782
+ context has mcp_server_verified && context.mcp_server_verified == false &&
783
+ context has threat_count && context.threat_count > 0
784
+ };
785
+ `;
786
+
511
787
  const OVERWATCH_TOOLS_MCP_ALLOWLIST_CEDAR = `// MCP Server Allowlist Template
512
788
  // Only allow specific MCP servers to be used
513
789
  // Category: tools
@@ -661,6 +937,8 @@ export const OVERWATCH_CATEGORIES: OverwatchCategoryInfo[] = [
661
937
  { id: 'semantic', name: 'Semantic Threat Detection', description: 'Detect and block prompt injection, jailbreak attempts, and high-severity AI security threats' },
662
938
  { id: 'tools', name: 'Tool Permissioning', description: 'Control access to shell execution, file operations, MCP servers, and sensitive system paths' },
663
939
  { id: 'organization', name: 'Organization Rules', description: 'Apply organization-wide policy baselines, team permissions, and agent-specific guardrails' },
940
+ { id: 'trust_safety', name: 'Content Safety', description: 'Detect and control violent, harmful, hateful, sexual, and profane content using trust/safety classification scores' },
941
+ { id: 'agent_security', name: 'Agent Security', description: 'Detect tool poisoning, rug pull attacks, and indirect prompt injection targeting AI agents' },
664
942
  ];
665
943
 
666
944
  // =============================================================================
@@ -668,6 +946,16 @@ export const OVERWATCH_CATEGORIES: OverwatchCategoryInfo[] = [
668
946
  // =============================================================================
669
947
 
670
948
  export const OVERWATCH_DEFAULTS: OverwatchDefaultPolicy[] = [
949
+ {
950
+ id: 'baseline-default',
951
+ name: 'Baseline Permit',
952
+ description: 'Permits all actions by default — threat-specific forbid policies override this when threats are detected',
953
+ category: 'organization',
954
+ cedarText: OVERWATCH_BASELINE_DEFAULT_CEDAR,
955
+ severity: 'low',
956
+ tags: ['baseline', 'permit-default', 'organization'],
957
+ isActive: true,
958
+ },
671
959
  {
672
960
  id: 'secrets-default',
673
961
  name: 'Secrets Detection',
@@ -708,6 +996,26 @@ export const OVERWATCH_DEFAULTS: OverwatchDefaultPolicy[] = [
708
996
  tags: ['shell', 'command-injection', 'file-access', 'mitre-t1059', 'baseline'],
709
997
  isActive: false,
710
998
  },
999
+ {
1000
+ id: 'trust-safety-default',
1001
+ name: 'Content Safety',
1002
+ description: 'Detect and block violent, harmful, hateful, sexual, and profane content using classification scores',
1003
+ category: 'trust_safety',
1004
+ cedarText: OVERWATCH_TRUST_SAFETY_DEFAULT_CEDAR,
1005
+ severity: 'critical',
1006
+ tags: ['violence', 'weapons', 'hate-speech', 'crime', 'sexual', 'profanity', 'content-safety', 'baseline'],
1007
+ isActive: true,
1008
+ },
1009
+ {
1010
+ id: 'agent-security-default',
1011
+ name: 'Agent Security',
1012
+ description: 'Detect and block tool poisoning, rug pull attacks, and indirect prompt injection targeting AI agents',
1013
+ category: 'agent_security',
1014
+ cedarText: OVERWATCH_AGENT_SECURITY_DEFAULT_CEDAR,
1015
+ severity: 'critical',
1016
+ tags: ['tool-poisoning', 'rug-pull', 'indirect-injection', 'mcp-security', 'agent-security', 'baseline'],
1017
+ isActive: true,
1018
+ },
711
1019
  ];
712
1020
 
713
1021
  // =============================================================================
@@ -769,7 +1077,7 @@ export const OVERWATCH_TEMPLATES: OverwatchTemplate[] = [
769
1077
  /** Raw templates.json metadata for the Overwatch service. */
770
1078
  export const OVERWATCH_TEMPLATES_JSON: string = `{
771
1079
  "service": "overwatch",
772
- "version": "2.0.0",
1080
+ "version": "3.0.0",
773
1081
  "description": "Overwatch policy templates for IDE security",
774
1082
  "categories": [
775
1083
  {
@@ -796,9 +1104,29 @@ export const OVERWATCH_TEMPLATES_JSON: string = `{
796
1104
  "id": "organization",
797
1105
  "name": "Organization Rules",
798
1106
  "description": "Apply organization-wide policy baselines, team permissions, and agent-specific guardrails"
1107
+ },
1108
+ {
1109
+ "id": "trust_safety",
1110
+ "name": "Content Safety",
1111
+ "description": "Detect and control violent, harmful, hateful, sexual, and profane content using trust/safety classification scores"
1112
+ },
1113
+ {
1114
+ "id": "agent_security",
1115
+ "name": "Agent Security",
1116
+ "description": "Detect tool poisoning, rug pull attacks, and indirect prompt injection targeting AI agents"
799
1117
  }
800
1118
  ],
801
1119
  "defaults": [
1120
+ {
1121
+ "id": "baseline-default",
1122
+ "name": "Baseline Permit",
1123
+ "description": "Permits all actions by default — threat-specific forbid policies override this when threats are detected",
1124
+ "category": "organization",
1125
+ "file": "defaults/baseline.cedar",
1126
+ "severity": "low",
1127
+ "tags": ["baseline", "permit-default", "organization"],
1128
+ "is_active": true
1129
+ },
802
1130
  {
803
1131
  "id": "secrets-default",
804
1132
  "name": "Secrets Detection",
@@ -838,6 +1166,26 @@ export const OVERWATCH_TEMPLATES_JSON: string = `{
838
1166
  "severity": "critical",
839
1167
  "tags": ["shell", "command-injection", "file-access", "mitre-t1059", "baseline"],
840
1168
  "is_active": false
1169
+ },
1170
+ {
1171
+ "id": "trust-safety-default",
1172
+ "name": "Content Safety",
1173
+ "description": "Detect and block violent, harmful, hateful, sexual, and profane content using classification scores",
1174
+ "category": "trust_safety",
1175
+ "file": "defaults/trust_safety.cedar",
1176
+ "severity": "critical",
1177
+ "tags": ["violence", "weapons", "hate-speech", "crime", "sexual", "profanity", "content-safety", "baseline"],
1178
+ "is_active": true
1179
+ },
1180
+ {
1181
+ "id": "agent-security-default",
1182
+ "name": "Agent Security",
1183
+ "description": "Detect and block tool poisoning, rug pull attacks, and indirect prompt injection targeting AI agents",
1184
+ "category": "agent_security",
1185
+ "file": "defaults/agent_security.cedar",
1186
+ "severity": "critical",
1187
+ "tags": ["tool-poisoning", "rug-pull", "indirect-injection", "mcp-security", "agent-security", "baseline"],
1188
+ "is_active": true
841
1189
  }
842
1190
  ],
843
1191
  "templates": [
@@ -21,14 +21,14 @@ import {
21
21
  // =============================================================================
22
22
 
23
23
  describe("Overwatch defaults data", () => {
24
- test("should have 5 categories", () => {
25
- expect(OVERWATCH_CATEGORIES).toHaveLength(5);
24
+ test("should have 7 categories", () => {
25
+ expect(OVERWATCH_CATEGORIES).toHaveLength(7);
26
26
  const ids = OVERWATCH_CATEGORIES.map((c) => c.id);
27
- expect(ids).toEqual(["secrets", "pii", "semantic", "tools", "organization"]);
27
+ expect(ids).toEqual(["secrets", "pii", "semantic", "tools", "organization", "trust_safety", "agent_security"]);
28
28
  });
29
29
 
30
- test("should have 4 default policies", () => {
31
- expect(OVERWATCH_DEFAULTS).toHaveLength(4);
30
+ test("should have 7 default policies", () => {
31
+ expect(OVERWATCH_DEFAULTS).toHaveLength(7);
32
32
  });
33
33
 
34
34
  test("should have 5 templates", () => {
@@ -216,6 +216,22 @@ describe('Service-Specific Schemas', () => {
216
216
  max_threat_severity: 1,
217
217
  contains_secrets: false,
218
218
  response_content: '',
219
+ // Trust/Safety scores
220
+ violence_score: 0,
221
+ weapons_score: 0,
222
+ hate_speech_score: 0,
223
+ crime_score: 0,
224
+ sexual_score: 0,
225
+ profanity_score: 0,
226
+ // Detector confidence
227
+ pii_confidence: 0,
228
+ injection_confidence: 0,
229
+ jailbreak_confidence: 0,
230
+ // Agent security
231
+ tool_poisoning_score: 0,
232
+ rug_pull_score: 0,
233
+ indirect_injection_score: 0,
234
+ mcp_server_verified: false,
219
235
  },
220
236
  entities,
221
237
  });
@@ -397,6 +413,22 @@ describe('Service-Specific Schemas', () => {
397
413
  max_threat_severity: 2,
398
414
  contains_secrets: false,
399
415
  response_content: '',
416
+ // Trust/Safety scores
417
+ violence_score: 0,
418
+ weapons_score: 0,
419
+ hate_speech_score: 0,
420
+ crime_score: 0,
421
+ sexual_score: 0,
422
+ profanity_score: 0,
423
+ // Detector confidence
424
+ pii_confidence: 0,
425
+ injection_confidence: 0,
426
+ jailbreak_confidence: 0,
427
+ // Agent security
428
+ tool_poisoning_score: 0,
429
+ rug_pull_score: 0,
430
+ indirect_injection_score: 0,
431
+ mcp_server_verified: false,
400
432
  },
401
433
  entities,
402
434
  });