@highflame/policy 2.0.10 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/_schemas/guardrails/context.json +435 -0
  2. package/_schemas/guardrails/schema.cedarschema +225 -0
  3. package/_schemas/guardrails/templates/defaults/agentic_safety.cedar +94 -0
  4. package/_schemas/guardrails/templates/defaults/baseline.cedar +24 -0
  5. package/_schemas/guardrails/templates/defaults/injection.cedar +70 -0
  6. package/_schemas/guardrails/templates/defaults/pii.cedar +48 -0
  7. package/_schemas/guardrails/templates/defaults/secrets.cedar +40 -0
  8. package/_schemas/guardrails/templates/defaults/semantic.cedar +59 -0
  9. package/_schemas/guardrails/templates/defaults/tool_risk.cedar +58 -0
  10. package/_schemas/guardrails/templates/defaults/toxicity.cedar +76 -0
  11. package/_schemas/guardrails/templates/mcp_tool_permissions.cedar +84 -0
  12. package/_schemas/guardrails/templates/profiles/chat_assistant/privacy.cedar +22 -0
  13. package/_schemas/guardrails/templates/profiles/chat_assistant/security.cedar +35 -0
  14. package/_schemas/guardrails/templates/profiles/chat_assistant/trust_safety.cedar +43 -0
  15. package/_schemas/guardrails/templates/profiles/chat_assistant.cedar +85 -0
  16. package/_schemas/guardrails/templates/profiles/code_agent/agentic_security.cedar +109 -0
  17. package/_schemas/guardrails/templates/profiles/code_agent/security.cedar +22 -0
  18. package/_schemas/guardrails/templates/profiles/code_agent.cedar +125 -0
  19. package/_schemas/guardrails/templates/profiles/data_pipeline/agentic_security.cedar +38 -0
  20. package/_schemas/guardrails/templates/profiles/data_pipeline/privacy.cedar +40 -0
  21. package/_schemas/guardrails/templates/profiles/data_pipeline/security.cedar +49 -0
  22. package/_schemas/guardrails/templates/profiles/data_pipeline.cedar +111 -0
  23. package/_schemas/guardrails/templates/templates.json +213 -0
  24. package/dist/builder.d.ts +61 -0
  25. package/dist/builder.js +4 -2
  26. package/dist/entities.gen.d.ts +4 -0
  27. package/dist/entities.gen.js +4 -0
  28. package/dist/explain.d.ts +178 -0
  29. package/dist/explain.js +390 -0
  30. package/dist/guardrails-context.gen.d.ts +49 -0
  31. package/dist/guardrails-context.gen.js +50 -0
  32. package/dist/guardrails-defaults.gen.d.ts +61 -0
  33. package/dist/guardrails-defaults.gen.js +1278 -0
  34. package/dist/guardrails-entities.gen.d.ts +11 -0
  35. package/dist/guardrails-entities.gen.js +37 -0
  36. package/dist/index.d.ts +6 -1
  37. package/dist/index.js +6 -1
  38. package/dist/overwatch-defaults.gen.js +98 -0
  39. package/dist/parser.js +136 -4
  40. package/dist/schema.gen.d.ts +1 -1
  41. package/dist/schema.gen.js +6 -0
  42. package/dist/service-schemas.gen.d.ts +10 -0
  43. package/dist/service-schemas.gen.js +337 -1
  44. package/dist/types.d.ts +6 -1
  45. package/dist/types.js +6 -1
  46. package/package.json +5 -1
@@ -0,0 +1,94 @@
1
+ // =============================================================================
2
+ // Agentic Safety Policy
3
+ // =============================================================================
4
+ // Blocks suspicious agentic behavior patterns:
5
+ // - Infinite loops (repeated tool calls)
6
+ // - Suspicious action sequences (data exfiltration patterns)
7
+ // - Budget violations (token/cost limits)
8
+ //
9
+ // Context keys used (normalized by projection layer):
10
+ // - loop_detected: bool - Whether tool call loop detected
11
+ // - loop_count: Long - Number of consecutive repeated calls
12
+ // - suspicious_pattern: bool - Whether suspicious sequence detected
13
+ // - pattern_type: String - Type of pattern (e.g., "data_exfiltration")
14
+ // - sequence_risk: Long (0-100) - Risk score from sequence analysis
15
+ // - budget_exceeded: bool - Whether token budget exceeded
16
+ // - budget_remaining_pct: Long (0-100) - Remaining budget percentage
17
+ //
18
+ // Category: agentic-security
19
+ // Namespace: Guardrails
20
+ // =============================================================================
21
+
22
+ @id("agentic-block-loops")
23
+ @name("Block tool call loops")
24
+ @description("Forbids repeated tool calls indicating infinite loop")
25
+ @severity("high")
26
+ @tags("agentic,loops,safety")
27
+ forbid (
28
+ principal,
29
+ action == Guardrails::Action::"call_tool",
30
+ resource
31
+ ) when {
32
+ context has loop_detected && context.loop_detected == true &&
33
+ context has loop_count && context.loop_count > 3
34
+ };
35
+
36
+ @id("agentic-block-exfiltration")
37
+ @name("Block data exfiltration patterns")
38
+ @description("Forbids suspicious action sequences like read → http_post")
39
+ @severity("critical")
40
+ @tags("agentic,exfiltration,security")
41
+ forbid (
42
+ principal,
43
+ action,
44
+ resource
45
+ ) when {
46
+ context has suspicious_pattern && context.suspicious_pattern == true &&
47
+ context has pattern_type &&
48
+ (
49
+ context.pattern_type == "data_exfiltration" ||
50
+ context.pattern_type == "secret_exfiltration" ||
51
+ context.pattern_type == "db_exfiltration"
52
+ )
53
+ };
54
+
55
+ @id("agentic-block-high-sequence-risk")
56
+ @name("Block high-risk action sequences")
57
+ @description("Forbids action sequences with risk score above 80")
58
+ @severity("high")
59
+ @tags("agentic,patterns,security")
60
+ forbid (
61
+ principal,
62
+ action,
63
+ resource
64
+ ) when {
65
+ context has sequence_risk && context.sequence_risk > 80
66
+ };
67
+
68
+ @id("agentic-block-budget-exceeded")
69
+ @name("Block budget violations")
70
+ @description("Forbids requests when token budget is exceeded")
71
+ @severity("medium")
72
+ @tags("agentic,budget,cost-control")
73
+ forbid (
74
+ principal,
75
+ action,
76
+ resource
77
+ ) when {
78
+ context has budget_exceeded && context.budget_exceeded == true
79
+ };
80
+
81
+ @id("agentic-block-low-budget")
82
+ @name("Block requests when budget critically low")
83
+ @description("Forbids requests when remaining budget falls below 5%")
84
+ @severity("medium")
85
+ @tags("agentic,budget,cost-control")
86
+ forbid (
87
+ principal,
88
+ action,
89
+ resource
90
+ ) when {
91
+ context has budget_remaining_pct &&
92
+ context.budget_remaining_pct < 5 &&
93
+ context.budget_remaining_pct > 0
94
+ };
@@ -0,0 +1,24 @@
1
+ // =============================================================================
2
+ // Baseline Permit Policy (Default)
3
+ // =============================================================================
4
+ // Permits all actions by default. Threat-specific forbid policies override
5
+ // this to block when detectors find security issues.
6
+ //
7
+ // Cedar is default-deny: without at least one permit rule, every request
8
+ // is denied regardless of forbid rules. This baseline ensures the system
9
+ // is "allow unless blocked" rather than "block everything".
10
+ //
11
+ // Category: organization
12
+ // Namespace: Guardrails
13
+ // =============================================================================
14
+
15
+ @id("baseline-permit-all")
16
+ @name("Permit all actions by default")
17
+ @description("Baseline permit for all actions — threat-specific forbid policies override this when threats are detected")
18
+ @severity("low")
19
+ @tags("baseline,permit-default,organization")
20
+ permit (
21
+ principal,
22
+ action,
23
+ resource
24
+ );
@@ -0,0 +1,70 @@
1
+ // =============================================================================
2
+ // Injection & Jailbreak Detection Policy
3
+ // =============================================================================
4
+ // Blocks prompt injection, jailbreak attempts, and command injection.
5
+ // Uses ML-based confidence scores from normalized context.
6
+ //
7
+ // Context keys used (normalized by projection layer):
8
+ // - injection_score: Long (0-100) - Overall injection confidence
9
+ // - jailbreak_score: Long (0-100) - Jailbreak attempt confidence
10
+ // - injection_type: String - Type of injection detected
11
+ // - contains_invisible_chars: Bool - Invisible Unicode characters detected
12
+ // - invisible_chars_score: Long (0-100) - Invisible character density
13
+ //
14
+ // Category: security
15
+ // Namespace: Guardrails
16
+ // =============================================================================
17
+
18
+ @id("injection-block-high-confidence")
19
+ @name("Block high-confidence injection attempts")
20
+ @description("Forbids requests with injection confidence above 85%")
21
+ @severity("high")
22
+ @tags("injection,jailbreak,security")
23
+ forbid (
24
+ principal,
25
+ action,
26
+ resource
27
+ ) when {
28
+ context has injection_score && context.injection_score > 85
29
+ };
30
+
31
+ @id("jailbreak-block-high-confidence")
32
+ @name("Block high-confidence jailbreak attempts")
33
+ @description("Forbids requests with jailbreak confidence above 80%")
34
+ @severity("high")
35
+ @tags("jailbreak,security")
36
+ forbid (
37
+ principal,
38
+ action,
39
+ resource
40
+ ) when {
41
+ context has jailbreak_score && context.jailbreak_score > 80
42
+ };
43
+
44
+ @id("injection-combined-threshold")
45
+ @name("Block combined injection and jailbreak")
46
+ @description("Forbids requests with moderate scores in both injection and jailbreak")
47
+ @severity("high")
48
+ @tags("injection,jailbreak,security")
49
+ forbid (
50
+ principal,
51
+ action,
52
+ resource
53
+ ) when {
54
+ context has injection_score && context has jailbreak_score &&
55
+ context.injection_score > 60 && context.jailbreak_score > 60
56
+ };
57
+
58
+ @id("injection-invisible-chars")
59
+ @name("Block invisible character injection")
60
+ @description("Forbids requests containing invisible Unicode characters (zero-width joiners, etc.) commonly used for prompt injection")
61
+ @severity("high")
62
+ @tags("injection,unicode,security")
63
+ forbid (
64
+ principal,
65
+ action,
66
+ resource
67
+ ) when {
68
+ context has contains_invisible_chars && context.contains_invisible_chars == true &&
69
+ context has invisible_chars_score && context.invisible_chars_score > 50
70
+ };
@@ -0,0 +1,48 @@
1
+ // =============================================================================
2
+ // PII (Personally Identifiable Information) Policy
3
+ // =============================================================================
4
+ // Blocks content containing PII in output responses.
5
+ // Typically applied to LLM outputs to prevent data leakage.
6
+ //
7
+ // Context keys used (normalized by projection layer):
8
+ // - pii_detected: bool - Whether PII was found
9
+ // - pii_count: Long - Number of PII matches
10
+ // - pii_types: Set<String> - Types of PII detected
11
+ // - direction: String - "input" or "output"
12
+ //
13
+ // Category: privacy
14
+ // Namespace: Guardrails
15
+ // =============================================================================
16
+
17
+ @id("pii-block-output")
18
+ @name("Block PII in outputs")
19
+ @description("Forbids LLM responses that contain PII (prevents data leakage)")
20
+ @severity("high")
21
+ @tags("pii,privacy,data-protection")
22
+ forbid (
23
+ principal,
24
+ action == Guardrails::Action::"process_prompt",
25
+ resource
26
+ ) when {
27
+ context has pii_detected && context.pii_detected == true &&
28
+ context.direction == "output"
29
+ };
30
+
31
+ @id("pii-block-sensitive-types")
32
+ @name("Block sensitive PII types")
33
+ @description("Forbids outputs containing SSN, credit cards, or passport numbers")
34
+ @severity("critical")
35
+ @tags("pii,privacy,sensitive-data")
36
+ forbid (
37
+ principal,
38
+ action == Guardrails::Action::"process_prompt",
39
+ resource
40
+ ) when {
41
+ context.direction == "output" &&
42
+ context has pii_types &&
43
+ (
44
+ context.pii_types.contains("ssn") ||
45
+ context.pii_types.contains("credit_card") ||
46
+ context.pii_types.contains("passport")
47
+ )
48
+ };
@@ -0,0 +1,40 @@
1
+ // =============================================================================
2
+ // Secrets Detection Policy
3
+ // =============================================================================
4
+ // Blocks requests containing API keys, tokens, credentials, or other secrets.
5
+ // Applies to both input prompts and output responses.
6
+ //
7
+ // Context keys used (normalized by projection layer):
8
+ // - contains_secrets: bool - Whether secrets were detected
9
+ // - secret_count: Long - Number of secret matches
10
+ // - secret_types: Set<String> - Types of secrets found
11
+ //
12
+ // Category: security
13
+ // Namespace: Guardrails
14
+ // =============================================================================
15
+
16
+ @id("secrets-block-all")
17
+ @name("Block content containing secrets")
18
+ @description("Forbids requests that contain API keys, tokens, or credentials")
19
+ @severity("critical")
20
+ @tags("secrets,security,data-leak")
21
+ forbid (
22
+ principal,
23
+ action,
24
+ resource
25
+ ) when {
26
+ context has contains_secrets && context.contains_secrets == true
27
+ };
28
+
29
+ @id("secrets-block-high-count")
30
+ @name("Block multiple secrets")
31
+ @description("Forbids requests with multiple secret matches (potential data dump)")
32
+ @severity("critical")
33
+ @tags("secrets,security,data-leak")
34
+ forbid (
35
+ principal,
36
+ action,
37
+ resource
38
+ ) when {
39
+ context has secret_count && context.secret_count > 2
40
+ };
@@ -0,0 +1,59 @@
1
+ // =============================================================================
2
+ // Semantic Topic Enforcement Policy
3
+ // =============================================================================
4
+ // Blocks content based on semantic topic classification. Enables per-application
5
+ // content restrictions (e.g., medical agent cannot discuss controlled substances).
6
+ //
7
+ // Context keys used (populated by topic classifier detector):
8
+ // - content_topics: Set<String> - Semantic topics detected in content
9
+ // - topic_confidence: Long (0-100) - Classifier confidence score
10
+ //
11
+ // Category: semantic
12
+ // Namespace: Guardrails
13
+ // =============================================================================
14
+
15
+ @id("semantic-block-dangerous-topics")
16
+ @name("Block dangerous content topics")
17
+ @description("Forbids content classified under dangerous topics such as weapons manufacturing, explosives, or illegal synthesis")
18
+ @severity("critical")
19
+ @tags("semantic,compliance,safety")
20
+ forbid (
21
+ principal,
22
+ action == Guardrails::Action::"process_prompt",
23
+ resource
24
+ ) when {
25
+ context.content_topics.contains("weapons_manufacturing") ||
26
+ context.content_topics.contains("explosive_materials") ||
27
+ context.content_topics.contains("illegal_synthesis")
28
+ };
29
+
30
+ @id("semantic-block-controlled-substances")
31
+ @name("Block controlled substance content")
32
+ @description("Forbids content discussing controlled substances, drug manufacturing, or precursor chemicals")
33
+ @severity("high")
34
+ @tags("semantic,compliance,medical")
35
+ forbid (
36
+ principal,
37
+ action == Guardrails::Action::"process_prompt",
38
+ resource
39
+ ) when {
40
+ context.content_topics.contains("controlled_substances") ||
41
+ context.content_topics.contains("drug_manufacturing") ||
42
+ context.content_topics.contains("precursor_chemicals")
43
+ };
44
+
45
+ @id("semantic-block-high-confidence-restricted")
46
+ @name("Block high-confidence restricted topics")
47
+ @description("Forbids content with high-confidence classification in any restricted topic category")
48
+ @severity("high")
49
+ @tags("semantic,compliance")
50
+ forbid (
51
+ principal,
52
+ action == Guardrails::Action::"process_prompt",
53
+ resource
54
+ ) when {
55
+ context.topic_confidence > 80 &&
56
+ (context.content_topics.contains("illegal_activity") ||
57
+ context.content_topics.contains("financial_fraud") ||
58
+ context.content_topics.contains("social_engineering"))
59
+ };
@@ -0,0 +1,58 @@
1
+ // =============================================================================
2
+ // Tool Risk Policy
3
+ // =============================================================================
4
+ // Blocks dangerous tool calls based on risk scoring.
5
+ // Considers tool sensitivity, argument patterns, and MCP verification.
6
+ //
7
+ // Context keys used (normalized by projection layer):
8
+ // - tool_name: String - Name of the tool
9
+ // - tool_risk_score: Long (0-100) - Computed risk score
10
+ // - tool_is_sensitive: bool - Whether tool is classified as sensitive
11
+ // - tool_category: String - "safe" | "sensitive" | "dangerous"
12
+ //
13
+ // Category: agentic-security
14
+ // Namespace: Guardrails
15
+ // =============================================================================
16
+
17
+ @id("tool-block-dangerous")
18
+ @name("Block dangerous tools")
19
+ @description("Forbids tools classified as dangerous (risk > 85)")
20
+ @severity("critical")
21
+ @tags("tools,agentic,security")
22
+ forbid (
23
+ principal,
24
+ action == Guardrails::Action::"call_tool",
25
+ resource
26
+ ) when {
27
+ (context has tool_risk_score && context.tool_risk_score > 85) ||
28
+ (context has tool_category && context.tool_category == "dangerous")
29
+ };
30
+
31
+ @id("tool-block-shell-commands")
32
+ @name("Block shell command execution")
33
+ @description("Forbids direct shell/execute_command tool calls")
34
+ @severity("high")
35
+ @tags("tools,shell,security")
36
+ forbid (
37
+ principal,
38
+ action == Guardrails::Action::"call_tool",
39
+ resource
40
+ ) when {
41
+ context has tool_name &&
42
+ (context.tool_name == "shell" ||
43
+ context.tool_name == "execute_command")
44
+ };
45
+
46
+ @id("tool-block-sensitive-high-risk")
47
+ @name("Block high-risk sensitive tools")
48
+ @description("Forbids sensitive tool calls with elevated risk scores")
49
+ @severity("high")
50
+ @tags("tools,agentic,security")
51
+ forbid (
52
+ principal,
53
+ action == Guardrails::Action::"call_tool",
54
+ resource
55
+ ) when {
56
+ context has tool_is_sensitive && context.tool_is_sensitive == true &&
57
+ context has tool_risk_score && context.tool_risk_score > 70
58
+ };
@@ -0,0 +1,76 @@
1
+ // =============================================================================
2
+ // Toxicity & Content Moderation Policy
3
+ // =============================================================================
4
+ // Blocks toxic, harmful, or inappropriate content.
5
+ // Covers violence, hate speech, sexual content, weapons, crime, and profanity.
6
+ //
7
+ // Context keys used (normalized by projection layer):
8
+ // - violence_score: Long (0-100) - Violence reference score
9
+ // - hate_speech_score: Long (0-100) - Hate speech score
10
+ // - sexual_score: Long (0-100) - Sexual content score
11
+ // - weapons_score: Long (0-100) - Weapons reference score
12
+ // - crime_score: Long (0-100) - Criminal activity score
13
+ // - profanity_score: Long (0-100) - Profanity score
14
+ //
15
+ // Category: trust-safety
16
+ // Namespace: Guardrails
17
+ // =============================================================================
18
+
19
+ @id("toxicity-block-critical")
20
+ @name("Block critical toxicity")
21
+ @description("Forbids content with any toxicity score above 90")
22
+ @severity("critical")
23
+ @tags("toxicity,trust-safety")
24
+ forbid (
25
+ principal,
26
+ action,
27
+ resource
28
+ ) when {
29
+ (context has violence_score && context.violence_score > 90) ||
30
+ (context has hate_speech_score && context.hate_speech_score > 90) ||
31
+ (context has sexual_score && context.sexual_score > 90) ||
32
+ (context has weapons_score && context.weapons_score > 90) ||
33
+ (context has crime_score && context.crime_score > 90) ||
34
+ (context has profanity_score && context.profanity_score > 90)
35
+ };
36
+
37
+ @id("toxicity-block-hate-speech")
38
+ @name("Block hate speech")
39
+ @description("Forbids content with hate speech above 75")
40
+ @severity("high")
41
+ @tags("hate-speech,trust-safety")
42
+ forbid (
43
+ principal,
44
+ action,
45
+ resource
46
+ ) when {
47
+ context has hate_speech_score && context.hate_speech_score > 75
48
+ };
49
+
50
+ @id("toxicity-block-combined")
51
+ @name("Block combined moderate toxicity")
52
+ @description("Forbids content with two or more moderate toxicity scores above 60")
53
+ @severity("high")
54
+ @tags("toxicity,trust-safety")
55
+ forbid (
56
+ principal,
57
+ action,
58
+ resource
59
+ ) when {
60
+ // Any 2 of 6 toxicity scores above 60 triggers a block
61
+ (context has violence_score && context has hate_speech_score && context.violence_score > 60 && context.hate_speech_score > 60) ||
62
+ (context has violence_score && context has sexual_score && context.violence_score > 60 && context.sexual_score > 60) ||
63
+ (context has violence_score && context has weapons_score && context.violence_score > 60 && context.weapons_score > 60) ||
64
+ (context has violence_score && context has crime_score && context.violence_score > 60 && context.crime_score > 60) ||
65
+ (context has violence_score && context has profanity_score && context.violence_score > 60 && context.profanity_score > 60) ||
66
+ (context has hate_speech_score && context has sexual_score && context.hate_speech_score > 60 && context.sexual_score > 60) ||
67
+ (context has hate_speech_score && context has weapons_score && context.hate_speech_score > 60 && context.weapons_score > 60) ||
68
+ (context has hate_speech_score && context has crime_score && context.hate_speech_score > 60 && context.crime_score > 60) ||
69
+ (context has hate_speech_score && context has profanity_score && context.hate_speech_score > 60 && context.profanity_score > 60) ||
70
+ (context has sexual_score && context has weapons_score && context.sexual_score > 60 && context.weapons_score > 60) ||
71
+ (context has sexual_score && context has crime_score && context.sexual_score > 60 && context.crime_score > 60) ||
72
+ (context has sexual_score && context has profanity_score && context.sexual_score > 60 && context.profanity_score > 60) ||
73
+ (context has weapons_score && context has crime_score && context.weapons_score > 60 && context.crime_score > 60) ||
74
+ (context has weapons_score && context has profanity_score && context.weapons_score > 60 && context.profanity_score > 60) ||
75
+ (context has crime_score && context has profanity_score && context.crime_score > 60 && context.profanity_score > 60)
76
+ };
@@ -0,0 +1,84 @@
1
+ // =============================================================================
2
+ // MCP Tool Permissions Template
3
+ // =============================================================================
4
+ // Per-tool access control for MCP servers. Allows fine-grained permissioning:
5
+ // allow specific tools from a server while denying others.
6
+ //
7
+ // Example: Allow read_issues from GitHub but deny create_issues.
8
+ //
9
+ // Context keys used:
10
+ // - mcp_server: String - MCP server name
11
+ // - mcp_tool: String - Tool name within the server
12
+ // - tool_name: String - Normalized tool name
13
+ // - mcp_server_verified: Bool - Whether server is verified
14
+ //
15
+ // Category: agentic_security
16
+ // Namespace: Guardrails
17
+ // =============================================================================
18
+
19
+ // -- GitHub MCP: Read-only access -------------------------------------------
20
+
21
+ @id("mcp-github-allow-read")
22
+ @name("Allow read-only GitHub tools")
23
+ @description("Permit read operations from GitHub MCP server (issues, PRs, code search)")
24
+ @severity("medium")
25
+ @tags("mcp,github,read-only,least-privilege")
26
+ permit (
27
+ principal,
28
+ action == Guardrails::Action::"call_tool",
29
+ resource
30
+ ) when {
31
+ context.mcp_server == "github" &&
32
+ (context.tool_name == "read_issues" ||
33
+ context.tool_name == "get_issue" ||
34
+ context.tool_name == "list_repos" ||
35
+ context.tool_name == "get_pull_request" ||
36
+ context.tool_name == "search_code" ||
37
+ context.tool_name == "get_file_contents")
38
+ };
39
+
40
+ @id("mcp-github-deny-write")
41
+ @name("Deny write GitHub tools")
42
+ @description("Block create/update/delete operations on GitHub MCP server")
43
+ @severity("high")
44
+ @tags("mcp,github,write-block,least-privilege")
45
+ forbid (
46
+ principal,
47
+ action == Guardrails::Action::"call_tool",
48
+ resource
49
+ ) when {
50
+ context.mcp_server == "github"
51
+ };
52
+
53
+ // -- Organization-wide MCP server exclusions --------------------------------
54
+
55
+ @id("mcp-exclude-server")
56
+ @name("Exclude specific MCP servers")
57
+ @description("Block all tool calls from excluded MCP servers (org-wide exclusion list)")
58
+ @severity("critical")
59
+ @tags("mcp,exclusion,org-wide,block")
60
+ forbid (
61
+ principal,
62
+ action == Guardrails::Action::"call_tool",
63
+ resource
64
+ ) when {
65
+ // Add server names to block across the organization.
66
+ // Modify this list to match your exclusion requirements.
67
+ context.mcp_server == "untrusted-server" ||
68
+ context.mcp_server == "deprecated-server"
69
+ };
70
+
71
+ // -- Block unverified MCP servers -------------------------------------------
72
+
73
+ @id("mcp-block-unverified")
74
+ @name("Block unverified MCP servers")
75
+ @description("Deny all tool calls from MCP servers not in the verified registry")
76
+ @severity("high")
77
+ @tags("mcp,trust,verification")
78
+ forbid (
79
+ principal,
80
+ action == Guardrails::Action::"call_tool",
81
+ resource
82
+ ) when {
83
+ context.mcp_server_verified == false
84
+ };
@@ -0,0 +1,22 @@
1
+ // =============================================================================
2
+ // Chat Assistant — Privacy
3
+ // =============================================================================
4
+ // PII protection for customer-facing chatbots.
5
+ // Blocks PII in both user inputs and assistant outputs.
6
+ //
7
+ // Category: privacy
8
+ // Namespace: Guardrails
9
+ // =============================================================================
10
+
11
+ @id("chat-pii-block-input-output")
12
+ @name("Block PII in both directions")
13
+ @description("Blocks PII in both user inputs and assistant outputs for chat applications")
14
+ @severity("high")
15
+ @tags("profile,chat-assistant,pii,privacy")
16
+ forbid (
17
+ principal,
18
+ action,
19
+ resource
20
+ ) when {
21
+ context has pii_detected && context.pii_detected == true
22
+ };
@@ -0,0 +1,35 @@
1
+ // =============================================================================
2
+ // Chat Assistant — Security
3
+ // =============================================================================
4
+ // Aggressive injection and jailbreak defense for customer-facing chatbots.
5
+ // Lower thresholds than defaults — public-facing chat is high-risk for attacks.
6
+ //
7
+ // Category: security
8
+ // Namespace: Guardrails
9
+ // =============================================================================
10
+
11
+ @id("chat-injection-lower-threshold")
12
+ @name("Aggressive injection defense for chat")
13
+ @description("Lower injection threshold for public-facing chat — blocks above 70")
14
+ @severity("high")
15
+ @tags("profile,chat-assistant,injection,security")
16
+ forbid (
17
+ principal,
18
+ action,
19
+ resource
20
+ ) when {
21
+ context has injection_score && context.injection_score > 70
22
+ };
23
+
24
+ @id("chat-jailbreak-lower-threshold")
25
+ @name("Aggressive jailbreak defense for chat")
26
+ @description("Lower jailbreak threshold for public-facing chat — blocks above 65")
27
+ @severity("high")
28
+ @tags("profile,chat-assistant,jailbreak,security")
29
+ forbid (
30
+ principal,
31
+ action,
32
+ resource
33
+ ) when {
34
+ context has jailbreak_score && context.jailbreak_score > 65
35
+ };
@@ -0,0 +1,43 @@
1
+ // =============================================================================
2
+ // Chat Assistant — Trust & Safety
3
+ // =============================================================================
4
+ // Strict content moderation for customer-facing chatbots.
5
+ // Lower toxicity thresholds and topic restrictions for public-facing content.
6
+ //
7
+ // Category: trust_safety
8
+ // Namespace: Guardrails
9
+ // =============================================================================
10
+
11
+ @id("chat-toxicity-strict")
12
+ @name("Strict toxicity for chat")
13
+ @description("Lower thresholds for customer-facing content — blocks toxicity above 70")
14
+ @severity("critical")
15
+ @tags("profile,chat-assistant,toxicity,trust-safety")
16
+ forbid (
17
+ principal,
18
+ action,
19
+ resource
20
+ ) when {
21
+ (context has violence_score && context.violence_score > 70) ||
22
+ (context has hate_speech_score && context.hate_speech_score > 70) ||
23
+ (context has sexual_score && context.sexual_score > 70) ||
24
+ (context has profanity_score && context.profanity_score > 70)
25
+ };
26
+
27
+ @id("chat-topic-restriction")
28
+ @name("Block restricted topics in chat")
29
+ @description("Prevents chat assistants from discussing dangerous or regulated topics")
30
+ @severity("high")
31
+ @tags("profile,chat-assistant,semantic,compliance")
32
+ forbid (
33
+ principal,
34
+ action,
35
+ resource
36
+ ) when {
37
+ context has topic_confidence && context.topic_confidence > 70 &&
38
+ context has content_topics &&
39
+ (context.content_topics.contains("weapons_manufacturing") ||
40
+ context.content_topics.contains("illegal_activity") ||
41
+ context.content_topics.contains("controlled_substances") ||
42
+ context.content_topics.contains("financial_fraud"))
43
+ };