npm - @highflame/policy - Versions diffs - 2.0.9 → 2.1.0 - Mend

@highflame/policy 2.0.9 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

package/_schemas/guardrails/context.json +435 -0
package/_schemas/guardrails/schema.cedarschema +225 -0
package/_schemas/guardrails/templates/defaults/agentic_safety.cedar +94 -0
package/_schemas/guardrails/templates/defaults/baseline.cedar +24 -0
package/_schemas/guardrails/templates/defaults/injection.cedar +70 -0
package/_schemas/guardrails/templates/defaults/pii.cedar +48 -0
package/_schemas/guardrails/templates/defaults/secrets.cedar +40 -0
package/_schemas/guardrails/templates/defaults/semantic.cedar +59 -0
package/_schemas/guardrails/templates/defaults/tool_risk.cedar +58 -0
package/_schemas/guardrails/templates/defaults/toxicity.cedar +76 -0
package/_schemas/guardrails/templates/mcp_tool_permissions.cedar +84 -0
package/_schemas/guardrails/templates/profiles/chat_assistant/privacy.cedar +22 -0
package/_schemas/guardrails/templates/profiles/chat_assistant/security.cedar +35 -0
package/_schemas/guardrails/templates/profiles/chat_assistant/trust_safety.cedar +43 -0
package/_schemas/guardrails/templates/profiles/chat_assistant.cedar +85 -0
package/_schemas/guardrails/templates/profiles/code_agent/agentic_security.cedar +109 -0
package/_schemas/guardrails/templates/profiles/code_agent/security.cedar +22 -0
package/_schemas/guardrails/templates/profiles/code_agent.cedar +125 -0
package/_schemas/guardrails/templates/profiles/data_pipeline/agentic_security.cedar +38 -0
package/_schemas/guardrails/templates/profiles/data_pipeline/privacy.cedar +40 -0
package/_schemas/guardrails/templates/profiles/data_pipeline/security.cedar +49 -0
package/_schemas/guardrails/templates/profiles/data_pipeline.cedar +111 -0
package/_schemas/guardrails/templates/templates.json +213 -0
package/_schemas/overwatch/context.json +54 -54
package/_schemas/overwatch/schema.cedarschema +77 -68
package/dist/builder.d.ts +106 -13
package/dist/builder.js +103 -34
package/dist/engine.d.ts +20 -2
package/dist/engine.js +50 -20
package/dist/entities.gen.d.ts +4 -0
package/dist/entities.gen.js +4 -0
package/dist/explain.d.ts +150 -0
package/dist/explain.js +363 -0
package/dist/guardrails-context.gen.d.ts +49 -0
package/dist/guardrails-context.gen.js +50 -0
package/dist/guardrails-defaults.gen.d.ts +61 -0
package/dist/guardrails-defaults.gen.js +1278 -0
package/dist/guardrails-entities.gen.d.ts +11 -0
package/dist/guardrails-entities.gen.js +37 -0
package/dist/index.d.ts +6 -1
package/dist/index.js +6 -1
package/dist/overwatch-defaults.gen.js +122 -2
package/dist/parser.js +136 -4
package/dist/schema.gen.d.ts +1 -1
package/dist/schema.gen.js +6 -0
package/dist/service-schemas.gen.d.ts +15 -11
package/dist/service-schemas.gen.js +509 -84
package/dist/types.d.ts +6 -1
package/dist/types.js +6 -1
package/package.json +5 -1

package/_schemas/guardrails/templates/defaults/agentic_safety.cedar ADDED Viewed

@@ -0,0 +1,94 @@
+// =============================================================================
+// Agentic Safety Policy
+// =============================================================================
+// Blocks suspicious agentic behavior patterns:
+// - Infinite loops (repeated tool calls)
+// - Suspicious action sequences (data exfiltration patterns)
+// - Budget violations (token/cost limits)
+//
+// Context keys used (normalized by projection layer):
+// - loop_detected: bool - Whether tool call loop detected
+// - loop_count: Long - Number of consecutive repeated calls
+// - suspicious_pattern: bool - Whether suspicious sequence detected
+// - pattern_type: String - Type of pattern (e.g., "data_exfiltration")
+// - sequence_risk: Long (0-100) - Risk score from sequence analysis
+// - budget_exceeded: bool - Whether token budget exceeded
+// - budget_remaining_pct: Long (0-100) - Remaining budget percentage
+//
+// Category: agentic-security
+// Namespace: Guardrails
+// =============================================================================
+@id("agentic-block-loops")
+@name("Block tool call loops")
+@description("Forbids repeated tool calls indicating infinite loop")
+@severity("high")
+@tags("agentic,loops,safety")
+forbid (
+    principal,
+    action == Guardrails::Action::"call_tool",
+    resource
+) when {
+    context has loop_detected && context.loop_detected == true &&
+    context has loop_count && context.loop_count > 3
+};
+@id("agentic-block-exfiltration")
+@name("Block data exfiltration patterns")
+@description("Forbids suspicious action sequences like read → http_post")
+@severity("critical")
+@tags("agentic,exfiltration,security")
+forbid (
+    principal,
+    action,
+    resource
+) when {
+    context has suspicious_pattern && context.suspicious_pattern == true &&
+    context has pattern_type &&
+    (
+        context.pattern_type == "data_exfiltration" ||
+        context.pattern_type == "secret_exfiltration" ||
+        context.pattern_type == "db_exfiltration"
+    )
+};
+@id("agentic-block-high-sequence-risk")
+@name("Block high-risk action sequences")
+@description("Forbids action sequences with risk score above 80")
+@severity("high")
+@tags("agentic,patterns,security")
+forbid (
+    principal,
+    action,
+    resource
+) when {
+    context has sequence_risk && context.sequence_risk > 80
+};
+@id("agentic-block-budget-exceeded")
+@name("Block budget violations")
+@description("Forbids requests when token budget is exceeded")
+@severity("medium")
+@tags("agentic,budget,cost-control")
+forbid (
+    principal,
+    action,
+    resource
+) when {
+    context has budget_exceeded && context.budget_exceeded == true
+};
+@id("agentic-block-low-budget")
+@name("Block requests when budget critically low")
+@description("Forbids requests when remaining budget falls below 5%")
+@severity("medium")
+@tags("agentic,budget,cost-control")
+forbid (
+    principal,
+    action,
+    resource
+) when {
+    context has budget_remaining_pct &&
+    context.budget_remaining_pct < 5 &&
+    context.budget_remaining_pct > 0
+};

package/_schemas/guardrails/templates/defaults/baseline.cedar ADDED Viewed

@@ -0,0 +1,24 @@
+// =============================================================================
+// Baseline Permit Policy (Default)
+// =============================================================================
+// Permits all actions by default. Threat-specific forbid policies override
+// this to block when detectors find security issues.
+//
+// Cedar is default-deny: without at least one permit rule, every request
+// is denied regardless of forbid rules. This baseline ensures the system
+// is "allow unless blocked" rather than "block everything".
+//
+// Category: organization
+// Namespace: Guardrails
+// =============================================================================
+@id("baseline-permit-all")
+@name("Permit all actions by default")
+@description("Baseline permit for all actions — threat-specific forbid policies override this when threats are detected")
+@severity("low")
+@tags("baseline,permit-default,organization")
+permit (
+    principal,
+    action,
+    resource
+);

package/_schemas/guardrails/templates/defaults/injection.cedar ADDED Viewed

@@ -0,0 +1,70 @@
+// =============================================================================
+// Injection & Jailbreak Detection Policy
+// =============================================================================
+// Blocks prompt injection, jailbreak attempts, and command injection.
+// Uses ML-based confidence scores from normalized context.
+//
+// Context keys used (normalized by projection layer):
+// - injection_score: Long (0-100) - Overall injection confidence
+// - jailbreak_score: Long (0-100) - Jailbreak attempt confidence
+// - injection_type: String - Type of injection detected
+// - contains_invisible_chars: Bool - Invisible Unicode characters detected
+// - invisible_chars_score: Long (0-100) - Invisible character density
+//
+// Category: security
+// Namespace: Guardrails
+// =============================================================================
+@id("injection-block-high-confidence")
+@name("Block high-confidence injection attempts")
+@description("Forbids requests with injection confidence above 85%")
+@severity("high")
+@tags("injection,jailbreak,security")
+forbid (
+    principal,
+    action,
+    resource
+) when {
+    context has injection_score && context.injection_score > 85
+};
+@id("jailbreak-block-high-confidence")
+@name("Block high-confidence jailbreak attempts")
+@description("Forbids requests with jailbreak confidence above 80%")
+@severity("high")
+@tags("jailbreak,security")
+forbid (
+    principal,
+    action,
+    resource
+) when {
+    context has jailbreak_score && context.jailbreak_score > 80
+};
+@id("injection-combined-threshold")
+@name("Block combined injection and jailbreak")
+@description("Forbids requests with moderate scores in both injection and jailbreak")
+@severity("high")
+@tags("injection,jailbreak,security")
+forbid (
+    principal,
+    action,
+    resource
+) when {
+    context has injection_score && context has jailbreak_score &&
+    context.injection_score > 60 && context.jailbreak_score > 60
+};
+@id("injection-invisible-chars")
+@name("Block invisible character injection")
+@description("Forbids requests containing invisible Unicode characters (zero-width joiners, etc.) commonly used for prompt injection")
+@severity("high")
+@tags("injection,unicode,security")
+forbid (
+    principal,
+    action,
+    resource
+) when {
+    context has contains_invisible_chars && context.contains_invisible_chars == true &&
+    context has invisible_chars_score && context.invisible_chars_score > 50
+};

package/_schemas/guardrails/templates/defaults/pii.cedar ADDED Viewed

@@ -0,0 +1,48 @@
+// =============================================================================
+// PII (Personally Identifiable Information) Policy
+// =============================================================================
+// Blocks content containing PII in output responses.
+// Typically applied to LLM outputs to prevent data leakage.
+//
+// Context keys used (normalized by projection layer):
+// - pii_detected: bool - Whether PII was found
+// - pii_count: Long - Number of PII matches
+// - pii_types: Set<String> - Types of PII detected
+// - direction: String - "input" or "output"
+//
+// Category: privacy
+// Namespace: Guardrails
+// =============================================================================
+@id("pii-block-output")
+@name("Block PII in outputs")
+@description("Forbids LLM responses that contain PII (prevents data leakage)")
+@severity("high")
+@tags("pii,privacy,data-protection")
+forbid (
+    principal,
+    action == Guardrails::Action::"process_prompt",
+    resource
+) when {
+    context has pii_detected && context.pii_detected == true &&
+    context.direction == "output"
+};
+@id("pii-block-sensitive-types")
+@name("Block sensitive PII types")
+@description("Forbids outputs containing SSN, credit cards, or passport numbers")
+@severity("critical")
+@tags("pii,privacy,sensitive-data")
+forbid (
+    principal,
+    action == Guardrails::Action::"process_prompt",
+    resource
+) when {
+    context.direction == "output" &&
+    context has pii_types &&
+    (
+        context.pii_types.contains("ssn") ||
+        context.pii_types.contains("credit_card") ||
+        context.pii_types.contains("passport")
+    )
+};

package/_schemas/guardrails/templates/defaults/secrets.cedar ADDED Viewed

@@ -0,0 +1,40 @@
+// =============================================================================
+// Secrets Detection Policy
+// =============================================================================
+// Blocks requests containing API keys, tokens, credentials, or other secrets.
+// Applies to both input prompts and output responses.
+//
+// Context keys used (normalized by projection layer):
+// - contains_secrets: bool - Whether secrets were detected
+// - secret_count: Long - Number of secret matches
+// - secret_types: Set<String> - Types of secrets found
+//
+// Category: security
+// Namespace: Guardrails
+// =============================================================================
+@id("secrets-block-all")
+@name("Block content containing secrets")
+@description("Forbids requests that contain API keys, tokens, or credentials")
+@severity("critical")
+@tags("secrets,security,data-leak")
+forbid (
+    principal,
+    action,
+    resource
+) when {
+    context has contains_secrets && context.contains_secrets == true
+};
+@id("secrets-block-high-count")
+@name("Block multiple secrets")
+@description("Forbids requests with multiple secret matches (potential data dump)")
+@severity("critical")
+@tags("secrets,security,data-leak")
+forbid (
+    principal,
+    action,
+    resource
+) when {
+    context has secret_count && context.secret_count > 2
+};

package/_schemas/guardrails/templates/defaults/semantic.cedar ADDED Viewed

@@ -0,0 +1,59 @@
+// =============================================================================
+// Semantic Topic Enforcement Policy
+// =============================================================================
+// Blocks content based on semantic topic classification. Enables per-application
+// content restrictions (e.g., medical agent cannot discuss controlled substances).
+//
+// Context keys used (populated by topic classifier detector):
+// - content_topics: Set<String> - Semantic topics detected in content
+// - topic_confidence: Long (0-100) - Classifier confidence score
+//
+// Category: semantic
+// Namespace: Guardrails
+// =============================================================================
+@id("semantic-block-dangerous-topics")
+@name("Block dangerous content topics")
+@description("Forbids content classified under dangerous topics such as weapons manufacturing, explosives, or illegal synthesis")
+@severity("critical")
+@tags("semantic,compliance,safety")
+forbid (
+    principal,
+    action == Guardrails::Action::"process_prompt",
+    resource
+) when {
+    context.content_topics.contains("weapons_manufacturing") ||
+    context.content_topics.contains("explosive_materials") ||
+    context.content_topics.contains("illegal_synthesis")
+};
+@id("semantic-block-controlled-substances")
+@name("Block controlled substance content")
+@description("Forbids content discussing controlled substances, drug manufacturing, or precursor chemicals")
+@severity("high")
+@tags("semantic,compliance,medical")
+forbid (
+    principal,
+    action == Guardrails::Action::"process_prompt",
+    resource
+) when {
+    context.content_topics.contains("controlled_substances") ||
+    context.content_topics.contains("drug_manufacturing") ||
+    context.content_topics.contains("precursor_chemicals")
+};
+@id("semantic-block-high-confidence-restricted")
+@name("Block high-confidence restricted topics")
+@description("Forbids content with high-confidence classification in any restricted topic category")
+@severity("high")
+@tags("semantic,compliance")
+forbid (
+    principal,
+    action == Guardrails::Action::"process_prompt",
+    resource
+) when {
+    context.topic_confidence > 80 &&
+    (context.content_topics.contains("illegal_activity") ||
+     context.content_topics.contains("financial_fraud") ||
+     context.content_topics.contains("social_engineering"))
+};

package/_schemas/guardrails/templates/defaults/tool_risk.cedar ADDED Viewed

@@ -0,0 +1,58 @@
+// =============================================================================
+// Tool Risk Policy
+// =============================================================================
+// Blocks dangerous tool calls based on risk scoring.
+// Considers tool sensitivity, argument patterns, and MCP verification.
+//
+// Context keys used (normalized by projection layer):
+// - tool_name: String - Name of the tool
+// - tool_risk_score: Long (0-100) - Computed risk score
+// - tool_is_sensitive: bool - Whether tool is classified as sensitive
+// - tool_category: String - "safe" | "sensitive" | "dangerous"
+//
+// Category: agentic-security
+// Namespace: Guardrails
+// =============================================================================
+@id("tool-block-dangerous")
+@name("Block dangerous tools")
+@description("Forbids tools classified as dangerous (risk > 85)")
+@severity("critical")
+@tags("tools,agentic,security")
+forbid (
+    principal,
+    action == Guardrails::Action::"call_tool",
+    resource
+) when {
+    (context has tool_risk_score && context.tool_risk_score > 85) ||
+    (context has tool_category && context.tool_category == "dangerous")
+};
+@id("tool-block-shell-commands")
+@name("Block shell command execution")
+@description("Forbids direct shell/execute_command tool calls")
+@severity("high")
+@tags("tools,shell,security")
+forbid (
+    principal,
+    action == Guardrails::Action::"call_tool",
+    resource
+) when {
+    context has tool_name &&
+    (context.tool_name == "shell" ||
+     context.tool_name == "execute_command")
+};
+@id("tool-block-sensitive-high-risk")
+@name("Block high-risk sensitive tools")
+@description("Forbids sensitive tool calls with elevated risk scores")
+@severity("high")
+@tags("tools,agentic,security")
+forbid (
+    principal,
+    action == Guardrails::Action::"call_tool",
+    resource
+) when {
+    context has tool_is_sensitive && context.tool_is_sensitive == true &&
+    context has tool_risk_score && context.tool_risk_score > 70
+};

package/_schemas/guardrails/templates/defaults/toxicity.cedar ADDED Viewed

@@ -0,0 +1,76 @@
+// =============================================================================
+// Toxicity & Content Moderation Policy
+// =============================================================================
+// Blocks toxic, harmful, or inappropriate content.
+// Covers violence, hate speech, sexual content, weapons, crime, and profanity.
+//
+// Context keys used (normalized by projection layer):
+// - violence_score: Long (0-100) - Violence reference score
+// - hate_speech_score: Long (0-100) - Hate speech score
+// - sexual_score: Long (0-100) - Sexual content score
+// - weapons_score: Long (0-100) - Weapons reference score
+// - crime_score: Long (0-100) - Criminal activity score
+// - profanity_score: Long (0-100) - Profanity score
+//
+// Category: trust-safety
+// Namespace: Guardrails
+// =============================================================================
+@id("toxicity-block-critical")
+@name("Block critical toxicity")
+@description("Forbids content with any toxicity score above 90")
+@severity("critical")
+@tags("toxicity,trust-safety")
+forbid (
+    principal,
+    action,
+    resource
+) when {
+    (context has violence_score && context.violence_score > 90) ||
+    (context has hate_speech_score && context.hate_speech_score > 90) ||
+    (context has sexual_score && context.sexual_score > 90) ||
+    (context has weapons_score && context.weapons_score > 90) ||
+    (context has crime_score && context.crime_score > 90) ||
+    (context has profanity_score && context.profanity_score > 90)
+};
+@id("toxicity-block-hate-speech")
+@name("Block hate speech")
+@description("Forbids content with hate speech above 75")
+@severity("high")
+@tags("hate-speech,trust-safety")
+forbid (
+    principal,
+    action,
+    resource
+) when {
+    context has hate_speech_score && context.hate_speech_score > 75
+};
+@id("toxicity-block-combined")
+@name("Block combined moderate toxicity")
+@description("Forbids content with two or more moderate toxicity scores above 60")
+@severity("high")
+@tags("toxicity,trust-safety")
+forbid (
+    principal,
+    action,
+    resource
+) when {
+    // Any 2 of 6 toxicity scores above 60 triggers a block
+    (context has violence_score && context has hate_speech_score && context.violence_score > 60 && context.hate_speech_score > 60) ||
+    (context has violence_score && context has sexual_score && context.violence_score > 60 && context.sexual_score > 60) ||
+    (context has violence_score && context has weapons_score && context.violence_score > 60 && context.weapons_score > 60) ||
+    (context has violence_score && context has crime_score && context.violence_score > 60 && context.crime_score > 60) ||
+    (context has violence_score && context has profanity_score && context.violence_score > 60 && context.profanity_score > 60) ||
+    (context has hate_speech_score && context has sexual_score && context.hate_speech_score > 60 && context.sexual_score > 60) ||
+    (context has hate_speech_score && context has weapons_score && context.hate_speech_score > 60 && context.weapons_score > 60) ||
+    (context has hate_speech_score && context has crime_score && context.hate_speech_score > 60 && context.crime_score > 60) ||
+    (context has hate_speech_score && context has profanity_score && context.hate_speech_score > 60 && context.profanity_score > 60) ||
+    (context has sexual_score && context has weapons_score && context.sexual_score > 60 && context.weapons_score > 60) ||
+    (context has sexual_score && context has crime_score && context.sexual_score > 60 && context.crime_score > 60) ||
+    (context has sexual_score && context has profanity_score && context.sexual_score > 60 && context.profanity_score > 60) ||
+    (context has weapons_score && context has crime_score && context.weapons_score > 60 && context.crime_score > 60) ||
+    (context has weapons_score && context has profanity_score && context.weapons_score > 60 && context.profanity_score > 60) ||
+    (context has crime_score && context has profanity_score && context.crime_score > 60 && context.profanity_score > 60)
+};

package/_schemas/guardrails/templates/mcp_tool_permissions.cedar ADDED Viewed

@@ -0,0 +1,84 @@
+// =============================================================================
+// MCP Tool Permissions Template
+// =============================================================================
+// Per-tool access control for MCP servers. Allows fine-grained permissioning:
+// allow specific tools from a server while denying others.
+//
+// Example: Allow read_issues from GitHub but deny create_issues.
+//
+// Context keys used:
+// - mcp_server: String - MCP server name
+// - mcp_tool: String - Tool name within the server
+// - tool_name: String - Normalized tool name
+// - mcp_server_verified: Bool - Whether server is verified
+//
+// Category: agentic_security
+// Namespace: Guardrails
+// =============================================================================
+// -- GitHub MCP: Read-only access -------------------------------------------
+@id("mcp-github-allow-read")
+@name("Allow read-only GitHub tools")
+@description("Permit read operations from GitHub MCP server (issues, PRs, code search)")
+@severity("medium")
+@tags("mcp,github,read-only,least-privilege")
+permit (
+    principal,
+    action == Guardrails::Action::"call_tool",
+    resource
+) when {
+    context.mcp_server == "github" &&
+    (context.tool_name == "read_issues" ||
+     context.tool_name == "get_issue" ||
+     context.tool_name == "list_repos" ||
+     context.tool_name == "get_pull_request" ||
+     context.tool_name == "search_code" ||
+     context.tool_name == "get_file_contents")
+};
+@id("mcp-github-deny-write")
+@name("Deny write GitHub tools")
+@description("Block create/update/delete operations on GitHub MCP server")
+@severity("high")
+@tags("mcp,github,write-block,least-privilege")
+forbid (
+    principal,
+    action == Guardrails::Action::"call_tool",
+    resource
+) when {
+    context.mcp_server == "github"
+};
+// -- Organization-wide MCP server exclusions --------------------------------
+@id("mcp-exclude-server")
+@name("Exclude specific MCP servers")
+@description("Block all tool calls from excluded MCP servers (org-wide exclusion list)")
+@severity("critical")
+@tags("mcp,exclusion,org-wide,block")
+forbid (
+    principal,
+    action == Guardrails::Action::"call_tool",
+    resource
+) when {
+    // Add server names to block across the organization.
+    // Modify this list to match your exclusion requirements.
+    context.mcp_server == "untrusted-server" ||
+    context.mcp_server == "deprecated-server"
+};
+// -- Block unverified MCP servers -------------------------------------------
+@id("mcp-block-unverified")
+@name("Block unverified MCP servers")
+@description("Deny all tool calls from MCP servers not in the verified registry")
+@severity("high")
+@tags("mcp,trust,verification")
+forbid (
+    principal,
+    action == Guardrails::Action::"call_tool",
+    resource
+) when {
+    context.mcp_server_verified == false
+};

package/_schemas/guardrails/templates/profiles/chat_assistant/privacy.cedar ADDED Viewed

@@ -0,0 +1,22 @@
+// =============================================================================
+// Chat Assistant — Privacy
+// =============================================================================
+// PII protection for customer-facing chatbots.
+// Blocks PII in both user inputs and assistant outputs.
+//
+// Category: privacy
+// Namespace: Guardrails
+// =============================================================================
+@id("chat-pii-block-input-output")
+@name("Block PII in both directions")
+@description("Blocks PII in both user inputs and assistant outputs for chat applications")
+@severity("high")
+@tags("profile,chat-assistant,pii,privacy")
+forbid (
+    principal,
+    action,
+    resource
+) when {
+    context has pii_detected && context.pii_detected == true
+};

package/_schemas/guardrails/templates/profiles/chat_assistant/security.cedar ADDED Viewed

@@ -0,0 +1,35 @@
+// =============================================================================
+// Chat Assistant — Security
+// =============================================================================
+// Aggressive injection and jailbreak defense for customer-facing chatbots.
+// Lower thresholds than defaults — public-facing chat is high-risk for attacks.
+//
+// Category: security
+// Namespace: Guardrails
+// =============================================================================
+@id("chat-injection-lower-threshold")
+@name("Aggressive injection defense for chat")
+@description("Lower injection threshold for public-facing chat — blocks above 70")
+@severity("high")
+@tags("profile,chat-assistant,injection,security")
+forbid (
+    principal,
+    action,
+    resource
+) when {
+    context has injection_score && context.injection_score > 70
+};
+@id("chat-jailbreak-lower-threshold")
+@name("Aggressive jailbreak defense for chat")
+@description("Lower jailbreak threshold for public-facing chat — blocks above 65")
+@severity("high")
+@tags("profile,chat-assistant,jailbreak,security")
+forbid (
+    principal,
+    action,
+    resource
+) when {
+    context has jailbreak_score && context.jailbreak_score > 65
+};

package/_schemas/guardrails/templates/profiles/chat_assistant/trust_safety.cedar ADDED Viewed

@@ -0,0 +1,43 @@
+// =============================================================================
+// Chat Assistant — Trust & Safety
+// =============================================================================
+// Strict content moderation for customer-facing chatbots.
+// Lower toxicity thresholds and topic restrictions for public-facing content.
+//
+// Category: trust_safety
+// Namespace: Guardrails
+// =============================================================================
+@id("chat-toxicity-strict")
+@name("Strict toxicity for chat")
+@description("Lower thresholds for customer-facing content — blocks toxicity above 70")
+@severity("critical")
+@tags("profile,chat-assistant,toxicity,trust-safety")
+forbid (
+    principal,
+    action,
+    resource
+) when {
+    (context has violence_score && context.violence_score > 70) ||
+    (context has hate_speech_score && context.hate_speech_score > 70) ||
+    (context has sexual_score && context.sexual_score > 70) ||
+    (context has profanity_score && context.profanity_score > 70)
+};
+@id("chat-topic-restriction")
+@name("Block restricted topics in chat")
+@description("Prevents chat assistants from discussing dangerous or regulated topics")
+@severity("high")
+@tags("profile,chat-assistant,semantic,compliance")
+forbid (
+    principal,
+    action,
+    resource
+) when {
+    context has topic_confidence && context.topic_confidence > 70 &&
+    context has content_topics &&
+    (context.content_topics.contains("weapons_manufacturing") ||
+     context.content_topics.contains("illegal_activity") ||
+     context.content_topics.contains("controlled_substances") ||
+     context.content_topics.contains("financial_fraud"))
+};