npm - @highflame/policy - Versions diffs - 2.1.2 → 2.1.4 - Mend

@highflame/policy 2.1.2 → 2.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/_schemas/guardrails/context.json +37 -73
package/_schemas/overwatch/context.json +211 -1
package/_schemas/palisade/context.json +1 -1
package/_schemas/sentry/context.json +1165 -0
package/_schemas/sentry/schema.cedarschema +388 -0
package/_schemas/sentry/templates/defaults/baseline.cedar +24 -0
package/_schemas/sentry/templates/defaults/content_safety.cedar +232 -0
package/_schemas/sentry/templates/defaults/file_safety.cedar +174 -0
package/_schemas/sentry/templates/defaults/organization.cedar +207 -0
package/_schemas/sentry/templates/defaults/pii.cedar +229 -0
package/_schemas/sentry/templates/defaults/semantic.cedar +167 -0
package/_schemas/sentry/templates/templates.json +93 -0
package/dist/builder.d.ts +32 -0
package/dist/builder.js +6 -6
package/dist/condition-groups.d.ts +69 -0
package/dist/condition-groups.js +305 -0
package/dist/index.d.ts +6 -1
package/dist/index.js +6 -1
package/dist/overwatch-context.gen.d.ts +7 -0
package/dist/overwatch-context.gen.js +7 -0
package/dist/overwatch-defaults.gen.js +358 -370
package/dist/sentry-context.gen.d.ts +76 -0
package/dist/sentry-context.gen.js +77 -0
package/dist/sentry-defaults.gen.d.ts +61 -0
package/dist/sentry-defaults.gen.js +1235 -0
package/dist/sentry-entities.gen.d.ts +11 -0
package/dist/sentry-entities.gen.js +33 -0
package/dist/service-schemas.gen.d.ts +10 -0
package/dist/service-schemas.gen.js +659 -6
package/dist/types.d.ts +6 -1
package/dist/types.js +6 -1
package/package.json +1 -1

package/_schemas/sentry/schema.cedarschema ADDED Viewed

@@ -0,0 +1,388 @@
+// =============================================================================
+// Sentry Cedar Schema
+// =============================================================================
+// Browser Security — monitors AI chat interactions in the browser and enforces
+// data-protection, content-safety, and compliance policies at point of use.
+//
+// Sentry is a lightweight browser extension (JSA) that intercepts:
+//   - Messages sent to AI chat services (ChatGPT, Gemini, Claude, Copilot, etc.)
+//   - AI responses returned to the user
+//   - Cut/paste operations transferring content into AI chats
+//   - File/document uploads into AI chat services
+//
+// Architecture:
+//   User → Browser Extension → Shield Detection Engine → Cedar Policy → Allow/Block
+//
+// Threat Coverage:
+//   - Data Leakage: PII, PHI, credentials, source code, confidential documents
+//   - Content Safety: Violence, hate speech, sexual content, restricted topics
+//   - Prompt Injection: Direct and indirect injection via pasted/uploaded content
+//   - Document Sensitivity: MIP label enforcement, classification-aware blocking
+//   - Compliance: GDPR, HIPAA, PCI DSS, CCPA, EU AI Act
+//
+// Supported AI Services:
+//   - ChatGPT (chat.openai.com)
+//   - Google Gemini (gemini.google.com)
+//   - Claude (claude.ai)
+//   - GitHub Copilot Chat
+//   - Microsoft Copilot
+//   - Custom/enterprise AI chat endpoints
+namespace Sentry {
+// =============================================================================
+// ENTITIES - Tenant Hierarchy (ReBAC)
+// =============================================================================
+// Aligned with Guardrails/Overwatch entity hierarchy (Account -> Project).
+//
+// Entity hierarchy enables Cedar's `in` operator for policy scoping:
+//   Account (org root)
+//     └── Project in [Account]
+//           └── ChatSession in [Project]
+//
+// Policy scoping examples:
+//   resource in Sentry::Account::"<uuid>"    → org-wide
+//   resource in Sentry::Project::"<uuid>"    → project-wide
+//   resource == Sentry::ChatSession::"<id>"  → specific session
+/// Account represents an organization (top-level tenant)
+entity Account;
+/// Project represents a project within an account
+entity Project in [Account];
+// =============================================================================
+// ENTITIES - Principals
+// =============================================================================
+/// Human user interacting with AI chat in the browser
+entity User;
+// =============================================================================
+// ENTITIES - Resources (scoped under Project)
+// =============================================================================
+/// AI chat session — resource for send_message and receive_response actions
+entity ChatSession in [Project];
+/// Document or file being uploaded — resource for upload_file action
+entity Document in [Project];
+// =============================================================================
+// ACTIONS
+// =============================================================================
+// User sends a message (prompt) to an AI chat service
+// Threat focus: data leakage (PII, secrets, confidential data), injection, content safety
+action send_message appliesTo {
+  principal: [User],
+  resource: [ChatSession],
+  context: {
+    // --- Core Metadata ---
+    content: String,                  // Raw message content being sent
+    source: String,                   // Browser extension identifier: "sentry"
+    event: String,                    // Event type: "send_message"
+    user_email: String,               // User identifier (SSO/OAuth verified)
+    target_app: String,               // AI service: "chatgpt", "gemini", "claude", "copilot", "custom"
+    target_url?: String,              // Full URL of the AI chat service
+    // --- Aggregated Threat Summary (from Shield NormalizeAggregation) ---
+    threat_count: Long,               // Total threats detected
+    highest_severity: String,         // "critical", "high", "medium", "low", "none"
+    threat_categories: Set<String>,   // Threat category names
+    detected_threats: Set<String>,    // Detection rule names that matched
+    max_threat_severity: Long,        // Numeric severity (0=none, 1=low, 2=medium, 3=high, 4=critical)
+    // --- Secrets Detection (from SecretsDetector) ---
+    contains_secrets: Bool,           // Whether secrets/credentials detected
+    secret_types?: Set<String>,       // Types: "aws_access_key", "github_token", "ssh_private_key", etc.
+    secret_count?: Long,              // Number of distinct secrets found
+    // --- PII Detection (from PIIRegexDetector, normalized) ---
+    pii_detected?: Bool,              // Whether any PII patterns matched
+    pii_types?: Set<String>,          // Types: "ssn", "credit_card", "email", "phone", etc.
+    pii_count?: Long,                 // Number of PII matches
+    pii_confidence?: Long,            // PII detection confidence (0-100)
+    // --- Content Safety Scores (from ToxicityDetector, 0-100) ---
+    violence_score: Long,
+    weapons_score: Long,
+    hate_speech_score: Long,
+    crime_score: Long,
+    sexual_score: Long,
+    profanity_score: Long,
+    // --- ML Detector Confidence Scores (0-100) ---
+    injection_score: Long,            // Prompt injection score (max of InjectionDetector + DeepContextDetector)
+    jailbreak_score: Long,            // Jailbreak detection score (max of JailbreakDetector + DeepContextDetector)
+    // --- Topic Classification (from TopicDetector) ---
+    content_topics?: Set<String>,     // Detected topics: "controlled_substances", "weapons_manufacturing", etc.
+    topic_confidence?: Long,          // Topic classifier confidence (0-100)
+    // --- Encoding & Unicode Attacks (from SecurityFiltersDetector, EncodedInjectionDetector) ---
+    contains_invisible_chars?: Bool,  // Zero-width chars, bidi overrides, tag chars
+    invisible_chars_score?: Long,     // Unicode attack severity (0-100)
+    encoded_content_detected?: Bool,  // Base64, hex, unicode, URL encoded content
+    encoded_types?: Set<String>,      // Encoding types detected
+    encoded_count?: Long,             // Number of encoded segments
+    encoded_score?: Long,             // Encoded injection severity (0-100)
+    // --- Code Detection (from CodeDetector) ---
+    contains_code?: Bool,             // Whether content contains source code
+    code_languages?: Set<String>,     // Detected languages: "python", "javascript", etc.
+    code_ratio?: Long,                // Percentage of content that is code (0-100)
+    // --- Language Detection (from LanguageDetector, ScriptDetector) ---
+    detected_language?: String,       // ISO language code
+    is_english?: Bool,
+    language_confidence?: Long,       // 0-100
+    detected_script?: String,         // "latin", "cyrillic", "arabic", "unknown"
+    is_latin_script?: Bool,
+    script_confidence?: Long,         // 0-100
+    // --- Keyword Detection (from KeywordDetector) ---
+    keyword_matched?: Bool,           // Whether any keywords matched
+    keyword_categories?: Set<String>, // Matched keyword categories
+    keyword_count?: Long,             // Number of keyword matches
+    // --- Phishing Detection (from CheckPhishDetector) ---
+    phishing_detected?: Bool,         // Whether phishing URLs detected in content
+    // --- Session Detection History (cross-turn sticky flags) ---
+    session_pii_detected?: Bool,
+    session_pii_types?: Set<String>,
+    session_secrets_detected?: Bool,
+    session_secret_types?: Set<String>,
+    session_injection_detected?: Bool,
+    session_threat_turns?: Long,
+  },
+};
+// AI service responds to the user
+// Threat focus: harmful content in responses, hallucination, data leakage in output
+action receive_response appliesTo {
+  principal: [User],
+  resource: [ChatSession],
+  context: {
+    // --- Core Metadata ---
+    content: String,                  // AI response content
+    source: String,
+    event: String,                    // "receive_response"
+    user_email: String,
+    target_app: String,
+    target_url?: String,
+    // --- Aggregated Threat Summary ---
+    threat_count: Long,
+    highest_severity: String,
+    threat_categories: Set<String>,
+    detected_threats: Set<String>,
+    max_threat_severity: Long,
+    // --- Secrets Detection ---
+    contains_secrets: Bool,
+    secret_types?: Set<String>,
+    secret_count?: Long,
+    // --- PII Detection ---
+    pii_detected?: Bool,
+    pii_types?: Set<String>,
+    pii_count?: Long,
+    pii_confidence?: Long,
+    // --- Content Safety Scores (0-100) ---
+    violence_score: Long,
+    weapons_score: Long,
+    hate_speech_score: Long,
+    crime_score: Long,
+    sexual_score: Long,
+    profanity_score: Long,
+    // --- ML Detector Scores (0-100) ---
+    injection_score: Long,            // Indirect injection in response content
+    jailbreak_score: Long,
+    // --- Hallucination Detection (from HallucinationDetector) ---
+    hallucination_score?: Long,       // Hallucination confidence (0-100)
+    factuality_score?: Long,          // Factuality score (0-100)
+    // --- Code in Response ---
+    contains_code?: Bool,
+    code_languages?: Set<String>,
+    code_ratio?: Long,
+    // --- Phishing ---
+    phishing_detected?: Bool,
+    // --- Session History ---
+    session_pii_detected?: Bool,
+    session_pii_types?: Set<String>,
+    session_secrets_detected?: Bool,
+    session_secret_types?: Set<String>,
+    session_injection_detected?: Bool,
+    session_threat_turns?: Long,
+  },
+};
+// User pastes content into an AI chat (clipboard, cross-tab, cross-app)
+// Threat focus: data leakage via cut/paste, injection payloads in pasted content
+action paste_content appliesTo {
+  principal: [User],
+  resource: [ChatSession],
+  context: {
+    // --- Core Metadata ---
+    content: String,                  // Pasted content
+    source: String,
+    event: String,                    // "paste_content"
+    user_email: String,
+    target_app: String,
+    target_url?: String,
+    // --- Paste Context ---
+    paste_source_app?: String,        // Source application (e.g., "outlook", "excel", "vscode", "terminal")
+    paste_source_url?: String,        // Source URL if from another browser tab
+    paste_length?: Long,              // Character length of pasted content
+    // --- Aggregated Threat Summary ---
+    threat_count: Long,
+    highest_severity: String,
+    threat_categories: Set<String>,
+    detected_threats: Set<String>,
+    max_threat_severity: Long,
+    // --- Secrets Detection ---
+    contains_secrets: Bool,
+    secret_types?: Set<String>,
+    secret_count?: Long,
+    // --- PII Detection ---
+    pii_detected?: Bool,
+    pii_types?: Set<String>,
+    pii_count?: Long,
+    pii_confidence?: Long,
+    // --- Content Safety Scores (0-100) ---
+    violence_score: Long,
+    weapons_score: Long,
+    hate_speech_score: Long,
+    crime_score: Long,
+    sexual_score: Long,
+    profanity_score: Long,
+    // --- ML Detector Scores (0-100) ---
+    injection_score: Long,
+    jailbreak_score: Long,
+    // --- Code Detection ---
+    contains_code?: Bool,
+    code_languages?: Set<String>,
+    code_ratio?: Long,
+    // --- Encoding Attacks ---
+    contains_invisible_chars?: Bool,
+    invisible_chars_score?: Long,
+    encoded_content_detected?: Bool,
+    encoded_types?: Set<String>,
+    encoded_count?: Long,
+    encoded_score?: Long,
+    // --- Keyword Detection ---
+    keyword_matched?: Bool,
+    keyword_categories?: Set<String>,
+    keyword_count?: Long,
+    // --- Session History ---
+    session_pii_detected?: Bool,
+    session_pii_types?: Set<String>,
+    session_secrets_detected?: Bool,
+    session_secret_types?: Set<String>,
+    session_injection_detected?: Bool,
+    session_threat_turns?: Long,
+  },
+};
+// User uploads a file or document into an AI chat
+// Threat focus: document sensitivity (MIP labels), PII/secrets in files, malware
+action upload_file appliesTo {
+  principal: [User],
+  resource: [Document, ChatSession],
+  context: {
+    // --- Core Metadata ---
+    content: String,                  // Extracted file text content (for scanning)
+    source: String,
+    event: String,                    // "upload_file"
+    user_email: String,
+    target_app: String,
+    target_url?: String,
+    // --- File Metadata ---
+    file_name?: String,               // Original file name
+    file_type?: String,               // MIME type: "application/pdf", "text/csv", etc.
+    file_size_bytes?: Long,           // File size in bytes
+    file_extension?: String,          // Extension: "pdf", "docx", "xlsx", "csv", "txt"
+    // --- Document Sensitivity (MIP Labels) ---
+    mip_label_id?: String,            // Microsoft Information Protection label ID
+    mip_label_name?: String,          // Label display name: "Public", "Internal", "Confidential", "Highly Confidential"
+    sensitivity_level?: String,       // Normalized: "public", "internal", "confidential", "restricted"
+    is_encrypted?: Bool,              // Whether file is encrypted (MIP protection)
+    is_rights_managed?: Bool,         // Whether file has rights management restrictions
+    // --- Aggregated Threat Summary ---
+    threat_count: Long,
+    highest_severity: String,
+    threat_categories: Set<String>,
+    detected_threats: Set<String>,
+    max_threat_severity: Long,
+    // --- Secrets Detection ---
+    contains_secrets: Bool,
+    secret_types?: Set<String>,
+    secret_count?: Long,
+    // --- PII Detection ---
+    pii_detected?: Bool,
+    pii_types?: Set<String>,
+    pii_count?: Long,
+    pii_confidence?: Long,
+    // --- Content Safety Scores (0-100) ---
+    violence_score: Long,
+    weapons_score: Long,
+    hate_speech_score: Long,
+    crime_score: Long,
+    sexual_score: Long,
+    profanity_score: Long,
+    // --- ML Detector Scores (0-100) ---
+    injection_score: Long,            // Prompt injection payloads hidden in documents
+    jailbreak_score: Long,
+    // --- Code Detection ---
+    contains_code?: Bool,
+    code_languages?: Set<String>,
+    code_ratio?: Long,
+    // --- Phishing ---
+    phishing_detected?: Bool,
+    // --- Encoding Attacks ---
+    contains_invisible_chars?: Bool,
+    invisible_chars_score?: Long,
+    encoded_content_detected?: Bool,
+    encoded_types?: Set<String>,
+    encoded_count?: Long,
+    encoded_score?: Long,
+    // --- Session History ---
+    session_pii_detected?: Bool,
+    session_pii_types?: Set<String>,
+    session_secrets_detected?: Bool,
+    session_secret_types?: Set<String>,
+    session_injection_detected?: Bool,
+    session_threat_turns?: Long,
+  },
+};
+}

package/_schemas/sentry/templates/defaults/baseline.cedar ADDED Viewed

@@ -0,0 +1,24 @@
+// =============================================================================
+// Baseline Permit Policy (Default)
+// =============================================================================
+// Permits all actions by default. Threat-specific forbid policies override
+// this to block when detection engines identify issues.
+//
+// Cedar is default-deny: without at least one permit rule, every request
+// is denied regardless of forbid rules. This baseline ensures the system
+// is "allow unless blocked" rather than "block everything".
+//
+// Category: organization
+// Namespace: Sentry
+// =============================================================================
+@id("sentry-baseline-permit-all")
+@name("Permit all actions by default")
+@description("Baseline permit for all actions — threat-specific forbid policies override this when threats are detected")
+@severity("low")
+@tags("baseline,permit-default,organization")
+permit (
+    principal,
+    action,
+    resource
+);

package/_schemas/sentry/templates/defaults/content_safety.cedar ADDED Viewed

@@ -0,0 +1,232 @@
+// =============================================================================
+// Content Safety Policy (Default)
+// =============================================================================
+// Detects and blocks violent, harmful, hateful, sexual, and profane content
+// in AI chat interactions. Includes cut-and-paste safety rules to prevent
+// unsafe content from being transferred into AI services.
+//
+// The detection engine runs ML classifiers (toxicity, content safety) and
+// produces normalized scores (0-100) for each category.
+//
+// Compliance:
+//   NIST 800-53 SI-4 (Information System Monitoring)
+//   EU AI Act Art. 52 (Transparency for AI Systems)
+//   OWASP LLM02 (Insecure Output Handling)
+//   ISO 42001 (AI Management System)
+//
+// Category: content_safety
+// Namespace: Sentry
+// =============================================================================
+// ---------------------------------------------------------------------------
+// Section 1: Violence & Weapons
+// Blocks content promoting, describing, or instructing violence and weapons.
+// Applies to messages, paste, and file uploads.
+// ---------------------------------------------------------------------------
+// Block violent content across all input channels
+@id("sentry-cs-block-violence")
+@name("Block violent content")
+@description("Block content when the ML violence detection score exceeds threshold (80/100). Catches graphic violence descriptions, instructions for causing harm, and violent threat language in messages, pastes, and uploads.")
+@severity("critical")
+@tags("violence,content-safety,trust-safety,nist-si-4,iso-42001")
+@reject_message("Content blocked: violent content detected. AI services must not process violent content in enterprise environments. Please rephrase without violence-related language.")
+forbid (
+    principal,
+    action in [Sentry::Action::"send_message", Sentry::Action::"paste_content", Sentry::Action::"upload_file"],
+    resource
+)
+when {
+    context has violence_score && context.violence_score >= 80
+};
+// Block weapons content
+@id("sentry-cs-block-weapons")
+@name("Block weapons content")
+@description("Block content when the ML weapons detection score exceeds threshold (80/100). Catches weapon manufacturing instructions, procurement guidance, and detailed specifications.")
+@severity("critical")
+@tags("weapons,content-safety,trust-safety,nist-si-4")
+@reject_message("Content blocked: weapons-related content detected. AI services must not process weapons manufacturing, procurement, or specification content.")
+forbid (
+    principal,
+    action in [Sentry::Action::"send_message", Sentry::Action::"paste_content", Sentry::Action::"upload_file"],
+    resource
+)
+when {
+    context has weapons_score && context.weapons_score >= 80
+};
+// ---------------------------------------------------------------------------
+// Section 2: Hate Speech & Discrimination
+// Lower threshold (75) — enterprises have zero tolerance for hate speech.
+// ---------------------------------------------------------------------------
+// Block hate speech content
+@id("sentry-cs-block-hate-speech")
+@name("Block hate speech")
+@description("Block content when the ML hate speech score exceeds threshold (75/100). Lower threshold than other categories because enterprises have zero tolerance for discriminatory content. Catches slurs, dehumanizing language, and targeted harassment.")
+@severity("critical")
+@tags("hate-speech,content-safety,trust-safety,nist-si-4,eu-ai-act")
+@reject_message("Content blocked: hate speech or discriminatory content detected. AI services must not process hateful, discriminatory, or dehumanizing content.")
+forbid (
+    principal,
+    action in [Sentry::Action::"send_message", Sentry::Action::"paste_content", Sentry::Action::"upload_file"],
+    resource
+)
+when {
+    context has hate_speech_score && context.hate_speech_score >= 75
+};
+// ---------------------------------------------------------------------------
+// Section 3: Criminal Content
+// ---------------------------------------------------------------------------
+// Block criminal content
+@id("sentry-cs-block-crime")
+@name("Block criminal content")
+@description("Block content when the ML criminal activity detection score exceeds threshold (80/100). Catches illegal activity instructions, fraud techniques, and criminal behavior content.")
+@severity("high")
+@tags("crime,content-safety,trust-safety,nist-si-4")
+@reject_message("Content blocked: criminal activity content detected. AI services must not process content related to illegal activities or fraud.")
+forbid (
+    principal,
+    action in [Sentry::Action::"send_message", Sentry::Action::"paste_content", Sentry::Action::"upload_file"],
+    resource
+)
+when {
+    context has crime_score && context.crime_score >= 80
+};
+// ---------------------------------------------------------------------------
+// Section 4: Sexual Content
+// ---------------------------------------------------------------------------
+// Block sexual content
+@id("sentry-cs-block-sexual")
+@name("Block sexual content")
+@description("Block content when the ML sexual content score exceeds threshold (80/100). Ensures AI services do not process sexually explicit material in enterprise environments.")
+@severity("high")
+@tags("sexual,content-safety,trust-safety,eu-ai-act,iso-42001")
+@reject_message("Content blocked: sexual content detected. AI services must not process sexually explicit material in enterprise environments.")
+forbid (
+    principal,
+    action in [Sentry::Action::"send_message", Sentry::Action::"paste_content", Sentry::Action::"upload_file"],
+    resource
+)
+when {
+    context has sexual_score && context.sexual_score >= 80
+};
+// ---------------------------------------------------------------------------
+// Section 5: Profanity
+// Higher threshold (90) — allows normal expression while blocking abuse.
+// ---------------------------------------------------------------------------
+// Block excessive profanity
+@id("sentry-cs-block-profanity")
+@name("Block profanity")
+@description("Block content when the ML profanity detection score exceeds threshold (90/100). Higher threshold allows normal expression while blocking abusive or harassing language patterns.")
+@severity("medium")
+@tags("profanity,content-safety,trust-safety")
+@reject_message("Content blocked: excessive profanity detected. Please rephrase in a professional manner.")
+forbid (
+    principal,
+    action in [Sentry::Action::"send_message", Sentry::Action::"paste_content"],
+    resource
+)
+when {
+    context has profanity_score && context.profanity_score >= 90
+};
+// ---------------------------------------------------------------------------
+// Section 6: Cut & Paste Safety
+// Specific rules for content pasted from external sources into AI chats.
+// Paste operations are a primary vector for data leakage.
+// ---------------------------------------------------------------------------
+// Block large pastes with any detected threats
+@id("sentry-cs-block-large-paste-threats")
+@name("Block large pastes with threats")
+@description("Block large paste operations (>5000 chars) when any threats are detected. Large pastes with threats likely indicate bulk data dumps from emails, documents, or databases being leaked to AI services.")
+@severity("high")
+@tags("paste-safety,data-leakage,content-safety")
+@reject_message("Large paste operation blocked: security threats were detected in the pasted content. Large data transfers to AI services require threat-free content.")
+forbid (
+    principal,
+    action == Sentry::Action::"paste_content",
+    resource
+)
+when {
+    context has paste_length && context has threat_count &&
+    context.paste_length > 5000 && context.threat_count >= 1
+};
+// Block pastes containing encoded injection payloads
+@id("sentry-cs-block-paste-encoded")
+@name("Block encoded paste content")
+@description("Block paste operations when encoded injection payloads (base64, hex, unicode) are detected. Attackers use encoding to smuggle injection payloads via clipboard transfer.")
+@severity("high")
+@tags("paste-safety,encoding,injection,content-safety")
+@reject_message("Paste blocked: encoded injection payloads detected in pasted content. Content with hidden encoded instructions cannot be shared with AI services.")
+forbid (
+    principal,
+    action == Sentry::Action::"paste_content",
+    resource
+)
+when {
+    context has encoded_content_detected && context.encoded_content_detected &&
+    context has encoded_score && context.encoded_score >= 60
+};
+// Block pastes with invisible characters
+@id("sentry-cs-block-paste-invisible")
+@name("Block paste with invisible characters")
+@description("Block paste operations containing invisible Unicode characters (zero-width, bidi overrides). These can hide malicious instructions that appear invisible to users but are processed by AI models.")
+@severity("high")
+@tags("paste-safety,unicode,invisible-chars,content-safety")
+@reject_message("Paste blocked: invisible Unicode characters detected. Hidden characters can disguise malicious instructions that AI models process but users cannot see.")
+forbid (
+    principal,
+    action == Sentry::Action::"paste_content",
+    resource
+)
+when {
+    context has contains_invisible_chars && context.contains_invisible_chars &&
+    context has invisible_chars_score && context.invisible_chars_score >= 50
+};
+// ---------------------------------------------------------------------------
+// Section 7: AI Response Safety
+// Block harmful content in AI responses before user sees it.
+// ---------------------------------------------------------------------------
+// Block violent/harmful AI responses
+@id("sentry-cs-block-response-safety")
+@name("Block harmful AI responses")
+@description("Block AI responses containing high-severity violent, hateful, or criminal content. Prevents harmful AI-generated content from reaching users in enterprise environments.")
+@severity("critical")
+@tags("response-safety,content-safety,owasp-llm02")
+@reject_message("AI response blocked: harmful content detected in the response. The AI service generated content that violates enterprise content safety policies.")
+forbid (
+    principal,
+    action == Sentry::Action::"receive_response",
+    resource
+)
+when {
+    context has violence_score && context.violence_score >= 80
+};
+@id("sentry-cs-block-response-hate")
+@name("Block hateful AI responses")
+@description("Block AI responses with hate speech or discriminatory content.")
+@severity("critical")
+@tags("response-safety,hate-speech,content-safety,owasp-llm02")
+@reject_message("AI response blocked: hate speech or discriminatory content detected in the response.")
+forbid (
+    principal,
+    action == Sentry::Action::"receive_response",
+    resource
+)
+when {
+    context has hate_speech_score && context.hate_speech_score >= 75
+};