@highflame/policy 2.1.36 → 2.1.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/_schemas/ai_gateway/context.json +431 -11
  2. package/_schemas/ai_gateway/schema.cedarschema +91 -11
  3. package/_schemas/ai_gateway/templates/defaults/agent_security.cedar +66 -43
  4. package/_schemas/ai_gateway/templates/defaults/baseline.cedar +9 -11
  5. package/_schemas/ai_gateway/templates/defaults/semantic.cedar +63 -40
  6. package/_schemas/ai_gateway/templates/defaults/tools.cedar +48 -36
  7. package/_schemas/ai_gateway/templates/llm_default_allow.cedar +9 -10
  8. package/_schemas/ai_gateway/templates/mcp_server_allowlist.cedar +22 -14
  9. package/_schemas/ai_gateway/templates/mcp_tool_permissions.cedar +29 -27
  10. package/_schemas/ai_gateway/templates/pii_redaction.cedar +38 -33
  11. package/_schemas/ai_gateway/templates/templates.json +42 -47
  12. package/_schemas/guardrails/context.json +12 -12
  13. package/_schemas/guardrails/schema.cedarschema +12 -12
  14. package/_schemas/guardrails/templates/defaults/agent_identity.cedar +60 -56
  15. package/_schemas/guardrails/templates/defaults/agentic_safety.cedar +83 -58
  16. package/_schemas/guardrails/templates/defaults/baseline.cedar +9 -12
  17. package/_schemas/guardrails/templates/defaults/injection.cedar +48 -36
  18. package/_schemas/guardrails/templates/defaults/pii.cedar +27 -20
  19. package/_schemas/guardrails/templates/defaults/secrets.cedar +39 -22
  20. package/_schemas/guardrails/templates/defaults/security_patterns.cedar +38 -25
  21. package/_schemas/guardrails/templates/defaults/semantic.cedar +47 -31
  22. package/_schemas/guardrails/templates/defaults/tool_risk.cedar +34 -26
  23. package/_schemas/guardrails/templates/defaults/toxicity.cedar +57 -47
  24. package/_schemas/guardrails/templates/mcp_tool_permissions.cedar +60 -43
  25. package/_schemas/guardrails/templates/profiles/a2a_security/cross_origin.cedar +29 -42
  26. package/_schemas/guardrails/templates/profiles/a2a_security/escalation_detection.cedar +43 -57
  27. package/_schemas/guardrails/templates/profiles/a2a_security/identity_enforcement.cedar +40 -57
  28. package/_schemas/guardrails/templates/profiles/a2a_security/inter_agent_injection.cedar +48 -62
  29. package/_schemas/guardrails/templates/profiles/a2a_security/supply_chain.cedar +40 -56
  30. package/_schemas/guardrails/templates/profiles/advanced_detection/pii.cedar +24 -34
  31. package/_schemas/guardrails/templates/profiles/advanced_detection/secrets.cedar +45 -37
  32. package/_schemas/guardrails/templates/profiles/advanced_detection/threat_severity.cedar +11 -16
  33. package/_schemas/guardrails/templates/profiles/chat_assistant/privacy.cedar +22 -9
  34. package/_schemas/guardrails/templates/profiles/chat_assistant/security.cedar +27 -15
  35. package/_schemas/guardrails/templates/profiles/chat_assistant/trust_safety.cedar +37 -22
  36. package/_schemas/guardrails/templates/profiles/code_agent/agentic_security.cedar +68 -47
  37. package/_schemas/guardrails/templates/profiles/code_agent/encoding.cedar +17 -21
  38. package/_schemas/guardrails/templates/profiles/code_agent/path_security.cedar +74 -73
  39. package/_schemas/guardrails/templates/profiles/code_agent/security.cedar +13 -9
  40. package/_schemas/guardrails/templates/profiles/code_agent/supply_chain.cedar +36 -58
  41. package/_schemas/guardrails/templates/profiles/data_pipeline/agentic_security.cedar +22 -15
  42. package/_schemas/guardrails/templates/profiles/data_pipeline/data_protection.cedar +52 -0
  43. package/_schemas/guardrails/templates/profiles/data_pipeline/privacy.cedar +41 -18
  44. package/_schemas/guardrails/templates/profiles/data_pipeline/security.cedar +18 -36
  45. package/_schemas/guardrails/templates/profiles/multi_agent/agent_safety.cedar +86 -79
  46. package/_schemas/guardrails/templates/profiles/multi_agent/agent_trust.cedar +73 -70
  47. package/_schemas/guardrails/templates/templates.json +188 -210
  48. package/_schemas/overwatch/context.json +14 -14
  49. package/_schemas/overwatch/schema.cedarschema +12 -12
  50. package/_schemas/sentry/context.json +11 -11
  51. package/_schemas/sentry/schema.cedarschema +11 -11
  52. package/_schemas/sentry/templates/defaults/baseline.cedar +8 -12
  53. package/_schemas/sentry/templates/defaults/clipboard.cedar +43 -42
  54. package/_schemas/sentry/templates/defaults/content_safety.cedar +38 -68
  55. package/_schemas/sentry/templates/defaults/file_safety.cedar +18 -26
  56. package/_schemas/sentry/templates/defaults/organization.cedar +10 -17
  57. package/_schemas/sentry/templates/defaults/pii.cedar +52 -73
  58. package/_schemas/sentry/templates/defaults/secrets.cedar +65 -58
  59. package/_schemas/sentry/templates/defaults/semantic.cedar +40 -59
  60. package/_schemas/sentry/templates/templates.json +46 -46
  61. package/dist/ai_gateway-context.gen.d.ts +18 -4
  62. package/dist/ai_gateway-context.gen.js +18 -4
  63. package/dist/ai_gateway-defaults.gen.d.ts +1 -1
  64. package/dist/ai_gateway-defaults.gen.js +377 -313
  65. package/dist/guardrails-context.gen.d.ts +5 -5
  66. package/dist/guardrails-context.gen.js +5 -5
  67. package/dist/guardrails-defaults.gen.d.ts +1 -1
  68. package/dist/guardrails-defaults.gen.js +2070 -1849
  69. package/dist/overwatch-context.gen.d.ts +5 -5
  70. package/dist/overwatch-context.gen.js +5 -5
  71. package/dist/overwatch-defaults.gen.d.ts +1 -1
  72. package/dist/overwatch-defaults.gen.js +547 -573
  73. package/dist/sentry-context.gen.d.ts +3 -3
  74. package/dist/sentry-context.gen.js +3 -3
  75. package/dist/sentry-defaults.gen.d.ts +1 -1
  76. package/dist/sentry-defaults.gen.js +379 -460
  77. package/dist/service-schemas.gen.d.ts +4 -4
  78. package/dist/service-schemas.gen.js +249 -99
  79. package/package.json +1 -1
@@ -1,76 +1,86 @@
1
1
  // =============================================================================
2
- // Toxicity & Content Moderation Policy
2
+ // Toxicity & Content Moderation (Default)
3
3
  // =============================================================================
4
- // Blocks toxic, harmful, or inappropriate content.
5
- // Covers violence, hate speech, sexual content, weapons, crime, and profanity.
4
+ // Blocks toxic, harmful, or inappropriate content across the six standard
5
+ // toxicity dimensions. Three rules: a strict critical-threshold catch-all,
6
+ // a hate-speech-specific lower threshold, and a "any two moderate signals"
7
+ // combined-toxicity rule.
6
8
  //
7
- // Context keys used (normalized by projection layer):
8
- // - violence_score: Long (0-100) - Violence reference score
9
- // - hate_speech_score: Long (0-100) - Hate speech score
10
- // - sexual_score: Long (0-100) - Sexual content score
11
- // - weapons_score: Long (0-100) - Weapons reference score
12
- // - crime_score: Long (0-100) - Criminal activity score
13
- // - profanity_score: Long (0-100) - Profanity score
9
+ // Context keys consumed:
10
+ // - violence_score: Long (0-100)
11
+ // - hate_speech_score: Long (0-100)
12
+ // - sexual_score: Long (0-100)
13
+ // - weapons_score: Long (0-100)
14
+ // - crime_score: Long (0-100)
15
+ // - profanity_score: Long (0-100)
14
16
  //
15
- // Category: trust-safety
17
+ // Compliance:
18
+ // - EU AI Act, ISO 42001, NIST SI-4
19
+ //
20
+ // Category: trust-safety
16
21
  // Namespace: Guardrails
17
22
  // =============================================================================
18
23
 
19
- @id("toxicity-block-critical")
24
+ @id("trust-safety.block-toxicity-critical")
20
25
  @name("Block critical toxicity")
21
- @description("Forbids content with any toxicity score above 90")
26
+ @description("Blocks process_prompt when any toxicity score is >= 91.")
22
27
  @severity("critical")
23
- @tags("toxicity,trust-safety")
28
+ @tags("category:trust-safety,threat:harmful,detection:ml,compliance:eu-ai-act,compliance:iso-42001")
29
+ @reject_message("Prompt blocked: critical-severity toxic content detected.")
24
30
  forbid (
25
31
  principal,
26
32
  action == Guardrails::Action::"process_prompt",
27
33
  resource
28
- ) when {
29
- (context has violence_score && context.violence_score > 90) ||
30
- (context has hate_speech_score && context.hate_speech_score > 90) ||
31
- (context has sexual_score && context.sexual_score > 90) ||
32
- (context has weapons_score && context.weapons_score > 90) ||
33
- (context has crime_score && context.crime_score > 90) ||
34
- (context has profanity_score && context.profanity_score > 90)
34
+ )
35
+ when {
36
+ (context has violence_score && context.violence_score >= 91) ||
37
+ (context has hate_speech_score && context.hate_speech_score >= 91) ||
38
+ (context has sexual_score && context.sexual_score >= 91) ||
39
+ (context has weapons_score && context.weapons_score >= 91) ||
40
+ (context has crime_score && context.crime_score >= 91) ||
41
+ (context has profanity_score && context.profanity_score >= 91)
35
42
  };
36
43
 
37
- @id("toxicity-block-hate-speech")
44
+ @id("trust-safety.block-hate-speech")
38
45
  @name("Block hate speech")
39
- @description("Forbids content with hate speech above 75")
46
+ @description("Blocks process_prompt when hate_speech_score >= 76.")
40
47
  @severity("high")
41
- @tags("hate-speech,trust-safety")
48
+ @tags("category:trust-safety,threat:hate-speech,detection:ml,compliance:eu-ai-act")
49
+ @reject_message("Prompt blocked: hate speech detected.")
42
50
  forbid (
43
51
  principal,
44
52
  action == Guardrails::Action::"process_prompt",
45
53
  resource
46
- ) when {
47
- context has hate_speech_score && context.hate_speech_score > 75
54
+ )
55
+ when {
56
+ context has hate_speech_score && context.hate_speech_score >= 76
48
57
  };
49
58
 
50
- @id("toxicity-block-combined")
51
- @name("Block combined moderate toxicity")
52
- @description("Forbids content with two or more moderate toxicity scores above 60")
59
+ @id("trust-safety.block-toxicity-combined")
60
+ @name("Block combined toxicity")
61
+ @description("Blocks process_prompt when any two toxicity scores are >= 61.")
53
62
  @severity("high")
54
- @tags("toxicity,trust-safety")
63
+ @tags("category:trust-safety,threat:harmful,detection:aggregate")
64
+ @reject_message("Prompt blocked: multiple moderate toxicity signals were detected together.")
55
65
  forbid (
56
66
  principal,
57
67
  action == Guardrails::Action::"process_prompt",
58
68
  resource
59
- ) when {
60
- // Any 2 of 6 toxicity scores above 60 triggers a block
61
- (context has violence_score && context has hate_speech_score && context.violence_score > 60 && context.hate_speech_score > 60) ||
62
- (context has violence_score && context has sexual_score && context.violence_score > 60 && context.sexual_score > 60) ||
63
- (context has violence_score && context has weapons_score && context.violence_score > 60 && context.weapons_score > 60) ||
64
- (context has violence_score && context has crime_score && context.violence_score > 60 && context.crime_score > 60) ||
65
- (context has violence_score && context has profanity_score && context.violence_score > 60 && context.profanity_score > 60) ||
66
- (context has hate_speech_score && context has sexual_score && context.hate_speech_score > 60 && context.sexual_score > 60) ||
67
- (context has hate_speech_score && context has weapons_score && context.hate_speech_score > 60 && context.weapons_score > 60) ||
68
- (context has hate_speech_score && context has crime_score && context.hate_speech_score > 60 && context.crime_score > 60) ||
69
- (context has hate_speech_score && context has profanity_score && context.hate_speech_score > 60 && context.profanity_score > 60) ||
70
- (context has sexual_score && context has weapons_score && context.sexual_score > 60 && context.weapons_score > 60) ||
71
- (context has sexual_score && context has crime_score && context.sexual_score > 60 && context.crime_score > 60) ||
72
- (context has sexual_score && context has profanity_score && context.sexual_score > 60 && context.profanity_score > 60) ||
73
- (context has weapons_score && context has crime_score && context.weapons_score > 60 && context.crime_score > 60) ||
74
- (context has weapons_score && context has profanity_score && context.weapons_score > 60 && context.profanity_score > 60) ||
75
- (context has crime_score && context has profanity_score && context.crime_score > 60 && context.profanity_score > 60)
69
+ )
70
+ when {
71
+ (context has violence_score && context has hate_speech_score && context.violence_score >= 61 && context.hate_speech_score >= 61) ||
72
+ (context has violence_score && context has sexual_score && context.violence_score >= 61 && context.sexual_score >= 61) ||
73
+ (context has violence_score && context has weapons_score && context.violence_score >= 61 && context.weapons_score >= 61) ||
74
+ (context has violence_score && context has crime_score && context.violence_score >= 61 && context.crime_score >= 61) ||
75
+ (context has violence_score && context has profanity_score && context.violence_score >= 61 && context.profanity_score >= 61) ||
76
+ (context has hate_speech_score && context has sexual_score && context.hate_speech_score >= 61 && context.sexual_score >= 61) ||
77
+ (context has hate_speech_score && context has weapons_score && context.hate_speech_score >= 61 && context.weapons_score >= 61) ||
78
+ (context has hate_speech_score && context has crime_score && context.hate_speech_score >= 61 && context.crime_score >= 61) ||
79
+ (context has hate_speech_score && context has profanity_score && context.hate_speech_score >= 61 && context.profanity_score >= 61) ||
80
+ (context has sexual_score && context has weapons_score && context.sexual_score >= 61 && context.weapons_score >= 61) ||
81
+ (context has sexual_score && context has crime_score && context.sexual_score >= 61 && context.crime_score >= 61) ||
82
+ (context has sexual_score && context has profanity_score && context.sexual_score >= 61 && context.profanity_score >= 61) ||
83
+ (context has weapons_score && context has crime_score && context.weapons_score >= 61 && context.crime_score >= 61) ||
84
+ (context has weapons_score && context has profanity_score && context.weapons_score >= 61 && context.profanity_score >= 61) ||
85
+ (context has crime_score && context has profanity_score && context.crime_score >= 61 && context.profanity_score >= 61)
76
86
  };
@@ -1,86 +1,103 @@
1
1
  // =============================================================================
2
- // MCP Tool Permissions Template
2
+ // MCP Tool Permissions
3
3
  // =============================================================================
4
- // Per-tool access control for MCP servers. Allows fine-grained permissioning:
5
- // allow specific tools from a server while denying others.
4
+ // Per-tool access control for MCP servers fine-grained permissioning that
5
+ // allows specific tools from a server while denying others.
6
6
  //
7
- // Example: Allow read_issues from GitHub but deny create_issues.
7
+ // Bundled rules cover:
8
+ // - GitHub MCP example: permit read tools, block write tools
9
+ // - Org-wide exclusion list (untrusted / deprecated servers)
10
+ // - Unverified MCP server block
8
11
  //
9
- // Context keys used:
10
- // - mcp_server: String - MCP server name
11
- // - mcp_tool: String - Tool name within the server
12
- // - tool_name: String - Normalized tool name
13
- // - mcp_server_verified: Bool - Whether server is verified
12
+ // Context keys consumed:
13
+ // - mcp_server: String
14
+ // - tool_name: String
15
+ // - mcp_server_verified: Bool
14
16
  //
15
- // Category: agentic_security
17
+ // Category: tools
16
18
  // Namespace: Guardrails
17
19
  // =============================================================================
18
20
 
19
- // -- GitHub MCP: Read-only access -------------------------------------------
21
+ // ---------------------------------------------------------------------------
22
+ // Section 1: Example — GitHub MCP read/write split
23
+ // ---------------------------------------------------------------------------
20
24
 
21
- @id("mcp-github-allow-read")
22
- @name("Allow read-only GitHub tools")
23
- @description("Permit read operations from GitHub MCP server (issues, PRs, code search)")
25
+ @id("tools.allow-github-mcp-read")
26
+ @name("Allow GitHub MCP read tools")
27
+ @description("Permits call_tool when mcp_server is \"github\" and tool_name is a read-only operation.")
24
28
  @severity("medium")
25
- @tags("mcp,github,read-only,least-privilege")
29
+ @tags("category:tools,scope:per-tool,surface:call-tool,posture:deny-default")
26
30
  permit (
27
31
  principal,
28
32
  action == Guardrails::Action::"call_tool",
29
33
  resource
30
- ) when {
34
+ )
35
+ when {
31
36
  context has mcp_server && context.mcp_server == "github" &&
32
37
  context has tool_name &&
33
- (context.tool_name == "read_issues" ||
34
- context.tool_name == "get_issue" ||
35
- context.tool_name == "list_repos" ||
36
- context.tool_name == "get_pull_request" ||
37
- context.tool_name == "search_code" ||
38
- context.tool_name == "get_file_contents")
38
+ (
39
+ context.tool_name == "read_issues" ||
40
+ context.tool_name == "get_issue" ||
41
+ context.tool_name == "list_repos" ||
42
+ context.tool_name == "get_pull_request" ||
43
+ context.tool_name == "search_code" ||
44
+ context.tool_name == "get_file_contents"
45
+ )
39
46
  };
40
47
 
41
- @id("mcp-github-deny-write")
42
- @name("Deny write GitHub tools")
43
- @description("Block create/update/delete operations on GitHub MCP server")
48
+ @id("tools.block-github-mcp-write")
49
+ @name("Block GitHub MCP write tools")
50
+ @description("Blocks call_tool when mcp_server is \"github\"; combine with the read-allow permit above.")
44
51
  @severity("high")
45
- @tags("mcp,github,write-block,least-privilege")
52
+ @tags("category:tools,scope:per-tool,surface:call-tool,posture:deny-default")
53
+ @reject_message("Tool execution blocked: write operations against the GitHub MCP server are not permitted.")
46
54
  forbid (
47
55
  principal,
48
56
  action == Guardrails::Action::"call_tool",
49
57
  resource
50
- ) when {
58
+ )
59
+ when {
51
60
  context has mcp_server && context.mcp_server == "github"
52
61
  };
53
62
 
54
- // -- Organization-wide MCP server exclusions --------------------------------
63
+ // ---------------------------------------------------------------------------
64
+ // Section 2: Org-wide exclusion list
65
+ // ---------------------------------------------------------------------------
55
66
 
56
- @id("mcp-exclude-server")
57
- @name("Exclude specific MCP servers")
58
- @description("Block all tool calls from excluded MCP servers (org-wide exclusion list)")
67
+ @id("tools.exclude-mcp-servers")
68
+ @name("Block excluded MCP servers")
69
+ @description("Blocks call_tool when mcp_server is in the org-wide exclusion list.")
59
70
  @severity("critical")
60
- @tags("mcp,exclusion,org-wide,block")
71
+ @tags("category:tools,scope:org-wide,surface:call-tool,posture:deny-default")
72
+ @reject_message("Tool execution blocked: MCP server is on the org-wide exclusion list.")
61
73
  forbid (
62
74
  principal,
63
75
  action == Guardrails::Action::"call_tool",
64
76
  resource
65
- ) when {
66
- // Add server names to block across the organization.
67
- // Modify this list to match your exclusion requirements.
77
+ )
78
+ when {
68
79
  context has mcp_server &&
69
- (context.mcp_server == "untrusted-server" ||
70
- context.mcp_server == "deprecated-server")
80
+ (
81
+ context.mcp_server == "untrusted-server" ||
82
+ context.mcp_server == "deprecated-server"
83
+ )
71
84
  };
72
85
 
73
- // -- Block unverified MCP servers -------------------------------------------
86
+ // ---------------------------------------------------------------------------
87
+ // Section 3: Unverified server block
88
+ // ---------------------------------------------------------------------------
74
89
 
75
- @id("mcp-block-unverified")
76
- @name("Block unverified MCP servers")
77
- @description("Deny all tool calls from MCP servers not in the verified registry")
90
+ @id("tools.block-unverified-mcp-tools")
91
+ @name("Block unverified MCP server tools")
92
+ @description("Blocks call_tool when mcp_server_verified is false.")
78
93
  @severity("high")
79
- @tags("mcp,trust,verification")
94
+ @tags("category:tools,threat:supply-chain,detection:rule,surface:call-tool")
95
+ @reject_message("Tool execution blocked: MCP server is not from a verified registry.")
80
96
  forbid (
81
97
  principal,
82
98
  action == Guardrails::Action::"call_tool",
83
99
  resource
84
- ) when {
100
+ )
101
+ when {
85
102
  context has mcp_server_verified && context.mcp_server_verified == false
86
103
  };
@@ -1,43 +1,33 @@
1
1
  // =============================================================================
2
2
  // A2A Security — Cross-Origin Trust Boundary Enforcement
3
3
  // =============================================================================
4
- // Detects and blocks confused deputy attacks where an agent from one trust
5
- // domain attempts to operate in another. Cross-origin violations occur when:
6
- // - An agent proxies requests across security domains
7
- // - Mixed-security tool chains span trust boundaries
8
- // - URL injection redirects agent communication to untrusted origins
4
+ // Detects and blocks confused-deputy attacks where an agent from one trust
5
+ // domain attempts to operate in another. Primary signal in A2A
6
+ // communication where independent agents cross separate trust domains.
9
7
  //
10
- // Key A2A distinction from MAS: In multi-agent systems (shared orchestrator),
11
- // cross-origin is unlikely because all agents share a trust context. In A2A
12
- // (independent agents, separate trust domains), cross-origin is the PRIMARY
13
- // signal that trust boundaries are being violated.
8
+ // Shield cross-origin detector scores (discrete):
9
+ // 90 mixed localhost + external | 85 URL injection | 80 proxy/redirect
10
+ // 75 multi-origin tool configs | 70 mixed HTTP/HTTPS | 65 JSON multi-URL
11
+ // 60 generic multi-domain patterns
14
12
  //
15
- // Shield cross-origin detector outputs discrete scores:
16
- // 90 mixed localhost + external domain
17
- // 85 URL injection in parameters
18
- // 80 — proxy/redirect patterns
19
- // 75 — multi-origin tool configs / JSON origin fields
20
- // 70 — mixed HTTP/HTTPS or ws/wss schemes
21
- // 65 — JSON arrays with multiple URLs
22
- // 60 — generic multi-domain patterns
13
+ // Context keys consumed:
14
+ // - cross_origin_detected, cross_origin_score
15
+ // - agent_id, agent_trust_level
16
+ // - tool_is_sensitive
23
17
  //
24
18
  // Compliance:
25
- // OWASP LLM08 (Excessive Agency)
26
- // OWASP ASI03 (Excessive Permissions)
27
- // MITRE ATLAS AML.T0051.002 (Indirect Prompt Injection via delegation)
28
- // NIST 800-53 AC-4 (Information Flow Enforcement)
19
+ // - OWASP LLM08, OWASP ASI03; NIST 800-53 AC-4
29
20
  //
30
- // Category: agent_identity
21
+ // Category: agent-identity
31
22
  // Namespace: Guardrails
32
23
  // =============================================================================
33
24
 
34
- // Block high-confidence cross-origin for any agent
35
- @id("a2a-cross-origin-block-critical")
25
+ @id("agent-identity.a2a-block-cross-origin-critical")
36
26
  @name("Block critical cross-origin from any agent")
37
- @description("Block all agent requests when cross-origin trust boundary violation score exceeds 80. High-confidence cross-origin signals (mixed localhost/external, URL injection, proxy redirects) indicate confused deputy attacks regardless of agent trust level.")
27
+ @description("Blocks process_prompt and call_tool when cross_origin_detected is true and cross_origin_score >= 80.")
38
28
  @severity("critical")
39
- @tags("profile,a2a-security,cross-origin,confused-deputy,trust-boundary,owasp-llm08")
40
- @reject_message("Request blocked: high-confidence cross-origin trust boundary violation detected (score >= 80). An external agent or service is attempting to operate across trust domains. Review the origin chain before retrying.")
29
+ @tags("category:agent-identity,threat:supply-chain,detection:ml,owasp:llm08")
30
+ @reject_message("Request blocked: high-confidence cross-origin trust-boundary violation detected confused-deputy attack.")
41
31
  forbid (
42
32
  principal is Guardrails::Agent,
43
33
  action in [Guardrails::Action::"process_prompt", Guardrails::Action::"call_tool"],
@@ -49,13 +39,12 @@ when {
49
39
  context has cross_origin_score && context.cross_origin_score >= 80
50
40
  };
51
41
 
52
- // Block cross-origin from unverified agents at any signal level
53
- @id("a2a-cross-origin-block-unverified")
42
+ @id("agent-identity.a2a-block-cross-origin-unverified")
54
43
  @name("Block cross-origin from unverified agents")
55
- @description("Unverified agents are blocked from any cross-origin activity at the lowest meaningful detection threshold (score >= 60). Cross-origin from an unverified source is a strong confused deputy indicator — the agent has no attestation AND is crossing trust boundaries.")
44
+ @description("Blocks process_prompt and call_tool when agent_trust_level is \"unverified\", cross_origin_detected is true, and cross_origin_score >= 60.")
56
45
  @severity("high")
57
- @tags("profile,a2a-security,cross-origin,unverified,trust-boundary,owasp-asi03")
58
- @reject_message("Request blocked: cross-origin activity detected from an unverified agent. Unverified agents cannot operate across trust boundaries. Register the agent or use a verified agent.")
46
+ @tags("category:agent-identity,threat:supply-chain,scope:per-agent,detection:ml,owasp:asi03")
47
+ @reject_message("Request blocked: unverified agent attempted cross-origin activity no trust attestation across trust domains.")
59
48
  forbid (
60
49
  principal is Guardrails::Agent,
61
50
  action in [Guardrails::Action::"process_prompt", Guardrails::Action::"call_tool"],
@@ -67,13 +56,12 @@ when {
67
56
  context has cross_origin_score && context.cross_origin_score >= 60
68
57
  };
69
58
 
70
- // Block cross-origin MCP server connections from non-first-party agents
71
- @id("a2a-cross-origin-block-server-connect")
59
+ @id("agent-identity.a2a-block-cross-origin-server-connect")
72
60
  @name("Block cross-origin MCP server connections")
73
- @description("Non-first-party agents cannot connect to MCP servers when cross-origin signals are present (score >= 65). Server-level cross-origin has wide blast radius — a single compromised connection exposes all tools on that server.")
61
+ @description("Blocks connect_server when agent_trust_level is not \"first_party\", cross_origin_detected is true, and cross_origin_score >= 65.")
74
62
  @severity("critical")
75
- @tags("profile,a2a-security,cross-origin,mcp,server,trust-boundary,nist-ac-4")
76
- @reject_message("MCP server connection blocked: cross-origin trust violation detected. Non-first-party agents cannot connect to MCP servers when cross-origin signals are present.")
63
+ @tags("category:agent-identity,threat:supply-chain,scope:per-agent,detection:ml,surface:connect-server")
64
+ @reject_message("MCP server connection blocked: non-first-party agent crossed trust boundaries during connection supply-chain risk.")
77
65
  forbid (
78
66
  principal is Guardrails::Agent,
79
67
  action == Guardrails::Action::"connect_server",
@@ -85,13 +73,12 @@ when {
85
73
  context has cross_origin_score && context.cross_origin_score >= 65
86
74
  };
87
75
 
88
- // Block cross-origin tool calls on sensitive tools from any agent
89
- @id("a2a-cross-origin-block-sensitive-tools")
76
+ @id("agent-identity.a2a-block-cross-origin-sensitive-tools")
90
77
  @name("Block cross-origin on sensitive tools")
91
- @description("Any agent attempting to call sensitive tools while cross-origin signals are present (score >= 60) is blocked. Sensitive tools (write_file, http_post, send_email) amplify the impact of confused deputy attacks — an agent crossing trust boundaries should not have access to high-impact operations.")
78
+ @description("Blocks call_tool when cross_origin_detected is true, cross_origin_score >= 60, and tool_is_sensitive is true.")
92
79
  @severity("high")
93
- @tags("profile,a2a-security,cross-origin,sensitive-tools,confused-deputy,owasp-llm08")
94
- @reject_message("Sensitive tool execution blocked: cross-origin trust violation detected. Tool calls to sensitive tools are blocked when cross-origin signals are present from agent requests.")
80
+ @tags("category:agent-identity,threat:supply-chain,detection:ml,surface:call-tool,owasp:llm08")
81
+ @reject_message("Sensitive tool execution blocked: cross-origin signal present sensitive tools amplify confused-deputy impact.")
95
82
  forbid (
96
83
  principal is Guardrails::Agent,
97
84
  action == Guardrails::Action::"call_tool",
@@ -1,47 +1,36 @@
1
1
  // =============================================================================
2
2
  // A2A Security — Escalation Detection & Circuit Breakers
3
3
  // =============================================================================
4
- // Detects progressive capability escalation across agent turns and applies
5
- // session-level circuit breakers tuned for adversarial A2A communication:
4
+ // Detects progressive capability escalation across agent turns. Lower
5
+ // circuit-breaker thresholds than the multi-agent profile because A2A
6
+ // communication spans separate trust domains with no shared orchestrator
7
+ // to reset session state.
6
8
  //
7
- // 1. SESSION PEAK SCORES: When the maximum injection or jailbreak score
8
- // across all prior turns exceeds 70, non-first-party agents are restricted.
9
- // Unlike MAS boolean flags, A2A uses numeric peaks for graduated response.
10
- //
11
- // 2. CUMULATIVE RISK CIRCUIT BREAKER: When accumulated risk across turns
12
- // exceeds 150, sensitive tools are blocked for non-first-party agents.
13
- // Lower threshold than MAS (200) because cross-system communication has
14
- // higher adversarial surface area.
15
- //
16
- // 3. THREAT TURN ESCALATION: After 3+ turns with detected threats, unverified
17
- // agents are fully locked out. This catches turn-by-turn probing attacks
18
- // where an attacker incrementally tests boundaries.
19
- //
20
- // Key A2A distinction: In MAS, the orchestrator can reset or contain sessions.
21
- // In A2A, independent agents have no shared circuit breaker, so policy must
22
- // enforce escalation detection at the evaluation layer.
9
+ // Context keys consumed:
10
+ // - agent_trust_level
11
+ // - session_max_injection_score, session_max_jailbreak_score: Long (0-100)
12
+ // - session_cumulative_risk_score: Long
13
+ // - session_threat_turns: Long
14
+ // - tool_is_sensitive: Bool
23
15
  //
24
16
  // Compliance:
25
- // OWASP LLM01 (Prompt Injection) — multi-turn variant
26
- // MITRE ATLAS AML.T0051 (LLM Prompt Injection)
27
- // NIST 800-53 SI-4 (System Monitoring)
28
- // NIST 800-53 IR-4 (Incident Handling)
17
+ // - OWASP LLM01 (multi-turn); MITRE ATLAS AML.T0051
18
+ // - NIST 800-53 SI-4, IR-4
29
19
  //
30
- // Category: agent_identity
20
+ // Category: agent-identity
31
21
  // Namespace: Guardrails
32
22
  // =============================================================================
33
23
 
34
- // -----------------------------------------------------------------------------
35
- // Session Peak Score Monitoring
36
- // -----------------------------------------------------------------------------
24
+ // ---------------------------------------------------------------------------
25
+ // Section 1: Session peak score monitoring
26
+ // ---------------------------------------------------------------------------
37
27
 
38
- // Block non-first-party agents when session injection peak is high
39
- @id("a2a-session-injection-peak-block")
40
- @name("Block agents when session injection peak is high")
41
- @description("Block non-first-party agents from tool calls and prompt processing when the maximum injection score across all prior turns exceeds 70. Unlike MAS policies that use boolean session_injection_detected, A2A uses the numeric peak score for graduated response — a session with a prior score of 75 is more dangerous than one with 45, even though both set the boolean flag.")
28
+ @id("agent-identity.a2a-block-session-injection-peak")
29
+ @name("Block agents at high session injection peak")
30
+ @description("Blocks call_tool and process_prompt when agent_trust_level is not \"first_party\" and session_max_injection_score >= 70.")
42
31
  @severity("critical")
43
- @tags("profile,a2a-security,escalation,session-peak,injection,owasp-lml01,nist-si-4")
44
- @reject_message("Request blocked: this session has a prior injection detection peak of 70+. Non-first-party agents are restricted after high injection signals to prevent progressive capability escalation.")
32
+ @tags("category:agent-identity,threat:injection,scope:per-agent,detection:aggregate,owasp:llm01")
33
+ @reject_message("Request blocked: this session previously crossed the injection peak threshold; non-first-party agents are restricted.")
45
34
  forbid (
46
35
  principal is Guardrails::Agent,
47
36
  action in [Guardrails::Action::"call_tool", Guardrails::Action::"process_prompt"],
@@ -52,13 +41,12 @@ when {
52
41
  context has session_max_injection_score && context.session_max_injection_score >= 70
53
42
  };
54
43
 
55
- // Block non-first-party agents when session jailbreak peak is high
56
- @id("a2a-session-jailbreak-peak-block")
57
- @name("Block agents when session jailbreak peak is high")
58
- @description("Block non-first-party agents from prompt processing when the maximum jailbreak score across all prior turns exceeds 70. A high jailbreak peak indicates the session's guardrails may have been partially eroded — subsequent agent requests in this session should be treated as potentially manipulated.")
44
+ @id("agent-identity.a2a-block-session-jailbreak-peak")
45
+ @name("Block agents at high session jailbreak peak")
46
+ @description("Blocks process_prompt when agent_trust_level is not \"first_party\" and session_max_jailbreak_score >= 70.")
59
47
  @severity("critical")
60
- @tags("profile,a2a-security,escalation,session-peak,jailbreak,owasp-lml01,nist-si-4")
61
- @reject_message("Request blocked: this session has a prior jailbreak detection peak of 70+. Non-first-party agents are restricted after high jailbreak signals to prevent progressive manipulation.")
48
+ @tags("category:agent-identity,threat:jailbreak,scope:per-agent,detection:aggregate,owasp:llm02")
49
+ @reject_message("Request blocked: this session previously crossed the jailbreak peak threshold; non-first-party agents are restricted.")
62
50
  forbid (
63
51
  principal is Guardrails::Agent,
64
52
  action == Guardrails::Action::"process_prompt",
@@ -69,17 +57,16 @@ when {
69
57
  context has session_max_jailbreak_score && context.session_max_jailbreak_score >= 70
70
58
  };
71
59
 
72
- // -----------------------------------------------------------------------------
73
- // Cumulative Risk Circuit Breaker
74
- // -----------------------------------------------------------------------------
60
+ // ---------------------------------------------------------------------------
61
+ // Section 2: Cumulative risk circuit breaker
62
+ // ---------------------------------------------------------------------------
75
63
 
76
- // Lock down non-first-party agents at high cumulative risk
77
- @id("a2a-cumulative-risk-agent-lockdown")
78
- @name("Lock down non-first-party agents at high cumulative risk")
79
- @description("Block non-first-party agents from sensitive tools when cumulative session risk exceeds 150. Lower threshold than MAS (200) because A2A sessions with cross-system agents should trip the circuit breaker earlier — independent agents cannot coordinate session recovery the way an orchestrator can.")
64
+ @id("agent-identity.a2a-cumulative-risk-lockdown")
65
+ @name("Block non-first-party agents at cumulative risk")
66
+ @description("Blocks call_tool when agent_trust_level is not \"first_party\", session_cumulative_risk_score >= 151, and tool_is_sensitive is true.")
80
67
  @severity("high")
81
- @tags("profile,a2a-security,escalation,cumulative-risk,circuit-breaker,nist-ir-4")
82
- @reject_message("Sensitive tool blocked: session cumulative risk exceeds 150. Non-first-party agents are restricted from sensitive tools in elevated-risk sessions to prevent progressive capability gain.")
68
+ @tags("category:agent-identity,scope:per-agent,detection:aggregate,surface:call-tool,posture:catch-all")
69
+ @reject_message("Tool execution blocked: cumulative session risk exceeded the A2A circuit-breaker threshold for sensitive tools.")
83
70
  forbid (
84
71
  principal is Guardrails::Agent,
85
72
  action == Guardrails::Action::"call_tool",
@@ -87,21 +74,20 @@ forbid (
87
74
  )
88
75
  when {
89
76
  context has agent_trust_level && context.agent_trust_level != "first_party" &&
90
- context has session_cumulative_risk_score && context.session_cumulative_risk_score > 150 &&
77
+ context has session_cumulative_risk_score && context.session_cumulative_risk_score >= 151 &&
91
78
  context has tool_is_sensitive && context.tool_is_sensitive == true
92
79
  };
93
80
 
94
- // -----------------------------------------------------------------------------
95
- // Threat Turn Escalation Probing Detection
96
- // -----------------------------------------------------------------------------
81
+ // ---------------------------------------------------------------------------
82
+ // Section 3: Threat turn escalation
83
+ // ---------------------------------------------------------------------------
97
84
 
98
- // Block unverified agents after repeated threat turns
99
- @id("a2a-threat-turn-escalation-block")
85
+ @id("agent-identity.a2a-threat-turn-escalation")
100
86
  @name("Block unverified agents after repeated threat turns")
101
- @description("Block unverified agents from all tool calls after 3+ threat turns are detected in the session. Lower threshold than MAS (5) because repeated threats from an unverified agent's session indicate adversarial probing — the attacker is incrementally testing boundaries. Three threat turns is sufficient evidence of active reconnaissance.")
87
+ @description("Blocks call_tool when agent_trust_level is \"unverified\" and session_threat_turns >= 3.")
102
88
  @severity("critical")
103
- @tags("profile,a2a-security,escalation,threat-turns,probing,unverified,nist-ir-4")
104
- @reject_message("Tool execution blocked: 3+ threat turns detected in this session. Unverified agents are locked out after repeated threat signals to prevent adversarial escalation via turn-by-turn probing.")
89
+ @tags("category:agent-identity,scope:per-agent,detection:aggregate,surface:call-tool,posture:catch-all")
90
+ @reject_message("Tool execution blocked: 3+ threat turns in this session unverified agents are locked out to stop adversarial probing.")
105
91
  forbid (
106
92
  principal is Guardrails::Agent,
107
93
  action == Guardrails::Action::"call_tool",
@@ -109,5 +95,5 @@ forbid (
109
95
  )
110
96
  when {
111
97
  context has agent_trust_level && context.agent_trust_level == "unverified" &&
112
- context has session_threat_turns && context.session_threat_turns > 2
98
+ context has session_threat_turns && context.session_threat_turns >= 3
113
99
  };