@highflame/policy 2.1.4 → 2.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/README.md +41 -0
  2. package/_schemas/guardrails/context.json +600 -6
  3. package/_schemas/guardrails/schema.cedarschema +79 -3
  4. package/_schemas/guardrails/templates/defaults/agent_identity.cedar +118 -0
  5. package/_schemas/guardrails/templates/defaults/agentic_safety.cedar +4 -4
  6. package/_schemas/guardrails/templates/defaults/injection.cedar +10 -10
  7. package/_schemas/guardrails/templates/defaults/secrets.cedar +2 -2
  8. package/_schemas/guardrails/templates/defaults/security_patterns.cedar +1 -1
  9. package/_schemas/guardrails/templates/defaults/toxicity.cedar +3 -3
  10. package/_schemas/guardrails/templates/profiles/chat_assistant/privacy.cedar +1 -1
  11. package/_schemas/guardrails/templates/profiles/chat_assistant/security.cedar +4 -4
  12. package/_schemas/guardrails/templates/profiles/chat_assistant/trust_safety.cedar +2 -2
  13. package/_schemas/guardrails/templates/profiles/code_agent/agentic_security.cedar +3 -3
  14. package/_schemas/guardrails/templates/profiles/data_pipeline/agentic_security.cedar +1 -1
  15. package/_schemas/guardrails/templates/profiles/data_pipeline/privacy.cedar +2 -2
  16. package/_schemas/guardrails/templates/profiles/data_pipeline/security.cedar +3 -3
  17. package/_schemas/guardrails/templates/profiles/multi_agent/agent_safety.cedar +157 -0
  18. package/_schemas/guardrails/templates/profiles/multi_agent/agent_trust.cedar +140 -0
  19. package/_schemas/guardrails/templates/templates.json +41 -0
  20. package/_schemas/overwatch/context.json +232 -4
  21. package/_schemas/overwatch/schema.cedarschema +42 -4
  22. package/dist/guardrails-context.gen.d.ts +24 -2
  23. package/dist/guardrails-context.gen.js +24 -2
  24. package/dist/guardrails-defaults.gen.d.ts +1 -1
  25. package/dist/guardrails-defaults.gen.js +524 -36
  26. package/dist/overwatch-context.gen.d.ts +10 -0
  27. package/dist/overwatch-context.gen.js +10 -0
  28. package/dist/service-schemas.gen.d.ts +2 -2
  29. package/dist/service-schemas.gen.js +275 -24
  30. package/package.json +1 -1
  31. package/_schemas/guardrails/templates/profiles/chat_assistant.cedar +0 -85
  32. package/_schemas/guardrails/templates/profiles/code_agent.cedar +0 -125
  33. package/_schemas/guardrails/templates/profiles/data_pipeline.cedar +0 -111
@@ -0,0 +1,157 @@
1
+ // =============================================================================
2
+ // Multi-Agent Orchestration — Cross-Turn Agent Safety
3
+ // =============================================================================
4
+ // Session-aware policies that use cross-turn detection history combined with
5
+ // agent identity for defense-in-depth. These policies handle the scenario
6
+ // where one agent in a multi-agent session detects a threat, and subsequent
7
+ // agents in the same session must be restricted accordingly.
8
+ //
9
+ // Key insight: In multi-agent orchestration, Agent A may detect PII in turn 3,
10
+ // and Agent B (a different agent) arrives in turn 5 wanting to call http_post.
11
+ // Cross-turn session flags + agent trust level enable this policy:
12
+ // "If PII was seen AND this agent is unverified → block network tools."
13
+ //
14
+ // Context keys used:
15
+ // - agent_trust_level: String - Trust tier of the current agent
16
+ // - agent_type: String - Agent classification
17
+ // - session_pii_detected: Bool - PII seen in any prior turn
18
+ // - session_pii_types: Set<String> - PII types accumulated
19
+ // - session_secrets_detected: Bool - Secrets seen in any prior turn
20
+ // - session_injection_detected: Bool - Injection seen in any prior turn
21
+ // - session_command_injection: Bool - Command injection in any prior turn
22
+ // - session_threat_turns: Long - Count of turns with threats
23
+ // - session_cumulative_risk_score: Long - Total accumulated risk
24
+ // - tool_name: String - Tool being called
25
+ // - tool_is_sensitive: Bool - Whether tool is sensitive
26
+ // - suspicious_pattern: Bool - Whether exfiltration pattern detected
27
+ //
28
+ // Category: agent_identity
29
+ // Namespace: Guardrails
30
+ // =============================================================================
31
+
32
+ // -----------------------------------------------------------------------------
33
+ // PII Containment — Prevent Agent Data Leakage
34
+ // -----------------------------------------------------------------------------
35
+
36
+ @id("multi-agent-pii-block-network-tools")
37
+ @name("Block network tools after PII detection for non-first-party agents")
38
+ @description("If PII was detected in any prior turn, block non-first-party agents from calling network-facing tools. Prevents data exfiltration by untrusted agents in sessions containing sensitive data")
39
+ @severity("critical")
40
+ @tags("profile,multi-agent,pii,exfiltration,cross-turn,a2a")
41
+ forbid (
42
+ principal is Guardrails::Agent,
43
+ action == Guardrails::Action::"call_tool",
44
+ resource
45
+ ) when {
46
+ context.agent_trust_level != "first_party" &&
47
+ context has session_pii_detected && context.session_pii_detected == true &&
48
+ context has tool_name &&
49
+ (context.tool_name == "http_post" ||
50
+ context.tool_name == "send_email" ||
51
+ context.tool_name == "http_request" ||
52
+ context.tool_name == "webhook")
53
+ };
54
+
55
+ @id("multi-agent-pii-block-unverified-file-write")
56
+ @name("Block unverified agents from writing files after PII detection")
57
+ @description("If PII was detected in the session, unverified agents cannot write files. Prevents PII persistence by untrusted agents")
58
+ @severity("high")
59
+ @tags("profile,multi-agent,pii,file-write,cross-turn,a2a")
60
+ forbid (
61
+ principal is Guardrails::Agent,
62
+ action == Guardrails::Action::"write_file",
63
+ resource
64
+ ) when {
65
+ context.agent_trust_level == "unverified" &&
66
+ context has session_pii_detected && context.session_pii_detected == true
67
+ };
68
+
69
+ // -----------------------------------------------------------------------------
70
+ // Secrets Containment — Lock Down After Credential Exposure
71
+ // -----------------------------------------------------------------------------
72
+
73
+ @id("multi-agent-secrets-lockdown")
74
+ @name("Lock down sensitive tools after secrets detection")
75
+ @description("If secrets (API keys, tokens) were detected in any prior turn, block all non-first-party agents from sensitive tool calls. Prevents credential exfiltration in compromised sessions")
76
+ @severity("critical")
77
+ @tags("profile,multi-agent,secrets,lockdown,cross-turn,a2a")
78
+ forbid (
79
+ principal is Guardrails::Agent,
80
+ action == Guardrails::Action::"call_tool",
81
+ resource
82
+ ) when {
83
+ context.agent_trust_level != "first_party" &&
84
+ context has session_secrets_detected && context.session_secrets_detected == true &&
85
+ context has tool_is_sensitive && context.tool_is_sensitive == true
86
+ };
87
+
88
+ // -----------------------------------------------------------------------------
89
+ // Injection Escalation — Tighten After Prior Attacks
90
+ // -----------------------------------------------------------------------------
91
+
92
+ @id("multi-agent-post-injection-lockdown")
93
+ @name("Restrict unverified agents after injection detection")
94
+ @description("If injection was detected in any prior turn, block unverified agents from all tool calls. An injection in a prior turn may have poisoned the context, making subsequent unverified agent actions high risk")
95
+ @severity("critical")
96
+ @tags("profile,multi-agent,injection,lockdown,cross-turn,a2a")
97
+ forbid (
98
+ principal is Guardrails::Agent,
99
+ action == Guardrails::Action::"call_tool",
100
+ resource
101
+ ) when {
102
+ context.agent_trust_level == "unverified" &&
103
+ context has session_injection_detected && context.session_injection_detected == true
104
+ };
105
+
106
+ @id("multi-agent-post-command-injection-block-shell")
107
+ @name("Block all agent shell access after command injection")
108
+ @description("If command injection was detected in any prior turn, no agent (regardless of trust) can execute shell commands. Defense against persistent shell compromise")
109
+ @severity("critical")
110
+ @tags("profile,multi-agent,command-injection,shell,cross-turn,a2a")
111
+ forbid (
112
+ principal is Guardrails::Agent,
113
+ action == Guardrails::Action::"call_tool",
114
+ resource
115
+ ) when {
116
+ context has session_command_injection && context.session_command_injection == true &&
117
+ context has tool_name &&
118
+ (context.tool_name == "shell" ||
119
+ context.tool_name == "execute_command" ||
120
+ context.tool_name == "bash")
121
+ };
122
+
123
+ // -----------------------------------------------------------------------------
124
+ // Cumulative Risk — Session-Level Circuit Breaker
125
+ // -----------------------------------------------------------------------------
126
+
127
+ @id("multi-agent-high-cumulative-risk-restrict")
128
+ @name("Restrict non-first-party agents in high-risk sessions")
129
+ @description("When cumulative session risk exceeds 200, restrict non-first-party agents to safe tools only. Acts as a circuit breaker for sessions that have accumulated multiple risk signals across turns")
130
+ @severity("high")
131
+ @tags("profile,multi-agent,cumulative-risk,circuit-breaker,a2a")
132
+ forbid (
133
+ principal is Guardrails::Agent,
134
+ action == Guardrails::Action::"call_tool",
135
+ resource
136
+ ) when {
137
+ context.agent_trust_level != "first_party" &&
138
+ context has session_cumulative_risk_score && context.session_cumulative_risk_score > 200 &&
139
+ context has tool_is_sensitive && context.tool_is_sensitive == true
140
+ };
141
+
142
+ @id("multi-agent-extreme-risk-full-lockdown")
143
+ @name("Full lockdown for unverified agents in extreme-risk sessions")
144
+ @description("When cumulative session risk exceeds 500 or more than 5 threat turns are detected, block ALL tool calls from unverified agents. Emergency circuit breaker for compromised sessions")
145
+ @severity("critical")
146
+ @tags("profile,multi-agent,extreme-risk,lockdown,a2a")
147
+ forbid (
148
+ principal is Guardrails::Agent,
149
+ action == Guardrails::Action::"call_tool",
150
+ resource
151
+ ) when {
152
+ context.agent_trust_level == "unverified" &&
153
+ (
154
+ (context has session_cumulative_risk_score && context.session_cumulative_risk_score > 500) ||
155
+ (context has session_threat_turns && context.session_threat_turns > 5)
156
+ )
157
+ };
@@ -0,0 +1,140 @@
1
+ // =============================================================================
2
+ // Multi-Agent Orchestration — Agent Trust Policies
3
+ // =============================================================================
4
+ // Production-grade trust policies for multi-agent systems where an orchestrator
5
+ // coordinates sub-agents with varying trust levels. These policies enforce
6
+ // least-privilege access: each agent gets only the permissions its trust level
7
+ // and type warrant.
8
+ //
9
+ // Architecture supported:
10
+ // Orchestrator (first_party)
11
+ // ├── Research Agent (verified_third_party, autonomous)
12
+ // ├── Code Agent (first_party, tool_agent)
13
+ // └── External Plugin (unverified, tool_agent)
14
+ //
15
+ // Context keys used:
16
+ // - agent_id: String - Unique agent identifier
17
+ // - agent_type: String - orchestrator | autonomous | tool_agent | human_proxy
18
+ // - agent_trust_level: String - first_party | verified_third_party | unverified
19
+ // - agent_framework: String - Agent framework/SDK
20
+ // - agent_publisher: String - Publishing organization
21
+ // - tool_name: String - Tool being called
22
+ // - tool_category: String - safe | sensitive | dangerous
23
+ // - tool_risk_score: Long (0-100) - Computed risk score
24
+ // - tool_is_sensitive: Bool - Whether tool is sensitive
25
+ // - mcp_server_verified: Bool - Whether MCP server is verified
26
+ // - injection_confidence: Long (0-100) - Injection detection score
27
+ // - jailbreak_confidence: Long (0-100) - Jailbreak detection score
28
+ //
29
+ // Category: agent_identity
30
+ // Namespace: Guardrails
31
+ // =============================================================================
32
+
33
+ // -----------------------------------------------------------------------------
34
+ // Tiered Tool Access — The Core A2A Trust Model
35
+ // -----------------------------------------------------------------------------
36
+
37
+ @id("multi-agent-only-first-party-dangerous")
38
+ @name("Only first-party agents can use dangerous tools")
39
+ @description("Dangerous tools (shell, delete_file, run_sql) are restricted to first-party agents only. Third-party and unverified agents are blocked regardless of other signals")
40
+ @severity("critical")
41
+ @tags("profile,multi-agent,trust,tools,a2a")
42
+ forbid (
43
+ principal is Guardrails::Agent,
44
+ action == Guardrails::Action::"call_tool",
45
+ resource
46
+ ) when {
47
+ context.agent_trust_level != "first_party" &&
48
+ context has tool_category && context.tool_category == "dangerous"
49
+ };
50
+
51
+ @id("multi-agent-block-unverified-sensitive")
52
+ @name("Block unverified agents from all sensitive tools")
53
+ @description("Unverified agents can only use tools classified as safe. Sensitive tools (write_file, http_post, send_email) require at least verified_third_party trust")
54
+ @severity("high")
55
+ @tags("profile,multi-agent,trust,tools,a2a")
56
+ forbid (
57
+ principal is Guardrails::Agent,
58
+ action == Guardrails::Action::"call_tool",
59
+ resource
60
+ ) when {
61
+ context.agent_trust_level == "unverified" &&
62
+ context has tool_is_sensitive && context.tool_is_sensitive == true
63
+ };
64
+
65
+ @id("multi-agent-block-unverified-mcp")
66
+ @name("Block unverified agents from unverified MCP servers")
67
+ @description("Unverified agents cannot call tools from unverified MCP servers. Double-unverified (agent + server) presents unacceptable supply chain risk")
68
+ @severity("critical")
69
+ @tags("profile,multi-agent,trust,mcp,a2a")
70
+ forbid (
71
+ principal is Guardrails::Agent,
72
+ action == Guardrails::Action::"call_tool",
73
+ resource
74
+ ) when {
75
+ context.agent_trust_level == "unverified" &&
76
+ context has mcp_server_verified && context.mcp_server_verified == false
77
+ };
78
+
79
+ // -----------------------------------------------------------------------------
80
+ // Autonomous Agent Safeguards
81
+ // -----------------------------------------------------------------------------
82
+
83
+ @id("multi-agent-autonomous-tool-risk-cap")
84
+ @name("Cap tool risk for autonomous agents")
85
+ @description("Autonomous agents (no human in the loop) have a lower tool risk ceiling. Tools with risk > 70 require human oversight that autonomous agents lack")
86
+ @severity("high")
87
+ @tags("profile,multi-agent,autonomous,tools,a2a")
88
+ forbid (
89
+ principal is Guardrails::Agent,
90
+ action == Guardrails::Action::"call_tool",
91
+ resource
92
+ ) when {
93
+ context.agent_type == "autonomous" &&
94
+ context has tool_risk_score && context.tool_risk_score > 70
95
+ };
96
+
97
+ @id("multi-agent-autonomous-injection-defense")
98
+ @name("Enhanced injection defense for autonomous agents")
99
+ @description("Autonomous agents are high-value targets for injection. Lower the threshold to 50 (vs 80 standard) since there is no human to catch false negatives")
100
+ @severity("high")
101
+ @tags("profile,multi-agent,autonomous,injection,a2a")
102
+ forbid (
103
+ principal is Guardrails::Agent,
104
+ action == Guardrails::Action::"process_prompt",
105
+ resource
106
+ ) when {
107
+ context.agent_type == "autonomous" &&
108
+ context has injection_confidence && context.injection_confidence > 50
109
+ };
110
+
111
+ @id("multi-agent-autonomous-jailbreak-defense")
112
+ @name("Enhanced jailbreak defense for autonomous agents")
113
+ @description("Lower jailbreak threshold for autonomous agents. Without human review, we must be more conservative")
114
+ @severity("high")
115
+ @tags("profile,multi-agent,autonomous,jailbreak,a2a")
116
+ forbid (
117
+ principal is Guardrails::Agent,
118
+ action == Guardrails::Action::"process_prompt",
119
+ resource
120
+ ) when {
121
+ context.agent_type == "autonomous" &&
122
+ context has jailbreak_confidence && context.jailbreak_confidence > 50
123
+ };
124
+
125
+ // -----------------------------------------------------------------------------
126
+ // MCP Server Connection Trust
127
+ // -----------------------------------------------------------------------------
128
+
129
+ @id("multi-agent-block-unverified-server-connect")
130
+ @name("Block unverified agents from connecting to MCP servers")
131
+ @description("Unverified agents cannot establish new MCP server connections. Limits blast radius of compromised or rogue agents")
132
+ @severity("high")
133
+ @tags("profile,multi-agent,trust,mcp,connect,a2a")
134
+ forbid (
135
+ principal is Guardrails::Agent,
136
+ action == Guardrails::Action::"connect_server",
137
+ resource
138
+ ) when {
139
+ context.agent_trust_level == "unverified"
140
+ };
@@ -23,6 +23,11 @@
23
23
  "name": "Agentic Security",
24
24
  "description": "Detect tool abuse, data exfiltration patterns, infinite loops, and budget violations"
25
25
  },
26
+ {
27
+ "id": "agent_identity",
28
+ "name": "Agent-to-Agent Security",
29
+ "description": "Trust-based access control for AI agents — tiered permissions by trust level, agent type restrictions, cross-turn session lockdowns for multi-agent orchestration"
30
+ },
26
31
  {
27
32
  "id": "organization",
28
33
  "name": "Organization",
@@ -109,6 +114,16 @@
109
114
  "severity": "critical",
110
115
  "tags": ["command-injection", "path-traversal", "sql-injection", "security"],
111
116
  "is_active": true
117
+ },
118
+ {
119
+ "id": "agent-identity-default",
120
+ "name": "Agent Identity & Trust",
121
+ "description": "Trust-based access control for AI agents: block unverified agents from dangerous/sensitive tools, apply stricter thresholds for autonomous agents, restrict unverified agents after session threats",
122
+ "category": "agent_identity",
123
+ "file": "defaults/agent_identity.cedar",
124
+ "severity": "critical",
125
+ "tags": ["agent-identity", "trust", "a2a", "autonomous", "cross-turn"],
126
+ "is_active": true
112
127
  }
113
128
  ],
114
129
  "templates": [
@@ -192,6 +207,24 @@
192
207
  "file": "profiles/data_pipeline/agentic_security.cedar",
193
208
  "severity": "critical",
194
209
  "tags": ["profile", "data-pipeline", "exfiltration", "tools"]
210
+ },
211
+ {
212
+ "id": "multi-agent-trust",
213
+ "name": "Multi-Agent Orchestration — Agent Trust",
214
+ "description": "Tiered trust policies for multi-agent systems: only first-party agents can use dangerous tools, unverified agents restricted to safe tools, autonomous agents have lower risk ceilings, MCP server connection trust enforcement",
215
+ "category": "agent_identity",
216
+ "file": "profiles/multi_agent/agent_trust.cedar",
217
+ "severity": "critical",
218
+ "tags": ["profile", "multi-agent", "trust", "a2a", "autonomous", "mcp"]
219
+ },
220
+ {
221
+ "id": "multi-agent-safety",
222
+ "name": "Multi-Agent Orchestration — Cross-Turn Safety",
223
+ "description": "Session-aware agent safety policies: PII containment across agents, secrets lockdown, injection escalation response, cumulative risk circuit breakers for multi-agent sessions",
224
+ "category": "agent_identity",
225
+ "file": "profiles/multi_agent/agent_safety.cedar",
226
+ "severity": "critical",
227
+ "tags": ["profile", "multi-agent", "cross-turn", "a2a", "pii", "secrets", "injection", "circuit-breaker"]
195
228
  }
196
229
  ],
197
230
  "profiles": [
@@ -218,6 +251,14 @@
218
251
  "severity": "critical",
219
252
  "tags": ["data-pipeline", "pii", "secrets", "exfiltration"],
220
253
  "template_ids": ["data-pipeline-privacy", "data-pipeline-security", "data-pipeline-agentic-security"]
254
+ },
255
+ {
256
+ "id": "multi-agent",
257
+ "name": "Multi-Agent Orchestration",
258
+ "description": "Production-grade A2A guardrails for multi-agent systems — tiered trust access control, autonomous agent safeguards, cross-turn PII/secrets containment, injection escalation response, cumulative risk circuit breakers",
259
+ "severity": "critical",
260
+ "tags": ["multi-agent", "a2a", "trust", "cross-turn", "circuit-breaker"],
261
+ "template_ids": ["multi-agent-trust", "multi-agent-safety"]
221
262
  }
222
263
  ]
223
264
  }
@@ -179,13 +179,37 @@
179
179
  "key": "injection_confidence",
180
180
  "type": "number",
181
181
  "required": true,
182
- "description": "Prompt injection ML classifier confidence (0-100)"
182
+ "description": "Combined prompt injection confidence (0-100). MAX of all detector scores (Pulse + DeepContext). Use injection_pulse_score / injection_deep_context_score for individual detector control"
183
183
  },
184
184
  {
185
185
  "key": "jailbreak_confidence",
186
186
  "type": "number",
187
187
  "required": true,
188
- "description": "Jailbreak detection ML classifier confidence (0-100)"
188
+ "description": "Combined jailbreak detection confidence (0-100). MAX of all detector scores (Pulse + DeepContext). Use jailbreak_pulse_score / jailbreak_deep_context_score for individual detector control"
189
+ },
190
+ {
191
+ "key": "injection_pulse_score",
192
+ "type": "number",
193
+ "required": false,
194
+ "description": "Highflame single-turn classifier score for prompt injection (0-100). Raw score from Pulse detector before combination with deep-context. Use for per-detector policy control"
195
+ },
196
+ {
197
+ "key": "injection_deep_context_score",
198
+ "type": "number",
199
+ "required": false,
200
+ "description": "DeepContext multi-turn analyzer score for prompt injection (0-100). Tracks injection patterns across conversation history. Generally higher confidence than single-turn"
201
+ },
202
+ {
203
+ "key": "jailbreak_pulse_score",
204
+ "type": "number",
205
+ "required": false,
206
+ "description": "Highflame single-turn classifier score for jailbreak attempts (0-100). Raw score from Pulse detector before combination with deep-context"
207
+ },
208
+ {
209
+ "key": "jailbreak_deep_context_score",
210
+ "type": "number",
211
+ "required": false,
212
+ "description": "DeepContext multi-turn analyzer score for jailbreak attempts (0-100). Detects jailbreak escalation patterns across conversation turns"
189
213
  },
190
214
  {
191
215
  "key": "indirect_injection_score",
@@ -234,6 +258,42 @@
234
258
  "type": "number",
235
259
  "required": false,
236
260
  "description": "Number of turns in the session where threats were detected"
261
+ },
262
+ {
263
+ "key": "session_max_injection_score",
264
+ "type": "number",
265
+ "required": false,
266
+ "description": "Highest prompt injection score seen in any turn of the session (0-100). Use for tiered responses: >75 hard block, >50 restrict permissions"
267
+ },
268
+ {
269
+ "key": "session_max_jailbreak_score",
270
+ "type": "number",
271
+ "required": false,
272
+ "description": "Highest jailbreak detection score seen in any turn of the session (0-100)"
273
+ },
274
+ {
275
+ "key": "session_max_command_injection_score",
276
+ "type": "number",
277
+ "required": false,
278
+ "description": "Highest command injection score seen in any turn of the session (0-100)"
279
+ },
280
+ {
281
+ "key": "session_max_pii_score",
282
+ "type": "number",
283
+ "required": false,
284
+ "description": "Highest PII risk score seen in any turn of the session (0-100)"
285
+ },
286
+ {
287
+ "key": "session_max_secret_score",
288
+ "type": "number",
289
+ "required": false,
290
+ "description": "Highest secret detection score seen in any turn of the session (0-100)"
291
+ },
292
+ {
293
+ "key": "session_cumulative_risk_score",
294
+ "type": "number",
295
+ "required": false,
296
+ "description": "Sum of per-turn risk scores across the session. Catches death-by-a-thousand-cuts where no single turn is high but cumulative risk is significant"
237
297
  }
238
298
  ]
239
299
  },
@@ -431,13 +491,37 @@
431
491
  "key": "injection_confidence",
432
492
  "type": "number",
433
493
  "required": false,
434
- "description": "Prompt injection ML classifier confidence (0-100)"
494
+ "description": "Combined prompt injection confidence (0-100). MAX of all detector scores (Pulse + DeepContext). Use injection_pulse_score / injection_deep_context_score for individual detector control"
435
495
  },
436
496
  {
437
497
  "key": "jailbreak_confidence",
438
498
  "type": "number",
439
499
  "required": false,
440
- "description": "Jailbreak detection ML classifier confidence (0-100)"
500
+ "description": "Combined jailbreak detection confidence (0-100). MAX of all detector scores (Pulse + DeepContext). Use jailbreak_pulse_score / jailbreak_deep_context_score for individual detector control"
501
+ },
502
+ {
503
+ "key": "injection_pulse_score",
504
+ "type": "number",
505
+ "required": false,
506
+ "description": "Highflame single-turn classifier score for prompt injection in tool arguments (0-100). Raw score from Pulse detector before combination with deep-context"
507
+ },
508
+ {
509
+ "key": "injection_deep_context_score",
510
+ "type": "number",
511
+ "required": false,
512
+ "description": "DeepContext multi-turn analyzer score for prompt injection in tool arguments (0-100). Tracks injection patterns across tool call history"
513
+ },
514
+ {
515
+ "key": "jailbreak_pulse_score",
516
+ "type": "number",
517
+ "required": false,
518
+ "description": "Highflame single-turn classifier score for jailbreak in tool arguments (0-100). Raw score from Pulse detector before combination with deep-context"
519
+ },
520
+ {
521
+ "key": "jailbreak_deep_context_score",
522
+ "type": "number",
523
+ "required": false,
524
+ "description": "DeepContext multi-turn analyzer score for jailbreak in tool arguments (0-100). Detects jailbreak escalation patterns across tool call turns"
441
525
  },
442
526
  {
443
527
  "key": "tool_poisoning_score",
@@ -576,6 +660,42 @@
576
660
  "type": "number",
577
661
  "required": false,
578
662
  "description": "Number of turns in the session where threats were detected"
663
+ },
664
+ {
665
+ "key": "session_max_injection_score",
666
+ "type": "number",
667
+ "required": false,
668
+ "description": "Highest prompt injection score seen in any turn of the session (0-100). Use for tiered responses: >75 hard block, >50 restrict permissions"
669
+ },
670
+ {
671
+ "key": "session_max_jailbreak_score",
672
+ "type": "number",
673
+ "required": false,
674
+ "description": "Highest jailbreak detection score seen in any turn of the session (0-100)"
675
+ },
676
+ {
677
+ "key": "session_max_command_injection_score",
678
+ "type": "number",
679
+ "required": false,
680
+ "description": "Highest command injection score seen in any turn of the session (0-100)"
681
+ },
682
+ {
683
+ "key": "session_max_pii_score",
684
+ "type": "number",
685
+ "required": false,
686
+ "description": "Highest PII risk score seen in any turn of the session (0-100)"
687
+ },
688
+ {
689
+ "key": "session_max_secret_score",
690
+ "type": "number",
691
+ "required": false,
692
+ "description": "Highest secret detection score seen in any turn of the session (0-100)"
693
+ },
694
+ {
695
+ "key": "session_cumulative_risk_score",
696
+ "type": "number",
697
+ "required": false,
698
+ "description": "Sum of per-turn risk scores across the session. Catches death-by-a-thousand-cuts where no single turn is high but cumulative risk is significant"
579
699
  }
580
700
  ]
581
701
  },
@@ -732,6 +852,42 @@
732
852
  "type": "number",
733
853
  "required": false,
734
854
  "description": "Number of turns in the session where threats were detected"
855
+ },
856
+ {
857
+ "key": "session_max_injection_score",
858
+ "type": "number",
859
+ "required": false,
860
+ "description": "Highest prompt injection score seen in any turn of the session (0-100). Use for tiered responses: >75 hard block, >50 restrict permissions"
861
+ },
862
+ {
863
+ "key": "session_max_jailbreak_score",
864
+ "type": "number",
865
+ "required": false,
866
+ "description": "Highest jailbreak detection score seen in any turn of the session (0-100)"
867
+ },
868
+ {
869
+ "key": "session_max_command_injection_score",
870
+ "type": "number",
871
+ "required": false,
872
+ "description": "Highest command injection score seen in any turn of the session (0-100)"
873
+ },
874
+ {
875
+ "key": "session_max_pii_score",
876
+ "type": "number",
877
+ "required": false,
878
+ "description": "Highest PII risk score seen in any turn of the session (0-100)"
879
+ },
880
+ {
881
+ "key": "session_max_secret_score",
882
+ "type": "number",
883
+ "required": false,
884
+ "description": "Highest secret detection score seen in any turn of the session (0-100)"
885
+ },
886
+ {
887
+ "key": "session_cumulative_risk_score",
888
+ "type": "number",
889
+ "required": false,
890
+ "description": "Sum of per-turn risk scores across the session. Catches death-by-a-thousand-cuts where no single turn is high but cumulative risk is significant"
735
891
  }
736
892
  ]
737
893
  },
@@ -888,6 +1044,42 @@
888
1044
  "type": "number",
889
1045
  "required": false,
890
1046
  "description": "Number of turns in the session where threats were detected"
1047
+ },
1048
+ {
1049
+ "key": "session_max_injection_score",
1050
+ "type": "number",
1051
+ "required": false,
1052
+ "description": "Highest prompt injection score seen in any turn of the session (0-100). Use for tiered responses: >75 hard block, >50 restrict permissions"
1053
+ },
1054
+ {
1055
+ "key": "session_max_jailbreak_score",
1056
+ "type": "number",
1057
+ "required": false,
1058
+ "description": "Highest jailbreak detection score seen in any turn of the session (0-100)"
1059
+ },
1060
+ {
1061
+ "key": "session_max_command_injection_score",
1062
+ "type": "number",
1063
+ "required": false,
1064
+ "description": "Highest command injection score seen in any turn of the session (0-100)"
1065
+ },
1066
+ {
1067
+ "key": "session_max_pii_score",
1068
+ "type": "number",
1069
+ "required": false,
1070
+ "description": "Highest PII risk score seen in any turn of the session (0-100)"
1071
+ },
1072
+ {
1073
+ "key": "session_max_secret_score",
1074
+ "type": "number",
1075
+ "required": false,
1076
+ "description": "Highest secret detection score seen in any turn of the session (0-100)"
1077
+ },
1078
+ {
1079
+ "key": "session_cumulative_risk_score",
1080
+ "type": "number",
1081
+ "required": false,
1082
+ "description": "Sum of per-turn risk scores across the session. Catches death-by-a-thousand-cuts where no single turn is high but cumulative risk is significant"
891
1083
  }
892
1084
  ]
893
1085
  },
@@ -1050,6 +1242,42 @@
1050
1242
  "type": "number",
1051
1243
  "required": false,
1052
1244
  "description": "Number of turns in the session where threats were detected"
1245
+ },
1246
+ {
1247
+ "key": "session_max_injection_score",
1248
+ "type": "number",
1249
+ "required": false,
1250
+ "description": "Highest prompt injection score seen in any turn of the session (0-100). Use for tiered responses: >75 hard block, >50 restrict permissions"
1251
+ },
1252
+ {
1253
+ "key": "session_max_jailbreak_score",
1254
+ "type": "number",
1255
+ "required": false,
1256
+ "description": "Highest jailbreak detection score seen in any turn of the session (0-100)"
1257
+ },
1258
+ {
1259
+ "key": "session_max_command_injection_score",
1260
+ "type": "number",
1261
+ "required": false,
1262
+ "description": "Highest command injection score seen in any turn of the session (0-100)"
1263
+ },
1264
+ {
1265
+ "key": "session_max_pii_score",
1266
+ "type": "number",
1267
+ "required": false,
1268
+ "description": "Highest PII risk score seen in any turn of the session (0-100)"
1269
+ },
1270
+ {
1271
+ "key": "session_max_secret_score",
1272
+ "type": "number",
1273
+ "required": false,
1274
+ "description": "Highest secret detection score seen in any turn of the session (0-100)"
1275
+ },
1276
+ {
1277
+ "key": "session_cumulative_risk_score",
1278
+ "type": "number",
1279
+ "required": false,
1280
+ "description": "Sum of per-turn risk scores across the session. Catches death-by-a-thousand-cuts where no single turn is high but cumulative risk is significant"
1053
1281
  }
1054
1282
  ]
1055
1283
  }