safetydrift 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- safetydrift-0.1.0/PKG-INFO +282 -0
- safetydrift-0.1.0/README.md +259 -0
- safetydrift-0.1.0/pyproject.toml +51 -0
- safetydrift-0.1.0/safetydrift/__init__.py +58 -0
- safetydrift-0.1.0/safetydrift/__main__.py +3 -0
- safetydrift-0.1.0/safetydrift/classifier.py +173 -0
- safetydrift-0.1.0/safetydrift/markov.py +130 -0
- safetydrift-0.1.0/safetydrift/mcp_server.py +176 -0
- safetydrift-0.1.0/safetydrift/policy.py +178 -0
- safetydrift-0.1.0/safetydrift/session.py +204 -0
- safetydrift-0.1.0/safetydrift.egg-info/PKG-INFO +282 -0
- safetydrift-0.1.0/safetydrift.egg-info/SOURCES.txt +16 -0
- safetydrift-0.1.0/safetydrift.egg-info/dependency_links.txt +1 -0
- safetydrift-0.1.0/safetydrift.egg-info/entry_points.txt +2 -0
- safetydrift-0.1.0/safetydrift.egg-info/requires.txt +5 -0
- safetydrift-0.1.0/safetydrift.egg-info/top_level.txt +1 -0
- safetydrift-0.1.0/setup.cfg +4 -0
- safetydrift-0.1.0/tests/test_safetydrift.py +192 -0
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: safetydrift
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Safety drift prediction for AI agents — the first open-source implementation of the SafetyDrift framework (arXiv:2603.27148)
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/YOUR_USERNAME/driftguard
|
|
7
|
+
Project-URL: Paper, https://arxiv.org/abs/2603.27148
|
|
8
|
+
Project-URL: Issues, https://github.com/YOUR_USERNAME/driftguard/issues
|
|
9
|
+
Keywords: ai-safety,llm-agents,mcp,drift-detection,model-context-protocol,agentic-ai,guardrails
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
Requires-Dist: numpy>=1.26
|
|
20
|
+
Provides-Extra: dev
|
|
21
|
+
Requires-Dist: pytest>=7.4; extra == "dev"
|
|
22
|
+
Requires-Dist: pytest-cov>=4.1; extra == "dev"
|
|
23
|
+
|
|
24
|
+
# SafetyDrift 🛡️
|
|
25
|
+
|
|
26
|
+
**The first open-source implementation of the SafetyDrift framework.**
|
|
27
|
+
|
|
28
|
+
> *"When an agent reads a confidential file, writes a summary, then emails it externally — no single step is unsafe, but the sequence is a data leak."*
|
|
29
|
+
> — SafetyDrift, arXiv:2603.27148 (March 2026)
|
|
30
|
+
|
|
31
|
+
DriftGuard predicts when individually safe AI agent actions are about to **compound into a safety violation**, and intervenes before it happens. It models agent safety trajectories as absorbing Markov chains — giving you a **P(violation within N steps)** score after every tool call.
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## The problem it solves
|
|
36
|
+
|
|
37
|
+
Every major AI agent framework (LangChain, AutoGen, CrewAI, Claude Code) trusts the agent. They check output content. They don't check **accumulated authority** or **trajectory risk**.
|
|
38
|
+
|
|
39
|
+
| Situation | Traditional guardrails | DriftGuard |
|
|
40
|
+
|-----------|----------------------|------------|
|
|
41
|
+
| Agent reads a secret | ✅ Allowed | ✅ Allowed (P=low) |
|
|
42
|
+
| Agent reads a secret, then opens customer CSV | ✅ Allowed | ⚠️ Warns (P rising) |
|
|
43
|
+
| Agent reads a secret, opens CSV, sends email | ✅ Allowed | 🚫 Blocked (P=87%) |
|
|
44
|
+
|
|
45
|
+
The SafetyDrift paper found: **in communication-capable agents, reaching even a mild risk state gives an 85% probability of a safety violation within 5 steps.** DriftGuard makes that prediction in real time, before the violation occurs.
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Quick start
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install driftguard
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from driftguard import Session, InterventionAction
|
|
57
|
+
|
|
58
|
+
session = Session(task_type="default") # or: communication, technical, autonomous
|
|
59
|
+
|
|
60
|
+
# Call before EVERY tool execution
|
|
61
|
+
result = session.gate("read_file", {"path": "/workspace/customer_data.csv"})
|
|
62
|
+
|
|
63
|
+
if result.action == InterventionAction.BLOCK:
|
|
64
|
+
raise RuntimeError(f"DriftGuard blocked: {result.reason}")
|
|
65
|
+
elif result.action == InterventionAction.PAUSE:
|
|
66
|
+
approved = ask_human_for_approval(result.to_dict())
|
|
67
|
+
if not approved:
|
|
68
|
+
raise RuntimeError("Human rejected action")
|
|
69
|
+
|
|
70
|
+
# Safe to proceed
|
|
71
|
+
actually_read_file(...)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## How it works
|
|
77
|
+
|
|
78
|
+
DriftGuard tracks three cumulative safety dimensions per session:
|
|
79
|
+
|
|
80
|
+
| Dimension | Description | Levels |
|
|
81
|
+
|-----------|-------------|--------|
|
|
82
|
+
| **Data Exposure** | Sensitivity of data accessed | None → Public → Internal → Confidential → Sensitive |
|
|
83
|
+
| **Tool Escalation** | Capability level reached | None → Read → Write → Network → External |
|
|
84
|
+
| **Reversibility** | Can actions be undone? | Fully → Mostly → Mixed → Mostly Not → Irreversible |
|
|
85
|
+
|
|
86
|
+
State is **monotonic**: it only ever increases. After each tool call, DriftGuard:
|
|
87
|
+
1. Classifies the call into risk dimensions
|
|
88
|
+
2. Projects the cumulative state forward
|
|
89
|
+
3. Runs Markov chain absorption analysis: **P(violation within N steps)**
|
|
90
|
+
4. Applies the configured policy (WARN / PAUSE / BLOCK)
|
|
91
|
+
|
|
92
|
+
### Markov chain model
|
|
93
|
+
|
|
94
|
+
From the SafetyDrift paper, safety violations follow absorbing Markov chain dynamics:
|
|
95
|
+
|
|
96
|
+
```
|
|
97
|
+
SAFE → LOW → MODERATE → HIGH → CRITICAL → [VIOLATION]
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Every agent will eventually reach a violation if left unsupervised — the practical question is **when**, not **if**. DriftGuard computes the finite-horizon absorption probability:
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
P(violation | state, horizon) = [T^horizon][state, violation_state]
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
where `T` is a task-type-calibrated transition matrix.
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## Demo output
|
|
111
|
+
|
|
112
|
+
```
|
|
113
|
+
Step 1: web_search Risk: MODERATE P(viol): 31.1% ✅ WARN
|
|
114
|
+
Step 2: read_file (config) Risk: MODERATE P(viol): 31.5% ✅ WARN
|
|
115
|
+
Step 3: read_file (customers) Risk: MODERATE P(viol): 31.9% ✅ WARN
|
|
116
|
+
Step 4: write_file (summary) Risk: HIGH P(viol): 54.8% ⏸ PAUSE
|
|
117
|
+
Step 5: send_email Risk: CRITICAL P(viol): 86.8% 🚫 BLOCK
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Each step looks safe in isolation. The sequence is a data leak. DriftGuard catches it at step 4 (pause) and blocks it at step 5.
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## Use as an MCP server
|
|
125
|
+
|
|
126
|
+
DriftGuard ships as a stdio MCP server. Any MCP-compatible agent (Claude Code, Cursor, GitHub Copilot) can call it directly.
|
|
127
|
+
|
|
128
|
+
**Add to your `mcp.json`:**
|
|
129
|
+
```json
|
|
130
|
+
{
|
|
131
|
+
"driftguard": {
|
|
132
|
+
"command": "python",
|
|
133
|
+
"args": ["-m", "driftguard"]
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
**Available MCP tools:**
|
|
139
|
+
|
|
140
|
+
| Tool | Description |
|
|
141
|
+
|------|-------------|
|
|
142
|
+
| `dg_gate` | Evaluate a tool call. Returns action: ALLOW / WARN / PAUSE / BLOCK |
|
|
143
|
+
| `dg_session_state` | Get current cumulative risk state |
|
|
144
|
+
| `dg_summary` | Get session stats (blocks, pauses, step count) |
|
|
145
|
+
| `dg_reset` | Reset session for a new task |
|
|
146
|
+
|
|
147
|
+
**Example system prompt addition for Claude Code:**
|
|
148
|
+
```
|
|
149
|
+
Before executing any tool that reads files, makes network requests, or
|
|
150
|
+
writes data, call dg_gate with the tool name and arguments. If the result
|
|
151
|
+
action is BLOCK, do not proceed. If PAUSE, describe the action and ask the
|
|
152
|
+
user for approval before continuing.
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
---
|
|
156
|
+
|
|
157
|
+
## Configuration
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from driftguard import Session
|
|
161
|
+
from driftguard.policy import PolicyConfig, PolicyThreshold
|
|
162
|
+
from driftguard.types import InterventionAction
|
|
163
|
+
|
|
164
|
+
config = PolicyConfig(
|
|
165
|
+
horizon=5, # steps ahead to evaluate
|
|
166
|
+
task_type="communication", # higher baseline risk
|
|
167
|
+
thresholds=[
|
|
168
|
+
PolicyThreshold(0.90, InterventionAction.BLOCK, "Critical risk — blocked"),
|
|
169
|
+
PolicyThreshold(0.60, InterventionAction.PAUSE, "High risk — approval needed"),
|
|
170
|
+
PolicyThreshold(0.30, InterventionAction.WARN, "Elevated risk — warned"),
|
|
171
|
+
PolicyThreshold(0.00, InterventionAction.LOG_ONLY, "Safe — logged"),
|
|
172
|
+
],
|
|
173
|
+
always_block={
|
|
174
|
+
"send_mass_email",
|
|
175
|
+
"delete_production_database",
|
|
176
|
+
"wipe_all_data",
|
|
177
|
+
},
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
session = Session(config=config, task_type="communication")
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### Custom classifier rules
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from driftguard import add_rule, ClassifierRule
|
|
187
|
+
from driftguard.types import DataExposure, ToolEscalation, Reversibility
|
|
188
|
+
|
|
189
|
+
# Add your own tool patterns
|
|
190
|
+
add_rule(ClassifierRule(
|
|
191
|
+
pattern=r"jira.*create.*ticket",
|
|
192
|
+
data_exposure=DataExposure.INTERNAL,
|
|
193
|
+
tool_escalation=ToolEscalation.EXTERNAL,
|
|
194
|
+
reversibility=Reversibility.MOSTLY,
|
|
195
|
+
description="Create Jira ticket — external but recoverable",
|
|
196
|
+
))
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## Task types
|
|
202
|
+
|
|
203
|
+
| Task type | Typical use | Baseline violation rate |
|
|
204
|
+
|-----------|-------------|------------------------|
|
|
205
|
+
| `technical` | Code editing, local file ops | Very low (~1–5% per step) |
|
|
206
|
+
| `information` | Research, browsing, summarising | Low (~8–15%) |
|
|
207
|
+
| `default` | General-purpose agents | Medium (~8%) |
|
|
208
|
+
| `autonomous` | Multi-step autonomous tasks | Medium-high (~12%) |
|
|
209
|
+
| `communication` | Email, messaging, posting agents | High (~18%) |
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
## Integration examples
|
|
214
|
+
|
|
215
|
+
### LangChain
|
|
216
|
+
```python
|
|
217
|
+
from driftguard import Session, InterventionAction
|
|
218
|
+
|
|
219
|
+
session = Session(task_type="default")
|
|
220
|
+
|
|
221
|
+
class GuardedTool(BaseTool):
|
|
222
|
+
def _run(self, *args, **kwargs):
|
|
223
|
+
result = session.gate(self.name, kwargs)
|
|
224
|
+
if result.action == InterventionAction.BLOCK:
|
|
225
|
+
raise ToolException(f"DriftGuard: {result.reason}")
|
|
226
|
+
return self._actual_run(*args, **kwargs)
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### OpenAI Agents SDK
|
|
230
|
+
```python
|
|
231
|
+
from agents import function_tool
|
|
232
|
+
from driftguard import Session, InterventionAction
|
|
233
|
+
|
|
234
|
+
session = Session(task_type="autonomous")
|
|
235
|
+
|
|
236
|
+
def guarded(fn):
|
|
237
|
+
def wrapper(**kwargs):
|
|
238
|
+
r = session.gate(fn.__name__, kwargs)
|
|
239
|
+
if r.action == InterventionAction.BLOCK:
|
|
240
|
+
return f"[BLOCKED by DriftGuard: {r.reason}]"
|
|
241
|
+
return fn(**kwargs)
|
|
242
|
+
return function_tool(wrapper)
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
---
|
|
246
|
+
|
|
247
|
+
## Background: the SafetyDrift paper
|
|
248
|
+
|
|
249
|
+
This library implements the framework from:
|
|
250
|
+
|
|
251
|
+
> **SafetyDrift: Predicting When AI Agents Cross the Line Before They Actually Do**
|
|
252
|
+
> Aditya Dhodapkar, Farhaan Pishori (March 2026)
|
|
253
|
+
> [arXiv:2603.27148](https://arxiv.org/abs/2603.27148)
|
|
254
|
+
|
|
255
|
+
Key findings we implement:
|
|
256
|
+
- Safety state modelled as an absorbing Markov chain across 3 dimensions
|
|
257
|
+
- Every agent has absorption probability 1.0 — the question is *when*, not *if*
|
|
258
|
+
- Communication tasks: 85% violation probability within 5 steps from mild-risk state
|
|
259
|
+
- Technical tasks: below 5% from any state
|
|
260
|
+
- "Points of no return" are sharply task-dependent
|
|
261
|
+
|
|
262
|
+
---
|
|
263
|
+
|
|
264
|
+
## Contributing
|
|
265
|
+
|
|
266
|
+
The highest-value contributions right now:
|
|
267
|
+
|
|
268
|
+
1. **Real trace data** — If you have agent session traces (with ground truth on whether violations occurred), they can calibrate the transition matrices far better than our heuristic approximation
|
|
269
|
+
2. **Framework adapters** — LangGraph, CrewAI, AutoGen, Google Genkit
|
|
270
|
+
3. **CI/CD integration** — GitHub Actions workflow that gates agent PRs
|
|
271
|
+
|
|
272
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for details.
|
|
273
|
+
|
|
274
|
+
---
|
|
275
|
+
|
|
276
|
+
## License
|
|
277
|
+
|
|
278
|
+
MIT — use it in your agent pipelines, commercial or otherwise.
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
282
|
+
*Inspired by and implementing [SafetyDrift (arXiv:2603.27148)](https://arxiv.org/abs/2603.27148). This project is not affiliated with the paper's authors.*
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
# SafetyDrift 🛡️
|
|
2
|
+
|
|
3
|
+
**The first open-source implementation of the SafetyDrift framework.**
|
|
4
|
+
|
|
5
|
+
> *"When an agent reads a confidential file, writes a summary, then emails it externally — no single step is unsafe, but the sequence is a data leak."*
|
|
6
|
+
> — SafetyDrift, arXiv:2603.27148 (March 2026)
|
|
7
|
+
|
|
8
|
+
DriftGuard predicts when individually safe AI agent actions are about to **compound into a safety violation**, and intervenes before it happens. It models agent safety trajectories as absorbing Markov chains — giving you a **P(violation within N steps)** score after every tool call.
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## The problem it solves
|
|
13
|
+
|
|
14
|
+
Every major AI agent framework (LangChain, AutoGen, CrewAI, Claude Code) trusts the agent. They check output content. They don't check **accumulated authority** or **trajectory risk**.
|
|
15
|
+
|
|
16
|
+
| Situation | Traditional guardrails | DriftGuard |
|
|
17
|
+
|-----------|----------------------|------------|
|
|
18
|
+
| Agent reads a secret | ✅ Allowed | ✅ Allowed (P=low) |
|
|
19
|
+
| Agent reads a secret, then opens customer CSV | ✅ Allowed | ⚠️ Warns (P rising) |
|
|
20
|
+
| Agent reads a secret, opens CSV, sends email | ✅ Allowed | 🚫 Blocked (P=87%) |
|
|
21
|
+
|
|
22
|
+
The SafetyDrift paper found: **in communication-capable agents, reaching even a mild risk state gives an 85% probability of a safety violation within 5 steps.** DriftGuard makes that prediction in real time, before the violation occurs.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Quick start
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install driftguard
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from driftguard import Session, InterventionAction
|
|
34
|
+
|
|
35
|
+
session = Session(task_type="default") # or: communication, technical, autonomous
|
|
36
|
+
|
|
37
|
+
# Call before EVERY tool execution
|
|
38
|
+
result = session.gate("read_file", {"path": "/workspace/customer_data.csv"})
|
|
39
|
+
|
|
40
|
+
if result.action == InterventionAction.BLOCK:
|
|
41
|
+
raise RuntimeError(f"DriftGuard blocked: {result.reason}")
|
|
42
|
+
elif result.action == InterventionAction.PAUSE:
|
|
43
|
+
approved = ask_human_for_approval(result.to_dict())
|
|
44
|
+
if not approved:
|
|
45
|
+
raise RuntimeError("Human rejected action")
|
|
46
|
+
|
|
47
|
+
# Safe to proceed
|
|
48
|
+
actually_read_file(...)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## How it works
|
|
54
|
+
|
|
55
|
+
DriftGuard tracks three cumulative safety dimensions per session:
|
|
56
|
+
|
|
57
|
+
| Dimension | Description | Levels |
|
|
58
|
+
|-----------|-------------|--------|
|
|
59
|
+
| **Data Exposure** | Sensitivity of data accessed | None → Public → Internal → Confidential → Sensitive |
|
|
60
|
+
| **Tool Escalation** | Capability level reached | None → Read → Write → Network → External |
|
|
61
|
+
| **Reversibility** | Can actions be undone? | Fully → Mostly → Mixed → Mostly Not → Irreversible |
|
|
62
|
+
|
|
63
|
+
State is **monotonic**: it only ever increases. After each tool call, DriftGuard:
|
|
64
|
+
1. Classifies the call into risk dimensions
|
|
65
|
+
2. Projects the cumulative state forward
|
|
66
|
+
3. Runs Markov chain absorption analysis: **P(violation within N steps)**
|
|
67
|
+
4. Applies the configured policy (WARN / PAUSE / BLOCK)
|
|
68
|
+
|
|
69
|
+
### Markov chain model
|
|
70
|
+
|
|
71
|
+
From the SafetyDrift paper, safety violations follow absorbing Markov chain dynamics:
|
|
72
|
+
|
|
73
|
+
```
|
|
74
|
+
SAFE → LOW → MODERATE → HIGH → CRITICAL → [VIOLATION]
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Every agent will eventually reach a violation if left unsupervised — the practical question is **when**, not **if**. DriftGuard computes the finite-horizon absorption probability:
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
P(violation | state, horizon) = [T^horizon][state, violation_state]
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
where `T` is a task-type-calibrated transition matrix.
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## Demo output
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
Step 1: web_search Risk: MODERATE P(viol): 31.1% ✅ WARN
|
|
91
|
+
Step 2: read_file (config) Risk: MODERATE P(viol): 31.5% ✅ WARN
|
|
92
|
+
Step 3: read_file (customers) Risk: MODERATE P(viol): 31.9% ✅ WARN
|
|
93
|
+
Step 4: write_file (summary) Risk: HIGH P(viol): 54.8% ⏸ PAUSE
|
|
94
|
+
Step 5: send_email Risk: CRITICAL P(viol): 86.8% 🚫 BLOCK
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Each step looks safe in isolation. The sequence is a data leak. DriftGuard catches it at step 4 (pause) and blocks it at step 5.
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## Use as an MCP server
|
|
102
|
+
|
|
103
|
+
DriftGuard ships as a stdio MCP server. Any MCP-compatible agent (Claude Code, Cursor, GitHub Copilot) can call it directly.
|
|
104
|
+
|
|
105
|
+
**Add to your `mcp.json`:**
|
|
106
|
+
```json
|
|
107
|
+
{
|
|
108
|
+
"driftguard": {
|
|
109
|
+
"command": "python",
|
|
110
|
+
"args": ["-m", "driftguard"]
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
**Available MCP tools:**
|
|
116
|
+
|
|
117
|
+
| Tool | Description |
|
|
118
|
+
|------|-------------|
|
|
119
|
+
| `dg_gate` | Evaluate a tool call. Returns action: ALLOW / WARN / PAUSE / BLOCK |
|
|
120
|
+
| `dg_session_state` | Get current cumulative risk state |
|
|
121
|
+
| `dg_summary` | Get session stats (blocks, pauses, step count) |
|
|
122
|
+
| `dg_reset` | Reset session for a new task |
|
|
123
|
+
|
|
124
|
+
**Example system prompt addition for Claude Code:**
|
|
125
|
+
```
|
|
126
|
+
Before executing any tool that reads files, makes network requests, or
|
|
127
|
+
writes data, call dg_gate with the tool name and arguments. If the result
|
|
128
|
+
action is BLOCK, do not proceed. If PAUSE, describe the action and ask the
|
|
129
|
+
user for approval before continuing.
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## Configuration
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from driftguard import Session
|
|
138
|
+
from driftguard.policy import PolicyConfig, PolicyThreshold
|
|
139
|
+
from driftguard.types import InterventionAction
|
|
140
|
+
|
|
141
|
+
config = PolicyConfig(
|
|
142
|
+
horizon=5, # steps ahead to evaluate
|
|
143
|
+
task_type="communication", # higher baseline risk
|
|
144
|
+
thresholds=[
|
|
145
|
+
PolicyThreshold(0.90, InterventionAction.BLOCK, "Critical risk — blocked"),
|
|
146
|
+
PolicyThreshold(0.60, InterventionAction.PAUSE, "High risk — approval needed"),
|
|
147
|
+
PolicyThreshold(0.30, InterventionAction.WARN, "Elevated risk — warned"),
|
|
148
|
+
PolicyThreshold(0.00, InterventionAction.LOG_ONLY, "Safe — logged"),
|
|
149
|
+
],
|
|
150
|
+
always_block={
|
|
151
|
+
"send_mass_email",
|
|
152
|
+
"delete_production_database",
|
|
153
|
+
"wipe_all_data",
|
|
154
|
+
},
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
session = Session(config=config, task_type="communication")
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### Custom classifier rules
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
from driftguard import add_rule, ClassifierRule
|
|
164
|
+
from driftguard.types import DataExposure, ToolEscalation, Reversibility
|
|
165
|
+
|
|
166
|
+
# Add your own tool patterns
|
|
167
|
+
add_rule(ClassifierRule(
|
|
168
|
+
pattern=r"jira.*create.*ticket",
|
|
169
|
+
data_exposure=DataExposure.INTERNAL,
|
|
170
|
+
tool_escalation=ToolEscalation.EXTERNAL,
|
|
171
|
+
reversibility=Reversibility.MOSTLY,
|
|
172
|
+
description="Create Jira ticket — external but recoverable",
|
|
173
|
+
))
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
## Task types
|
|
179
|
+
|
|
180
|
+
| Task type | Typical use | Baseline violation rate |
|
|
181
|
+
|-----------|-------------|------------------------|
|
|
182
|
+
| `technical` | Code editing, local file ops | Very low (~1–5% per step) |
|
|
183
|
+
| `information` | Research, browsing, summarising | Low (~8–15%) |
|
|
184
|
+
| `default` | General-purpose agents | Medium (~8%) |
|
|
185
|
+
| `autonomous` | Multi-step autonomous tasks | Medium-high (~12%) |
|
|
186
|
+
| `communication` | Email, messaging, posting agents | High (~18%) |
|
|
187
|
+
|
|
188
|
+
---
|
|
189
|
+
|
|
190
|
+
## Integration examples
|
|
191
|
+
|
|
192
|
+
### LangChain
|
|
193
|
+
```python
|
|
194
|
+
from driftguard import Session, InterventionAction
|
|
195
|
+
|
|
196
|
+
session = Session(task_type="default")
|
|
197
|
+
|
|
198
|
+
class GuardedTool(BaseTool):
|
|
199
|
+
def _run(self, *args, **kwargs):
|
|
200
|
+
result = session.gate(self.name, kwargs)
|
|
201
|
+
if result.action == InterventionAction.BLOCK:
|
|
202
|
+
raise ToolException(f"DriftGuard: {result.reason}")
|
|
203
|
+
return self._actual_run(*args, **kwargs)
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
### OpenAI Agents SDK
|
|
207
|
+
```python
|
|
208
|
+
from agents import function_tool
|
|
209
|
+
from driftguard import Session, InterventionAction
|
|
210
|
+
|
|
211
|
+
session = Session(task_type="autonomous")
|
|
212
|
+
|
|
213
|
+
def guarded(fn):
|
|
214
|
+
def wrapper(**kwargs):
|
|
215
|
+
r = session.gate(fn.__name__, kwargs)
|
|
216
|
+
if r.action == InterventionAction.BLOCK:
|
|
217
|
+
return f"[BLOCKED by DriftGuard: {r.reason}]"
|
|
218
|
+
return fn(**kwargs)
|
|
219
|
+
return function_tool(wrapper)
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
---
|
|
223
|
+
|
|
224
|
+
## Background: the SafetyDrift paper
|
|
225
|
+
|
|
226
|
+
This library implements the framework from:
|
|
227
|
+
|
|
228
|
+
> **SafetyDrift: Predicting When AI Agents Cross the Line Before They Actually Do**
|
|
229
|
+
> Aditya Dhodapkar, Farhaan Pishori (March 2026)
|
|
230
|
+
> [arXiv:2603.27148](https://arxiv.org/abs/2603.27148)
|
|
231
|
+
|
|
232
|
+
Key findings we implement:
|
|
233
|
+
- Safety state modelled as an absorbing Markov chain across 3 dimensions
|
|
234
|
+
- Every agent has absorption probability 1.0 — the question is *when*, not *if*
|
|
235
|
+
- Communication tasks: 85% violation probability within 5 steps from mild-risk state
|
|
236
|
+
- Technical tasks: below 5% from any state
|
|
237
|
+
- "Points of no return" are sharply task-dependent
|
|
238
|
+
|
|
239
|
+
---
|
|
240
|
+
|
|
241
|
+
## Contributing
|
|
242
|
+
|
|
243
|
+
The highest-value contributions right now:
|
|
244
|
+
|
|
245
|
+
1. **Real trace data** — If you have agent session traces (with ground truth on whether violations occurred), they can calibrate the transition matrices far better than our heuristic approximation
|
|
246
|
+
2. **Framework adapters** — LangGraph, CrewAI, AutoGen, Google Genkit
|
|
247
|
+
3. **CI/CD integration** — GitHub Actions workflow that gates agent PRs
|
|
248
|
+
|
|
249
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for details.
|
|
250
|
+
|
|
251
|
+
---
|
|
252
|
+
|
|
253
|
+
## License
|
|
254
|
+
|
|
255
|
+
MIT — use it in your agent pipelines, commercial or otherwise.
|
|
256
|
+
|
|
257
|
+
---
|
|
258
|
+
|
|
259
|
+
*Inspired by and implementing [SafetyDrift (arXiv:2603.27148)](https://arxiv.org/abs/2603.27148). This project is not affiliated with the paper's authors.*
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "safetydrift"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Safety drift prediction for AI agents — the first open-source implementation of the SafetyDrift framework (arXiv:2603.27148)"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
|
|
13
|
+
keywords = [
|
|
14
|
+
"ai-safety", "llm-agents", "mcp", "drift-detection",
|
|
15
|
+
"model-context-protocol", "agentic-ai", "guardrails",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Development Status :: 3 - Alpha",
|
|
20
|
+
"Intended Audience :: Developers",
|
|
21
|
+
"License :: OSI Approved :: MIT License",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
dependencies = [
|
|
29
|
+
"numpy>=1.26",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
dev = [
|
|
34
|
+
"pytest>=7.4",
|
|
35
|
+
"pytest-cov>=4.1",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[project.urls]
|
|
39
|
+
Homepage = "https://github.com/YOUR_USERNAME/driftguard"
|
|
40
|
+
Paper = "https://arxiv.org/abs/2603.27148"
|
|
41
|
+
Issues = "https://github.com/YOUR_USERNAME/driftguard/issues"
|
|
42
|
+
|
|
43
|
+
[project.scripts]
|
|
44
|
+
driftguard-mcp = "driftguard.mcp_server:run_stdio"
|
|
45
|
+
|
|
46
|
+
[tool.setuptools.packages.find]
|
|
47
|
+
where = ["."]
|
|
48
|
+
include = ["driftguard*"]
|
|
49
|
+
|
|
50
|
+
[tool.pytest.ini_options]
|
|
51
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SafetyDrift — Safety drift prediction for AI agents.
|
|
3
|
+
|
|
4
|
+
The first open-source implementation of the SafetyDrift framework
|
|
5
|
+
(Dhodapkar & Pishori, arXiv:2603.27148, March 2026).
|
|
6
|
+
|
|
7
|
+
Predicts when individually safe AI agent actions are about to compound
|
|
8
|
+
into a safety violation, using absorbing Markov chain analysis.
|
|
9
|
+
|
|
10
|
+
Quick start:
|
|
11
|
+
from safetydrift import Session, InterventionAction
|
|
12
|
+
|
|
13
|
+
session = Session(task_type="communication")
|
|
14
|
+
|
|
15
|
+
# Before any tool call:
|
|
16
|
+
result = session.gate("send_email", {"to": "boss@corp.com", "body": "..."})
|
|
17
|
+
|
|
18
|
+
if result.action == InterventionAction.BLOCK:
|
|
19
|
+
raise RuntimeError(f"safetydrift blocked: {result.reason}")
|
|
20
|
+
elif result.action == InterventionAction.PAUSE:
|
|
21
|
+
approved = ask_human_approval(result)
|
|
22
|
+
if not approved:
|
|
23
|
+
raise RuntimeError("Human rejected action")
|
|
24
|
+
|
|
25
|
+
# Safe to proceed
|
|
26
|
+
actually_send_email(...)
|
|
27
|
+
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from .drift_types import (
|
|
31
|
+
DataExposure,
|
|
32
|
+
DriftAssessment,
|
|
33
|
+
InterventionAction,
|
|
34
|
+
Reversibility,
|
|
35
|
+
RiskLevel,
|
|
36
|
+
SafetyState,
|
|
37
|
+
ToolCall,
|
|
38
|
+
ToolEscalation,
|
|
39
|
+
)
|
|
40
|
+
from .classifier import classify, add_rule, ClassifierRule
|
|
41
|
+
from .markov import violation_probability, steps_to_threshold
|
|
42
|
+
from .policy import PolicyConfig, PolicyEngine, PolicyThreshold
|
|
43
|
+
from .session import Session, AuditEntry
|
|
44
|
+
|
|
45
|
+
__version__ = "0.1.0"
|
|
46
|
+
__all__ = [
|
|
47
|
+
# Types
|
|
48
|
+
"DataExposure", "ToolEscalation", "Reversibility", "RiskLevel",
|
|
49
|
+
"InterventionAction", "SafetyState", "ToolCall", "DriftAssessment",
|
|
50
|
+
# Classifier
|
|
51
|
+
"classify", "add_rule", "ClassifierRule",
|
|
52
|
+
# Markov
|
|
53
|
+
"violation_probability", "steps_to_threshold",
|
|
54
|
+
# Policy
|
|
55
|
+
"PolicyConfig", "PolicyEngine", "PolicyThreshold",
|
|
56
|
+
# Session
|
|
57
|
+
"Session", "AuditEntry",
|
|
58
|
+
]
|