evalguardai 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalguardai-1.1.0/PKG-INFO +362 -0
- evalguardai-1.1.0/README.md +305 -0
- evalguardai-1.1.0/evalguard/__init__.py +42 -0
- evalguardai-1.1.0/evalguard/anthropic.py +182 -0
- evalguardai-1.1.0/evalguard/bedrock.py +280 -0
- evalguardai-1.1.0/evalguard/client.py +516 -0
- evalguardai-1.1.0/evalguard/crewai.py +189 -0
- evalguardai-1.1.0/evalguard/fastapi.py +273 -0
- evalguardai-1.1.0/evalguard/guardrails.py +160 -0
- evalguardai-1.1.0/evalguard/langchain.py +218 -0
- evalguardai-1.1.0/evalguard/nemoclaw.py +251 -0
- evalguardai-1.1.0/evalguard/openai.py +194 -0
- evalguardai-1.1.0/evalguard/types.py +142 -0
- evalguardai-1.1.0/evalguardai.egg-info/PKG-INFO +362 -0
- evalguardai-1.1.0/evalguardai.egg-info/SOURCES.txt +20 -0
- evalguardai-1.1.0/evalguardai.egg-info/dependency_links.txt +1 -0
- evalguardai-1.1.0/evalguardai.egg-info/requires.txt +32 -0
- evalguardai-1.1.0/evalguardai.egg-info/top_level.txt +1 -0
- evalguardai-1.1.0/pyproject.toml +78 -0
- evalguardai-1.1.0/setup.cfg +4 -0
- evalguardai-1.1.0/setup.py +64 -0
- evalguardai-1.1.0/tests/test_client.py +430 -0
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: evalguardai
|
|
3
|
+
Version: 1.1.0
|
|
4
|
+
Summary: Python SDK for EvalGuard -- evaluate, red-team, and guard LLM applications with drop-in framework integrations
|
|
5
|
+
Home-page: https://github.com/EvalGuardAi/evalguard
|
|
6
|
+
Author: EvalGuard
|
|
7
|
+
Author-email: EvalGuard <support@evalguard.ai>
|
|
8
|
+
License: MIT
|
|
9
|
+
Project-URL: Homepage, https://evalguard.ai
|
|
10
|
+
Project-URL: Repository, https://github.com/EvalGuardAi/evalguard
|
|
11
|
+
Project-URL: Documentation, https://docs.evalguard.ai/python-sdk
|
|
12
|
+
Project-URL: Issues, https://github.com/EvalGuardAi/evalguard/issues
|
|
13
|
+
Project-URL: Changelog, https://github.com/EvalGuardAi/evalguard/releases
|
|
14
|
+
Keywords: llm,evaluation,ai,security,red-team,prompt-injection,guardrails,ai-safety,llm-security,agent-evaluation,monitoring,evalguard,openai,anthropic,langchain,bedrock,crewai,fastapi
|
|
15
|
+
Classifier: Development Status :: 4 - Beta
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
25
|
+
Classifier: Topic :: Security
|
|
26
|
+
Classifier: Topic :: Software Development :: Testing
|
|
27
|
+
Classifier: Typing :: Typed
|
|
28
|
+
Requires-Python: >=3.9
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
Requires-Dist: requests>=2.28.0
|
|
31
|
+
Provides-Extra: openai
|
|
32
|
+
Requires-Dist: openai>=1.0.0; extra == "openai"
|
|
33
|
+
Provides-Extra: anthropic
|
|
34
|
+
Requires-Dist: anthropic>=0.18.0; extra == "anthropic"
|
|
35
|
+
Provides-Extra: langchain
|
|
36
|
+
Requires-Dist: langchain-core>=0.1.0; extra == "langchain"
|
|
37
|
+
Provides-Extra: crewai
|
|
38
|
+
Requires-Dist: crewai>=0.1.0; extra == "crewai"
|
|
39
|
+
Provides-Extra: bedrock
|
|
40
|
+
Requires-Dist: boto3>=1.28.0; extra == "bedrock"
|
|
41
|
+
Provides-Extra: fastapi
|
|
42
|
+
Requires-Dist: fastapi>=0.100.0; extra == "fastapi"
|
|
43
|
+
Provides-Extra: all
|
|
44
|
+
Requires-Dist: openai>=1.0.0; extra == "all"
|
|
45
|
+
Requires-Dist: anthropic>=0.18.0; extra == "all"
|
|
46
|
+
Requires-Dist: langchain-core>=0.1.0; extra == "all"
|
|
47
|
+
Requires-Dist: crewai>=0.1.0; extra == "all"
|
|
48
|
+
Requires-Dist: boto3>=1.28.0; extra == "all"
|
|
49
|
+
Requires-Dist: fastapi>=0.100.0; extra == "all"
|
|
50
|
+
Provides-Extra: dev
|
|
51
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
52
|
+
Requires-Dist: pytest-mock>=3.10; extra == "dev"
|
|
53
|
+
Requires-Dist: responses>=0.23; extra == "dev"
|
|
54
|
+
Dynamic: author
|
|
55
|
+
Dynamic: home-page
|
|
56
|
+
Dynamic: requires-python
|
|
57
|
+
|
|
58
|
+
# evalguard-python
|
|
59
|
+
|
|
60
|
+
[](https://pypi.org/project/evalguard-python/)
|
|
61
|
+
[](https://opensource.org/licenses/MIT)
|
|
62
|
+
[](https://www.python.org/downloads/)
|
|
63
|
+
|
|
64
|
+
Python SDK for [EvalGuard](https://evalguard.ai) -- evaluate, red-team, and guard LLM applications with **drop-in framework integrations**.
|
|
65
|
+
|
|
66
|
+
## Installation
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
# Core SDK
|
|
70
|
+
pip install evalguard-python
|
|
71
|
+
|
|
72
|
+
# With framework extras
|
|
73
|
+
pip install evalguard-python[openai]
|
|
74
|
+
pip install evalguard-python[anthropic]
|
|
75
|
+
pip install evalguard-python[langchain]
|
|
76
|
+
pip install evalguard-python[bedrock]
|
|
77
|
+
pip install evalguard-python[crewai]
|
|
78
|
+
pip install evalguard-python[fastapi]
|
|
79
|
+
|
|
80
|
+
# Everything
|
|
81
|
+
pip install evalguard-python[all]
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Quick Start
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
from evalguard import EvalGuardClient
|
|
88
|
+
|
|
89
|
+
client = EvalGuardClient(api_key="eg_live_...")
|
|
90
|
+
|
|
91
|
+
# Run an evaluation
|
|
92
|
+
result = client.run_eval({
|
|
93
|
+
"model": "gpt-4o",
|
|
94
|
+
"prompt": "Answer: {{input}}",
|
|
95
|
+
"cases": [
|
|
96
|
+
{"input": "What is 2+2?", "expectedOutput": "4"},
|
|
97
|
+
],
|
|
98
|
+
"scorers": ["exact-match", "contains"],
|
|
99
|
+
})
|
|
100
|
+
print(f"Score: {result['score']}, Pass rate: {result['passRate']}")
|
|
101
|
+
|
|
102
|
+
# Check the firewall
|
|
103
|
+
fw = client.check_firewall("Ignore all previous instructions")
|
|
104
|
+
print(f"Action: {fw['action']}") # "block"
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
## Framework Integrations
|
|
110
|
+
|
|
111
|
+
Every integration is a **drop-in wrapper** -- add two lines and your existing code gets automatic guardrails, traces, and observability.
|
|
112
|
+
|
|
113
|
+
### OpenAI
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from evalguard.openai import wrap
|
|
117
|
+
from openai import OpenAI
|
|
118
|
+
|
|
119
|
+
client = wrap(OpenAI(), api_key="eg_...", project_id="proj_...")
|
|
120
|
+
|
|
121
|
+
# Use exactly like normal -- guardrails are automatic
|
|
122
|
+
response = client.chat.completions.create(
|
|
123
|
+
model="gpt-4o",
|
|
124
|
+
messages=[{"role": "user", "content": "Hello, world!"}],
|
|
125
|
+
)
|
|
126
|
+
print(response.choices[0].message.content)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
All calls to `chat.completions.create()` are intercepted:
|
|
130
|
+
- **Pre-LLM**: Input is checked for prompt injection, PII, etc.
|
|
131
|
+
- **Post-LLM**: Response + latency + token usage are traced to EvalGuard.
|
|
132
|
+
- **Violations**: Raise `GuardrailViolation` (or log-only with `block_on_violation=False`).
|
|
133
|
+
|
|
134
|
+
### Anthropic
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from evalguard.anthropic import wrap
|
|
138
|
+
from anthropic import Anthropic
|
|
139
|
+
|
|
140
|
+
client = wrap(Anthropic(), api_key="eg_...", project_id="proj_...")
|
|
141
|
+
|
|
142
|
+
response = client.messages.create(
|
|
143
|
+
model="claude-sonnet-4-20250514",
|
|
144
|
+
max_tokens=1024,
|
|
145
|
+
messages=[{"role": "user", "content": "Explain quantum computing"}],
|
|
146
|
+
)
|
|
147
|
+
print(response.content[0].text)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
Intercepts `messages.create()` with the same pre/post guardrail pattern.
|
|
151
|
+
|
|
152
|
+
### LangChain
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
from evalguard.langchain import EvalGuardCallback
|
|
156
|
+
from langchain_openai import ChatOpenAI
|
|
157
|
+
|
|
158
|
+
callback = EvalGuardCallback(api_key="eg_...", project_id="proj_...")
|
|
159
|
+
|
|
160
|
+
llm = ChatOpenAI(model="gpt-4o", callbacks=[callback])
|
|
161
|
+
result = llm.invoke("What is the capital of France?")
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
Works with **any** LangChain LLM, chat model, or chain that supports callbacks. The callback implements the full LangChain callback protocol without importing LangChain, so it is compatible with all versions (0.1.x through 0.3.x).
|
|
165
|
+
|
|
166
|
+
Traced events:
|
|
167
|
+
- `on_llm_start` / `on_chat_model_start` -- pre-check input
|
|
168
|
+
- `on_llm_end` -- log output trace
|
|
169
|
+
- `on_llm_error` -- log error trace
|
|
170
|
+
|
|
171
|
+
### AWS Bedrock
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
from evalguard.bedrock import wrap
|
|
175
|
+
import boto3
|
|
176
|
+
|
|
177
|
+
bedrock = boto3.client("bedrock-runtime", region_name="us-east-1")
|
|
178
|
+
client = wrap(bedrock, api_key="eg_...", project_id="proj_...")
|
|
179
|
+
|
|
180
|
+
# invoke_model (all Bedrock model families supported)
|
|
181
|
+
import json
|
|
182
|
+
response = client.invoke_model(
|
|
183
|
+
modelId="anthropic.claude-3-sonnet-20240229-v1:0",
|
|
184
|
+
body=json.dumps({
|
|
185
|
+
"messages": [{"role": "user", "content": "Hello"}],
|
|
186
|
+
"max_tokens": 256,
|
|
187
|
+
"anthropic_version": "bedrock-2023-05-31",
|
|
188
|
+
}),
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Converse API
|
|
192
|
+
response = client.converse(
|
|
193
|
+
modelId="anthropic.claude-3-sonnet-20240229-v1:0",
|
|
194
|
+
messages=[{"role": "user", "content": [{"text": "Hello"}]}],
|
|
195
|
+
)
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
Supports all Bedrock model families: Anthropic Claude, Amazon Titan, Meta Llama, Cohere, AI21, and Mistral. Both `invoke_model` and `converse` APIs are guarded.
|
|
199
|
+
|
|
200
|
+
### CrewAI
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
from evalguard.crewai import guard_agent, EvalGuardGuardrail
|
|
204
|
+
from crewai import Agent, Task, Crew
|
|
205
|
+
|
|
206
|
+
# Guard individual agents
|
|
207
|
+
agent = Agent(role="researcher", goal="...", backstory="...")
|
|
208
|
+
agent = guard_agent(agent, api_key="eg_...")
|
|
209
|
+
|
|
210
|
+
# Or use the standalone guardrail
|
|
211
|
+
guardrail = EvalGuardGuardrail(api_key="eg_...", project_id="proj_...")
|
|
212
|
+
result = guardrail.check("User input to validate")
|
|
213
|
+
|
|
214
|
+
# Wrap arbitrary functions
|
|
215
|
+
@guardrail.wrap_function
|
|
216
|
+
def my_tool(query: str) -> str:
|
|
217
|
+
return do_search(query)
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
### FastAPI Middleware
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
from evalguard.fastapi import EvalGuardMiddleware
|
|
224
|
+
from fastapi import FastAPI
|
|
225
|
+
|
|
226
|
+
app = FastAPI()
|
|
227
|
+
app.add_middleware(
|
|
228
|
+
EvalGuardMiddleware,
|
|
229
|
+
api_key="eg_...",
|
|
230
|
+
project_id="proj_...",
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
@app.post("/api/chat")
|
|
234
|
+
async def chat(request: dict):
|
|
235
|
+
# Automatically guarded -- prompt injection blocked with 403
|
|
236
|
+
return {"response": "..."}
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
By default, POST requests to paths containing `/chat`, `/completions`, `/generate`, `/invoke`, or `/messages` are guarded. Customize with `guarded_paths`:
|
|
240
|
+
|
|
241
|
+
```python
|
|
242
|
+
app.add_middleware(
|
|
243
|
+
EvalGuardMiddleware,
|
|
244
|
+
api_key="eg_...",
|
|
245
|
+
guarded_paths={"/api/v1/chat", "/api/v1/generate"},
|
|
246
|
+
)
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
For per-route control:
|
|
250
|
+
|
|
251
|
+
```python
|
|
252
|
+
from evalguard.fastapi import guard_route
|
|
253
|
+
|
|
254
|
+
@app.post("/api/chat")
|
|
255
|
+
@guard_route(api_key="eg_...", rules=["prompt_injection"])
|
|
256
|
+
async def chat(request: Request):
|
|
257
|
+
body = await request.json()
|
|
258
|
+
...
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
### NeMo / Agent Workflows
|
|
262
|
+
|
|
263
|
+
```python
|
|
264
|
+
from evalguard.nemoclaw import EvalGuardAgent
|
|
265
|
+
|
|
266
|
+
agent = EvalGuardAgent(api_key="eg_...", agent_name="support-bot")
|
|
267
|
+
|
|
268
|
+
# Guard any LLM call
|
|
269
|
+
result = agent.guarded_call(
|
|
270
|
+
provider="openai",
|
|
271
|
+
messages=[{"role": "user", "content": "Reset my password"}],
|
|
272
|
+
llm_fn=lambda: openai_client.chat.completions.create(
|
|
273
|
+
model="gpt-4", messages=[{"role": "user", "content": "Reset my password"}]
|
|
274
|
+
),
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Multi-step agent sessions
|
|
278
|
+
with agent.session("ticket-123") as session:
|
|
279
|
+
session.check("User says: reset my password")
|
|
280
|
+
result = do_llm_call(...)
|
|
281
|
+
session.log_step("password_reset", input="...", output=str(result))
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
---
|
|
285
|
+
|
|
286
|
+
## Core Guardrail Client
|
|
287
|
+
|
|
288
|
+
All framework integrations share the same underlying `GuardrailClient`:
|
|
289
|
+
|
|
290
|
+
```python
|
|
291
|
+
from evalguard.guardrails import GuardrailClient
|
|
292
|
+
|
|
293
|
+
guard = GuardrailClient(
|
|
294
|
+
api_key="eg_...",
|
|
295
|
+
project_id="proj_...",
|
|
296
|
+
timeout=5.0, # keep low to avoid latency
|
|
297
|
+
fail_open=True, # allow on error (default)
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
# Pre-LLM check
|
|
301
|
+
result = guard.check_input("user prompt here", rules=["prompt_injection", "pii_redact"])
|
|
302
|
+
if not result["allowed"]:
|
|
303
|
+
print("Blocked:", result["violations"])
|
|
304
|
+
|
|
305
|
+
# Post-LLM check
|
|
306
|
+
result = guard.check_output("model response here", rules=["toxic_content"])
|
|
307
|
+
|
|
308
|
+
# Fire-and-forget trace
|
|
309
|
+
guard.log_trace({"model": "gpt-4", "input": "...", "output": "...", "latency_ms": 120})
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
## Error Handling
|
|
313
|
+
|
|
314
|
+
All integrations use **fail-open** semantics by default: if the EvalGuard API is unreachable, requests pass through rather than blocking your application.
|
|
315
|
+
|
|
316
|
+
To fail-closed:
|
|
317
|
+
|
|
318
|
+
```python
|
|
319
|
+
# Framework wrappers
|
|
320
|
+
client = wrap(OpenAI(), api_key="eg_...", block_on_violation=True)
|
|
321
|
+
|
|
322
|
+
# Core client
|
|
323
|
+
guard = GuardrailClient(api_key="eg_...", fail_open=False)
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
Catch violations explicitly:
|
|
327
|
+
|
|
328
|
+
```python
|
|
329
|
+
from evalguard import GuardrailViolation
|
|
330
|
+
|
|
331
|
+
try:
|
|
332
|
+
response = client.chat.completions.create(...)
|
|
333
|
+
except GuardrailViolation as e:
|
|
334
|
+
print(f"Blocked: {e.violations}")
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
## All SDK Methods
|
|
338
|
+
|
|
339
|
+
| Method | Description |
|
|
340
|
+
|---|---|
|
|
341
|
+
| `client.run_eval(config)` | Run an evaluation with scorers and test cases |
|
|
342
|
+
| `client.get_eval(run_id)` | Fetch a specific eval run by ID |
|
|
343
|
+
| `client.list_evals(project_id=None)` | List eval runs, optionally filtered by project |
|
|
344
|
+
| `client.run_scan(config)` | Run a red-team security scan against a model |
|
|
345
|
+
| `client.get_scan(scan_id)` | Fetch a specific security scan by ID |
|
|
346
|
+
| `client.list_scorers()` | List all available evaluation scorers |
|
|
347
|
+
| `client.list_plugins()` | List all available security plugins |
|
|
348
|
+
| `client.check_firewall(input_text, rules=None)` | Check input against firewall rules |
|
|
349
|
+
| `client.run_benchmarks(suites, model)` | Run benchmark suites against a model |
|
|
350
|
+
| `client.export_dpo(run_id)` | Export eval results as DPO training data (JSONL) |
|
|
351
|
+
| `client.export_burp(scan_id)` | Export scan results as Burp Suite XML |
|
|
352
|
+
| `client.get_compliance_report(scan_id, framework)` | Map scan results to a compliance framework |
|
|
353
|
+
| `client.detect_drift(config)` | Detect performance drift between eval runs |
|
|
354
|
+
| `client.generate_guardrails(config)` | Auto-generate firewall rules from scan findings |
|
|
355
|
+
|
|
356
|
+
## Documentation
|
|
357
|
+
|
|
358
|
+
Full documentation at [docs.evalguard.ai/python-sdk](https://docs.evalguard.ai/python-sdk).
|
|
359
|
+
|
|
360
|
+
## License
|
|
361
|
+
|
|
362
|
+
MIT -- see [LICENSE](./LICENSE) for details.
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
# evalguard-python
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/evalguard-python/)
|
|
4
|
+
[](https://opensource.org/licenses/MIT)
|
|
5
|
+
[](https://www.python.org/downloads/)
|
|
6
|
+
|
|
7
|
+
Python SDK for [EvalGuard](https://evalguard.ai) -- evaluate, red-team, and guard LLM applications with **drop-in framework integrations**.
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
# Core SDK
|
|
13
|
+
pip install evalguard-python
|
|
14
|
+
|
|
15
|
+
# With framework extras
|
|
16
|
+
pip install evalguard-python[openai]
|
|
17
|
+
pip install evalguard-python[anthropic]
|
|
18
|
+
pip install evalguard-python[langchain]
|
|
19
|
+
pip install evalguard-python[bedrock]
|
|
20
|
+
pip install evalguard-python[crewai]
|
|
21
|
+
pip install evalguard-python[fastapi]
|
|
22
|
+
|
|
23
|
+
# Everything
|
|
24
|
+
pip install evalguard-python[all]
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Quick Start
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from evalguard import EvalGuardClient
|
|
31
|
+
|
|
32
|
+
client = EvalGuardClient(api_key="eg_live_...")
|
|
33
|
+
|
|
34
|
+
# Run an evaluation
|
|
35
|
+
result = client.run_eval({
|
|
36
|
+
"model": "gpt-4o",
|
|
37
|
+
"prompt": "Answer: {{input}}",
|
|
38
|
+
"cases": [
|
|
39
|
+
{"input": "What is 2+2?", "expectedOutput": "4"},
|
|
40
|
+
],
|
|
41
|
+
"scorers": ["exact-match", "contains"],
|
|
42
|
+
})
|
|
43
|
+
print(f"Score: {result['score']}, Pass rate: {result['passRate']}")
|
|
44
|
+
|
|
45
|
+
# Check the firewall
|
|
46
|
+
fw = client.check_firewall("Ignore all previous instructions")
|
|
47
|
+
print(f"Action: {fw['action']}") # "block"
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## Framework Integrations
|
|
53
|
+
|
|
54
|
+
Every integration is a **drop-in wrapper** -- add two lines and your existing code gets automatic guardrails, traces, and observability.
|
|
55
|
+
|
|
56
|
+
### OpenAI
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from evalguard.openai import wrap
|
|
60
|
+
from openai import OpenAI
|
|
61
|
+
|
|
62
|
+
client = wrap(OpenAI(), api_key="eg_...", project_id="proj_...")
|
|
63
|
+
|
|
64
|
+
# Use exactly like normal -- guardrails are automatic
|
|
65
|
+
response = client.chat.completions.create(
|
|
66
|
+
model="gpt-4o",
|
|
67
|
+
messages=[{"role": "user", "content": "Hello, world!"}],
|
|
68
|
+
)
|
|
69
|
+
print(response.choices[0].message.content)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
All calls to `chat.completions.create()` are intercepted:
|
|
73
|
+
- **Pre-LLM**: Input is checked for prompt injection, PII, etc.
|
|
74
|
+
- **Post-LLM**: Response + latency + token usage are traced to EvalGuard.
|
|
75
|
+
- **Violations**: Raise `GuardrailViolation` (or log-only with `block_on_violation=False`).
|
|
76
|
+
|
|
77
|
+
### Anthropic
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from evalguard.anthropic import wrap
|
|
81
|
+
from anthropic import Anthropic
|
|
82
|
+
|
|
83
|
+
client = wrap(Anthropic(), api_key="eg_...", project_id="proj_...")
|
|
84
|
+
|
|
85
|
+
response = client.messages.create(
|
|
86
|
+
model="claude-sonnet-4-20250514",
|
|
87
|
+
max_tokens=1024,
|
|
88
|
+
messages=[{"role": "user", "content": "Explain quantum computing"}],
|
|
89
|
+
)
|
|
90
|
+
print(response.content[0].text)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Intercepts `messages.create()` with the same pre/post guardrail pattern.
|
|
94
|
+
|
|
95
|
+
### LangChain
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
from evalguard.langchain import EvalGuardCallback
|
|
99
|
+
from langchain_openai import ChatOpenAI
|
|
100
|
+
|
|
101
|
+
callback = EvalGuardCallback(api_key="eg_...", project_id="proj_...")
|
|
102
|
+
|
|
103
|
+
llm = ChatOpenAI(model="gpt-4o", callbacks=[callback])
|
|
104
|
+
result = llm.invoke("What is the capital of France?")
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Works with **any** LangChain LLM, chat model, or chain that supports callbacks. The callback implements the full LangChain callback protocol without importing LangChain, so it is compatible with all versions (0.1.x through 0.3.x).
|
|
108
|
+
|
|
109
|
+
Traced events:
|
|
110
|
+
- `on_llm_start` / `on_chat_model_start` -- pre-check input
|
|
111
|
+
- `on_llm_end` -- log output trace
|
|
112
|
+
- `on_llm_error` -- log error trace
|
|
113
|
+
|
|
114
|
+
### AWS Bedrock
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from evalguard.bedrock import wrap
|
|
118
|
+
import boto3
|
|
119
|
+
|
|
120
|
+
bedrock = boto3.client("bedrock-runtime", region_name="us-east-1")
|
|
121
|
+
client = wrap(bedrock, api_key="eg_...", project_id="proj_...")
|
|
122
|
+
|
|
123
|
+
# invoke_model (all Bedrock model families supported)
|
|
124
|
+
import json
|
|
125
|
+
response = client.invoke_model(
|
|
126
|
+
modelId="anthropic.claude-3-sonnet-20240229-v1:0",
|
|
127
|
+
body=json.dumps({
|
|
128
|
+
"messages": [{"role": "user", "content": "Hello"}],
|
|
129
|
+
"max_tokens": 256,
|
|
130
|
+
"anthropic_version": "bedrock-2023-05-31",
|
|
131
|
+
}),
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# Converse API
|
|
135
|
+
response = client.converse(
|
|
136
|
+
modelId="anthropic.claude-3-sonnet-20240229-v1:0",
|
|
137
|
+
messages=[{"role": "user", "content": [{"text": "Hello"}]}],
|
|
138
|
+
)
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
Supports all Bedrock model families: Anthropic Claude, Amazon Titan, Meta Llama, Cohere, AI21, and Mistral. Both `invoke_model` and `converse` APIs are guarded.
|
|
142
|
+
|
|
143
|
+
### CrewAI
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
from evalguard.crewai import guard_agent, EvalGuardGuardrail
|
|
147
|
+
from crewai import Agent, Task, Crew
|
|
148
|
+
|
|
149
|
+
# Guard individual agents
|
|
150
|
+
agent = Agent(role="researcher", goal="...", backstory="...")
|
|
151
|
+
agent = guard_agent(agent, api_key="eg_...")
|
|
152
|
+
|
|
153
|
+
# Or use the standalone guardrail
|
|
154
|
+
guardrail = EvalGuardGuardrail(api_key="eg_...", project_id="proj_...")
|
|
155
|
+
result = guardrail.check("User input to validate")
|
|
156
|
+
|
|
157
|
+
# Wrap arbitrary functions
|
|
158
|
+
@guardrail.wrap_function
|
|
159
|
+
def my_tool(query: str) -> str:
|
|
160
|
+
return do_search(query)
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
### FastAPI Middleware
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
from evalguard.fastapi import EvalGuardMiddleware
|
|
167
|
+
from fastapi import FastAPI
|
|
168
|
+
|
|
169
|
+
app = FastAPI()
|
|
170
|
+
app.add_middleware(
|
|
171
|
+
EvalGuardMiddleware,
|
|
172
|
+
api_key="eg_...",
|
|
173
|
+
project_id="proj_...",
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
@app.post("/api/chat")
|
|
177
|
+
async def chat(request: dict):
|
|
178
|
+
# Automatically guarded -- prompt injection blocked with 403
|
|
179
|
+
return {"response": "..."}
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
By default, POST requests to paths containing `/chat`, `/completions`, `/generate`, `/invoke`, or `/messages` are guarded. Customize with `guarded_paths`:
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
app.add_middleware(
|
|
186
|
+
EvalGuardMiddleware,
|
|
187
|
+
api_key="eg_...",
|
|
188
|
+
guarded_paths={"/api/v1/chat", "/api/v1/generate"},
|
|
189
|
+
)
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
For per-route control:
|
|
193
|
+
|
|
194
|
+
```python
|
|
195
|
+
from evalguard.fastapi import guard_route
|
|
196
|
+
|
|
197
|
+
@app.post("/api/chat")
|
|
198
|
+
@guard_route(api_key="eg_...", rules=["prompt_injection"])
|
|
199
|
+
async def chat(request: Request):
|
|
200
|
+
body = await request.json()
|
|
201
|
+
...
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
### NeMo / Agent Workflows
|
|
205
|
+
|
|
206
|
+
```python
|
|
207
|
+
from evalguard.nemoclaw import EvalGuardAgent
|
|
208
|
+
|
|
209
|
+
agent = EvalGuardAgent(api_key="eg_...", agent_name="support-bot")
|
|
210
|
+
|
|
211
|
+
# Guard any LLM call
|
|
212
|
+
result = agent.guarded_call(
|
|
213
|
+
provider="openai",
|
|
214
|
+
messages=[{"role": "user", "content": "Reset my password"}],
|
|
215
|
+
llm_fn=lambda: openai_client.chat.completions.create(
|
|
216
|
+
model="gpt-4", messages=[{"role": "user", "content": "Reset my password"}]
|
|
217
|
+
),
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Multi-step agent sessions
|
|
221
|
+
with agent.session("ticket-123") as session:
|
|
222
|
+
session.check("User says: reset my password")
|
|
223
|
+
result = do_llm_call(...)
|
|
224
|
+
session.log_step("password_reset", input="...", output=str(result))
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
---
|
|
228
|
+
|
|
229
|
+
## Core Guardrail Client
|
|
230
|
+
|
|
231
|
+
All framework integrations share the same underlying `GuardrailClient`:
|
|
232
|
+
|
|
233
|
+
```python
|
|
234
|
+
from evalguard.guardrails import GuardrailClient
|
|
235
|
+
|
|
236
|
+
guard = GuardrailClient(
|
|
237
|
+
api_key="eg_...",
|
|
238
|
+
project_id="proj_...",
|
|
239
|
+
timeout=5.0, # keep low to avoid latency
|
|
240
|
+
fail_open=True, # allow on error (default)
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
# Pre-LLM check
|
|
244
|
+
result = guard.check_input("user prompt here", rules=["prompt_injection", "pii_redact"])
|
|
245
|
+
if not result["allowed"]:
|
|
246
|
+
print("Blocked:", result["violations"])
|
|
247
|
+
|
|
248
|
+
# Post-LLM check
|
|
249
|
+
result = guard.check_output("model response here", rules=["toxic_content"])
|
|
250
|
+
|
|
251
|
+
# Fire-and-forget trace
|
|
252
|
+
guard.log_trace({"model": "gpt-4", "input": "...", "output": "...", "latency_ms": 120})
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
## Error Handling
|
|
256
|
+
|
|
257
|
+
All integrations use **fail-open** semantics by default: if the EvalGuard API is unreachable, requests pass through rather than blocking your application.
|
|
258
|
+
|
|
259
|
+
To fail-closed:
|
|
260
|
+
|
|
261
|
+
```python
|
|
262
|
+
# Framework wrappers
|
|
263
|
+
client = wrap(OpenAI(), api_key="eg_...", block_on_violation=True)
|
|
264
|
+
|
|
265
|
+
# Core client
|
|
266
|
+
guard = GuardrailClient(api_key="eg_...", fail_open=False)
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
Catch violations explicitly:
|
|
270
|
+
|
|
271
|
+
```python
|
|
272
|
+
from evalguard import GuardrailViolation
|
|
273
|
+
|
|
274
|
+
try:
|
|
275
|
+
response = client.chat.completions.create(...)
|
|
276
|
+
except GuardrailViolation as e:
|
|
277
|
+
print(f"Blocked: {e.violations}")
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
## All SDK Methods
|
|
281
|
+
|
|
282
|
+
| Method | Description |
|
|
283
|
+
|---|---|
|
|
284
|
+
| `client.run_eval(config)` | Run an evaluation with scorers and test cases |
|
|
285
|
+
| `client.get_eval(run_id)` | Fetch a specific eval run by ID |
|
|
286
|
+
| `client.list_evals(project_id=None)` | List eval runs, optionally filtered by project |
|
|
287
|
+
| `client.run_scan(config)` | Run a red-team security scan against a model |
|
|
288
|
+
| `client.get_scan(scan_id)` | Fetch a specific security scan by ID |
|
|
289
|
+
| `client.list_scorers()` | List all available evaluation scorers |
|
|
290
|
+
| `client.list_plugins()` | List all available security plugins |
|
|
291
|
+
| `client.check_firewall(input_text, rules=None)` | Check input against firewall rules |
|
|
292
|
+
| `client.run_benchmarks(suites, model)` | Run benchmark suites against a model |
|
|
293
|
+
| `client.export_dpo(run_id)` | Export eval results as DPO training data (JSONL) |
|
|
294
|
+
| `client.export_burp(scan_id)` | Export scan results as Burp Suite XML |
|
|
295
|
+
| `client.get_compliance_report(scan_id, framework)` | Map scan results to a compliance framework |
|
|
296
|
+
| `client.detect_drift(config)` | Detect performance drift between eval runs |
|
|
297
|
+
| `client.generate_guardrails(config)` | Auto-generate firewall rules from scan findings |
|
|
298
|
+
|
|
299
|
+
## Documentation
|
|
300
|
+
|
|
301
|
+
Full documentation at [docs.evalguard.ai/python-sdk](https://docs.evalguard.ai/python-sdk).
|
|
302
|
+
|
|
303
|
+
## License
|
|
304
|
+
|
|
305
|
+
MIT -- see [LICENSE](./LICENSE) for details.
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""EvalGuard Python SDK -- evaluate, red-team, and guard LLM applications."""
|
|
2
|
+
|
|
3
|
+
from .client import EvalGuardClient, EvalGuardError
|
|
4
|
+
from .guardrails import GuardrailClient, GuardrailViolation
|
|
5
|
+
from .types import (
|
|
6
|
+
BenchmarkResult,
|
|
7
|
+
CaseResult,
|
|
8
|
+
ComplianceReport,
|
|
9
|
+
DriftReport,
|
|
10
|
+
EvalCase,
|
|
11
|
+
EvalResult,
|
|
12
|
+
EvalRun,
|
|
13
|
+
FirewallResult,
|
|
14
|
+
FirewallRule,
|
|
15
|
+
SecurityFinding,
|
|
16
|
+
SecurityScanResult,
|
|
17
|
+
TokenUsage,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
__version__ = "1.1.0"
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
# Core client
|
|
24
|
+
"EvalGuardClient",
|
|
25
|
+
"EvalGuardError",
|
|
26
|
+
# Guardrails
|
|
27
|
+
"GuardrailClient",
|
|
28
|
+
"GuardrailViolation",
|
|
29
|
+
# Types
|
|
30
|
+
"BenchmarkResult",
|
|
31
|
+
"CaseResult",
|
|
32
|
+
"ComplianceReport",
|
|
33
|
+
"DriftReport",
|
|
34
|
+
"EvalCase",
|
|
35
|
+
"EvalResult",
|
|
36
|
+
"EvalRun",
|
|
37
|
+
"FirewallResult",
|
|
38
|
+
"FirewallRule",
|
|
39
|
+
"SecurityFinding",
|
|
40
|
+
"SecurityScanResult",
|
|
41
|
+
"TokenUsage",
|
|
42
|
+
]
|