evalguard-python 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
evalguard/types.py ADDED
@@ -0,0 +1,142 @@
1
+ """EvalGuard Python SDK — Domain types matching the TypeScript definitions."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any, Dict, List, Optional
7
+
8
+
9
+ # ── Enums as string literals ──
10
+
11
+ EvalStatus = str # "pending" | "running" | "passed" | "failed" | "error"
12
+ Severity = str # "critical" | "high" | "medium" | "low" | "info"
13
+ PlanTier = str # "free" | "pro" | "team" | "enterprise"
14
+
15
+
16
+ @dataclass
17
+ class TokenUsage:
18
+ prompt: int
19
+ completion: int
20
+ total: int
21
+
22
+
23
+ @dataclass
24
+ class EvalCase:
25
+ id: str
26
+ eval_run_id: str
27
+ input: str
28
+ expected_output: Optional[str] = None
29
+ actual_output: Optional[str] = None
30
+ score: Optional[float] = None
31
+ passed: Optional[bool] = None
32
+ latency: Optional[float] = None
33
+ token_usage: Optional[TokenUsage] = None
34
+ metadata: Dict[str, Any] = field(default_factory=dict)
35
+
36
+
37
+ @dataclass
38
+ class EvalRun:
39
+ id: str
40
+ project_id: str
41
+ name: str
42
+ status: EvalStatus
43
+ score: Optional[float]
44
+ max_score: float
45
+ duration: Optional[float]
46
+ created_at: str
47
+ completed_at: Optional[str] = None
48
+ metadata: Dict[str, Any] = field(default_factory=dict)
49
+
50
+
51
+ @dataclass
52
+ class CaseResult:
53
+ input: str
54
+ actual_output: str
55
+ score: float
56
+ passed: bool
57
+ latency: float
58
+ expected_output: Optional[str] = None
59
+ scorer_results: Dict[str, Any] = field(default_factory=dict)
60
+ token_usage: Optional[TokenUsage] = None
61
+
62
+
63
+ @dataclass
64
+ class EvalResult:
65
+ cases: List[CaseResult]
66
+ score: float
67
+ max_score: float
68
+ pass_rate: float
69
+ total_latency: float
70
+ total_tokens: int
71
+
72
+
73
+ @dataclass
74
+ class SecurityFinding:
75
+ id: str
76
+ scan_id: str
77
+ type: str
78
+ severity: Severity
79
+ title: str
80
+ description: str
81
+ input: str
82
+ output: str
83
+ passed: bool = True
84
+ plugin_id: Optional[str] = None
85
+ strategy_id: Optional[str] = None
86
+ metadata: Dict[str, Any] = field(default_factory=dict)
87
+
88
+
89
+ @dataclass
90
+ class SecurityScanResult:
91
+ findings: List[SecurityFinding]
92
+ pass_rate: float
93
+ critical_count: int
94
+ high_count: int
95
+ medium_count: int
96
+ low_count: int
97
+ total_tests: int
98
+ duration: float
99
+
100
+
101
+ @dataclass
102
+ class FirewallResult:
103
+ action: str # "allow" | "block" | "flag"
104
+ reasons: List[Dict[str, Any]]
105
+ latency_ms: float
106
+
107
+
108
+ @dataclass
109
+ class FirewallRule:
110
+ id: str
111
+ name: str
112
+ type: str # "pii" | "injection" | "toxic" | "topic" | "custom"
113
+ enabled: bool
114
+ config: Dict[str, Any] = field(default_factory=dict)
115
+
116
+
117
+ @dataclass
118
+ class ComplianceReport:
119
+ framework: str
120
+ total_controls: int
121
+ tested_controls: int
122
+ passed_controls: int
123
+ failed_controls: int
124
+ coverage: float
125
+ findings: List[Dict[str, Any]]
126
+
127
+
128
+ @dataclass
129
+ class DriftReport:
130
+ has_drift: bool
131
+ overall_delta: float
132
+ metric_deltas: List[Dict[str, Any]]
133
+ alerts: List[str]
134
+
135
+
136
+ @dataclass
137
+ class BenchmarkResult:
138
+ suite: str
139
+ model: str
140
+ score: float
141
+ cases: List[Dict[str, Any]]
142
+ duration: float
@@ -0,0 +1,362 @@
1
+ Metadata-Version: 2.4
2
+ Name: evalguard-python
3
+ Version: 1.1.0
4
+ Summary: Python SDK for EvalGuard -- evaluate, red-team, and guard LLM applications with drop-in framework integrations
5
+ Home-page: https://github.com/EvalGuardAi/evalguard
6
+ Author: EvalGuard
7
+ Author-email: EvalGuard <support@evalguard.ai>
8
+ License: MIT
9
+ Project-URL: Homepage, https://evalguard.ai
10
+ Project-URL: Repository, https://github.com/EvalGuardAi/evalguard
11
+ Project-URL: Documentation, https://docs.evalguard.ai/python-sdk
12
+ Project-URL: Issues, https://github.com/EvalGuardAi/evalguard/issues
13
+ Project-URL: Changelog, https://github.com/EvalGuardAi/evalguard/releases
14
+ Keywords: llm,evaluation,ai,security,red-team,prompt-injection,guardrails,ai-safety,llm-security,agent-evaluation,monitoring,evalguard,openai,anthropic,langchain,bedrock,crewai,fastapi
15
+ Classifier: Development Status :: 4 - Beta
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: License :: OSI Approved :: MIT License
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Programming Language :: Python :: 3.13
24
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
25
+ Classifier: Topic :: Security
26
+ Classifier: Topic :: Software Development :: Testing
27
+ Classifier: Typing :: Typed
28
+ Requires-Python: >=3.9
29
+ Description-Content-Type: text/markdown
30
+ Requires-Dist: requests>=2.28.0
31
+ Provides-Extra: openai
32
+ Requires-Dist: openai>=1.0.0; extra == "openai"
33
+ Provides-Extra: anthropic
34
+ Requires-Dist: anthropic>=0.18.0; extra == "anthropic"
35
+ Provides-Extra: langchain
36
+ Requires-Dist: langchain-core>=0.1.0; extra == "langchain"
37
+ Provides-Extra: crewai
38
+ Requires-Dist: crewai>=0.1.0; extra == "crewai"
39
+ Provides-Extra: bedrock
40
+ Requires-Dist: boto3>=1.28.0; extra == "bedrock"
41
+ Provides-Extra: fastapi
42
+ Requires-Dist: fastapi>=0.100.0; extra == "fastapi"
43
+ Provides-Extra: all
44
+ Requires-Dist: openai>=1.0.0; extra == "all"
45
+ Requires-Dist: anthropic>=0.18.0; extra == "all"
46
+ Requires-Dist: langchain-core>=0.1.0; extra == "all"
47
+ Requires-Dist: crewai>=0.1.0; extra == "all"
48
+ Requires-Dist: boto3>=1.28.0; extra == "all"
49
+ Requires-Dist: fastapi>=0.100.0; extra == "all"
50
+ Provides-Extra: dev
51
+ Requires-Dist: pytest>=7.0; extra == "dev"
52
+ Requires-Dist: pytest-mock>=3.10; extra == "dev"
53
+ Requires-Dist: responses>=0.23; extra == "dev"
54
+ Dynamic: author
55
+ Dynamic: home-page
56
+ Dynamic: requires-python
57
+
58
+ # evalguard-python
59
+
60
+ [![PyPI version](https://img.shields.io/pypi/v/evalguard-python.svg)](https://pypi.org/project/evalguard-python/)
61
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
62
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
63
+
64
+ Python SDK for [EvalGuard](https://evalguard.ai) -- evaluate, red-team, and guard LLM applications with **drop-in framework integrations**.
65
+
66
+ ## Installation
67
+
68
+ ```bash
69
+ # Core SDK
70
+ pip install evalguard-python
71
+
72
+ # With framework extras
73
+ pip install evalguard-python[openai]
74
+ pip install evalguard-python[anthropic]
75
+ pip install evalguard-python[langchain]
76
+ pip install evalguard-python[bedrock]
77
+ pip install evalguard-python[crewai]
78
+ pip install evalguard-python[fastapi]
79
+
80
+ # Everything
81
+ pip install evalguard-python[all]
82
+ ```
83
+
84
+ ## Quick Start
85
+
86
+ ```python
87
+ from evalguard import EvalGuardClient
88
+
89
+ client = EvalGuardClient(api_key="eg_live_...")
90
+
91
+ # Run an evaluation
92
+ result = client.run_eval({
93
+ "model": "gpt-4o",
94
+ "prompt": "Answer: {{input}}",
95
+ "cases": [
96
+ {"input": "What is 2+2?", "expectedOutput": "4"},
97
+ ],
98
+ "scorers": ["exact-match", "contains"],
99
+ })
100
+ print(f"Score: {result['score']}, Pass rate: {result['passRate']}")
101
+
102
+ # Check the firewall
103
+ fw = client.check_firewall("Ignore all previous instructions")
104
+ print(f"Action: {fw['action']}") # "block"
105
+ ```
106
+
107
+ ---
108
+
109
+ ## Framework Integrations
110
+
111
+ Every integration is a **drop-in wrapper** -- add two lines and your existing code gets automatic guardrails, traces, and observability.
112
+
113
+ ### OpenAI
114
+
115
+ ```python
116
+ from evalguard.openai import wrap
117
+ from openai import OpenAI
118
+
119
+ client = wrap(OpenAI(), api_key="eg_...", project_id="proj_...")
120
+
121
+ # Use exactly like normal -- guardrails are automatic
122
+ response = client.chat.completions.create(
123
+ model="gpt-4o",
124
+ messages=[{"role": "user", "content": "Hello, world!"}],
125
+ )
126
+ print(response.choices[0].message.content)
127
+ ```
128
+
129
+ All calls to `chat.completions.create()` are intercepted:
130
+ - **Pre-LLM**: Input is checked for prompt injection, PII, etc.
131
+ - **Post-LLM**: Response + latency + token usage are traced to EvalGuard.
132
+ - **Violations**: Raise `GuardrailViolation` (or log-only with `block_on_violation=False`).
133
+
134
+ ### Anthropic
135
+
136
+ ```python
137
+ from evalguard.anthropic import wrap
138
+ from anthropic import Anthropic
139
+
140
+ client = wrap(Anthropic(), api_key="eg_...", project_id="proj_...")
141
+
142
+ response = client.messages.create(
143
+ model="claude-sonnet-4-20250514",
144
+ max_tokens=1024,
145
+ messages=[{"role": "user", "content": "Explain quantum computing"}],
146
+ )
147
+ print(response.content[0].text)
148
+ ```
149
+
150
+ Intercepts `messages.create()` with the same pre/post guardrail pattern.
151
+
152
+ ### LangChain
153
+
154
+ ```python
155
+ from evalguard.langchain import EvalGuardCallback
156
+ from langchain_openai import ChatOpenAI
157
+
158
+ callback = EvalGuardCallback(api_key="eg_...", project_id="proj_...")
159
+
160
+ llm = ChatOpenAI(model="gpt-4o", callbacks=[callback])
161
+ result = llm.invoke("What is the capital of France?")
162
+ ```
163
+
164
+ Works with **any** LangChain LLM, chat model, or chain that supports callbacks. The callback implements the full LangChain callback protocol without importing LangChain, so it is compatible with all versions (0.1.x through 0.3.x).
165
+
166
+ Traced events:
167
+ - `on_llm_start` / `on_chat_model_start` -- pre-check input
168
+ - `on_llm_end` -- log output trace
169
+ - `on_llm_error` -- log error trace
170
+
171
+ ### AWS Bedrock
172
+
173
+ ```python
174
+ from evalguard.bedrock import wrap
175
+ import boto3
176
+
177
+ bedrock = boto3.client("bedrock-runtime", region_name="us-east-1")
178
+ client = wrap(bedrock, api_key="eg_...", project_id="proj_...")
179
+
180
+ # invoke_model (all Bedrock model families supported)
181
+ import json
182
+ response = client.invoke_model(
183
+ modelId="anthropic.claude-3-sonnet-20240229-v1:0",
184
+ body=json.dumps({
185
+ "messages": [{"role": "user", "content": "Hello"}],
186
+ "max_tokens": 256,
187
+ "anthropic_version": "bedrock-2023-05-31",
188
+ }),
189
+ )
190
+
191
+ # Converse API
192
+ response = client.converse(
193
+ modelId="anthropic.claude-3-sonnet-20240229-v1:0",
194
+ messages=[{"role": "user", "content": [{"text": "Hello"}]}],
195
+ )
196
+ ```
197
+
198
+ Supports all Bedrock model families: Anthropic Claude, Amazon Titan, Meta Llama, Cohere, AI21, and Mistral. Both `invoke_model` and `converse` APIs are guarded.
199
+
200
+ ### CrewAI
201
+
202
+ ```python
203
+ from evalguard.crewai import guard_agent, EvalGuardGuardrail
204
+ from crewai import Agent, Task, Crew
205
+
206
+ # Guard individual agents
207
+ agent = Agent(role="researcher", goal="...", backstory="...")
208
+ agent = guard_agent(agent, api_key="eg_...")
209
+
210
+ # Or use the standalone guardrail
211
+ guardrail = EvalGuardGuardrail(api_key="eg_...", project_id="proj_...")
212
+ result = guardrail.check("User input to validate")
213
+
214
+ # Wrap arbitrary functions
215
+ @guardrail.wrap_function
216
+ def my_tool(query: str) -> str:
217
+ return do_search(query)
218
+ ```
219
+
220
+ ### FastAPI Middleware
221
+
222
+ ```python
223
+ from evalguard.fastapi import EvalGuardMiddleware
224
+ from fastapi import FastAPI
225
+
226
+ app = FastAPI()
227
+ app.add_middleware(
228
+ EvalGuardMiddleware,
229
+ api_key="eg_...",
230
+ project_id="proj_...",
231
+ )
232
+
233
+ @app.post("/api/chat")
234
+ async def chat(request: dict):
235
+ # Automatically guarded -- prompt injection blocked with 403
236
+ return {"response": "..."}
237
+ ```
238
+
239
+ By default, POST requests to paths containing `/chat`, `/completions`, `/generate`, `/invoke`, or `/messages` are guarded. Customize with `guarded_paths`:
240
+
241
+ ```python
242
+ app.add_middleware(
243
+ EvalGuardMiddleware,
244
+ api_key="eg_...",
245
+ guarded_paths={"/api/v1/chat", "/api/v1/generate"},
246
+ )
247
+ ```
248
+
249
+ For per-route control:
250
+
251
+ ```python
252
+ from evalguard.fastapi import guard_route
253
+
254
+ @app.post("/api/chat")
255
+ @guard_route(api_key="eg_...", rules=["prompt_injection"])
256
+ async def chat(request: Request):
257
+ body = await request.json()
258
+ ...
259
+ ```
260
+
261
+ ### NeMo / Agent Workflows
262
+
263
+ ```python
264
+ from evalguard.nemoclaw import EvalGuardAgent
265
+
266
+ agent = EvalGuardAgent(api_key="eg_...", agent_name="support-bot")
267
+
268
+ # Guard any LLM call
269
+ result = agent.guarded_call(
270
+ provider="openai",
271
+ messages=[{"role": "user", "content": "Reset my password"}],
272
+ llm_fn=lambda: openai_client.chat.completions.create(
273
+ model="gpt-4", messages=[{"role": "user", "content": "Reset my password"}]
274
+ ),
275
+ )
276
+
277
+ # Multi-step agent sessions
278
+ with agent.session("ticket-123") as session:
279
+ session.check("User says: reset my password")
280
+ result = do_llm_call(...)
281
+ session.log_step("password_reset", input="...", output=str(result))
282
+ ```
283
+
284
+ ---
285
+
286
+ ## Core Guardrail Client
287
+
288
+ All framework integrations share the same underlying `GuardrailClient`:
289
+
290
+ ```python
291
+ from evalguard.guardrails import GuardrailClient
292
+
293
+ guard = GuardrailClient(
294
+ api_key="eg_...",
295
+ project_id="proj_...",
296
+ timeout=5.0, # keep low to avoid latency
297
+ fail_open=True, # allow on error (default)
298
+ )
299
+
300
+ # Pre-LLM check
301
+ result = guard.check_input("user prompt here", rules=["prompt_injection", "pii_redact"])
302
+ if not result["allowed"]:
303
+ print("Blocked:", result["violations"])
304
+
305
+ # Post-LLM check
306
+ result = guard.check_output("model response here", rules=["toxic_content"])
307
+
308
+ # Fire-and-forget trace
309
+ guard.log_trace({"model": "gpt-4", "input": "...", "output": "...", "latency_ms": 120})
310
+ ```
311
+
312
+ ## Error Handling
313
+
314
+ All integrations use **fail-open** semantics by default: if the EvalGuard API is unreachable, requests pass through rather than blocking your application.
315
+
316
+ To fail-closed:
317
+
318
+ ```python
319
+ # Framework wrappers
320
+ client = wrap(OpenAI(), api_key="eg_...", block_on_violation=True)
321
+
322
+ # Core client
323
+ guard = GuardrailClient(api_key="eg_...", fail_open=False)
324
+ ```
325
+
326
+ Catch violations explicitly:
327
+
328
+ ```python
329
+ from evalguard import GuardrailViolation
330
+
331
+ try:
332
+ response = client.chat.completions.create(...)
333
+ except GuardrailViolation as e:
334
+ print(f"Blocked: {e.violations}")
335
+ ```
336
+
337
+ ## All SDK Methods
338
+
339
+ | Method | Description |
340
+ |---|---|
341
+ | `client.run_eval(config)` | Run an evaluation with scorers and test cases |
342
+ | `client.get_eval(run_id)` | Fetch a specific eval run by ID |
343
+ | `client.list_evals(project_id=None)` | List eval runs, optionally filtered by project |
344
+ | `client.run_scan(config)` | Run a red-team security scan against a model |
345
+ | `client.get_scan(scan_id)` | Fetch a specific security scan by ID |
346
+ | `client.list_scorers()` | List all available evaluation scorers |
347
+ | `client.list_plugins()` | List all available security plugins |
348
+ | `client.check_firewall(input_text, rules=None)` | Check input against firewall rules |
349
+ | `client.run_benchmarks(suites, model)` | Run benchmark suites against a model |
350
+ | `client.export_dpo(run_id)` | Export eval results as DPO training data (JSONL) |
351
+ | `client.export_burp(scan_id)` | Export scan results as Burp Suite XML |
352
+ | `client.get_compliance_report(scan_id, framework)` | Map scan results to a compliance framework |
353
+ | `client.detect_drift(config)` | Detect performance drift between eval runs |
354
+ | `client.generate_guardrails(config)` | Auto-generate firewall rules from scan findings |
355
+
356
+ ## Documentation
357
+
358
+ Full documentation at [docs.evalguard.ai/python-sdk](https://docs.evalguard.ai/python-sdk).
359
+
360
+ ## License
361
+
362
+ MIT -- see [LICENSE](./LICENSE) for details.
@@ -0,0 +1,15 @@
1
+ evalguard/__init__.py,sha256=IAIMbOw7JydE_MzegoSDFXVmQj8jkzrkqXwCLAuU-78,895
2
+ evalguard/anthropic.py,sha256=XlPho-PKuniJGcGMEJQh9qAxO5b2kn90-TNpORaygRw,6490
3
+ evalguard/bedrock.py,sha256=6_uGpZOW6mATekUIo2tu4zRIc7pXyYhEljTGjECtqFQ,10050
4
+ evalguard/client.py,sha256=txQ3IUD9-ltbhyzU22Aije9LZ8AP20Ze-pT05pkXI7E,25594
5
+ evalguard/crewai.py,sha256=RurHWDl0t0S615Zv5MSHZn2V1MZmUFdRvokOs_h4CLo,5643
6
+ evalguard/fastapi.py,sha256=RedY-i2a-UWw8AcoGgX1rQYKYgZZWTOBsqSKz7nStp0,8921
7
+ evalguard/guardrails.py,sha256=pNf0nSe5gI2DzraPuySTMjaQFun_fXsTSu8lOlTtbSc,5376
8
+ evalguard/langchain.py,sha256=sIO3lP9_eNRIVz_y0b_owkU2Vsy_Jpmjk77hJkBBwag,7482
9
+ evalguard/nemoclaw.py,sha256=DxGHXHvuaPVZWvutvJHca-zdwiu99OLyV2yTsp-jt8Q,7919
10
+ evalguard/openai.py,sha256=SLpnS__233jY6xyhSXXSdqARoeSBZBLa_nmhCvtAZsg,6753
11
+ evalguard/types.py,sha256=6y2L9dC9OhEZPQulSkewZAARo0R5pknI40Ek-7mlbeI,3139
12
+ evalguard_python-1.1.0.dist-info/METADATA,sha256=nwc5Jx-bgSa9Aob_7CMqhDrmQLKWQnAxZe-nbtGvr84,11507
13
+ evalguard_python-1.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
14
+ evalguard_python-1.1.0.dist-info/top_level.txt,sha256=BUIQhUkmgN0RSompUln4ahg7wOdGcO8h06-vwk_I5xA,10
15
+ evalguard_python-1.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ evalguard