evalgate-sdk 3.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. evalgate_sdk/__init__.py +707 -0
  2. evalgate_sdk/_version.py +3 -0
  3. evalgate_sdk/assertions.py +1362 -0
  4. evalgate_sdk/auto.py +247 -0
  5. evalgate_sdk/batch.py +174 -0
  6. evalgate_sdk/cache.py +111 -0
  7. evalgate_sdk/ci_context.py +123 -0
  8. evalgate_sdk/cli/__init__.py +111 -0
  9. evalgate_sdk/cli/api.py +261 -0
  10. evalgate_sdk/cli/cli_constants.py +20 -0
  11. evalgate_sdk/cli/commands.py +1041 -0
  12. evalgate_sdk/cli/config.py +228 -0
  13. evalgate_sdk/cli/env.py +43 -0
  14. evalgate_sdk/cli/formatters/types.py +132 -0
  15. evalgate_sdk/cli/golden_commands.py +322 -0
  16. evalgate_sdk/cli/manifest.py +301 -0
  17. evalgate_sdk/cli/new_commands.py +435 -0
  18. evalgate_sdk/cli/policy_packs.py +103 -0
  19. evalgate_sdk/cli/profiles.py +12 -0
  20. evalgate_sdk/cli/regression_gate.py +312 -0
  21. evalgate_sdk/cli/render/__init__.py +1 -0
  22. evalgate_sdk/cli/render/snippet.py +18 -0
  23. evalgate_sdk/cli/render/sort.py +29 -0
  24. evalgate_sdk/cli/report/__init__.py +1 -0
  25. evalgate_sdk/cli/report/build_check_report.py +209 -0
  26. evalgate_sdk/cli/traces.py +186 -0
  27. evalgate_sdk/cli/workspace.py +63 -0
  28. evalgate_sdk/client.py +609 -0
  29. evalgate_sdk/cluster.py +359 -0
  30. evalgate_sdk/collector.py +161 -0
  31. evalgate_sdk/constants.py +6 -0
  32. evalgate_sdk/context.py +151 -0
  33. evalgate_sdk/errors.py +236 -0
  34. evalgate_sdk/export.py +238 -0
  35. evalgate_sdk/formatters/__init__.py +11 -0
  36. evalgate_sdk/formatters/github.py +51 -0
  37. evalgate_sdk/formatters/human.py +68 -0
  38. evalgate_sdk/formatters/json_fmt.py +11 -0
  39. evalgate_sdk/formatters/pr_comment.py +80 -0
  40. evalgate_sdk/golden.py +426 -0
  41. evalgate_sdk/integrations/__init__.py +1 -0
  42. evalgate_sdk/integrations/anthropic.py +99 -0
  43. evalgate_sdk/integrations/autogen.py +62 -0
  44. evalgate_sdk/integrations/crewai.py +61 -0
  45. evalgate_sdk/integrations/langchain.py +100 -0
  46. evalgate_sdk/integrations/openai.py +155 -0
  47. evalgate_sdk/integrations/openai_eval.py +221 -0
  48. evalgate_sdk/local.py +144 -0
  49. evalgate_sdk/logger.py +123 -0
  50. evalgate_sdk/matchers.py +62 -0
  51. evalgate_sdk/otel.py +256 -0
  52. evalgate_sdk/pagination.py +145 -0
  53. evalgate_sdk/py.typed +0 -0
  54. evalgate_sdk/pytest_plugin.py +96 -0
  55. evalgate_sdk/reason_codes.py +103 -0
  56. evalgate_sdk/regression.py +196 -0
  57. evalgate_sdk/replay_decision.py +115 -0
  58. evalgate_sdk/runtime/__init__.py +50 -0
  59. evalgate_sdk/runtime/adapters/__init__.py +1 -0
  60. evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
  61. evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
  62. evalgate_sdk/runtime/context.py +68 -0
  63. evalgate_sdk/runtime/eval.py +318 -0
  64. evalgate_sdk/runtime/execution_mode.py +170 -0
  65. evalgate_sdk/runtime/executor.py +92 -0
  66. evalgate_sdk/runtime/registry.py +125 -0
  67. evalgate_sdk/runtime/run_report.py +249 -0
  68. evalgate_sdk/runtime/types.py +143 -0
  69. evalgate_sdk/snapshot.py +219 -0
  70. evalgate_sdk/streaming.py +124 -0
  71. evalgate_sdk/synthesize.py +226 -0
  72. evalgate_sdk/testing.py +128 -0
  73. evalgate_sdk/types.py +666 -0
  74. evalgate_sdk/utils/__init__.py +1 -0
  75. evalgate_sdk/utils/input_hash.py +42 -0
  76. evalgate_sdk/workflows.py +264 -0
  77. evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
  78. evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
  79. evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
  80. evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,608 @@
1
+ Metadata-Version: 2.4
2
+ Name: evalgate-sdk
3
+ Version: 3.3.1
4
+ Summary: EvalGate Python SDK — CI for AI behavior. Traces, evaluations, assertions, and regression gates for LLM apps.
5
+ Project-URL: Homepage, https://evalgate.com
6
+ Project-URL: Documentation, https://github.com/evalgate/ai-evaluation-platform#readme
7
+ Project-URL: Repository, https://github.com/evalgate/ai-evaluation-platform
8
+ Project-URL: Issues, https://github.com/evalgate/ai-evaluation-platform/issues
9
+ Project-URL: Changelog, https://github.com/evalgate/ai-evaluation-platform/blob/main/src/packages/sdk-python/CHANGELOG.md
10
+ Author-email: EvalGate <team@evalgate.com>
11
+ License-Expression: MIT
12
+ Keywords: ai,anthropic,assertions,ci,evaluation,llm,monitoring,observability,openai,regression,testing,tracing,workflow
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Classifier: Topic :: Software Development :: Testing
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.10
25
+ Requires-Dist: httpx<1,>=0.27
26
+ Requires-Dist: pydantic<3,>=2.0
27
+ Provides-Extra: all
28
+ Requires-Dist: anthropic>=0.20; extra == 'all'
29
+ Requires-Dist: langchain-core>=0.2; extra == 'all'
30
+ Requires-Dist: openai>=1.0; extra == 'all'
31
+ Requires-Dist: rich>=13; extra == 'all'
32
+ Requires-Dist: typer>=0.12; extra == 'all'
33
+ Provides-Extra: anthropic
34
+ Requires-Dist: anthropic>=0.20; extra == 'anthropic'
35
+ Provides-Extra: cli
36
+ Requires-Dist: rich>=13; extra == 'cli'
37
+ Requires-Dist: typer>=0.12; extra == 'cli'
38
+ Provides-Extra: dev
39
+ Requires-Dist: mypy>=1.10; extra == 'dev'
40
+ Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
41
+ Requires-Dist: pytest-cov>=5; extra == 'dev'
42
+ Requires-Dist: pytest>=8; extra == 'dev'
43
+ Requires-Dist: respx>=0.21; extra == 'dev'
44
+ Requires-Dist: rich>=13; extra == 'dev'
45
+ Requires-Dist: ruff>=0.5; extra == 'dev'
46
+ Requires-Dist: typer>=0.12; extra == 'dev'
47
+ Provides-Extra: langchain
48
+ Requires-Dist: langchain-core>=0.2; extra == 'langchain'
49
+ Provides-Extra: openai
50
+ Requires-Dist: openai>=1.0; extra == 'openai'
51
+ Description-Content-Type: text/markdown
52
+
53
+ # evalgate-sdk
54
+
55
+ Build a living golden suite for AI behavior. 🚀
56
+
57
+ No infra. No lock-in. Remove anytime.
58
+
59
+ **EvalGate = the full suite for AI quality in Python.** Discover overlap, cluster failures, build golden datasets, run automated regression gates, and guide optimization before changes reach production.
60
+
61
+ [![PyPI](https://img.shields.io/pypi/v/evalgate-sdk)](https://pypi.org/project/evalgate-sdk/)
62
+ [![Python](https://img.shields.io/pypi/pyversions/evalgate-sdk)](https://pypi.org/project/evalgate-sdk/)
63
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
64
+ [![Typed](https://img.shields.io/badge/typing-typed-blue)](https://peps.python.org/pep-0561/)
65
+ [![Tests](https://img.shields.io/badge/tests-507%20passed-brightgreen.svg)](#)
66
+
67
+ ---
68
+
69
+ ## The Full EvalGate Workflow
70
+
71
+ EvalGate is no longer just a pass/fail gate at the end of CI. The current workflow is a full loop:
72
+
73
+ ```text
74
+ discover -> cluster -> label/analyze -> synthesize -> gate/auto
75
+ ```
76
+
77
+ - **Discover overlap before adding more tests** with `evalgate discover --manifest`
78
+ - **Cluster failures by pattern** with `evalgate cluster --run .evalgate/runs/latest.json`
79
+ - **Build a labeled golden dataset** with `evalgate label` and `evalgate analyze`
80
+ - **Draft broader golden cases** with `evalgate synthesize --dataset .evalgate/golden/labeled.jsonl --output .evalgate/golden/synthetic.jsonl`
81
+ - **Block regressions or run guided optimization** with `evalgate gate`, `evalgate ci`, and `evalgate auto`
82
+
83
+ The Python SDK ships the same closed-loop workflow primitives as the platform: assertions, spec execution, tracing, clustering, golden-dataset analysis, synthesis, replay decision, and guided auto iterations.
84
+
85
+ ---
86
+
87
+ ## Install
88
+
89
+ ```bash
90
+ pip install evalgate-sdk # Core
91
+ pip install "evalgate-sdk[openai]" # + OpenAI tracing and async assertions
92
+ pip install "evalgate-sdk[anthropic]" # + Anthropic tracing and async assertions
93
+ pip install "evalgate-sdk[all]" # Everything
94
+ ```
95
+
96
+ ---
97
+
98
+ ## Quickstart
99
+
100
+ No API key needed for local assertions:
101
+
102
+ ```python
103
+ from evalgate_sdk import AIEvalClient, expect
104
+ from evalgate_sdk.types import CreateTraceParams
105
+
106
+ # Local assertions — no API key needed
107
+ result = expect("The capital of France is Paris.").to_contain("Paris")
108
+ print(result.passed) # True
109
+
110
+ # Platform: trace and evaluate with API key
111
+ client = AIEvalClient(api_key="sk-...")
112
+ trace = await client.traces.create(CreateTraceParams(name="chat-quality"))
113
+ ```
114
+
115
+ Same CI gate, same quality checks. Python supports the same core loop as TypeScript: assertions, test suites, OpenAI/Anthropic tracing, LangChain/CrewAI/AutoGen integrations, golden dataset workflow commands, and regression gates.
116
+
117
+ **Python CLI:** `pip install "evalgate-sdk[cli]"` → `evalgate init`, `evalgate run`, `evalgate check`, `evalgate gate`, `evalgate ci`, `evalgate discover`, `evalgate cluster`, `evalgate label`, `evalgate analyze`, `evalgate synthesize`, `evalgate replay-decision`, `evalgate explain`, `evalgate doctor`, `evalgate auto`.
118
+
119
+ Context helpers are importable from the package root:
120
+
121
+ ```python
122
+ from evalgate_sdk import ContextMetadata, create_context
123
+
124
+ ctx: ContextMetadata = {"run_id": "test-run"}
125
+ token = create_context(ctx)
126
+ ```
127
+
128
+ ---
129
+
130
+ ## Why EvalGate?
131
+
132
+ LLMs don't fail like traditional software — they drift silently. EvalGate turns evaluations into CI gates so regressions never reach production.
133
+
134
+ | What you get | How it works |
135
+ |--------------|--------------|
136
+ | **30+ assertions** | `expect(output).to_contain("Paris")`, `.to_not_contain_pii()`, `.to_have_no_profanity()` |
137
+ | **DSL spec system** | `define_eval("name", executor)` with `.skip` and `.only` support |
138
+ | **Test suites** | Define cases with retries, seed, strict mode, and stop-on-failure |
139
+ | **Workflow tracing** | Multi-agent handoffs, decisions, costs — with offline mode |
140
+ | **OpenAI / Anthropic** | Drop-in tracing wrappers + LangChain, CrewAI, AutoGen |
141
+ | **Regression gates** | Block deploys when eval scores drop, with baseline tamper detection |
142
+ | **Snapshot testing** | Save, compare, and diff outputs over time |
143
+ | **Impact analysis** | `evalgate discover` → manifest → impact analysis → run only what changed |
144
+ | **CLI** | `evalgate run`, `evalgate check`, `evalgate gate`, `evalgate ci`, `evalgate discover`, `evalgate cluster`, `evalgate label`, `evalgate analyze`, `evalgate synthesize`, `evalgate replay-decision`, `evalgate explain`, `evalgate doctor`, `evalgate auto` |
145
+
146
+ ---
147
+
148
+ ## Assertions
149
+
150
+ 30+ built-in checks for LLM output quality, safety, and structure. All return `AssertionResult` with `.passed`, `.message`, `.expected`, `.actual`.
151
+
152
+ ### Fluent API (`expect`)
153
+
154
+ ```python
155
+ from evalgate_sdk import expect
156
+
157
+ # Content
158
+ expect("The capital of France is Paris.").to_contain("Paris")
159
+ expect("draft output").not_.to_contain("final answer")
160
+ expect("Hello World").to_not_contain_pii()
161
+ expect("Thank you for your help.").to_be_professional()
162
+ expect("Clean output").to_have_no_profanity()
163
+
164
+ # Sentiment
165
+ expect("Great product!").to_have_sentiment("positive")
166
+
167
+ # Structure
168
+ expect('{"name": "Alice"}').to_be_valid_json()
169
+ expect('{"name": "Alice"}').to_match_json({"type": "object"})
170
+ expect('payload={"name": "Alice"}').to_match_json({"required": ["name"]})
171
+ expect(0.95).to_be_between(0.0, 1.0)
172
+ expect("Hello world").to_have_length(min=5, max=100)
173
+ expect(output).to_contain_keywords(["gravity", "force"])
174
+
175
+ # Comparison
176
+ expect(42).to_be_greater_than(10)
177
+ expect(42).to_be_less_than(100)
178
+ expect(True).to_be_truthy()
179
+
180
+ # Code
181
+ expect("def hello(): pass").to_contain_code()
182
+
183
+ # Hallucination
184
+ expect(output).to_not_hallucinate(["Paris is the capital of France"])
185
+ ```
186
+
187
+ ### Standalone Functions
188
+
189
+ ```python
190
+ from evalgate_sdk import (
191
+ contains_keywords, has_no_toxicity, has_sentiment, similar_to,
192
+ contains_json, has_readability_score, has_factual_accuracy,
193
+ has_valid_code_syntax, has_sentiment_with_score, matches_pattern,
194
+ matches_schema, responded_within_duration, responded_within_time_since,
195
+ run_assertions,
196
+ )
197
+
198
+ # Sync standalone assertion helpers return AssertionResult
199
+ result = has_no_toxicity("Thank you for your help.")
200
+ print(result.passed, result.message)
201
+
202
+ result = has_valid_code_syntax("def hello():\n return 'hi'", "python")
203
+ print(result.passed) # True — uses ast.parse for Python
204
+
205
+ result = matches_schema('payload={"status": "ok"}', {"required": ["status"]})
206
+ print(result.passed, result.actual)
207
+
208
+ # Batch assertions
209
+ results = run_assertions([
210
+ lambda: expect(output).to_contain("Paris"),
211
+ lambda: expect(output).to_have_sentiment("positive"),
212
+ lambda: expect(output).to_have_length(min=10),
213
+ lambda: True, # legacy bools are coerced into AssertionResult
214
+ ])
215
+ all_passed = all(r.passed for r in results)
216
+ ```
217
+
218
+ Compatibility helpers such as `has_pii()`, async semantic checks like `has_sentiment_async()`, and score-style utilities such as `has_consistency()` still return booleans or dictionaries where documented.
219
+
220
+ ### LLM-Backed Assertions (Async)
221
+
222
+ For context-aware checking beyond heuristics. Install the matching optional extra first, for example `pip install "evalgate-sdk[openai]"` when using the default OpenAI provider.
223
+
224
+ ```python
225
+ from evalgate_sdk import configure_assertions
226
+ from evalgate_sdk import has_sentiment_async, has_no_toxicity_async
227
+
228
+ configure_assertions(
229
+ provider="openai", # or "anthropic"
230
+ api_key="sk-...",
231
+ model="gpt-4o-mini",
232
+ timeout_ms=30_000, # 30s default, prevents hung calls
233
+ )
234
+
235
+ matches = await has_sentiment_async("subtle irony...", "negative")
236
+ is_safe = await has_no_toxicity_async("borderline text")
237
+ ```
238
+
239
+ You can also keep using `configure_assertions(AssertionLLMConfig(...))` when you prefer an explicit config object.
240
+
241
+ ---
242
+
243
+ ## DSL Spec System
244
+
245
+ Define evaluation specs with the `define_eval` DSL — the same API as the TypeScript SDK:
246
+
247
+ ```python
248
+ from evalgate_sdk import define_eval, create_result
249
+
250
+ define_eval("Math Operations", async_executor)
251
+
252
+ # Object form with metadata
253
+ define_eval({
254
+ "name": "String check",
255
+ "tags": ["basic"],
256
+ "executor": async_executor,
257
+ })
258
+
259
+ # Skip / Only (matches TS defineEval.skip / defineEval.only)
260
+ define_eval.skip("Skipped spec", async_executor)
261
+ define_eval.only("Focus spec", async_executor)
262
+ ```
263
+
264
+ ---
265
+
266
+ ## Test Suites
267
+
268
+ ```python
269
+ from evalgate_sdk import create_test_suite
270
+ from evalgate_sdk.types import TestSuiteCase, TestSuiteConfig
271
+
272
+ suite = create_test_suite("safety-checks", TestSuiteConfig(
273
+ evaluator=my_llm_function,
274
+ test_cases=[
275
+ TestSuiteCase(name="greeting", input="Hello", expected_output="Hi there!"),
276
+ TestSuiteCase(name="pii-check", input="Describe yourself",
277
+ assertions=[{"type": "not_contains_pii"}]),
278
+ ],
279
+ retries=3, # Retry failed cases (default: 0)
280
+ retry_delay_ms=1000, # Delay between retries
281
+ retry_jitter=True, # Add jitter to retry delay
282
+ seed=42, # Deterministic ordering
283
+ strict=True, # Fail on warnings
284
+ stop_on_failure=True, # Abort on first failure
285
+ ))
286
+
287
+ result = await suite.run()
288
+ print(f"{result.passed_count}/{result.total} passed")
289
+ ```
290
+
291
+ ---
292
+
293
+ ## OpenAI Integration
294
+
295
+ ```python
296
+ from openai import AsyncOpenAI
297
+ from evalgate_sdk import AIEvalClient
298
+ from evalgate_sdk.integrations.openai import trace_openai
299
+
300
+ traced = trace_openai(AsyncOpenAI(), AIEvalClient.init())
301
+ response = await traced.chat.completions.create(
302
+ model="gpt-4",
303
+ messages=[{"role": "user", "content": "Explain gravity"}]
304
+ )
305
+ # Automatically traced with latency, tokens, and output
306
+ ```
307
+
308
+ Batch eval with built-in assertions:
309
+
310
+ ```python
311
+ from evalgate_sdk import openai_chat_eval, OpenAIChatEvalCase
312
+
313
+ result = await openai_chat_eval(
314
+ name="chat-quality",
315
+ model="gpt-4",
316
+ cases=[
317
+ OpenAIChatEvalCase(
318
+ input="Explain gravity in one sentence.",
319
+ assertions=[{"type": "contains_keywords", "value": ["gravity", "force"]}],
320
+ ),
321
+ ],
322
+ )
323
+ print(f"{result.passed_count}/{result.total} passed — score: {result.score:.2f}")
324
+ ```
325
+
326
+ ---
327
+
328
+ ## Anthropic Integration
329
+
330
+ ```python
331
+ from anthropic import AsyncAnthropic
332
+ from evalgate_sdk import AIEvalClient
333
+ from evalgate_sdk.integrations.anthropic import trace_anthropic
334
+
335
+ traced = trace_anthropic(AsyncAnthropic(), AIEvalClient.init())
336
+ response = await traced.messages.create(
337
+ model="claude-sonnet-4-20250514",
338
+ max_tokens=1024,
339
+ messages=[{"role": "user", "content": "Explain gravity"}]
340
+ )
341
+ ```
342
+
343
+ Also available: `trace_langchain`, `trace_crewai`, `trace_autogen`.
344
+
345
+ ---
346
+
347
+ ## Workflow Tracing
348
+
349
+ Track multi-agent systems end-to-end — handoffs, decisions, and cost:
350
+
351
+ ```python
352
+ from evalgate_sdk import AIEvalClient, WorkflowTracer
353
+ from evalgate_sdk.types import HandoffType, CostCategory, RecordCostParams
354
+
355
+ client = AIEvalClient.init()
356
+ tracer = WorkflowTracer(client, name="research-pipeline")
357
+
358
+ ctx = await tracer.start_workflow()
359
+ span = await tracer.start_agent_span("researcher", {"query": "AI trends"})
360
+ await tracer.end_agent_span(span, {"findings": "..."})
361
+
362
+ await tracer.record_handoff("researcher", "writer", handoff_type=HandoffType.DELEGATION)
363
+ await tracer.record_cost(RecordCostParams(
364
+ agent_name="researcher", category=CostCategory.LLM_INPUT, amount=0.05, tokens=1500
365
+ ))
366
+
367
+ await tracer.end_workflow()
368
+ print(f"Total cost: ${tracer.get_total_cost():.2f}")
369
+ ```
370
+
371
+ ### Offline Mode
372
+
373
+ Run workflow tracing locally without an API connection:
374
+
375
+ ```python
376
+ tracer = WorkflowTracer(None, name="local-test", offline=True)
377
+ ctx = await tracer.start_workflow() # No API calls, no crash
378
+ ```
379
+
380
+ You can also omit the client entirely when you want local-only workflow tracing:
381
+
382
+ ```python
383
+ from evalgate_sdk import create_workflow_tracer
384
+
385
+ tracer = create_workflow_tracer(name="local-test")
386
+ ctx = await tracer.start_workflow()
387
+ assert ctx.trace_id is None
388
+ ```
389
+
390
+ ## Batch Processing
391
+
392
+ `batch_process(items, processor, concurrency=...)` expects an async callable for `processor` and returns results in input order.
393
+
394
+ ```python
395
+ from evalgate_sdk import batch_process
396
+
397
+ async def double(value: int) -> int:
398
+ return value * 2
399
+
400
+ results = await batch_process([1, 2, 3], double, concurrency=2)
401
+ ```
402
+
403
+ If you pass a synchronous function, the SDK raises `TypeError` immediately instead of failing later with a generic await error.
404
+
405
+ ## Snapshot Testing
406
+
407
+ Snapshots are stored in `.snapshots` by default, relative to the current working directory.
408
+
409
+ ```python
410
+ from evalgate_sdk import compare_with_snapshot, snapshot
411
+
412
+ snapshot("Hello there", "support-reply")
413
+ comparison = compare_with_snapshot("support-reply", "Hello there")
414
+ print(comparison.matches)
415
+ ```
416
+
417
+ Override the directory when you want snapshots under a project-specific path:
418
+
419
+ ```python
420
+ snapshot("Hello there", "support-reply", directory=".evalgate/snapshots")
421
+ ```
422
+
423
+ Add `.snapshots/` to your `.gitignore` unless you intentionally want snapshot files committed.
424
+
425
+ ---
426
+
427
+ ## Regression Gates
428
+
429
+ Block deployments when eval scores drop:
430
+
431
+ ```python
432
+ from evalgate_sdk import evaluate_regression, to_pass_gate
433
+
434
+ report = evaluate_regression(current_results, baseline)
435
+ assert to_pass_gate(report), f"Regression detected: {report.summary}"
436
+ ```
437
+
438
+ ### Baseline Tamper Detection
439
+
440
+ ```python
441
+ from evalgate_sdk import compute_baseline_checksum, verify_baseline_checksum, Baseline
442
+
443
+ baseline = Baseline(scores={"chat-quality": 0.95, "safety": 0.99})
444
+ checksum = compute_baseline_checksum(baseline)
445
+
446
+ # Later — verify integrity before gating
447
+ assert verify_baseline_checksum(baseline, checksum), "Baseline tampered!"
448
+ ```
449
+
450
+ ---
451
+
452
+ ## CLI
453
+
454
+ ```bash
455
+ evalgate init # Scaffold eval config
456
+ evalgate discover # Find eval spec files
457
+ evalgate discover --manifest # Generate stable manifest
458
+ evalgate run --write-results # Run with artifact retention
459
+ evalgate gate # Regression gate
460
+ evalgate ci # Run + gate (CI mode)
461
+ evalgate ci --base main --format github # CI with PR summary
462
+ evalgate cluster --run .evalgate/runs/latest.json
463
+ evalgate label --run .evalgate/runs/latest.json
464
+ evalgate analyze --dataset .evalgate/golden/labeled.jsonl
465
+ evalgate synthesize --dataset .evalgate/golden/labeled.jsonl --output .evalgate/golden/synthetic.jsonl
466
+ evalgate replay-decision --previous .evalgate/runs/run-prev.json --current .evalgate/runs/run-latest.json
467
+ evalgate auto run --objective "reduce hallucination" --baseline-run previous.json --candidate-run current.json
468
+ evalgate auto daemon --objective "reduce hallucination" --cycles 3
469
+ evalgate compare --base a.json --head b.json # Side-by-side diff
470
+ evalgate doctor # Preflight checklist
471
+ evalgate explain # Root cause analysis on last failure
472
+ evalgate impact-analysis --base main # Run only impacted specs
473
+ ```
474
+
475
+ ### Exit Codes
476
+
477
+ | Code | Meaning |
478
+ |------|---------|
479
+ | 0 | Pass — no regression |
480
+ | 1 | Regression detected |
481
+ | 2 | Infra error (baseline missing, tests crashed) |
482
+
483
+ ---
484
+
485
+ ## Data Export & Import
486
+
487
+ ```python
488
+ from evalgate_sdk import export_data, import_data, ExportOptions, export_to_file
489
+
490
+ # Export
491
+ data = await export_data(client, ExportOptions(format="json"))
492
+ export_to_file(data, "backup.json")
493
+
494
+ # Import (2-arg API — client is optional keyword arg)
495
+ from evalgate_sdk import import_from_file
496
+ data = import_from_file("backup.json")
497
+ result = await import_data(data, client=client)
498
+
499
+ # LangSmith migration
500
+ from evalgate_sdk import import_from_langsmith
501
+ data = import_from_langsmith(langsmith_export)
502
+ ```
503
+
504
+ ---
505
+
506
+ ## Reliability
507
+
508
+ | Feature | Detail |
509
+ |---|---|
510
+ | **Python** | 3.10, 3.11, 3.12, 3.13 |
511
+ | **Dependencies** | Only `httpx` + `pydantic` |
512
+ | **Async** | Native `async/await` throughout; sync wrappers available |
513
+ | **Type hints** | Full `py.typed` — works with mypy and Pyright |
514
+ | **Errors** | Structured: `RateLimitError`, `AuthenticationError`, `NetworkError`, `ValidationError` — all have `.message` |
515
+ | **Rate handling** | Built-in `RateLimiter` with configurable tiers |
516
+ | **Batching** | `batch_process()` with concurrency control |
517
+ | **Pagination** | Async `PaginatedIterator` with cursor support |
518
+ | **Timeouts** | 30s default on all HTTP clients and LLM assertion calls |
519
+ | **Offline** | `WorkflowTracer(offline=True)`, `LocalStorage` for file-based dev |
520
+
521
+ ---
522
+
523
+ ## API Reference
524
+
525
+ | Module | Methods |
526
+ |---|---|
527
+ | `client.traces` | `create`, `list`, `get`, `update`, `delete`, `create_span`, `list_spans` |
528
+ | `client.evaluations` | `create`, `get`, `list`, `update`, `delete`, `create_test_case`, `list_test_cases`, `create_run`, `list_runs`, `get_run` |
529
+ | `client.llm_judge` | `evaluate`, `create_config`, `list_configs`, `list_results`, `get_alignment` |
530
+ | `client.annotations` | `create`, `list`, `tasks.create`, `tasks.list`, `tasks.get`, `tasks.items.create`, `tasks.items.list` |
531
+ | `client.developer` | `get_usage`, `get_usage_summary`, `api_keys.*`, `webhooks.*` |
532
+
533
+ ---
534
+
535
+ ## Release Notes
536
+
537
+ ### v3.2.x
538
+
539
+ #### Highlights
540
+
541
+ 1. **Full EvalGate loop**: discover → cluster → label/analyze → synthesize → gate/auto
542
+ 2. **Golden dataset workflow**: canonical labeled dataset, analysis summaries, synthetic case generation, and replay decision helpers
543
+ 3. **Guided optimization**: `evalgate auto run`, `evalgate auto daemon`, and auto history/report helpers
544
+ 4. **CLI parity improvements**: Python CLI covers clustering, labeling, analysis, synthesis, replay-decision, and bounded auto workflows
545
+ 5. **Tracing + workflow integrations**: OpenAI, Anthropic, LangChain, CrewAI, and AutoGen remain first-class Python surfaces
546
+
547
+ #### Changelog
548
+
549
+ 1. **Correctness fixes (parity with TypeScript SDK)**:
550
+ * **Assertion return types**: sync helpers now normalize to `AssertionResult`, including `contains_keywords`, `has_sentiment`, `has_readability_score`, `similar_to`, `contains_json`, `has_no_toxicity`, `matches_schema`, `has_valid_code_syntax`, `follows_instructions`, and `contains_all_required_fields`
551
+ * **Toxicity blocklist**: expanded from 9 → 95 terms across 8 categories; uses `\b` word-boundary regex (no substring false positives)
552
+ * **`has_valid_code_syntax`**: Python uses `ast.parse` (real syntax validation); other languages use structural regex
553
+ * **`has_factual_accuracy`**: entity-aware word-overlap check instead of raw substring matching
554
+ * **Expectation parity**: `expect(...).not_` now inverts fluent assertions and `to_match_json()` accepts JSON strings or embedded JSON snippets
555
+ * **Batch compatibility**: `run_assertions()` now coerces legacy boolean and mapping results into `AssertionResult`
556
+ * **`has_sentiment_with_score`**: confidence gradient scales with margin × magnitude; single-word inputs no longer return 1.0
557
+ * **`WorkflowTracer`**: accepts `name` and `offline` kwargs; offline mode skips all API calls
558
+ * **`import_data`**: 2-arg `(data, options)` signature matching TypeScript; client is keyword-only
559
+ * **`Logger.child`**: uses `:` separator matching TypeScript (was `.`)
560
+ * **`define_eval.skip` / `.only`**: attached as methods on `define_eval`
561
+ * **`ValidationError.message`**: `.message` property on all error classes
562
+ * **`AssertionLLMConfig.timeout_ms`**: 30s default, enforced via `asyncio.wait_for`
563
+ * **`compute_baseline_checksum` / `verify_baseline_checksum`**: SHA-256 tamper detection
564
+ * **`TestSuiteConfig`**: added `retries`, `retry_delay_ms`, `retry_jitter`, `seed`, `strict`, `stop_on_failure`
565
+ * **`to_have_no_profanity`**: new method on `Expectation` matching TypeScript `toHaveNoProfanity`
566
+ * **`RequestCache`**: removed from public exports (internal only)
567
+ 2. **Production hardening**:
568
+ * 30s default timeout on all `httpx.AsyncClient` calls
569
+ * API key validation before sending requests
570
+ * URL-encoded query params in `fetch_quality_latest`
571
+ * Graceful error handling in `report_trace` and OTel exporter (no more crashes on network errors)
572
+ * `run_report` correctly sets `success=False` on test failures
573
+ * GitHub Actions formatter uses `GITHUB_OUTPUT` (deprecated `::set-output` removed)
574
+ * Config parse errors logged as warnings instead of silently swallowed
575
+ * `save_trace` / `save_evaluation` no longer mutate caller's dict
576
+ * Subprocess timeout handling in regression gate
577
+
578
+ 507 tests passing.
579
+
580
+ ---
581
+
582
+ ## Examples
583
+
584
+ See [`examples/python/`](https://github.com/evalgate/ai-evaluation-platform/tree/main/examples/python):
585
+
586
+ - **[OpenAI Eval](examples/python/openai_eval.ipynb)** — Trace and evaluate OpenAI chat completions
587
+ - **[RAG Eval](examples/python/rag_eval.ipynb)** — Evaluate retrieval-augmented generation pipelines
588
+ - **[Agent Eval](examples/python/agent_eval.ipynb)** — Test and trace multi-agent workflows
589
+
590
+ ---
591
+
592
+ ## No Lock-in
593
+
594
+ ```bash
595
+ rm .evalgate/config.json
596
+ ```
597
+
598
+ Your local assertions keep working. No account cancellation. No data export required.
599
+
600
+ ---
601
+
602
+ ## Links
603
+
604
+ [Platform](https://evalgate.com) · [GitHub](https://github.com/evalgate/ai-evaluation-platform) · [TypeScript SDK](https://www.npmjs.com/package/@evalgate/sdk)
605
+
606
+ ## License
607
+
608
+ MIT
@@ -0,0 +1,80 @@
1
+ evalgate_sdk/__init__.py,sha256=1t6havKtAQCGUZDGsx7OHywPWS371vjMN3969tvw-I4,17915
2
+ evalgate_sdk/_version.py,sha256=FGscfcIpTNSl8_v-ekzH-qo7xflizQx-2ULm5j2v4nQ,71
3
+ evalgate_sdk/assertions.py,sha256=z8lOi7_OEXToZe5lTfLzb4wI7SyXryYpcS_RuK4PtKY,46432
4
+ evalgate_sdk/auto.py,sha256=_6R18_6AOKKg_aj-cy4ykFl7Lxw4fbengM_D1oKa_zU,9254
5
+ evalgate_sdk/batch.py,sha256=xFIKX55X1Ad3bljPHLzckG2MTcL556YrJDuEU_Wtq08,5601
6
+ evalgate_sdk/cache.py,sha256=_asiOV3s4OjjhHISonYwrGOdjlJkMnzFtCclWC9hyfU,3259
7
+ evalgate_sdk/ci_context.py,sha256=86YP27I6KWvmlF3p0L5aXMPHazFJSAsgfZ0spW0yCso,3892
8
+ evalgate_sdk/client.py,sha256=zO59_I0BGPfEWDr_yPKTWvTQA7geMXI7mRAdMC8S0co,24660
9
+ evalgate_sdk/cluster.py,sha256=BRP0PHliFtGx4nZdBZ-8DEW36SSIxoLncWoUqZiK9eQ,11698
10
+ evalgate_sdk/collector.py,sha256=JUdd-hXxy_o0CKzNl0ljrtiVQAz2dZ3OiB7kEE8z5C8,4747
11
+ evalgate_sdk/constants.py,sha256=ZimYIgG9vgLwe7rRfSf2TRRevfQgfgWTpNg6MziQzBQ,121
12
+ evalgate_sdk/context.py,sha256=8ftm9QJWM9Ak2vi5H9PcjIbpWqLlKeeGEZXEum0kjp8,4399
13
+ evalgate_sdk/errors.py,sha256=qjoib8ECiTuFa2te8z6SCWtwALTOqPgs2wGvwGG0lQc,8410
14
+ evalgate_sdk/export.py,sha256=DOVcTEEKZ1PGt9FTb6S4d5vo5RTQb1hVKolqpZQRTKc,8097
15
+ evalgate_sdk/golden.py,sha256=20DLfjihgQPBVvdSGLd9r19LqqV9GDK2JLzhocXPryk,17560
16
+ evalgate_sdk/local.py,sha256=UAtNetqm8elsRiqfjflUkn37RVBs87MVsyoXNlALIsc,5685
17
+ evalgate_sdk/logger.py,sha256=cCGlvzcF4bxVss6aQN2xiWpl6z_1ZefbQAcurXd9K5Q,4067
18
+ evalgate_sdk/matchers.py,sha256=h8jVD8W2luquaay7nwGNSaTERYInaaoMZXLgKnF2RpE,1898
19
+ evalgate_sdk/otel.py,sha256=t5389FLYxM0_ht1p0xu2BgLC8Z7crNlEEnUB-3WrJ5U,8851
20
+ evalgate_sdk/pagination.py,sha256=FEK68fYx3JUTyY3oFlr3Cx1w_7yYmWp1apK_2t3B17g,4088
21
+ evalgate_sdk/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ evalgate_sdk/pytest_plugin.py,sha256=Pvg3oyLRw6p4D87sI-078w5qsE0A8b5PhCFtTPUagYM,3427
23
+ evalgate_sdk/reason_codes.py,sha256=ylD7VwFZKeFI7_i8iTI1NewkGDPoUIB5FPyXGD98CHw,3550
24
+ evalgate_sdk/regression.py,sha256=hb5C10nJeILCwfDwAGIMif15NLuNZMVR_OmKwIdBt2g,6162
25
+ evalgate_sdk/replay_decision.py,sha256=AesoGm6n_nyfEZ1XiJUqpfCEKkozp_cslAXRx76eJtw,4388
26
+ evalgate_sdk/snapshot.py,sha256=XLgpaXovwDpSdtVBxrm-8H_vFv8DAu_iODdaxC1fMPc,6753
27
+ evalgate_sdk/streaming.py,sha256=zFRyuu7xJwdISCQmEAcmyNek8-4jadQGio4CLasSxvg,3701
28
+ evalgate_sdk/synthesize.py,sha256=2W_L8CAdD79MC9zkkECDzHTttyvG3mXaSXHgdoEBLeY,8542
29
+ evalgate_sdk/testing.py,sha256=99pYWKP75VsxI0Dgj8W_9J6Qk2o3iABoorBki7I5EMI,4306
30
+ evalgate_sdk/types.py,sha256=w93uk8JhBepEfH9xGgyHXe8zM4OO69WaWLWESl27PB4,18260
31
+ evalgate_sdk/workflows.py,sha256=hbdCbLdoxFV4UpRGPG9Xhp8vPaQqWBbki7Zhtp7YaYM,9092
32
+ evalgate_sdk/cli/__init__.py,sha256=udeI-aBSxgMMA6jNunKwnjUxJu_tev2qGnEbfd5KGtA,2881
33
+ evalgate_sdk/cli/api.py,sha256=_QuRXt-5ye-pHycauyq9LtPIWezLWXkx_ES70foQUDM,8178
34
+ evalgate_sdk/cli/cli_constants.py,sha256=wIIEgHhVIw73wOnYQTyIhbz5HbcfbV9CAMo1hKY-dok,353
35
+ evalgate_sdk/cli/commands.py,sha256=lP65uUcDC3I0avxK9v8TIpIpWcvC7f14JraSqEQ1YLI,40664
36
+ evalgate_sdk/cli/config.py,sha256=NEM68veuR4z-eGcwneL1GOh4WjCoU_XVlIgFJ73oNlI,7362
37
+ evalgate_sdk/cli/env.py,sha256=lFwBQnxTxLgBsEwcySdNEPuVef4XP8fgwlHqL1N5syo,1086
38
+ evalgate_sdk/cli/golden_commands.py,sha256=pIGNqpYJ9-Sd_65PuvyXJgsaT3VX03m4OxtEmTh6rXE,14718
39
+ evalgate_sdk/cli/manifest.py,sha256=QDaAVS1Hcqlrq0bBwe85fTEJzuO_8f--gMpBF3fagfI,9631
40
+ evalgate_sdk/cli/new_commands.py,sha256=4GH6Y1Pv0wQBy7UPuX_88gsE3CdbcRiaImFkZIE5IK8,16698
41
+ evalgate_sdk/cli/policy_packs.py,sha256=ZsPPTRnidEZKdxe9KruCy2EsQ_rbUecZctrd4Rf-jZo,3103
42
+ evalgate_sdk/cli/profiles.py,sha256=DVKyWtiavBO-1ybCn0tA74rJ0RXDd13aEf8_ORWmTLs,459
43
+ evalgate_sdk/cli/regression_gate.py,sha256=_TPkqEkXapvSAZpoQB7T8hsQjGUxVzGcGQEBt7Xl2zg,9700
44
+ evalgate_sdk/cli/traces.py,sha256=sjlKi5QjVisyMg--7qkz3qyhnqmSWtwdOvFdke7C81g,6172
45
+ evalgate_sdk/cli/workspace.py,sha256=oQclyoOQaw53VcsDRBlWJunce-mCR1R5oAP0S9wFAnw,1783
46
+ evalgate_sdk/cli/formatters/types.py,sha256=VcyLbRP1kYmT24I0MieaA2cldt0pQzrGIHJtHNVd60U,3573
47
+ evalgate_sdk/cli/render/__init__.py,sha256=yfgGT5xI8EYeSsjIRU4hqt1348u9JhomjVxQTWbjmYs,39
48
+ evalgate_sdk/cli/render/snippet.py,sha256=mKf0DNtJ_t_k2PlJOoqlf9_fRU2CbGe6f_m2z8GsZXc,438
49
+ evalgate_sdk/cli/render/sort.py,sha256=B_0BjUYz8C8ne03yyJr_6E80ajjG8LAeKzpl2SmRP6I,754
50
+ evalgate_sdk/cli/report/__init__.py,sha256=UMg_X33KNx52oI00FAUE4t8ucaUwcheGFmOxnRQtmxU,41
51
+ evalgate_sdk/cli/report/build_check_report.py,sha256=wkGtC2yutSbVTHM6lje5EGJzPVENYaHDIozTSOlSidI,7839
52
+ evalgate_sdk/formatters/__init__.py,sha256=OeO9n0NptwneXKG6ky4AWwaArX97UM7E4ew_w5PnfWg,433
53
+ evalgate_sdk/formatters/github.py,sha256=z88s1sWbLCteokQ8u-JdYybV9nGEwGCB7zwh4BuJ_mA,1842
54
+ evalgate_sdk/formatters/human.py,sha256=IaTLV4h-0rqrWet3sJMW1Vv4xo15RHsbYcP-icOg-YU,2480
55
+ evalgate_sdk/formatters/json_fmt.py,sha256=fD4KOFrtVHe6we9hxd04O9VlzD-Pqk0Tbg70KEaj71A,283
56
+ evalgate_sdk/formatters/pr_comment.py,sha256=ngOgpcOmLKtSLlTl9-2i__qqbfFpWANX3LVHHRkP3RA,2823
57
+ evalgate_sdk/integrations/__init__.py,sha256=NAhgW3m42rDPxNZz4yx9S67no4vhBqkWDY5jiAMosD8,74
58
+ evalgate_sdk/integrations/anthropic.py,sha256=cYpfmLMPEsRiWUmp5MOdjVh_0r1dUtdSvruXarx3FXg,3074
59
+ evalgate_sdk/integrations/autogen.py,sha256=7jATsDiM4GvzWlxy9L6x6b59f82SLZsY4IqyJdV9zy0,2384
60
+ evalgate_sdk/integrations/crewai.py,sha256=L2lUJ_3kXBQqQF-l5DQ20mZ1KRe3YcGioylqlRmooaM,2174
61
+ evalgate_sdk/integrations/langchain.py,sha256=GOLC8ivFqS25jhzTD3LIGHPAdDqDKLVOxMg8xyUWJ0U,3748
62
+ evalgate_sdk/integrations/openai.py,sha256=oqnNfSHqt5-axNODB24voQJoDziggEVf7BCwdPocaAY,4909
63
+ evalgate_sdk/integrations/openai_eval.py,sha256=4QcOAx40esdUpmOqcwJ2hxuj2rCkBMXui_cTFysMeEM,7098
64
+ evalgate_sdk/runtime/__init__.py,sha256=S6qaWxlyB7QS2Q7On8Pb-8cz8yrnBtbjGtu-Fm5DLx8,1241
65
+ evalgate_sdk/runtime/context.py,sha256=98IkuFZJ1Yxcbg57NfTNsky8gwUR080Dz_GPwVaPbIY,2190
66
+ evalgate_sdk/runtime/eval.py,sha256=NRYbP4mNOtpmquuSZgi5ZhBuk9rInW2DRxCXQ2lWHpc,9714
67
+ evalgate_sdk/runtime/execution_mode.py,sha256=uMA1_NZFDNQqoMIKTbmHCOgSGFTvlO43DMble6UOqBA,5080
68
+ evalgate_sdk/runtime/executor.py,sha256=qyLs9t1p-ljmwscRefM2c3itv8WrbKmUqs5pmLFW_Wk,3223
69
+ evalgate_sdk/runtime/registry.py,sha256=1FmmVtUma4y610a-ProXg3cCYbF8bccAS63Yu4CRlg4,3529
70
+ evalgate_sdk/runtime/run_report.py,sha256=ESrEZW3qUwdOwwAdfa7GCWUJsaXQlzWx7IajTRao6KE,7613
71
+ evalgate_sdk/runtime/types.py,sha256=xflPZlbQhYOSCdW8T3jDEuMqidoqaiSjC-52pl8_o3M,3639
72
+ evalgate_sdk/runtime/adapters/__init__.py,sha256=fT2RSPBf-Lc6L911KQ-TFtFLa3ersKVfF9D1T_tDQI8,53
73
+ evalgate_sdk/runtime/adapters/config_to_dsl.py,sha256=wRO5fZdfyFZE2WRdjiU_-7-TPFlKxpACi1W4MjM5EDY,9070
74
+ evalgate_sdk/runtime/adapters/testsuite_to_dsl.py,sha256=X-lCV2aICGEjppBPO67IiL5etVW3TRytITdG5xiPQ6Y,6642
75
+ evalgate_sdk/utils/__init__.py,sha256=qp_u7iwL20xqoqZ4_bt0E9CiP9OBCjoHQUZFq1vzxho,30
76
+ evalgate_sdk/utils/input_hash.py,sha256=AqY9lPKijJWgmpBTmkRzsjMw-FW-VocQ_2TCYA1BKwo,1225
77
+ evalgate_sdk-3.3.1.dist-info/METADATA,sha256=O84327rWl6A4XCc3uZE0zoNF-HXI9-vTIrb3ehx2k64,23472
78
+ evalgate_sdk-3.3.1.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
79
+ evalgate_sdk-3.3.1.dist-info/entry_points.txt,sha256=r04Fx9iLP6UAXOzTB5aAOUp4BewuMWa7gfC6K6NZZpU,51
80
+ evalgate_sdk-3.3.1.dist-info/RECORD,,