PyPI - evalgate-sdk - Versions diffs - 3.3.1__py3-none-any.whl - Mend

evalgate-sdk 3.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

evalgate_sdk/__init__.py +707 -0
evalgate_sdk/_version.py +3 -0
evalgate_sdk/assertions.py +1362 -0
evalgate_sdk/auto.py +247 -0
evalgate_sdk/batch.py +174 -0
evalgate_sdk/cache.py +111 -0
evalgate_sdk/ci_context.py +123 -0
evalgate_sdk/cli/__init__.py +111 -0
evalgate_sdk/cli/api.py +261 -0
evalgate_sdk/cli/cli_constants.py +20 -0
evalgate_sdk/cli/commands.py +1041 -0
evalgate_sdk/cli/config.py +228 -0
evalgate_sdk/cli/env.py +43 -0
evalgate_sdk/cli/formatters/types.py +132 -0
evalgate_sdk/cli/golden_commands.py +322 -0
evalgate_sdk/cli/manifest.py +301 -0
evalgate_sdk/cli/new_commands.py +435 -0
evalgate_sdk/cli/policy_packs.py +103 -0
evalgate_sdk/cli/profiles.py +12 -0
evalgate_sdk/cli/regression_gate.py +312 -0
evalgate_sdk/cli/render/__init__.py +1 -0
evalgate_sdk/cli/render/snippet.py +18 -0
evalgate_sdk/cli/render/sort.py +29 -0
evalgate_sdk/cli/report/__init__.py +1 -0
evalgate_sdk/cli/report/build_check_report.py +209 -0
evalgate_sdk/cli/traces.py +186 -0
evalgate_sdk/cli/workspace.py +63 -0
evalgate_sdk/client.py +609 -0
evalgate_sdk/cluster.py +359 -0
evalgate_sdk/collector.py +161 -0
evalgate_sdk/constants.py +6 -0
evalgate_sdk/context.py +151 -0
evalgate_sdk/errors.py +236 -0
evalgate_sdk/export.py +238 -0
evalgate_sdk/formatters/__init__.py +11 -0
evalgate_sdk/formatters/github.py +51 -0
evalgate_sdk/formatters/human.py +68 -0
evalgate_sdk/formatters/json_fmt.py +11 -0
evalgate_sdk/formatters/pr_comment.py +80 -0
evalgate_sdk/golden.py +426 -0
evalgate_sdk/integrations/__init__.py +1 -0
evalgate_sdk/integrations/anthropic.py +99 -0
evalgate_sdk/integrations/autogen.py +62 -0
evalgate_sdk/integrations/crewai.py +61 -0
evalgate_sdk/integrations/langchain.py +100 -0
evalgate_sdk/integrations/openai.py +155 -0
evalgate_sdk/integrations/openai_eval.py +221 -0
evalgate_sdk/local.py +144 -0
evalgate_sdk/logger.py +123 -0
evalgate_sdk/matchers.py +62 -0
evalgate_sdk/otel.py +256 -0
evalgate_sdk/pagination.py +145 -0
evalgate_sdk/py.typed +0 -0
evalgate_sdk/pytest_plugin.py +96 -0
evalgate_sdk/reason_codes.py +103 -0
evalgate_sdk/regression.py +196 -0
evalgate_sdk/replay_decision.py +115 -0
evalgate_sdk/runtime/__init__.py +50 -0
evalgate_sdk/runtime/adapters/__init__.py +1 -0
evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
evalgate_sdk/runtime/context.py +68 -0
evalgate_sdk/runtime/eval.py +318 -0
evalgate_sdk/runtime/execution_mode.py +170 -0
evalgate_sdk/runtime/executor.py +92 -0
evalgate_sdk/runtime/registry.py +125 -0
evalgate_sdk/runtime/run_report.py +249 -0
evalgate_sdk/runtime/types.py +143 -0
evalgate_sdk/snapshot.py +219 -0
evalgate_sdk/streaming.py +124 -0
evalgate_sdk/synthesize.py +226 -0
evalgate_sdk/testing.py +128 -0
evalgate_sdk/types.py +666 -0
evalgate_sdk/utils/__init__.py +1 -0
evalgate_sdk/utils/input_hash.py +42 -0
evalgate_sdk/workflows.py +264 -0
evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0

evalgate_sdk-3.3.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,608 @@
+Metadata-Version: 2.4
+Name: evalgate-sdk
+Version: 3.3.1
+Summary: EvalGate Python SDK — CI for AI behavior. Traces, evaluations, assertions, and regression gates for LLM apps.
+Project-URL: Homepage, https://evalgate.com
+Project-URL: Documentation, https://github.com/evalgate/ai-evaluation-platform#readme
+Project-URL: Repository, https://github.com/evalgate/ai-evaluation-platform
+Project-URL: Issues, https://github.com/evalgate/ai-evaluation-platform/issues
+Project-URL: Changelog, https://github.com/evalgate/ai-evaluation-platform/blob/main/src/packages/sdk-python/CHANGELOG.md
+Author-email: EvalGate <team@evalgate.com>
+License-Expression: MIT
+Keywords: ai,anthropic,assertions,ci,evaluation,llm,monitoring,observability,openai,regression,testing,tracing,workflow
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Software Development :: Testing
+Classifier: Typing :: Typed
+Requires-Python: >=3.10
+Requires-Dist: httpx<1,>=0.27
+Requires-Dist: pydantic<3,>=2.0
+Provides-Extra: all
+Requires-Dist: anthropic>=0.20; extra == 'all'
+Requires-Dist: langchain-core>=0.2; extra == 'all'
+Requires-Dist: openai>=1.0; extra == 'all'
+Requires-Dist: rich>=13; extra == 'all'
+Requires-Dist: typer>=0.12; extra == 'all'
+Provides-Extra: anthropic
+Requires-Dist: anthropic>=0.20; extra == 'anthropic'
+Provides-Extra: cli
+Requires-Dist: rich>=13; extra == 'cli'
+Requires-Dist: typer>=0.12; extra == 'cli'
+Provides-Extra: dev
+Requires-Dist: mypy>=1.10; extra == 'dev'
+Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
+Requires-Dist: pytest-cov>=5; extra == 'dev'
+Requires-Dist: pytest>=8; extra == 'dev'
+Requires-Dist: respx>=0.21; extra == 'dev'
+Requires-Dist: rich>=13; extra == 'dev'
+Requires-Dist: ruff>=0.5; extra == 'dev'
+Requires-Dist: typer>=0.12; extra == 'dev'
+Provides-Extra: langchain
+Requires-Dist: langchain-core>=0.2; extra == 'langchain'
+Provides-Extra: openai
+Requires-Dist: openai>=1.0; extra == 'openai'
+Description-Content-Type: text/markdown
+# evalgate-sdk
+ Build a living golden suite for AI behavior. 🚀
+ No infra. No lock-in. Remove anytime.
+ **EvalGate = the full suite for AI quality in Python.** Discover overlap, cluster failures, build golden datasets, run automated regression gates, and guide optimization before changes reach production.
+[![PyPI](https://img.shields.io/pypi/v/evalgate-sdk)](https://pypi.org/project/evalgate-sdk/)
+[![Python](https://img.shields.io/pypi/pyversions/evalgate-sdk)](https://pypi.org/project/evalgate-sdk/)
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
+ [![Typed](https://img.shields.io/badge/typing-typed-blue)](https://peps.python.org/pep-0561/)
+ [![Tests](https://img.shields.io/badge/tests-507%20passed-brightgreen.svg)](#)
+ ---
+ ## The Full EvalGate Workflow
+ EvalGate is no longer just a pass/fail gate at the end of CI. The current workflow is a full loop:
+ ```text
+ discover -> cluster -> label/analyze -> synthesize -> gate/auto
+ ```
+ - **Discover overlap before adding more tests** with `evalgate discover --manifest`
+ - **Cluster failures by pattern** with `evalgate cluster --run .evalgate/runs/latest.json`
+ - **Build a labeled golden dataset** with `evalgate label` and `evalgate analyze`
+ - **Draft broader golden cases** with `evalgate synthesize --dataset .evalgate/golden/labeled.jsonl --output .evalgate/golden/synthetic.jsonl`
+ - **Block regressions or run guided optimization** with `evalgate gate`, `evalgate ci`, and `evalgate auto`
+ The Python SDK ships the same closed-loop workflow primitives as the platform: assertions, spec execution, tracing, clustering, golden-dataset analysis, synthesis, replay decision, and guided auto iterations.
+ ---
+## Install
+```bash
+pip install evalgate-sdk                        # Core
+pip install "evalgate-sdk[openai]"              # + OpenAI tracing and async assertions
+pip install "evalgate-sdk[anthropic]"           # + Anthropic tracing and async assertions
+pip install "evalgate-sdk[all]"                 # Everything
+```
+ ---
+## Quickstart
+ No API key needed for local assertions:
+ ```python
+ from evalgate_sdk import AIEvalClient, expect
+ from evalgate_sdk.types import CreateTraceParams
+ # Local assertions — no API key needed
+ result = expect("The capital of France is Paris.").to_contain("Paris")
+ print(result.passed)  # True
+ # Platform: trace and evaluate with API key
+ client = AIEvalClient(api_key="sk-...")
+ trace = await client.traces.create(CreateTraceParams(name="chat-quality"))
+ ```
+ Same CI gate, same quality checks. Python supports the same core loop as TypeScript: assertions, test suites, OpenAI/Anthropic tracing, LangChain/CrewAI/AutoGen integrations, golden dataset workflow commands, and regression gates.
+**Python CLI:** `pip install "evalgate-sdk[cli]"` → `evalgate init`, `evalgate run`, `evalgate check`, `evalgate gate`, `evalgate ci`, `evalgate discover`, `evalgate cluster`, `evalgate label`, `evalgate analyze`, `evalgate synthesize`, `evalgate replay-decision`, `evalgate explain`, `evalgate doctor`, `evalgate auto`.
+Context helpers are importable from the package root:
+```python
+from evalgate_sdk import ContextMetadata, create_context
+ctx: ContextMetadata = {"run_id": "test-run"}
+token = create_context(ctx)
+```
+ ---
+## Why EvalGate?
+LLMs don't fail like traditional software — they drift silently. EvalGate turns evaluations into CI gates so regressions never reach production.
+| What you get | How it works |
+|--------------|--------------|
+| **30+ assertions** | `expect(output).to_contain("Paris")`, `.to_not_contain_pii()`, `.to_have_no_profanity()` |
+| **DSL spec system** | `define_eval("name", executor)` with `.skip` and `.only` support |
+| **Test suites** | Define cases with retries, seed, strict mode, and stop-on-failure |
+| **Workflow tracing** | Multi-agent handoffs, decisions, costs — with offline mode |
+| **OpenAI / Anthropic** | Drop-in tracing wrappers + LangChain, CrewAI, AutoGen |
+| **Regression gates** | Block deploys when eval scores drop, with baseline tamper detection |
+| **Snapshot testing** | Save, compare, and diff outputs over time |
+| **Impact analysis** | `evalgate discover` → manifest → impact analysis → run only what changed |
+| **CLI** | `evalgate run`, `evalgate check`, `evalgate gate`, `evalgate ci`, `evalgate discover`, `evalgate cluster`, `evalgate label`, `evalgate analyze`, `evalgate synthesize`, `evalgate replay-decision`, `evalgate explain`, `evalgate doctor`, `evalgate auto` |
+ ---
+## Assertions
+30+ built-in checks for LLM output quality, safety, and structure. All return `AssertionResult` with `.passed`, `.message`, `.expected`, `.actual`.
+### Fluent API (`expect`)
+```python
+from evalgate_sdk import expect
+# Content
+expect("The capital of France is Paris.").to_contain("Paris")
+expect("draft output").not_.to_contain("final answer")
+expect("Hello World").to_not_contain_pii()
+expect("Thank you for your help.").to_be_professional()
+expect("Clean output").to_have_no_profanity()
+# Sentiment
+expect("Great product!").to_have_sentiment("positive")
+# Structure
+expect('{"name": "Alice"}').to_be_valid_json()
+expect('{"name": "Alice"}').to_match_json({"type": "object"})
+expect('payload={"name": "Alice"}').to_match_json({"required": ["name"]})
+expect(0.95).to_be_between(0.0, 1.0)
+expect("Hello world").to_have_length(min=5, max=100)
+expect(output).to_contain_keywords(["gravity", "force"])
+# Comparison
+expect(42).to_be_greater_than(10)
+expect(42).to_be_less_than(100)
+expect(True).to_be_truthy()
+# Code
+expect("def hello(): pass").to_contain_code()
+# Hallucination
+expect(output).to_not_hallucinate(["Paris is the capital of France"])
+```
+### Standalone Functions
+```python
+from evalgate_sdk import (
+    contains_keywords, has_no_toxicity, has_sentiment, similar_to,
+    contains_json, has_readability_score, has_factual_accuracy,
+    has_valid_code_syntax, has_sentiment_with_score, matches_pattern,
+    matches_schema, responded_within_duration, responded_within_time_since,
+    run_assertions,
+)
+# Sync standalone assertion helpers return AssertionResult
+result = has_no_toxicity("Thank you for your help.")
+print(result.passed, result.message)
+result = has_valid_code_syntax("def hello():\n    return 'hi'", "python")
+print(result.passed)  # True — uses ast.parse for Python
+result = matches_schema('payload={"status": "ok"}', {"required": ["status"]})
+print(result.passed, result.actual)
+# Batch assertions
+results = run_assertions([
+    lambda: expect(output).to_contain("Paris"),
+    lambda: expect(output).to_have_sentiment("positive"),
+    lambda: expect(output).to_have_length(min=10),
+    lambda: True,  # legacy bools are coerced into AssertionResult
+])
+all_passed = all(r.passed for r in results)
+```
+Compatibility helpers such as `has_pii()`, async semantic checks like `has_sentiment_async()`, and score-style utilities such as `has_consistency()` still return booleans or dictionaries where documented.
+### LLM-Backed Assertions (Async)
+For context-aware checking beyond heuristics. Install the matching optional extra first, for example `pip install "evalgate-sdk[openai]"` when using the default OpenAI provider.
+```python
+from evalgate_sdk import configure_assertions
+from evalgate_sdk import has_sentiment_async, has_no_toxicity_async
+configure_assertions(
+    provider="openai",             # or "anthropic"
+    api_key="sk-...",
+    model="gpt-4o-mini",
+    timeout_ms=30_000,              # 30s default, prevents hung calls
+)
+matches = await has_sentiment_async("subtle irony...", "negative")
+is_safe = await has_no_toxicity_async("borderline text")
+```
+You can also keep using `configure_assertions(AssertionLLMConfig(...))` when you prefer an explicit config object.
+ ---
+## DSL Spec System
+Define evaluation specs with the `define_eval` DSL — the same API as the TypeScript SDK:
+```python
+from evalgate_sdk import define_eval, create_result
+define_eval("Math Operations", async_executor)
+# Object form with metadata
+define_eval({
+    "name": "String check",
+    "tags": ["basic"],
+    "executor": async_executor,
+})
+# Skip / Only (matches TS defineEval.skip / defineEval.only)
+define_eval.skip("Skipped spec", async_executor)
+define_eval.only("Focus spec", async_executor)
+```
+ ---
+## Test Suites
+```python
+from evalgate_sdk import create_test_suite
+from evalgate_sdk.types import TestSuiteCase, TestSuiteConfig
+suite = create_test_suite("safety-checks", TestSuiteConfig(
+    evaluator=my_llm_function,
+    test_cases=[
+        TestSuiteCase(name="greeting", input="Hello", expected_output="Hi there!"),
+        TestSuiteCase(name="pii-check", input="Describe yourself",
+                      assertions=[{"type": "not_contains_pii"}]),
+    ],
+    retries=3,                # Retry failed cases (default: 0)
+    retry_delay_ms=1000,      # Delay between retries
+    retry_jitter=True,        # Add jitter to retry delay
+    seed=42,                  # Deterministic ordering
+    strict=True,              # Fail on warnings
+    stop_on_failure=True,     # Abort on first failure
+))
+result = await suite.run()
+print(f"{result.passed_count}/{result.total} passed")
+```
+ ---
+## OpenAI Integration
+```python
+from openai import AsyncOpenAI
+from evalgate_sdk import AIEvalClient
+from evalgate_sdk.integrations.openai import trace_openai
+traced = trace_openai(AsyncOpenAI(), AIEvalClient.init())
+response = await traced.chat.completions.create(
+    model="gpt-4",
+    messages=[{"role": "user", "content": "Explain gravity"}]
+)
+# Automatically traced with latency, tokens, and output
+```
+Batch eval with built-in assertions:
+```python
+from evalgate_sdk import openai_chat_eval, OpenAIChatEvalCase
+result = await openai_chat_eval(
+    name="chat-quality",
+    model="gpt-4",
+    cases=[
+        OpenAIChatEvalCase(
+            input="Explain gravity in one sentence.",
+            assertions=[{"type": "contains_keywords", "value": ["gravity", "force"]}],
+        ),
+    ],
+)
+print(f"{result.passed_count}/{result.total} passed — score: {result.score:.2f}")
+```
+ ---
+## Anthropic Integration
+```python
+from anthropic import AsyncAnthropic
+from evalgate_sdk import AIEvalClient
+from evalgate_sdk.integrations.anthropic import trace_anthropic
+traced = trace_anthropic(AsyncAnthropic(), AIEvalClient.init())
+response = await traced.messages.create(
+    model="claude-sonnet-4-20250514",
+    max_tokens=1024,
+    messages=[{"role": "user", "content": "Explain gravity"}]
+)
+```
+Also available: `trace_langchain`, `trace_crewai`, `trace_autogen`.
+ ---
+## Workflow Tracing
+Track multi-agent systems end-to-end — handoffs, decisions, and cost:
+```python
+from evalgate_sdk import AIEvalClient, WorkflowTracer
+from evalgate_sdk.types import HandoffType, CostCategory, RecordCostParams
+client = AIEvalClient.init()
+tracer = WorkflowTracer(client, name="research-pipeline")
+ctx = await tracer.start_workflow()
+span = await tracer.start_agent_span("researcher", {"query": "AI trends"})
+await tracer.end_agent_span(span, {"findings": "..."})
+await tracer.record_handoff("researcher", "writer", handoff_type=HandoffType.DELEGATION)
+await tracer.record_cost(RecordCostParams(
+    agent_name="researcher", category=CostCategory.LLM_INPUT, amount=0.05, tokens=1500
+))
+await tracer.end_workflow()
+print(f"Total cost: ${tracer.get_total_cost():.2f}")
+```
+### Offline Mode
+Run workflow tracing locally without an API connection:
+```python
+tracer = WorkflowTracer(None, name="local-test", offline=True)
+ctx = await tracer.start_workflow()  # No API calls, no crash
+```
+You can also omit the client entirely when you want local-only workflow tracing:
+```python
+from evalgate_sdk import create_workflow_tracer
+tracer = create_workflow_tracer(name="local-test")
+ctx = await tracer.start_workflow()
+assert ctx.trace_id is None
+```
+## Batch Processing
+`batch_process(items, processor, concurrency=...)` expects an async callable for `processor` and returns results in input order.
+```python
+from evalgate_sdk import batch_process
+async def double(value: int) -> int:
+    return value * 2
+results = await batch_process([1, 2, 3], double, concurrency=2)
+```
+If you pass a synchronous function, the SDK raises `TypeError` immediately instead of failing later with a generic await error.
+## Snapshot Testing
+Snapshots are stored in `.snapshots` by default, relative to the current working directory.
+```python
+from evalgate_sdk import compare_with_snapshot, snapshot
+snapshot("Hello there", "support-reply")
+comparison = compare_with_snapshot("support-reply", "Hello there")
+print(comparison.matches)
+```
+Override the directory when you want snapshots under a project-specific path:
+```python
+snapshot("Hello there", "support-reply", directory=".evalgate/snapshots")
+```
+Add `.snapshots/` to your `.gitignore` unless you intentionally want snapshot files committed.
+ ---
+## Regression Gates
+Block deployments when eval scores drop:
+```python
+from evalgate_sdk import evaluate_regression, to_pass_gate
+report = evaluate_regression(current_results, baseline)
+assert to_pass_gate(report), f"Regression detected: {report.summary}"
+```
+### Baseline Tamper Detection
+```python
+from evalgate_sdk import compute_baseline_checksum, verify_baseline_checksum, Baseline
+baseline = Baseline(scores={"chat-quality": 0.95, "safety": 0.99})
+checksum = compute_baseline_checksum(baseline)
+# Later — verify integrity before gating
+assert verify_baseline_checksum(baseline, checksum), "Baseline tampered!"
+```
+ ---
+## CLI
+```bash
+evalgate init                          # Scaffold eval config
+evalgate discover                      # Find eval spec files
+evalgate discover --manifest           # Generate stable manifest
+evalgate run --write-results           # Run with artifact retention
+evalgate gate                          # Regression gate
+evalgate ci                            # Run + gate (CI mode)
+evalgate ci --base main --format github # CI with PR summary
+evalgate cluster --run .evalgate/runs/latest.json
+evalgate label --run .evalgate/runs/latest.json
+evalgate analyze --dataset .evalgate/golden/labeled.jsonl
+evalgate synthesize --dataset .evalgate/golden/labeled.jsonl --output .evalgate/golden/synthetic.jsonl
+evalgate replay-decision --previous .evalgate/runs/run-prev.json --current .evalgate/runs/run-latest.json
+evalgate auto run --objective "reduce hallucination" --baseline-run previous.json --candidate-run current.json
+evalgate auto daemon --objective "reduce hallucination" --cycles 3
+evalgate compare --base a.json --head b.json  # Side-by-side diff
+evalgate doctor                        # Preflight checklist
+evalgate explain                       # Root cause analysis on last failure
+evalgate impact-analysis --base main   # Run only impacted specs
+```
+### Exit Codes
+| Code | Meaning |
+|------|---------|
+| 0 | Pass — no regression |
+| 1 | Regression detected |
+| 2 | Infra error (baseline missing, tests crashed) |
+ ---
+## Data Export & Import
+```python
+from evalgate_sdk import export_data, import_data, ExportOptions, export_to_file
+# Export
+data = await export_data(client, ExportOptions(format="json"))
+export_to_file(data, "backup.json")
+# Import (2-arg API — client is optional keyword arg)
+from evalgate_sdk import import_from_file
+data = import_from_file("backup.json")
+result = await import_data(data, client=client)
+# LangSmith migration
+from evalgate_sdk import import_from_langsmith
+data = import_from_langsmith(langsmith_export)
+```
+ ---
+## Reliability
+| Feature | Detail |
+|---|---|
+| **Python** | 3.10, 3.11, 3.12, 3.13 |
+| **Dependencies** | Only `httpx` + `pydantic` |
+| **Async** | Native `async/await` throughout; sync wrappers available |
+| **Type hints** | Full `py.typed` — works with mypy and Pyright |
+| **Errors** | Structured: `RateLimitError`, `AuthenticationError`, `NetworkError`, `ValidationError` — all have `.message` |
+| **Rate handling** | Built-in `RateLimiter` with configurable tiers |
+| **Batching** | `batch_process()` with concurrency control |
+| **Pagination** | Async `PaginatedIterator` with cursor support |
+| **Timeouts** | 30s default on all HTTP clients and LLM assertion calls |
+| **Offline** | `WorkflowTracer(offline=True)`, `LocalStorage` for file-based dev |
+ ---
+## API Reference
+| Module | Methods |
+|---|---|
+| `client.traces` | `create`, `list`, `get`, `update`, `delete`, `create_span`, `list_spans` |
+| `client.evaluations` | `create`, `get`, `list`, `update`, `delete`, `create_test_case`, `list_test_cases`, `create_run`, `list_runs`, `get_run` |
+| `client.llm_judge` | `evaluate`, `create_config`, `list_configs`, `list_results`, `get_alignment` |
+| `client.annotations` | `create`, `list`, `tasks.create`, `tasks.list`, `tasks.get`, `tasks.items.create`, `tasks.items.list` |
+| `client.developer` | `get_usage`, `get_usage_summary`, `api_keys.*`, `webhooks.*` |
+ ---
+## Release Notes
+### v3.2.x
+#### Highlights
+1. **Full EvalGate loop**: discover → cluster → label/analyze → synthesize → gate/auto
+2. **Golden dataset workflow**: canonical labeled dataset, analysis summaries, synthetic case generation, and replay decision helpers
+3. **Guided optimization**: `evalgate auto run`, `evalgate auto daemon`, and auto history/report helpers
+4. **CLI parity improvements**: Python CLI covers clustering, labeling, analysis, synthesis, replay-decision, and bounded auto workflows
+5. **Tracing + workflow integrations**: OpenAI, Anthropic, LangChain, CrewAI, and AutoGen remain first-class Python surfaces
+#### Changelog
+1. **Correctness fixes (parity with TypeScript SDK)**:
+	* **Assertion return types**: sync helpers now normalize to `AssertionResult`, including `contains_keywords`, `has_sentiment`, `has_readability_score`, `similar_to`, `contains_json`, `has_no_toxicity`, `matches_schema`, `has_valid_code_syntax`, `follows_instructions`, and `contains_all_required_fields`
+	* **Toxicity blocklist**: expanded from 9 → 95 terms across 8 categories; uses `\b` word-boundary regex (no substring false positives)
+	* **`has_valid_code_syntax`**: Python uses `ast.parse` (real syntax validation); other languages use structural regex
+	* **`has_factual_accuracy`**: entity-aware word-overlap check instead of raw substring matching
+	* **Expectation parity**: `expect(...).not_` now inverts fluent assertions and `to_match_json()` accepts JSON strings or embedded JSON snippets
+	* **Batch compatibility**: `run_assertions()` now coerces legacy boolean and mapping results into `AssertionResult`
+	* **`has_sentiment_with_score`**: confidence gradient scales with margin × magnitude; single-word inputs no longer return 1.0
+	* **`WorkflowTracer`**: accepts `name` and `offline` kwargs; offline mode skips all API calls
+	* **`import_data`**: 2-arg `(data, options)` signature matching TypeScript; client is keyword-only
+	* **`Logger.child`**: uses `:` separator matching TypeScript (was `.`)
+	* **`define_eval.skip` / `.only`**: attached as methods on `define_eval`
+	* **`ValidationError.message`**: `.message` property on all error classes
+	* **`AssertionLLMConfig.timeout_ms`**: 30s default, enforced via `asyncio.wait_for`
+	* **`compute_baseline_checksum` / `verify_baseline_checksum`**: SHA-256 tamper detection
+	* **`TestSuiteConfig`**: added `retries`, `retry_delay_ms`, `retry_jitter`, `seed`, `strict`, `stop_on_failure`
+	* **`to_have_no_profanity`**: new method on `Expectation` matching TypeScript `toHaveNoProfanity`
+	* **`RequestCache`**: removed from public exports (internal only)
+2. **Production hardening**:
+	* 30s default timeout on all `httpx.AsyncClient` calls
+	* API key validation before sending requests
+	* URL-encoded query params in `fetch_quality_latest`
+	* Graceful error handling in `report_trace` and OTel exporter (no more crashes on network errors)
+	* `run_report` correctly sets `success=False` on test failures
+	* GitHub Actions formatter uses `GITHUB_OUTPUT` (deprecated `::set-output` removed)
+	* Config parse errors logged as warnings instead of silently swallowed
+	* `save_trace` / `save_evaluation` no longer mutate caller's dict
+	* Subprocess timeout handling in regression gate
+507 tests passing.
+ ---
+## Examples
+See [`examples/python/`](https://github.com/evalgate/ai-evaluation-platform/tree/main/examples/python):
+- **[OpenAI Eval](examples/python/openai_eval.ipynb)** — Trace and evaluate OpenAI chat completions
+- **[RAG Eval](examples/python/rag_eval.ipynb)** — Evaluate retrieval-augmented generation pipelines
+- **[Agent Eval](examples/python/agent_eval.ipynb)** — Test and trace multi-agent workflows
+---
+## No Lock-in
+```bash
+rm .evalgate/config.json
+```
+Your local assertions keep working. No account cancellation. No data export required.
+---
+## Links
+[Platform](https://evalgate.com) · [GitHub](https://github.com/evalgate/ai-evaluation-platform) · [TypeScript SDK](https://www.npmjs.com/package/@evalgate/sdk)
+## License
+MIT

evalgate_sdk-3.3.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,80 @@
+evalgate_sdk/__init__.py,sha256=1t6havKtAQCGUZDGsx7OHywPWS371vjMN3969tvw-I4,17915
+evalgate_sdk/_version.py,sha256=FGscfcIpTNSl8_v-ekzH-qo7xflizQx-2ULm5j2v4nQ,71
+evalgate_sdk/assertions.py,sha256=z8lOi7_OEXToZe5lTfLzb4wI7SyXryYpcS_RuK4PtKY,46432
+evalgate_sdk/auto.py,sha256=_6R18_6AOKKg_aj-cy4ykFl7Lxw4fbengM_D1oKa_zU,9254
+evalgate_sdk/batch.py,sha256=xFIKX55X1Ad3bljPHLzckG2MTcL556YrJDuEU_Wtq08,5601
+evalgate_sdk/cache.py,sha256=_asiOV3s4OjjhHISonYwrGOdjlJkMnzFtCclWC9hyfU,3259
+evalgate_sdk/ci_context.py,sha256=86YP27I6KWvmlF3p0L5aXMPHazFJSAsgfZ0spW0yCso,3892
+evalgate_sdk/client.py,sha256=zO59_I0BGPfEWDr_yPKTWvTQA7geMXI7mRAdMC8S0co,24660
+evalgate_sdk/cluster.py,sha256=BRP0PHliFtGx4nZdBZ-8DEW36SSIxoLncWoUqZiK9eQ,11698
+evalgate_sdk/collector.py,sha256=JUdd-hXxy_o0CKzNl0ljrtiVQAz2dZ3OiB7kEE8z5C8,4747
+evalgate_sdk/constants.py,sha256=ZimYIgG9vgLwe7rRfSf2TRRevfQgfgWTpNg6MziQzBQ,121
+evalgate_sdk/context.py,sha256=8ftm9QJWM9Ak2vi5H9PcjIbpWqLlKeeGEZXEum0kjp8,4399
+evalgate_sdk/errors.py,sha256=qjoib8ECiTuFa2te8z6SCWtwALTOqPgs2wGvwGG0lQc,8410
+evalgate_sdk/export.py,sha256=DOVcTEEKZ1PGt9FTb6S4d5vo5RTQb1hVKolqpZQRTKc,8097
+evalgate_sdk/golden.py,sha256=20DLfjihgQPBVvdSGLd9r19LqqV9GDK2JLzhocXPryk,17560
+evalgate_sdk/local.py,sha256=UAtNetqm8elsRiqfjflUkn37RVBs87MVsyoXNlALIsc,5685
+evalgate_sdk/logger.py,sha256=cCGlvzcF4bxVss6aQN2xiWpl6z_1ZefbQAcurXd9K5Q,4067
+evalgate_sdk/matchers.py,sha256=h8jVD8W2luquaay7nwGNSaTERYInaaoMZXLgKnF2RpE,1898
+evalgate_sdk/otel.py,sha256=t5389FLYxM0_ht1p0xu2BgLC8Z7crNlEEnUB-3WrJ5U,8851
+evalgate_sdk/pagination.py,sha256=FEK68fYx3JUTyY3oFlr3Cx1w_7yYmWp1apK_2t3B17g,4088
+evalgate_sdk/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+evalgate_sdk/pytest_plugin.py,sha256=Pvg3oyLRw6p4D87sI-078w5qsE0A8b5PhCFtTPUagYM,3427
+evalgate_sdk/reason_codes.py,sha256=ylD7VwFZKeFI7_i8iTI1NewkGDPoUIB5FPyXGD98CHw,3550
+evalgate_sdk/regression.py,sha256=hb5C10nJeILCwfDwAGIMif15NLuNZMVR_OmKwIdBt2g,6162
+evalgate_sdk/replay_decision.py,sha256=AesoGm6n_nyfEZ1XiJUqpfCEKkozp_cslAXRx76eJtw,4388
+evalgate_sdk/snapshot.py,sha256=XLgpaXovwDpSdtVBxrm-8H_vFv8DAu_iODdaxC1fMPc,6753
+evalgate_sdk/streaming.py,sha256=zFRyuu7xJwdISCQmEAcmyNek8-4jadQGio4CLasSxvg,3701
+evalgate_sdk/synthesize.py,sha256=2W_L8CAdD79MC9zkkECDzHTttyvG3mXaSXHgdoEBLeY,8542
+evalgate_sdk/testing.py,sha256=99pYWKP75VsxI0Dgj8W_9J6Qk2o3iABoorBki7I5EMI,4306
+evalgate_sdk/types.py,sha256=w93uk8JhBepEfH9xGgyHXe8zM4OO69WaWLWESl27PB4,18260
+evalgate_sdk/workflows.py,sha256=hbdCbLdoxFV4UpRGPG9Xhp8vPaQqWBbki7Zhtp7YaYM,9092
+evalgate_sdk/cli/__init__.py,sha256=udeI-aBSxgMMA6jNunKwnjUxJu_tev2qGnEbfd5KGtA,2881
+evalgate_sdk/cli/api.py,sha256=_QuRXt-5ye-pHycauyq9LtPIWezLWXkx_ES70foQUDM,8178
+evalgate_sdk/cli/cli_constants.py,sha256=wIIEgHhVIw73wOnYQTyIhbz5HbcfbV9CAMo1hKY-dok,353
+evalgate_sdk/cli/commands.py,sha256=lP65uUcDC3I0avxK9v8TIpIpWcvC7f14JraSqEQ1YLI,40664
+evalgate_sdk/cli/config.py,sha256=NEM68veuR4z-eGcwneL1GOh4WjCoU_XVlIgFJ73oNlI,7362
+evalgate_sdk/cli/env.py,sha256=lFwBQnxTxLgBsEwcySdNEPuVef4XP8fgwlHqL1N5syo,1086
+evalgate_sdk/cli/golden_commands.py,sha256=pIGNqpYJ9-Sd_65PuvyXJgsaT3VX03m4OxtEmTh6rXE,14718
+evalgate_sdk/cli/manifest.py,sha256=QDaAVS1Hcqlrq0bBwe85fTEJzuO_8f--gMpBF3fagfI,9631
+evalgate_sdk/cli/new_commands.py,sha256=4GH6Y1Pv0wQBy7UPuX_88gsE3CdbcRiaImFkZIE5IK8,16698
+evalgate_sdk/cli/policy_packs.py,sha256=ZsPPTRnidEZKdxe9KruCy2EsQ_rbUecZctrd4Rf-jZo,3103
+evalgate_sdk/cli/profiles.py,sha256=DVKyWtiavBO-1ybCn0tA74rJ0RXDd13aEf8_ORWmTLs,459
+evalgate_sdk/cli/regression_gate.py,sha256=_TPkqEkXapvSAZpoQB7T8hsQjGUxVzGcGQEBt7Xl2zg,9700
+evalgate_sdk/cli/traces.py,sha256=sjlKi5QjVisyMg--7qkz3qyhnqmSWtwdOvFdke7C81g,6172
+evalgate_sdk/cli/workspace.py,sha256=oQclyoOQaw53VcsDRBlWJunce-mCR1R5oAP0S9wFAnw,1783
+evalgate_sdk/cli/formatters/types.py,sha256=VcyLbRP1kYmT24I0MieaA2cldt0pQzrGIHJtHNVd60U,3573
+evalgate_sdk/cli/render/__init__.py,sha256=yfgGT5xI8EYeSsjIRU4hqt1348u9JhomjVxQTWbjmYs,39
+evalgate_sdk/cli/render/snippet.py,sha256=mKf0DNtJ_t_k2PlJOoqlf9_fRU2CbGe6f_m2z8GsZXc,438
+evalgate_sdk/cli/render/sort.py,sha256=B_0BjUYz8C8ne03yyJr_6E80ajjG8LAeKzpl2SmRP6I,754
+evalgate_sdk/cli/report/__init__.py,sha256=UMg_X33KNx52oI00FAUE4t8ucaUwcheGFmOxnRQtmxU,41
+evalgate_sdk/cli/report/build_check_report.py,sha256=wkGtC2yutSbVTHM6lje5EGJzPVENYaHDIozTSOlSidI,7839
+evalgate_sdk/formatters/__init__.py,sha256=OeO9n0NptwneXKG6ky4AWwaArX97UM7E4ew_w5PnfWg,433
+evalgate_sdk/formatters/github.py,sha256=z88s1sWbLCteokQ8u-JdYybV9nGEwGCB7zwh4BuJ_mA,1842
+evalgate_sdk/formatters/human.py,sha256=IaTLV4h-0rqrWet3sJMW1Vv4xo15RHsbYcP-icOg-YU,2480
+evalgate_sdk/formatters/json_fmt.py,sha256=fD4KOFrtVHe6we9hxd04O9VlzD-Pqk0Tbg70KEaj71A,283
+evalgate_sdk/formatters/pr_comment.py,sha256=ngOgpcOmLKtSLlTl9-2i__qqbfFpWANX3LVHHRkP3RA,2823
+evalgate_sdk/integrations/__init__.py,sha256=NAhgW3m42rDPxNZz4yx9S67no4vhBqkWDY5jiAMosD8,74
+evalgate_sdk/integrations/anthropic.py,sha256=cYpfmLMPEsRiWUmp5MOdjVh_0r1dUtdSvruXarx3FXg,3074
+evalgate_sdk/integrations/autogen.py,sha256=7jATsDiM4GvzWlxy9L6x6b59f82SLZsY4IqyJdV9zy0,2384
+evalgate_sdk/integrations/crewai.py,sha256=L2lUJ_3kXBQqQF-l5DQ20mZ1KRe3YcGioylqlRmooaM,2174
+evalgate_sdk/integrations/langchain.py,sha256=GOLC8ivFqS25jhzTD3LIGHPAdDqDKLVOxMg8xyUWJ0U,3748
+evalgate_sdk/integrations/openai.py,sha256=oqnNfSHqt5-axNODB24voQJoDziggEVf7BCwdPocaAY,4909
+evalgate_sdk/integrations/openai_eval.py,sha256=4QcOAx40esdUpmOqcwJ2hxuj2rCkBMXui_cTFysMeEM,7098
+evalgate_sdk/runtime/__init__.py,sha256=S6qaWxlyB7QS2Q7On8Pb-8cz8yrnBtbjGtu-Fm5DLx8,1241
+evalgate_sdk/runtime/context.py,sha256=98IkuFZJ1Yxcbg57NfTNsky8gwUR080Dz_GPwVaPbIY,2190
+evalgate_sdk/runtime/eval.py,sha256=NRYbP4mNOtpmquuSZgi5ZhBuk9rInW2DRxCXQ2lWHpc,9714
+evalgate_sdk/runtime/execution_mode.py,sha256=uMA1_NZFDNQqoMIKTbmHCOgSGFTvlO43DMble6UOqBA,5080
+evalgate_sdk/runtime/executor.py,sha256=qyLs9t1p-ljmwscRefM2c3itv8WrbKmUqs5pmLFW_Wk,3223
+evalgate_sdk/runtime/registry.py,sha256=1FmmVtUma4y610a-ProXg3cCYbF8bccAS63Yu4CRlg4,3529
+evalgate_sdk/runtime/run_report.py,sha256=ESrEZW3qUwdOwwAdfa7GCWUJsaXQlzWx7IajTRao6KE,7613
+evalgate_sdk/runtime/types.py,sha256=xflPZlbQhYOSCdW8T3jDEuMqidoqaiSjC-52pl8_o3M,3639
+evalgate_sdk/runtime/adapters/__init__.py,sha256=fT2RSPBf-Lc6L911KQ-TFtFLa3ersKVfF9D1T_tDQI8,53
+evalgate_sdk/runtime/adapters/config_to_dsl.py,sha256=wRO5fZdfyFZE2WRdjiU_-7-TPFlKxpACi1W4MjM5EDY,9070
+evalgate_sdk/runtime/adapters/testsuite_to_dsl.py,sha256=X-lCV2aICGEjppBPO67IiL5etVW3TRytITdG5xiPQ6Y,6642
+evalgate_sdk/utils/__init__.py,sha256=qp_u7iwL20xqoqZ4_bt0E9CiP9OBCjoHQUZFq1vzxho,30
+evalgate_sdk/utils/input_hash.py,sha256=AqY9lPKijJWgmpBTmkRzsjMw-FW-VocQ_2TCYA1BKwo,1225
+evalgate_sdk-3.3.1.dist-info/METADATA,sha256=O84327rWl6A4XCc3uZE0zoNF-HXI9-vTIrb3ehx2k64,23472
+evalgate_sdk-3.3.1.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
+evalgate_sdk-3.3.1.dist-info/entry_points.txt,sha256=r04Fx9iLP6UAXOzTB5aAOUp4BewuMWa7gfC6K6NZZpU,51
+evalgate_sdk-3.3.1.dist-info/RECORD,,