evalgate-sdk 3.3.1__tar.gz → 3.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/CHANGELOG.md +22 -0
- evalgate_sdk-3.5.0/PKG-INFO +220 -0
- evalgate_sdk-3.5.0/README.md +168 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/pyproject.toml +3 -3
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/__init__.py +137 -7
- evalgate_sdk-3.5.0/src/evalgate_sdk/_version.py +3 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/assertions.py +79 -10
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/auto.py +9 -4
- evalgate_sdk-3.5.0/src/evalgate_sdk/auto_experiment_runner.py +214 -0
- evalgate_sdk-3.5.0/src/evalgate_sdk/auto_failure_analyzer.py +211 -0
- evalgate_sdk-3.5.0/src/evalgate_sdk/auto_mutation_engine.py +354 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/batch.py +2 -1
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/__init__.py +9 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/api.py +184 -1
- evalgate_sdk-3.5.0/src/evalgate_sdk/cli/api_command_config.py +83 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/commands.py +73 -37
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/config.py +14 -16
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/golden_commands.py +22 -8
- evalgate_sdk-3.5.0/src/evalgate_sdk/cli/judge_commands.py +436 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/report/build_check_report.py +6 -4
- evalgate_sdk-3.5.0/src/evalgate_sdk/cli/workflow_commands.py +615 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/workspace.py +2 -23
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/client.py +405 -52
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cluster.py +126 -7
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/collector.py +41 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/context.py +5 -8
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/errors.py +5 -5
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/export.py +3 -3
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/golden.py +60 -19
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/integrations/anthropic.py +1 -1
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/integrations/autogen.py +3 -2
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/integrations/crewai.py +3 -2
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/integrations/langchain.py +1 -1
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/integrations/openai.py +1 -1
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/integrations/openai_eval.py +35 -8
- evalgate_sdk-3.5.0/src/evalgate_sdk/knowledge.py +43 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/local.py +3 -2
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/logger.py +1 -1
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/pagination.py +1 -1
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/replay_decision.py +3 -1
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/adapters/config_to_dsl.py +6 -14
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/eval.py +9 -10
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/execution_mode.py +2 -6
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/executor.py +5 -3
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/synthesize.py +18 -5
- evalgate_sdk-3.5.0/src/evalgate_sdk/types.py +1211 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/workflows.py +86 -13
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_assertions_async.py +26 -0
- evalgate_sdk-3.5.0/tests/test_auto_mutation_engine_parity.py +229 -0
- evalgate_sdk-3.5.0/tests/test_auto_session_methods.py +103 -0
- evalgate_sdk-3.5.0/tests/test_batch_collector.py +73 -0
- evalgate_sdk-3.5.0/tests/test_cli_golden_commands.py +553 -0
- evalgate_sdk-3.5.0/tests/test_client.py +479 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_decorators_and_integrations.py +9 -24
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_execution_mode.py +5 -5
- evalgate_sdk-3.5.0/tests/test_knowledge_methods.py +85 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_new_modules.py +1 -1
- evalgate_sdk-3.5.0/tests/test_openapi_contracts.py +259 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_parity_gap_modules.py +37 -5
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_parity_gaps.py +4 -6
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_prod_hardening.py +3 -3
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_runtime_dsl.py +6 -0
- evalgate_sdk-3.5.0/tests/test_sdk_contract_scaffold.py +42 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_testing.py +1 -1
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_types.py +53 -0
- evalgate_sdk-3.5.0/tests/test_workflows_parity.py +97 -0
- evalgate_sdk-3.3.1/PKG-INFO +0 -608
- evalgate_sdk-3.3.1/README.md +0 -556
- evalgate_sdk-3.3.1/src/evalgate_sdk/_version.py +0 -3
- evalgate_sdk-3.3.1/src/evalgate_sdk/types.py +0 -666
- evalgate_sdk-3.3.1/tests/test_cli_golden_commands.py +0 -232
- evalgate_sdk-3.3.1/tests/test_client.py +0 -164
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/.gitignore +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cache.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/ci_context.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/cli_constants.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/env.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/formatters/types.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/manifest.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/new_commands.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/policy_packs.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/profiles.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/regression_gate.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/render/__init__.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/render/snippet.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/render/sort.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/report/__init__.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/traces.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/constants.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/formatters/__init__.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/formatters/github.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/formatters/human.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/formatters/json_fmt.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/formatters/pr_comment.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/integrations/__init__.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/matchers.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/otel.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/py.typed +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/pytest_plugin.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/reason_codes.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/regression.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/__init__.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/adapters/__init__.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/context.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/registry.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/run_report.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/types.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/snapshot.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/streaming.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/testing.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/utils/__init__.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/utils/input_hash.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/__init__.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_assertions.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_ci_context.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_cli_new_commands.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_collector.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_contract_payloads.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_errors.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_formatters.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_local_storage.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_otel.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_pytest_plugin.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_run_report.py +0 -0
- {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_workflows.py +0 -0
|
@@ -9,6 +9,28 @@ Version numbering is aligned with the TypeScript SDK (`@evalgate/sdk`) and the p
|
|
|
9
9
|
|
|
10
10
|
**Version history note:** The Python SDK jumped from 1.0.0 → 1.9.x → 2.0.0 to stay in sync with the TypeScript SDK. The TypeScript SDK had many releases (1.1–1.9) before the Python SDK existed. We now align both SDKs on the same major.minor version.
|
|
11
11
|
|
|
12
|
+
## [3.5.0] - 2026-04-09
|
|
13
|
+
|
|
14
|
+
### Added
|
|
15
|
+
- **Knowledge API parity** — new `client.knowledge.search()`, `upload_document()`, and `get_grounding_context()` methods matching the TypeScript SDK surface.
|
|
16
|
+
- **Auto-session SDK methods** — `evaluations.create_auto_session()`, `list_auto_sessions()`, `get_auto_session()`, `get_quality_score()`, and `synthesize_test_cases()` for autonomous loop management.
|
|
17
|
+
- **Batch trace ingestion** — `report_traces()` function for sending up to 100 traces per request.
|
|
18
|
+
- **Offline trace persistence** — `WorkflowTracer` supports JSON file persistence when disconnected.
|
|
19
|
+
|
|
20
|
+
### Changed
|
|
21
|
+
- **Version alignment** — bumped Python SDK package metadata and spec constants to `3.5.0`.
|
|
22
|
+
|
|
23
|
+
## [3.4.0] - 2026-03-29
|
|
24
|
+
|
|
25
|
+
### Added
|
|
26
|
+
- **Judge orchestration parity** — added Python client and CLI support for the normalized judge registry, presets, run/case judge inspection, and multi-judge evidence payloads.
|
|
27
|
+
- **Workflow and controls parity** — expanded Python operator surfaces for workflow execution, system controls, and workflow-aware parity with the TypeScript SDK.
|
|
28
|
+
- **Provider/PII safeguards** — outbound judge/assertion flows now enforce shared provider policy and pre-send PII protection semantics in the Python SDK path too.
|
|
29
|
+
|
|
30
|
+
### Changed
|
|
31
|
+
- **Canonical contract alignment** — updated Python types and CLI copy to match the workflow-native and judge-orchestration model used across the platform.
|
|
32
|
+
- **Version alignment** — bumped Python SDK package metadata and spec constants to `3.4.0`.
|
|
33
|
+
|
|
12
34
|
## [3.3.1] - 2026-03-24
|
|
13
35
|
|
|
14
36
|
### Changed
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: evalgate-sdk
|
|
3
|
+
Version: 3.5.0
|
|
4
|
+
Summary: EvalGate Python SDK — CI for AI behavior. Traces, evaluations, assertions, and regression gates for LLM apps.
|
|
5
|
+
Project-URL: Homepage, https://evalgate.com
|
|
6
|
+
Project-URL: Documentation, https://github.com/evalgate/ai-evaluation-platform#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/evalgate/ai-evaluation-platform
|
|
8
|
+
Project-URL: Issues, https://github.com/evalgate/ai-evaluation-platform/issues
|
|
9
|
+
Project-URL: Changelog, https://github.com/evalgate/ai-evaluation-platform/blob/main/src/packages/sdk-python/CHANGELOG.md
|
|
10
|
+
Author-email: EvalGate <team@evalgate.com>
|
|
11
|
+
License-Expression: MIT
|
|
12
|
+
Keywords: ai,anthropic,assertions,ci,evaluation,llm,monitoring,observability,openai,regression,testing,tracing,workflow
|
|
13
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Topic :: Software Development :: Testing
|
|
23
|
+
Classifier: Typing :: Typed
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Requires-Dist: httpx<1,>=0.27
|
|
26
|
+
Requires-Dist: pydantic<3,>=2.0
|
|
27
|
+
Provides-Extra: all
|
|
28
|
+
Requires-Dist: anthropic>=0.20; extra == 'all'
|
|
29
|
+
Requires-Dist: langchain-core>=0.2; extra == 'all'
|
|
30
|
+
Requires-Dist: openai>=1.0; extra == 'all'
|
|
31
|
+
Requires-Dist: rich>=13; extra == 'all'
|
|
32
|
+
Requires-Dist: typer>=0.12; extra == 'all'
|
|
33
|
+
Provides-Extra: anthropic
|
|
34
|
+
Requires-Dist: anthropic>=0.20; extra == 'anthropic'
|
|
35
|
+
Provides-Extra: cli
|
|
36
|
+
Requires-Dist: rich>=13; extra == 'cli'
|
|
37
|
+
Requires-Dist: typer>=0.12; extra == 'cli'
|
|
38
|
+
Provides-Extra: dev
|
|
39
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
40
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
|
|
41
|
+
Requires-Dist: pytest-cov>=5; extra == 'dev'
|
|
42
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
43
|
+
Requires-Dist: respx>=0.21; extra == 'dev'
|
|
44
|
+
Requires-Dist: rich>=13; extra == 'dev'
|
|
45
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
46
|
+
Requires-Dist: typer>=0.12; extra == 'dev'
|
|
47
|
+
Provides-Extra: langchain
|
|
48
|
+
Requires-Dist: langchain-core>=0.2; extra == 'langchain'
|
|
49
|
+
Provides-Extra: openai
|
|
50
|
+
Requires-Dist: openai>=1.0; extra == 'openai'
|
|
51
|
+
Description-Content-Type: text/markdown
|
|
52
|
+
|
|
53
|
+
# evalgate-sdk
|
|
54
|
+
|
|
55
|
+
[](https://pypi.org/project/evalgate-sdk/)
|
|
56
|
+
[](https://pypi.org/project/evalgate-sdk/)
|
|
57
|
+
[](https://opensource.org/licenses/MIT)
|
|
58
|
+
[](https://peps.python.org/pep-0561/)
|
|
59
|
+
|
|
60
|
+
`evalgate-sdk` brings EvalGate's evaluation control plane to Python.
|
|
61
|
+
|
|
62
|
+
Use it to run assertions, trace agent workflows, inspect platform runs, orchestrate judges, and participate in the same closed-loop evaluation workflow as the TypeScript SDK and the web app.
|
|
63
|
+
|
|
64
|
+
## Install
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pip install evalgate-sdk
|
|
68
|
+
pip install "evalgate-sdk[cli]"
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Optional extras:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
pip install "evalgate-sdk[openai]"
|
|
75
|
+
pip install "evalgate-sdk[anthropic]"
|
|
76
|
+
pip install "evalgate-sdk[langchain]"
|
|
77
|
+
pip install "evalgate-sdk[all]"
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Quick Start
|
|
81
|
+
|
|
82
|
+
### Local Assertions
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from evalgate_sdk import expect
|
|
86
|
+
|
|
87
|
+
result = expect("The capital of France is Paris.").to_contain("Paris")
|
|
88
|
+
print(result.passed)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Platform Client
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from evalgate_sdk import AIEvalClient
|
|
95
|
+
from evalgate_sdk.types import CreateTraceParams
|
|
96
|
+
|
|
97
|
+
client = AIEvalClient(api_key="sk-...")
|
|
98
|
+
|
|
99
|
+
trace = await client.traces.create(
|
|
100
|
+
CreateTraceParams(
|
|
101
|
+
name="support-run",
|
|
102
|
+
input="Cancel my subscription",
|
|
103
|
+
output="I've canceled your plan effective today.",
|
|
104
|
+
)
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
print(trace.id)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## Closed-Loop Workflow
|
|
111
|
+
|
|
112
|
+
The Python package participates in the same EvalGate loop:
|
|
113
|
+
|
|
114
|
+
```text
|
|
115
|
+
trace -> cluster -> synthesize -> gate -> review -> auto -> ship
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Representative CLI commands:
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
evalgate gate
|
|
122
|
+
evalgate cluster --run .evalgate/runs/latest.json
|
|
123
|
+
evalgate synthesize --dataset .evalgate/golden/labeled.jsonl
|
|
124
|
+
evalgate judge registry
|
|
125
|
+
evalgate judge presets
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Judge Orchestration
|
|
129
|
+
|
|
130
|
+
The Python SDK supports the same core judge model as the platform:
|
|
131
|
+
|
|
132
|
+
- registry-backed judge selection
|
|
133
|
+
- saved judge configs
|
|
134
|
+
- multi-judge aggregation
|
|
135
|
+
- per-run and per-case judge inspection
|
|
136
|
+
- structured reasoning and signal breakdowns
|
|
137
|
+
|
|
138
|
+
### CLI
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
evalgate judge registry
|
|
142
|
+
evalgate judge presets
|
|
143
|
+
|
|
144
|
+
evalgate judge test \
|
|
145
|
+
--provider openai \
|
|
146
|
+
--model gpt-5.2-chat-latest \
|
|
147
|
+
--judge openai:gpt-5.2-chat-latest \
|
|
148
|
+
--judge anthropic:claude-sonnet-4-20250514 \
|
|
149
|
+
--aggregation weighted \
|
|
150
|
+
--prompt-template "Return strict JSON with score, passed, reasoning, and signals." \
|
|
151
|
+
--input "Cancel my subscription" \
|
|
152
|
+
--output "I've canceled your plan effective today."
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### Client
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
from evalgate_sdk.types import TestLLMJudgeConfigParams
|
|
159
|
+
|
|
160
|
+
registry = await client.llm_judge.list_registry()
|
|
161
|
+
presets = await client.llm_judge.list_presets()
|
|
162
|
+
|
|
163
|
+
result = await client.llm_judge.test_config(
|
|
164
|
+
TestLLMJudgeConfigParams(
|
|
165
|
+
provider="openai",
|
|
166
|
+
model="gpt-5.2-chat-latest",
|
|
167
|
+
prompt_template="Return strict JSON with score, passed, reasoning, and signals.",
|
|
168
|
+
judges=[
|
|
169
|
+
{
|
|
170
|
+
"id": "primary",
|
|
171
|
+
"type": "llm",
|
|
172
|
+
"provider": "openai",
|
|
173
|
+
"model": "gpt-5.2-chat-latest",
|
|
174
|
+
"weight": 0.6,
|
|
175
|
+
},
|
|
176
|
+
{
|
|
177
|
+
"id": "backup",
|
|
178
|
+
"type": "llm",
|
|
179
|
+
"provider": "anthropic",
|
|
180
|
+
"model": "claude-sonnet-4-20250514",
|
|
181
|
+
"weight": 0.4,
|
|
182
|
+
},
|
|
183
|
+
],
|
|
184
|
+
aggregation="weighted",
|
|
185
|
+
input="Cancel my subscription",
|
|
186
|
+
output="I've canceled your plan effective today.",
|
|
187
|
+
behavior="tool_use",
|
|
188
|
+
task_type="support",
|
|
189
|
+
)
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
print(result.result.score, result.result.reasoning)
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## Enterprise Controls
|
|
196
|
+
|
|
197
|
+
The Python SDK respects the same platform guardrails:
|
|
198
|
+
|
|
199
|
+
- provider allowlists
|
|
200
|
+
- pre-send PII checks for outbound judge/provider calls
|
|
201
|
+
- org-scoped policy
|
|
202
|
+
- shared config precedence with the API and web app
|
|
203
|
+
|
|
204
|
+
## Surface Parity
|
|
205
|
+
|
|
206
|
+
The Python SDK now covers the primary EvalGate workflows used in the product:
|
|
207
|
+
|
|
208
|
+
- traces and runs
|
|
209
|
+
- clustering and synthesis
|
|
210
|
+
- gate/check/review loops
|
|
211
|
+
- judge registry, presets, configs, results, and comparisons
|
|
212
|
+
- workflow and control-plane inspection commands
|
|
213
|
+
|
|
214
|
+
TypeScript still has a few extra convenience wrappers, but the core control-plane model is shared.
|
|
215
|
+
|
|
216
|
+
For broader product context, see:
|
|
217
|
+
|
|
218
|
+
- [Root README](../../../README.md)
|
|
219
|
+
- [Integration reference](../../app/docs/integration/page.tsx)
|
|
220
|
+
- [SDK parity guide](../../app/docs/sdk-parity/page.tsx)
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# evalgate-sdk
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/evalgate-sdk/)
|
|
4
|
+
[](https://pypi.org/project/evalgate-sdk/)
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
[](https://peps.python.org/pep-0561/)
|
|
7
|
+
|
|
8
|
+
`evalgate-sdk` brings EvalGate's evaluation control plane to Python.
|
|
9
|
+
|
|
10
|
+
Use it to run assertions, trace agent workflows, inspect platform runs, orchestrate judges, and participate in the same closed-loop evaluation workflow as the TypeScript SDK and the web app.
|
|
11
|
+
|
|
12
|
+
## Install
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install evalgate-sdk
|
|
16
|
+
pip install "evalgate-sdk[cli]"
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Optional extras:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install "evalgate-sdk[openai]"
|
|
23
|
+
pip install "evalgate-sdk[anthropic]"
|
|
24
|
+
pip install "evalgate-sdk[langchain]"
|
|
25
|
+
pip install "evalgate-sdk[all]"
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Quick Start
|
|
29
|
+
|
|
30
|
+
### Local Assertions
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from evalgate_sdk import expect
|
|
34
|
+
|
|
35
|
+
result = expect("The capital of France is Paris.").to_contain("Paris")
|
|
36
|
+
print(result.passed)
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### Platform Client
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from evalgate_sdk import AIEvalClient
|
|
43
|
+
from evalgate_sdk.types import CreateTraceParams
|
|
44
|
+
|
|
45
|
+
client = AIEvalClient(api_key="sk-...")
|
|
46
|
+
|
|
47
|
+
trace = await client.traces.create(
|
|
48
|
+
CreateTraceParams(
|
|
49
|
+
name="support-run",
|
|
50
|
+
input="Cancel my subscription",
|
|
51
|
+
output="I've canceled your plan effective today.",
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
print(trace.id)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Closed-Loop Workflow
|
|
59
|
+
|
|
60
|
+
The Python package participates in the same EvalGate loop:
|
|
61
|
+
|
|
62
|
+
```text
|
|
63
|
+
trace -> cluster -> synthesize -> gate -> review -> auto -> ship
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Representative CLI commands:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
evalgate gate
|
|
70
|
+
evalgate cluster --run .evalgate/runs/latest.json
|
|
71
|
+
evalgate synthesize --dataset .evalgate/golden/labeled.jsonl
|
|
72
|
+
evalgate judge registry
|
|
73
|
+
evalgate judge presets
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Judge Orchestration
|
|
77
|
+
|
|
78
|
+
The Python SDK supports the same core judge model as the platform:
|
|
79
|
+
|
|
80
|
+
- registry-backed judge selection
|
|
81
|
+
- saved judge configs
|
|
82
|
+
- multi-judge aggregation
|
|
83
|
+
- per-run and per-case judge inspection
|
|
84
|
+
- structured reasoning and signal breakdowns
|
|
85
|
+
|
|
86
|
+
### CLI
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
evalgate judge registry
|
|
90
|
+
evalgate judge presets
|
|
91
|
+
|
|
92
|
+
evalgate judge test \
|
|
93
|
+
--provider openai \
|
|
94
|
+
--model gpt-5.2-chat-latest \
|
|
95
|
+
--judge openai:gpt-5.2-chat-latest \
|
|
96
|
+
--judge anthropic:claude-sonnet-4-20250514 \
|
|
97
|
+
--aggregation weighted \
|
|
98
|
+
--prompt-template "Return strict JSON with score, passed, reasoning, and signals." \
|
|
99
|
+
--input "Cancel my subscription" \
|
|
100
|
+
--output "I've canceled your plan effective today."
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Client
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from evalgate_sdk.types import TestLLMJudgeConfigParams
|
|
107
|
+
|
|
108
|
+
registry = await client.llm_judge.list_registry()
|
|
109
|
+
presets = await client.llm_judge.list_presets()
|
|
110
|
+
|
|
111
|
+
result = await client.llm_judge.test_config(
|
|
112
|
+
TestLLMJudgeConfigParams(
|
|
113
|
+
provider="openai",
|
|
114
|
+
model="gpt-5.2-chat-latest",
|
|
115
|
+
prompt_template="Return strict JSON with score, passed, reasoning, and signals.",
|
|
116
|
+
judges=[
|
|
117
|
+
{
|
|
118
|
+
"id": "primary",
|
|
119
|
+
"type": "llm",
|
|
120
|
+
"provider": "openai",
|
|
121
|
+
"model": "gpt-5.2-chat-latest",
|
|
122
|
+
"weight": 0.6,
|
|
123
|
+
},
|
|
124
|
+
{
|
|
125
|
+
"id": "backup",
|
|
126
|
+
"type": "llm",
|
|
127
|
+
"provider": "anthropic",
|
|
128
|
+
"model": "claude-sonnet-4-20250514",
|
|
129
|
+
"weight": 0.4,
|
|
130
|
+
},
|
|
131
|
+
],
|
|
132
|
+
aggregation="weighted",
|
|
133
|
+
input="Cancel my subscription",
|
|
134
|
+
output="I've canceled your plan effective today.",
|
|
135
|
+
behavior="tool_use",
|
|
136
|
+
task_type="support",
|
|
137
|
+
)
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
print(result.result.score, result.result.reasoning)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Enterprise Controls
|
|
144
|
+
|
|
145
|
+
The Python SDK respects the same platform guardrails:
|
|
146
|
+
|
|
147
|
+
- provider allowlists
|
|
148
|
+
- pre-send PII checks for outbound judge/provider calls
|
|
149
|
+
- org-scoped policy
|
|
150
|
+
- shared config precedence with the API and web app
|
|
151
|
+
|
|
152
|
+
## Surface Parity
|
|
153
|
+
|
|
154
|
+
The Python SDK now covers the primary EvalGate workflows used in the product:
|
|
155
|
+
|
|
156
|
+
- traces and runs
|
|
157
|
+
- clustering and synthesis
|
|
158
|
+
- gate/check/review loops
|
|
159
|
+
- judge registry, presets, configs, results, and comparisons
|
|
160
|
+
- workflow and control-plane inspection commands
|
|
161
|
+
|
|
162
|
+
TypeScript still has a few extra convenience wrappers, but the core control-plane model is shared.
|
|
163
|
+
|
|
164
|
+
For broader product context, see:
|
|
165
|
+
|
|
166
|
+
- [Root README](../../../README.md)
|
|
167
|
+
- [Integration reference](../../app/docs/integration/page.tsx)
|
|
168
|
+
- [SDK parity guide](../../app/docs/sdk-parity/page.tsx)
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "evalgate-sdk"
|
|
7
|
-
version = "3.
|
|
7
|
+
version = "3.5.0"
|
|
8
8
|
description = "EvalGate Python SDK — CI for AI behavior. Traces, evaluations, assertions, and regression gates for LLM apps."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -76,5 +76,5 @@ select = ["E", "F", "I", "UP", "B", "SIM"]
|
|
|
76
76
|
|
|
77
77
|
[tool.mypy]
|
|
78
78
|
python_version = "3.10"
|
|
79
|
-
strict =
|
|
80
|
-
warn_return_any =
|
|
79
|
+
strict = false
|
|
80
|
+
warn_return_any = false
|
|
@@ -59,25 +59,57 @@ from evalgate_sdk.auto import (
|
|
|
59
59
|
run_auto_daemon,
|
|
60
60
|
write_auto_report,
|
|
61
61
|
)
|
|
62
|
+
from evalgate_sdk.auto_experiment_runner import (
|
|
63
|
+
Experiment,
|
|
64
|
+
ExperimentChangeSet,
|
|
65
|
+
ExperimentDecision,
|
|
66
|
+
ExperimentMetrics,
|
|
67
|
+
ExperimentRunner,
|
|
68
|
+
KeepDiscardPolicy,
|
|
69
|
+
KeepDiscardResult,
|
|
70
|
+
evaluate_experiment,
|
|
71
|
+
select_best_experiment,
|
|
72
|
+
)
|
|
73
|
+
from evalgate_sdk.auto_failure_analyzer import (
|
|
74
|
+
AnalyzeInput,
|
|
75
|
+
AutoLLMConfig,
|
|
76
|
+
FailureAnalysis,
|
|
77
|
+
FailureAnalyzer,
|
|
78
|
+
FailureCase,
|
|
79
|
+
FailurePattern,
|
|
80
|
+
TasteContext,
|
|
81
|
+
)
|
|
82
|
+
from evalgate_sdk.auto_mutation_engine import (
|
|
83
|
+
GeneratedMutation,
|
|
84
|
+
GenerateInput,
|
|
85
|
+
KnowledgeHint,
|
|
86
|
+
ModelSettingsOverride,
|
|
87
|
+
MutationAttempt,
|
|
88
|
+
MutationEngine,
|
|
89
|
+
MutationStrategy,
|
|
90
|
+
TrajectorySummary,
|
|
91
|
+
)
|
|
62
92
|
from evalgate_sdk.batch import RequestBatcher, batch_process, can_batch
|
|
63
93
|
from evalgate_sdk.cache import CacheTTL, RequestCache, get_ttl, should_cache
|
|
64
94
|
from evalgate_sdk.ci_context import CIContext, detect_ci_context
|
|
65
|
-
from evalgate_sdk.cluster import ClusterCase, ClusterSample, ClusterSummary, TraceCluster, cluster_run_result, format_cluster_human
|
|
66
95
|
from evalgate_sdk.cli.api import (
|
|
67
96
|
FetchOptions,
|
|
68
97
|
PublishShareResult,
|
|
69
98
|
QualityLatestData,
|
|
70
99
|
RunDetailsData,
|
|
71
100
|
fetch_api,
|
|
101
|
+
fetch_auto_plan_preview,
|
|
102
|
+
fetch_integrity,
|
|
103
|
+
fetch_knowledge,
|
|
72
104
|
fetch_quality_latest,
|
|
73
105
|
fetch_run_details,
|
|
74
106
|
fetch_run_export,
|
|
75
107
|
import_run_on_fail,
|
|
76
108
|
publish_share,
|
|
109
|
+
refresh_integrity,
|
|
77
110
|
)
|
|
78
111
|
from evalgate_sdk.cli.cli_constants import EXIT
|
|
79
112
|
from evalgate_sdk.cli.config import (
|
|
80
|
-
EvalAIConfig,
|
|
81
113
|
EvalGateConfig,
|
|
82
114
|
find_config_path,
|
|
83
115
|
load_config,
|
|
@@ -130,6 +162,14 @@ from evalgate_sdk.cli.traces import (
|
|
|
130
162
|
)
|
|
131
163
|
from evalgate_sdk.cli.workspace import EvalWorkspace, resolve_eval_workspace
|
|
132
164
|
from evalgate_sdk.client import AIEvalClient
|
|
165
|
+
from evalgate_sdk.cluster import (
|
|
166
|
+
ClusterCase,
|
|
167
|
+
ClusterSample,
|
|
168
|
+
ClusterSummary,
|
|
169
|
+
TraceCluster,
|
|
170
|
+
cluster_run_result,
|
|
171
|
+
format_cluster_human,
|
|
172
|
+
)
|
|
133
173
|
from evalgate_sdk.collector import (
|
|
134
174
|
CollectorFeedbackInput,
|
|
135
175
|
CollectorSpanInput,
|
|
@@ -137,6 +177,7 @@ from evalgate_sdk.collector import (
|
|
|
137
177
|
ReportTraceOptions,
|
|
138
178
|
ReportTraceResult,
|
|
139
179
|
report_trace,
|
|
180
|
+
report_traces,
|
|
140
181
|
)
|
|
141
182
|
from evalgate_sdk.constants import DEFAULT_BASE_URL
|
|
142
183
|
from evalgate_sdk.context import (
|
|
@@ -202,6 +243,7 @@ from evalgate_sdk.integrations.openai_eval import (
|
|
|
202
243
|
OpenAIChatEvalResult,
|
|
203
244
|
openai_chat_eval,
|
|
204
245
|
)
|
|
246
|
+
from evalgate_sdk.knowledge import KnowledgeAPI
|
|
205
247
|
from evalgate_sdk.local import LocalStorage, LocalStorageStats
|
|
206
248
|
from evalgate_sdk.logger import Logger, RequestLogger, create_logger, get_logger, set_logger
|
|
207
249
|
from evalgate_sdk.matchers import GateAssertionError, assert_passes_gate, to_pass_gate
|
|
@@ -223,7 +265,6 @@ from evalgate_sdk.pytest_plugin import (
|
|
|
223
265
|
assert_score_between,
|
|
224
266
|
)
|
|
225
267
|
from evalgate_sdk.reason_codes import REASON_CODES, get_reason_info, is_blocking
|
|
226
|
-
from evalgate_sdk.replay_decision import NormalizedBudgetConfig, ReplayDecision, determine_comparison_basis, evaluate_replay_outcome
|
|
227
268
|
from evalgate_sdk.regression import (
|
|
228
269
|
ARTIFACTS,
|
|
229
270
|
GATE_CATEGORY,
|
|
@@ -237,6 +278,12 @@ from evalgate_sdk.regression import (
|
|
|
237
278
|
evaluate_regression,
|
|
238
279
|
verify_baseline_checksum,
|
|
239
280
|
)
|
|
281
|
+
from evalgate_sdk.replay_decision import (
|
|
282
|
+
NormalizedBudgetConfig,
|
|
283
|
+
ReplayDecision,
|
|
284
|
+
determine_comparison_basis,
|
|
285
|
+
evaluate_replay_outcome,
|
|
286
|
+
)
|
|
240
287
|
from evalgate_sdk.runtime import (
|
|
241
288
|
EvalExecutionError,
|
|
242
289
|
EvalRuntimeError,
|
|
@@ -270,7 +317,6 @@ from evalgate_sdk.runtime.eval import (
|
|
|
270
317
|
define_eval_only,
|
|
271
318
|
define_eval_skip,
|
|
272
319
|
define_suite,
|
|
273
|
-
evalai,
|
|
274
320
|
from_dataset,
|
|
275
321
|
get_filtered_specs,
|
|
276
322
|
)
|
|
@@ -334,7 +380,25 @@ from evalgate_sdk.synthesize import (
|
|
|
334
380
|
synthesize_labeled_dataset,
|
|
335
381
|
)
|
|
336
382
|
from evalgate_sdk.testing import TestSuite, create_test_suite
|
|
337
|
-
from evalgate_sdk.types import
|
|
383
|
+
from evalgate_sdk.types import (
|
|
384
|
+
AgentApprovalMode,
|
|
385
|
+
AgentAutonomyLevel,
|
|
386
|
+
AgentMemoryMode,
|
|
387
|
+
AgentWorkflowClass,
|
|
388
|
+
AgentWorkflowExperimentRecipe,
|
|
389
|
+
AgentWorkflowHookPhase,
|
|
390
|
+
AgentWorkflowSpec,
|
|
391
|
+
AutoPlanPreviewParams,
|
|
392
|
+
AutoPlanPreviewResponse,
|
|
393
|
+
AutoSessionWorkflowProfile,
|
|
394
|
+
CamelModel,
|
|
395
|
+
EvaluationIntegritySnapshot,
|
|
396
|
+
EvaluationRunDetail,
|
|
397
|
+
KnowledgeHintSummary,
|
|
398
|
+
KnowledgeResponse,
|
|
399
|
+
QualityBreakdown,
|
|
400
|
+
QualityScore,
|
|
401
|
+
)
|
|
338
402
|
from evalgate_sdk.utils.input_hash import normalize_input, sha256_input
|
|
339
403
|
from evalgate_sdk.workflows import WorkflowTracer, create_workflow_tracer, trace_workflow_step
|
|
340
404
|
|
|
@@ -474,13 +538,74 @@ __all__ = [
|
|
|
474
538
|
"AutoDiffSnapshot",
|
|
475
539
|
"AutoIterationResult",
|
|
476
540
|
"AutoReport",
|
|
541
|
+
"Experiment",
|
|
542
|
+
"ExperimentChangeSet",
|
|
543
|
+
"ExperimentDecision",
|
|
544
|
+
"ExperimentMetrics",
|
|
545
|
+
"ExperimentRunner",
|
|
546
|
+
"KeepDiscardPolicy",
|
|
547
|
+
"KeepDiscardResult",
|
|
548
|
+
"AnalyzeInput",
|
|
549
|
+
"AutoLLMConfig",
|
|
550
|
+
"FailureAnalysis",
|
|
551
|
+
"FailureAnalyzer",
|
|
552
|
+
"FailureCase",
|
|
553
|
+
"FailurePattern",
|
|
554
|
+
"KnowledgeHint",
|
|
555
|
+
"GenerateInput",
|
|
556
|
+
"GeneratedMutation",
|
|
557
|
+
"ModelSettingsOverride",
|
|
558
|
+
"MutationAttempt",
|
|
559
|
+
"MutationEngine",
|
|
560
|
+
"MutationStrategy",
|
|
561
|
+
"TasteContext",
|
|
562
|
+
"TrajectorySummary",
|
|
477
563
|
"build_auto_plan",
|
|
478
564
|
"decide_auto_experiment",
|
|
479
565
|
"build_auto_report",
|
|
566
|
+
"evaluate_experiment",
|
|
480
567
|
"format_auto_human",
|
|
481
568
|
"append_auto_history",
|
|
569
|
+
"select_best_experiment",
|
|
482
570
|
"write_auto_report",
|
|
483
571
|
"run_auto_daemon",
|
|
572
|
+
"Experiment",
|
|
573
|
+
"ExperimentChangeSet",
|
|
574
|
+
"ExperimentDecision",
|
|
575
|
+
"ExperimentMetrics",
|
|
576
|
+
"ExperimentRunner",
|
|
577
|
+
"KeepDiscardPolicy",
|
|
578
|
+
"KeepDiscardResult",
|
|
579
|
+
"evaluate_experiment",
|
|
580
|
+
"select_best_experiment",
|
|
581
|
+
"AnalyzeInput",
|
|
582
|
+
"AutoLLMConfig",
|
|
583
|
+
"FailureAnalysis",
|
|
584
|
+
"FailureAnalyzer",
|
|
585
|
+
"FailureCase",
|
|
586
|
+
"FailurePattern",
|
|
587
|
+
"TasteContext",
|
|
588
|
+
"GeneratedMutation",
|
|
589
|
+
"GenerateInput",
|
|
590
|
+
"KnowledgeHint",
|
|
591
|
+
"ModelSettingsOverride",
|
|
592
|
+
"MutationAttempt",
|
|
593
|
+
"MutationEngine",
|
|
594
|
+
"MutationStrategy",
|
|
595
|
+
"AgentApprovalMode",
|
|
596
|
+
"AgentAutonomyLevel",
|
|
597
|
+
"AgentMemoryMode",
|
|
598
|
+
"AgentWorkflowClass",
|
|
599
|
+
"AgentWorkflowExperimentRecipe",
|
|
600
|
+
"AgentWorkflowHookPhase",
|
|
601
|
+
"AgentWorkflowSpec",
|
|
602
|
+
"AutoPlanPreviewParams",
|
|
603
|
+
"AutoPlanPreviewResponse",
|
|
604
|
+
"AutoSessionWorkflowProfile",
|
|
605
|
+
"EvaluationIntegritySnapshot",
|
|
606
|
+
"EvaluationRunDetail",
|
|
607
|
+
"KnowledgeHintSummary",
|
|
608
|
+
"KnowledgeResponse",
|
|
484
609
|
# Streaming
|
|
485
610
|
"RateLimiter",
|
|
486
611
|
"BatchProgress",
|
|
@@ -546,7 +671,6 @@ __all__ = [
|
|
|
546
671
|
"define_eval_only",
|
|
547
672
|
"define_suite",
|
|
548
673
|
"create_result",
|
|
549
|
-
"evalai",
|
|
550
674
|
"from_dataset",
|
|
551
675
|
"get_filtered_specs",
|
|
552
676
|
# Runtime management
|
|
@@ -576,8 +700,11 @@ __all__ = [
|
|
|
576
700
|
"CamelModel",
|
|
577
701
|
"QualityScore",
|
|
578
702
|
"QualityBreakdown",
|
|
703
|
+
# Knowledge API
|
|
704
|
+
"KnowledgeAPI",
|
|
579
705
|
# Collector (T2)
|
|
580
706
|
"report_trace",
|
|
707
|
+
"report_traces",
|
|
581
708
|
"ReportTraceInput",
|
|
582
709
|
"ReportTraceOptions",
|
|
583
710
|
"ReportTraceResult",
|
|
@@ -647,13 +774,16 @@ __all__ = [
|
|
|
647
774
|
"is_git_ref",
|
|
648
775
|
"get_github_step_summary_path",
|
|
649
776
|
# CLI config (T14)
|
|
650
|
-
"EvalAIConfig",
|
|
651
777
|
"EvalGateConfig",
|
|
652
778
|
"find_config_path",
|
|
653
779
|
"load_config",
|
|
654
780
|
"merge_config_with_args",
|
|
655
781
|
# CLI API (T14)
|
|
656
782
|
"fetch_api",
|
|
783
|
+
"fetch_integrity",
|
|
784
|
+
"refresh_integrity",
|
|
785
|
+
"fetch_knowledge",
|
|
786
|
+
"fetch_auto_plan_preview",
|
|
657
787
|
"fetch_quality_latest",
|
|
658
788
|
"fetch_run_details",
|
|
659
789
|
"fetch_run_export",
|