evalgate-sdk 3.3.1__tar.gz → 3.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/CHANGELOG.md +22 -0
  2. evalgate_sdk-3.5.0/PKG-INFO +220 -0
  3. evalgate_sdk-3.5.0/README.md +168 -0
  4. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/pyproject.toml +3 -3
  5. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/__init__.py +137 -7
  6. evalgate_sdk-3.5.0/src/evalgate_sdk/_version.py +3 -0
  7. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/assertions.py +79 -10
  8. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/auto.py +9 -4
  9. evalgate_sdk-3.5.0/src/evalgate_sdk/auto_experiment_runner.py +214 -0
  10. evalgate_sdk-3.5.0/src/evalgate_sdk/auto_failure_analyzer.py +211 -0
  11. evalgate_sdk-3.5.0/src/evalgate_sdk/auto_mutation_engine.py +354 -0
  12. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/batch.py +2 -1
  13. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/__init__.py +9 -0
  14. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/api.py +184 -1
  15. evalgate_sdk-3.5.0/src/evalgate_sdk/cli/api_command_config.py +83 -0
  16. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/commands.py +73 -37
  17. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/config.py +14 -16
  18. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/golden_commands.py +22 -8
  19. evalgate_sdk-3.5.0/src/evalgate_sdk/cli/judge_commands.py +436 -0
  20. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/report/build_check_report.py +6 -4
  21. evalgate_sdk-3.5.0/src/evalgate_sdk/cli/workflow_commands.py +615 -0
  22. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/workspace.py +2 -23
  23. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/client.py +405 -52
  24. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cluster.py +126 -7
  25. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/collector.py +41 -0
  26. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/context.py +5 -8
  27. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/errors.py +5 -5
  28. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/export.py +3 -3
  29. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/golden.py +60 -19
  30. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/integrations/anthropic.py +1 -1
  31. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/integrations/autogen.py +3 -2
  32. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/integrations/crewai.py +3 -2
  33. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/integrations/langchain.py +1 -1
  34. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/integrations/openai.py +1 -1
  35. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/integrations/openai_eval.py +35 -8
  36. evalgate_sdk-3.5.0/src/evalgate_sdk/knowledge.py +43 -0
  37. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/local.py +3 -2
  38. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/logger.py +1 -1
  39. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/pagination.py +1 -1
  40. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/replay_decision.py +3 -1
  41. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/adapters/config_to_dsl.py +6 -14
  42. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/eval.py +9 -10
  43. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/execution_mode.py +2 -6
  44. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/executor.py +5 -3
  45. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/synthesize.py +18 -5
  46. evalgate_sdk-3.5.0/src/evalgate_sdk/types.py +1211 -0
  47. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/workflows.py +86 -13
  48. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_assertions_async.py +26 -0
  49. evalgate_sdk-3.5.0/tests/test_auto_mutation_engine_parity.py +229 -0
  50. evalgate_sdk-3.5.0/tests/test_auto_session_methods.py +103 -0
  51. evalgate_sdk-3.5.0/tests/test_batch_collector.py +73 -0
  52. evalgate_sdk-3.5.0/tests/test_cli_golden_commands.py +553 -0
  53. evalgate_sdk-3.5.0/tests/test_client.py +479 -0
  54. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_decorators_and_integrations.py +9 -24
  55. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_execution_mode.py +5 -5
  56. evalgate_sdk-3.5.0/tests/test_knowledge_methods.py +85 -0
  57. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_new_modules.py +1 -1
  58. evalgate_sdk-3.5.0/tests/test_openapi_contracts.py +259 -0
  59. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_parity_gap_modules.py +37 -5
  60. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_parity_gaps.py +4 -6
  61. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_prod_hardening.py +3 -3
  62. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_runtime_dsl.py +6 -0
  63. evalgate_sdk-3.5.0/tests/test_sdk_contract_scaffold.py +42 -0
  64. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_testing.py +1 -1
  65. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_types.py +53 -0
  66. evalgate_sdk-3.5.0/tests/test_workflows_parity.py +97 -0
  67. evalgate_sdk-3.3.1/PKG-INFO +0 -608
  68. evalgate_sdk-3.3.1/README.md +0 -556
  69. evalgate_sdk-3.3.1/src/evalgate_sdk/_version.py +0 -3
  70. evalgate_sdk-3.3.1/src/evalgate_sdk/types.py +0 -666
  71. evalgate_sdk-3.3.1/tests/test_cli_golden_commands.py +0 -232
  72. evalgate_sdk-3.3.1/tests/test_client.py +0 -164
  73. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/.gitignore +0 -0
  74. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cache.py +0 -0
  75. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/ci_context.py +0 -0
  76. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/cli_constants.py +0 -0
  77. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/env.py +0 -0
  78. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/formatters/types.py +0 -0
  79. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/manifest.py +0 -0
  80. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/new_commands.py +0 -0
  81. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/policy_packs.py +0 -0
  82. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/profiles.py +0 -0
  83. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/regression_gate.py +0 -0
  84. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/render/__init__.py +0 -0
  85. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/render/snippet.py +0 -0
  86. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/render/sort.py +0 -0
  87. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/report/__init__.py +0 -0
  88. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/cli/traces.py +0 -0
  89. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/constants.py +0 -0
  90. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/formatters/__init__.py +0 -0
  91. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/formatters/github.py +0 -0
  92. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/formatters/human.py +0 -0
  93. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/formatters/json_fmt.py +0 -0
  94. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/formatters/pr_comment.py +0 -0
  95. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/integrations/__init__.py +0 -0
  96. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/matchers.py +0 -0
  97. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/otel.py +0 -0
  98. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/py.typed +0 -0
  99. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/pytest_plugin.py +0 -0
  100. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/reason_codes.py +0 -0
  101. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/regression.py +0 -0
  102. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/__init__.py +0 -0
  103. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/adapters/__init__.py +0 -0
  104. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +0 -0
  105. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/context.py +0 -0
  106. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/registry.py +0 -0
  107. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/run_report.py +0 -0
  108. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/runtime/types.py +0 -0
  109. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/snapshot.py +0 -0
  110. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/streaming.py +0 -0
  111. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/testing.py +0 -0
  112. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/utils/__init__.py +0 -0
  113. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/src/evalgate_sdk/utils/input_hash.py +0 -0
  114. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/__init__.py +0 -0
  115. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_assertions.py +0 -0
  116. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_ci_context.py +0 -0
  117. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_cli_new_commands.py +0 -0
  118. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_collector.py +0 -0
  119. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_contract_payloads.py +0 -0
  120. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_errors.py +0 -0
  121. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_formatters.py +0 -0
  122. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_local_storage.py +0 -0
  123. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_otel.py +0 -0
  124. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_pytest_plugin.py +0 -0
  125. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_run_report.py +0 -0
  126. {evalgate_sdk-3.3.1 → evalgate_sdk-3.5.0}/tests/test_workflows.py +0 -0
@@ -9,6 +9,28 @@ Version numbering is aligned with the TypeScript SDK (`@evalgate/sdk`) and the p
9
9
 
10
10
  **Version history note:** The Python SDK jumped from 1.0.0 → 1.9.x → 2.0.0 to stay in sync with the TypeScript SDK. The TypeScript SDK had many releases (1.1–1.9) before the Python SDK existed. We now align both SDKs on the same major.minor version.
11
11
 
12
+ ## [3.5.0] - 2026-04-09
13
+
14
+ ### Added
15
+ - **Knowledge API parity** — new `client.knowledge.search()`, `upload_document()`, and `get_grounding_context()` methods matching the TypeScript SDK surface.
16
+ - **Auto-session SDK methods** — `evaluations.create_auto_session()`, `list_auto_sessions()`, `get_auto_session()`, `get_quality_score()`, and `synthesize_test_cases()` for autonomous loop management.
17
+ - **Batch trace ingestion** — `report_traces()` function for sending up to 100 traces per request.
18
+ - **Offline trace persistence** — `WorkflowTracer` supports JSON file persistence when disconnected.
19
+
20
+ ### Changed
21
+ - **Version alignment** — bumped Python SDK package metadata and spec constants to `3.5.0`.
22
+
23
+ ## [3.4.0] - 2026-03-29
24
+
25
+ ### Added
26
+ - **Judge orchestration parity** — added Python client and CLI support for the normalized judge registry, presets, run/case judge inspection, and multi-judge evidence payloads.
27
+ - **Workflow and controls parity** — expanded Python operator surfaces for workflow execution, system controls, and workflow-aware parity with the TypeScript SDK.
28
+ - **Provider/PII safeguards** — outbound judge/assertion flows now enforce shared provider policy and pre-send PII protection semantics in the Python SDK path too.
29
+
30
+ ### Changed
31
+ - **Canonical contract alignment** — updated Python types and CLI copy to match the workflow-native and judge-orchestration model used across the platform.
32
+ - **Version alignment** — bumped Python SDK package metadata and spec constants to `3.4.0`.
33
+
12
34
  ## [3.3.1] - 2026-03-24
13
35
 
14
36
  ### Changed
@@ -0,0 +1,220 @@
1
+ Metadata-Version: 2.4
2
+ Name: evalgate-sdk
3
+ Version: 3.5.0
4
+ Summary: EvalGate Python SDK — CI for AI behavior. Traces, evaluations, assertions, and regression gates for LLM apps.
5
+ Project-URL: Homepage, https://evalgate.com
6
+ Project-URL: Documentation, https://github.com/evalgate/ai-evaluation-platform#readme
7
+ Project-URL: Repository, https://github.com/evalgate/ai-evaluation-platform
8
+ Project-URL: Issues, https://github.com/evalgate/ai-evaluation-platform/issues
9
+ Project-URL: Changelog, https://github.com/evalgate/ai-evaluation-platform/blob/main/src/packages/sdk-python/CHANGELOG.md
10
+ Author-email: EvalGate <team@evalgate.com>
11
+ License-Expression: MIT
12
+ Keywords: ai,anthropic,assertions,ci,evaluation,llm,monitoring,observability,openai,regression,testing,tracing,workflow
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Classifier: Topic :: Software Development :: Testing
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.10
25
+ Requires-Dist: httpx<1,>=0.27
26
+ Requires-Dist: pydantic<3,>=2.0
27
+ Provides-Extra: all
28
+ Requires-Dist: anthropic>=0.20; extra == 'all'
29
+ Requires-Dist: langchain-core>=0.2; extra == 'all'
30
+ Requires-Dist: openai>=1.0; extra == 'all'
31
+ Requires-Dist: rich>=13; extra == 'all'
32
+ Requires-Dist: typer>=0.12; extra == 'all'
33
+ Provides-Extra: anthropic
34
+ Requires-Dist: anthropic>=0.20; extra == 'anthropic'
35
+ Provides-Extra: cli
36
+ Requires-Dist: rich>=13; extra == 'cli'
37
+ Requires-Dist: typer>=0.12; extra == 'cli'
38
+ Provides-Extra: dev
39
+ Requires-Dist: mypy>=1.10; extra == 'dev'
40
+ Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
41
+ Requires-Dist: pytest-cov>=5; extra == 'dev'
42
+ Requires-Dist: pytest>=8; extra == 'dev'
43
+ Requires-Dist: respx>=0.21; extra == 'dev'
44
+ Requires-Dist: rich>=13; extra == 'dev'
45
+ Requires-Dist: ruff>=0.5; extra == 'dev'
46
+ Requires-Dist: typer>=0.12; extra == 'dev'
47
+ Provides-Extra: langchain
48
+ Requires-Dist: langchain-core>=0.2; extra == 'langchain'
49
+ Provides-Extra: openai
50
+ Requires-Dist: openai>=1.0; extra == 'openai'
51
+ Description-Content-Type: text/markdown
52
+
53
+ # evalgate-sdk
54
+
55
+ [![PyPI](https://img.shields.io/pypi/v/evalgate-sdk)](https://pypi.org/project/evalgate-sdk/)
56
+ [![Python](https://img.shields.io/pypi/pyversions/evalgate-sdk)](https://pypi.org/project/evalgate-sdk/)
57
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
58
+ [![Typed](https://img.shields.io/badge/typing-typed-blue)](https://peps.python.org/pep-0561/)
59
+
60
+ `evalgate-sdk` brings EvalGate's evaluation control plane to Python.
61
+
62
+ Use it to run assertions, trace agent workflows, inspect platform runs, orchestrate judges, and participate in the same closed-loop evaluation workflow as the TypeScript SDK and the web app.
63
+
64
+ ## Install
65
+
66
+ ```bash
67
+ pip install evalgate-sdk
68
+ pip install "evalgate-sdk[cli]"
69
+ ```
70
+
71
+ Optional extras:
72
+
73
+ ```bash
74
+ pip install "evalgate-sdk[openai]"
75
+ pip install "evalgate-sdk[anthropic]"
76
+ pip install "evalgate-sdk[langchain]"
77
+ pip install "evalgate-sdk[all]"
78
+ ```
79
+
80
+ ## Quick Start
81
+
82
+ ### Local Assertions
83
+
84
+ ```python
85
+ from evalgate_sdk import expect
86
+
87
+ result = expect("The capital of France is Paris.").to_contain("Paris")
88
+ print(result.passed)
89
+ ```
90
+
91
+ ### Platform Client
92
+
93
+ ```python
94
+ from evalgate_sdk import AIEvalClient
95
+ from evalgate_sdk.types import CreateTraceParams
96
+
97
+ client = AIEvalClient(api_key="sk-...")
98
+
99
+ trace = await client.traces.create(
100
+ CreateTraceParams(
101
+ name="support-run",
102
+ input="Cancel my subscription",
103
+ output="I've canceled your plan effective today.",
104
+ )
105
+ )
106
+
107
+ print(trace.id)
108
+ ```
109
+
110
+ ## Closed-Loop Workflow
111
+
112
+ The Python package participates in the same EvalGate loop:
113
+
114
+ ```text
115
+ trace -> cluster -> synthesize -> gate -> review -> auto -> ship
116
+ ```
117
+
118
+ Representative CLI commands:
119
+
120
+ ```bash
121
+ evalgate gate
122
+ evalgate cluster --run .evalgate/runs/latest.json
123
+ evalgate synthesize --dataset .evalgate/golden/labeled.jsonl
124
+ evalgate judge registry
125
+ evalgate judge presets
126
+ ```
127
+
128
+ ## Judge Orchestration
129
+
130
+ The Python SDK supports the same core judge model as the platform:
131
+
132
+ - registry-backed judge selection
133
+ - saved judge configs
134
+ - multi-judge aggregation
135
+ - per-run and per-case judge inspection
136
+ - structured reasoning and signal breakdowns
137
+
138
+ ### CLI
139
+
140
+ ```bash
141
+ evalgate judge registry
142
+ evalgate judge presets
143
+
144
+ evalgate judge test \
145
+ --provider openai \
146
+ --model gpt-5.2-chat-latest \
147
+ --judge openai:gpt-5.2-chat-latest \
148
+ --judge anthropic:claude-sonnet-4-20250514 \
149
+ --aggregation weighted \
150
+ --prompt-template "Return strict JSON with score, passed, reasoning, and signals." \
151
+ --input "Cancel my subscription" \
152
+ --output "I've canceled your plan effective today."
153
+ ```
154
+
155
+ ### Client
156
+
157
+ ```python
158
+ from evalgate_sdk.types import TestLLMJudgeConfigParams
159
+
160
+ registry = await client.llm_judge.list_registry()
161
+ presets = await client.llm_judge.list_presets()
162
+
163
+ result = await client.llm_judge.test_config(
164
+ TestLLMJudgeConfigParams(
165
+ provider="openai",
166
+ model="gpt-5.2-chat-latest",
167
+ prompt_template="Return strict JSON with score, passed, reasoning, and signals.",
168
+ judges=[
169
+ {
170
+ "id": "primary",
171
+ "type": "llm",
172
+ "provider": "openai",
173
+ "model": "gpt-5.2-chat-latest",
174
+ "weight": 0.6,
175
+ },
176
+ {
177
+ "id": "backup",
178
+ "type": "llm",
179
+ "provider": "anthropic",
180
+ "model": "claude-sonnet-4-20250514",
181
+ "weight": 0.4,
182
+ },
183
+ ],
184
+ aggregation="weighted",
185
+ input="Cancel my subscription",
186
+ output="I've canceled your plan effective today.",
187
+ behavior="tool_use",
188
+ task_type="support",
189
+ )
190
+ )
191
+
192
+ print(result.result.score, result.result.reasoning)
193
+ ```
194
+
195
+ ## Enterprise Controls
196
+
197
+ The Python SDK respects the same platform guardrails:
198
+
199
+ - provider allowlists
200
+ - pre-send PII checks for outbound judge/provider calls
201
+ - org-scoped policy
202
+ - shared config precedence with the API and web app
203
+
204
+ ## Surface Parity
205
+
206
+ The Python SDK now covers the primary EvalGate workflows used in the product:
207
+
208
+ - traces and runs
209
+ - clustering and synthesis
210
+ - gate/check/review loops
211
+ - judge registry, presets, configs, results, and comparisons
212
+ - workflow and control-plane inspection commands
213
+
214
+ TypeScript still has a few extra convenience wrappers, but the core control-plane model is shared.
215
+
216
+ For broader product context, see:
217
+
218
+ - [Root README](../../../README.md)
219
+ - [Integration reference](../../app/docs/integration/page.tsx)
220
+ - [SDK parity guide](../../app/docs/sdk-parity/page.tsx)
@@ -0,0 +1,168 @@
1
+ # evalgate-sdk
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/evalgate-sdk)](https://pypi.org/project/evalgate-sdk/)
4
+ [![Python](https://img.shields.io/pypi/pyversions/evalgate-sdk)](https://pypi.org/project/evalgate-sdk/)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
6
+ [![Typed](https://img.shields.io/badge/typing-typed-blue)](https://peps.python.org/pep-0561/)
7
+
8
+ `evalgate-sdk` brings EvalGate's evaluation control plane to Python.
9
+
10
+ Use it to run assertions, trace agent workflows, inspect platform runs, orchestrate judges, and participate in the same closed-loop evaluation workflow as the TypeScript SDK and the web app.
11
+
12
+ ## Install
13
+
14
+ ```bash
15
+ pip install evalgate-sdk
16
+ pip install "evalgate-sdk[cli]"
17
+ ```
18
+
19
+ Optional extras:
20
+
21
+ ```bash
22
+ pip install "evalgate-sdk[openai]"
23
+ pip install "evalgate-sdk[anthropic]"
24
+ pip install "evalgate-sdk[langchain]"
25
+ pip install "evalgate-sdk[all]"
26
+ ```
27
+
28
+ ## Quick Start
29
+
30
+ ### Local Assertions
31
+
32
+ ```python
33
+ from evalgate_sdk import expect
34
+
35
+ result = expect("The capital of France is Paris.").to_contain("Paris")
36
+ print(result.passed)
37
+ ```
38
+
39
+ ### Platform Client
40
+
41
+ ```python
42
+ from evalgate_sdk import AIEvalClient
43
+ from evalgate_sdk.types import CreateTraceParams
44
+
45
+ client = AIEvalClient(api_key="sk-...")
46
+
47
+ trace = await client.traces.create(
48
+ CreateTraceParams(
49
+ name="support-run",
50
+ input="Cancel my subscription",
51
+ output="I've canceled your plan effective today.",
52
+ )
53
+ )
54
+
55
+ print(trace.id)
56
+ ```
57
+
58
+ ## Closed-Loop Workflow
59
+
60
+ The Python package participates in the same EvalGate loop:
61
+
62
+ ```text
63
+ trace -> cluster -> synthesize -> gate -> review -> auto -> ship
64
+ ```
65
+
66
+ Representative CLI commands:
67
+
68
+ ```bash
69
+ evalgate gate
70
+ evalgate cluster --run .evalgate/runs/latest.json
71
+ evalgate synthesize --dataset .evalgate/golden/labeled.jsonl
72
+ evalgate judge registry
73
+ evalgate judge presets
74
+ ```
75
+
76
+ ## Judge Orchestration
77
+
78
+ The Python SDK supports the same core judge model as the platform:
79
+
80
+ - registry-backed judge selection
81
+ - saved judge configs
82
+ - multi-judge aggregation
83
+ - per-run and per-case judge inspection
84
+ - structured reasoning and signal breakdowns
85
+
86
+ ### CLI
87
+
88
+ ```bash
89
+ evalgate judge registry
90
+ evalgate judge presets
91
+
92
+ evalgate judge test \
93
+ --provider openai \
94
+ --model gpt-5.2-chat-latest \
95
+ --judge openai:gpt-5.2-chat-latest \
96
+ --judge anthropic:claude-sonnet-4-20250514 \
97
+ --aggregation weighted \
98
+ --prompt-template "Return strict JSON with score, passed, reasoning, and signals." \
99
+ --input "Cancel my subscription" \
100
+ --output "I've canceled your plan effective today."
101
+ ```
102
+
103
+ ### Client
104
+
105
+ ```python
106
+ from evalgate_sdk.types import TestLLMJudgeConfigParams
107
+
108
+ registry = await client.llm_judge.list_registry()
109
+ presets = await client.llm_judge.list_presets()
110
+
111
+ result = await client.llm_judge.test_config(
112
+ TestLLMJudgeConfigParams(
113
+ provider="openai",
114
+ model="gpt-5.2-chat-latest",
115
+ prompt_template="Return strict JSON with score, passed, reasoning, and signals.",
116
+ judges=[
117
+ {
118
+ "id": "primary",
119
+ "type": "llm",
120
+ "provider": "openai",
121
+ "model": "gpt-5.2-chat-latest",
122
+ "weight": 0.6,
123
+ },
124
+ {
125
+ "id": "backup",
126
+ "type": "llm",
127
+ "provider": "anthropic",
128
+ "model": "claude-sonnet-4-20250514",
129
+ "weight": 0.4,
130
+ },
131
+ ],
132
+ aggregation="weighted",
133
+ input="Cancel my subscription",
134
+ output="I've canceled your plan effective today.",
135
+ behavior="tool_use",
136
+ task_type="support",
137
+ )
138
+ )
139
+
140
+ print(result.result.score, result.result.reasoning)
141
+ ```
142
+
143
+ ## Enterprise Controls
144
+
145
+ The Python SDK respects the same platform guardrails:
146
+
147
+ - provider allowlists
148
+ - pre-send PII checks for outbound judge/provider calls
149
+ - org-scoped policy
150
+ - shared config precedence with the API and web app
151
+
152
+ ## Surface Parity
153
+
154
+ The Python SDK now covers the primary EvalGate workflows used in the product:
155
+
156
+ - traces and runs
157
+ - clustering and synthesis
158
+ - gate/check/review loops
159
+ - judge registry, presets, configs, results, and comparisons
160
+ - workflow and control-plane inspection commands
161
+
162
+ TypeScript still has a few extra convenience wrappers, but the core control-plane model is shared.
163
+
164
+ For broader product context, see:
165
+
166
+ - [Root README](../../../README.md)
167
+ - [Integration reference](../../app/docs/integration/page.tsx)
168
+ - [SDK parity guide](../../app/docs/sdk-parity/page.tsx)
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "evalgate-sdk"
7
- version = "3.3.1"
7
+ version = "3.5.0"
8
8
  description = "EvalGate Python SDK — CI for AI behavior. Traces, evaluations, assertions, and regression gates for LLM apps."
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -76,5 +76,5 @@ select = ["E", "F", "I", "UP", "B", "SIM"]
76
76
 
77
77
  [tool.mypy]
78
78
  python_version = "3.10"
79
- strict = true
80
- warn_return_any = true
79
+ strict = false
80
+ warn_return_any = false
@@ -59,25 +59,57 @@ from evalgate_sdk.auto import (
59
59
  run_auto_daemon,
60
60
  write_auto_report,
61
61
  )
62
+ from evalgate_sdk.auto_experiment_runner import (
63
+ Experiment,
64
+ ExperimentChangeSet,
65
+ ExperimentDecision,
66
+ ExperimentMetrics,
67
+ ExperimentRunner,
68
+ KeepDiscardPolicy,
69
+ KeepDiscardResult,
70
+ evaluate_experiment,
71
+ select_best_experiment,
72
+ )
73
+ from evalgate_sdk.auto_failure_analyzer import (
74
+ AnalyzeInput,
75
+ AutoLLMConfig,
76
+ FailureAnalysis,
77
+ FailureAnalyzer,
78
+ FailureCase,
79
+ FailurePattern,
80
+ TasteContext,
81
+ )
82
+ from evalgate_sdk.auto_mutation_engine import (
83
+ GeneratedMutation,
84
+ GenerateInput,
85
+ KnowledgeHint,
86
+ ModelSettingsOverride,
87
+ MutationAttempt,
88
+ MutationEngine,
89
+ MutationStrategy,
90
+ TrajectorySummary,
91
+ )
62
92
  from evalgate_sdk.batch import RequestBatcher, batch_process, can_batch
63
93
  from evalgate_sdk.cache import CacheTTL, RequestCache, get_ttl, should_cache
64
94
  from evalgate_sdk.ci_context import CIContext, detect_ci_context
65
- from evalgate_sdk.cluster import ClusterCase, ClusterSample, ClusterSummary, TraceCluster, cluster_run_result, format_cluster_human
66
95
  from evalgate_sdk.cli.api import (
67
96
  FetchOptions,
68
97
  PublishShareResult,
69
98
  QualityLatestData,
70
99
  RunDetailsData,
71
100
  fetch_api,
101
+ fetch_auto_plan_preview,
102
+ fetch_integrity,
103
+ fetch_knowledge,
72
104
  fetch_quality_latest,
73
105
  fetch_run_details,
74
106
  fetch_run_export,
75
107
  import_run_on_fail,
76
108
  publish_share,
109
+ refresh_integrity,
77
110
  )
78
111
  from evalgate_sdk.cli.cli_constants import EXIT
79
112
  from evalgate_sdk.cli.config import (
80
- EvalAIConfig,
81
113
  EvalGateConfig,
82
114
  find_config_path,
83
115
  load_config,
@@ -130,6 +162,14 @@ from evalgate_sdk.cli.traces import (
130
162
  )
131
163
  from evalgate_sdk.cli.workspace import EvalWorkspace, resolve_eval_workspace
132
164
  from evalgate_sdk.client import AIEvalClient
165
+ from evalgate_sdk.cluster import (
166
+ ClusterCase,
167
+ ClusterSample,
168
+ ClusterSummary,
169
+ TraceCluster,
170
+ cluster_run_result,
171
+ format_cluster_human,
172
+ )
133
173
  from evalgate_sdk.collector import (
134
174
  CollectorFeedbackInput,
135
175
  CollectorSpanInput,
@@ -137,6 +177,7 @@ from evalgate_sdk.collector import (
137
177
  ReportTraceOptions,
138
178
  ReportTraceResult,
139
179
  report_trace,
180
+ report_traces,
140
181
  )
141
182
  from evalgate_sdk.constants import DEFAULT_BASE_URL
142
183
  from evalgate_sdk.context import (
@@ -202,6 +243,7 @@ from evalgate_sdk.integrations.openai_eval import (
202
243
  OpenAIChatEvalResult,
203
244
  openai_chat_eval,
204
245
  )
246
+ from evalgate_sdk.knowledge import KnowledgeAPI
205
247
  from evalgate_sdk.local import LocalStorage, LocalStorageStats
206
248
  from evalgate_sdk.logger import Logger, RequestLogger, create_logger, get_logger, set_logger
207
249
  from evalgate_sdk.matchers import GateAssertionError, assert_passes_gate, to_pass_gate
@@ -223,7 +265,6 @@ from evalgate_sdk.pytest_plugin import (
223
265
  assert_score_between,
224
266
  )
225
267
  from evalgate_sdk.reason_codes import REASON_CODES, get_reason_info, is_blocking
226
- from evalgate_sdk.replay_decision import NormalizedBudgetConfig, ReplayDecision, determine_comparison_basis, evaluate_replay_outcome
227
268
  from evalgate_sdk.regression import (
228
269
  ARTIFACTS,
229
270
  GATE_CATEGORY,
@@ -237,6 +278,12 @@ from evalgate_sdk.regression import (
237
278
  evaluate_regression,
238
279
  verify_baseline_checksum,
239
280
  )
281
+ from evalgate_sdk.replay_decision import (
282
+ NormalizedBudgetConfig,
283
+ ReplayDecision,
284
+ determine_comparison_basis,
285
+ evaluate_replay_outcome,
286
+ )
240
287
  from evalgate_sdk.runtime import (
241
288
  EvalExecutionError,
242
289
  EvalRuntimeError,
@@ -270,7 +317,6 @@ from evalgate_sdk.runtime.eval import (
270
317
  define_eval_only,
271
318
  define_eval_skip,
272
319
  define_suite,
273
- evalai,
274
320
  from_dataset,
275
321
  get_filtered_specs,
276
322
  )
@@ -334,7 +380,25 @@ from evalgate_sdk.synthesize import (
334
380
  synthesize_labeled_dataset,
335
381
  )
336
382
  from evalgate_sdk.testing import TestSuite, create_test_suite
337
- from evalgate_sdk.types import CamelModel, QualityBreakdown, QualityScore
383
+ from evalgate_sdk.types import (
384
+ AgentApprovalMode,
385
+ AgentAutonomyLevel,
386
+ AgentMemoryMode,
387
+ AgentWorkflowClass,
388
+ AgentWorkflowExperimentRecipe,
389
+ AgentWorkflowHookPhase,
390
+ AgentWorkflowSpec,
391
+ AutoPlanPreviewParams,
392
+ AutoPlanPreviewResponse,
393
+ AutoSessionWorkflowProfile,
394
+ CamelModel,
395
+ EvaluationIntegritySnapshot,
396
+ EvaluationRunDetail,
397
+ KnowledgeHintSummary,
398
+ KnowledgeResponse,
399
+ QualityBreakdown,
400
+ QualityScore,
401
+ )
338
402
  from evalgate_sdk.utils.input_hash import normalize_input, sha256_input
339
403
  from evalgate_sdk.workflows import WorkflowTracer, create_workflow_tracer, trace_workflow_step
340
404
 
@@ -474,13 +538,74 @@ __all__ = [
474
538
  "AutoDiffSnapshot",
475
539
  "AutoIterationResult",
476
540
  "AutoReport",
541
+ "Experiment",
542
+ "ExperimentChangeSet",
543
+ "ExperimentDecision",
544
+ "ExperimentMetrics",
545
+ "ExperimentRunner",
546
+ "KeepDiscardPolicy",
547
+ "KeepDiscardResult",
548
+ "AnalyzeInput",
549
+ "AutoLLMConfig",
550
+ "FailureAnalysis",
551
+ "FailureAnalyzer",
552
+ "FailureCase",
553
+ "FailurePattern",
554
+ "KnowledgeHint",
555
+ "GenerateInput",
556
+ "GeneratedMutation",
557
+ "ModelSettingsOverride",
558
+ "MutationAttempt",
559
+ "MutationEngine",
560
+ "MutationStrategy",
561
+ "TasteContext",
562
+ "TrajectorySummary",
477
563
  "build_auto_plan",
478
564
  "decide_auto_experiment",
479
565
  "build_auto_report",
566
+ "evaluate_experiment",
480
567
  "format_auto_human",
481
568
  "append_auto_history",
569
+ "select_best_experiment",
482
570
  "write_auto_report",
483
571
  "run_auto_daemon",
572
+ "Experiment",
573
+ "ExperimentChangeSet",
574
+ "ExperimentDecision",
575
+ "ExperimentMetrics",
576
+ "ExperimentRunner",
577
+ "KeepDiscardPolicy",
578
+ "KeepDiscardResult",
579
+ "evaluate_experiment",
580
+ "select_best_experiment",
581
+ "AnalyzeInput",
582
+ "AutoLLMConfig",
583
+ "FailureAnalysis",
584
+ "FailureAnalyzer",
585
+ "FailureCase",
586
+ "FailurePattern",
587
+ "TasteContext",
588
+ "GeneratedMutation",
589
+ "GenerateInput",
590
+ "KnowledgeHint",
591
+ "ModelSettingsOverride",
592
+ "MutationAttempt",
593
+ "MutationEngine",
594
+ "MutationStrategy",
595
+ "AgentApprovalMode",
596
+ "AgentAutonomyLevel",
597
+ "AgentMemoryMode",
598
+ "AgentWorkflowClass",
599
+ "AgentWorkflowExperimentRecipe",
600
+ "AgentWorkflowHookPhase",
601
+ "AgentWorkflowSpec",
602
+ "AutoPlanPreviewParams",
603
+ "AutoPlanPreviewResponse",
604
+ "AutoSessionWorkflowProfile",
605
+ "EvaluationIntegritySnapshot",
606
+ "EvaluationRunDetail",
607
+ "KnowledgeHintSummary",
608
+ "KnowledgeResponse",
484
609
  # Streaming
485
610
  "RateLimiter",
486
611
  "BatchProgress",
@@ -546,7 +671,6 @@ __all__ = [
546
671
  "define_eval_only",
547
672
  "define_suite",
548
673
  "create_result",
549
- "evalai",
550
674
  "from_dataset",
551
675
  "get_filtered_specs",
552
676
  # Runtime management
@@ -576,8 +700,11 @@ __all__ = [
576
700
  "CamelModel",
577
701
  "QualityScore",
578
702
  "QualityBreakdown",
703
+ # Knowledge API
704
+ "KnowledgeAPI",
579
705
  # Collector (T2)
580
706
  "report_trace",
707
+ "report_traces",
581
708
  "ReportTraceInput",
582
709
  "ReportTraceOptions",
583
710
  "ReportTraceResult",
@@ -647,13 +774,16 @@ __all__ = [
647
774
  "is_git_ref",
648
775
  "get_github_step_summary_path",
649
776
  # CLI config (T14)
650
- "EvalAIConfig",
651
777
  "EvalGateConfig",
652
778
  "find_config_path",
653
779
  "load_config",
654
780
  "merge_config_with_args",
655
781
  # CLI API (T14)
656
782
  "fetch_api",
783
+ "fetch_integrity",
784
+ "refresh_integrity",
785
+ "fetch_knowledge",
786
+ "fetch_auto_plan_preview",
657
787
  "fetch_quality_latest",
658
788
  "fetch_run_details",
659
789
  "fetch_run_export",
@@ -0,0 +1,3 @@
1
+ __version__ = "3.5.0"
2
+ SDK_VERSION = __version__
3
+ SPEC_VERSION = "3.5.0"