evalgate-sdk 3.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalgate_sdk/__init__.py +707 -0
- evalgate_sdk/_version.py +3 -0
- evalgate_sdk/assertions.py +1362 -0
- evalgate_sdk/auto.py +247 -0
- evalgate_sdk/batch.py +174 -0
- evalgate_sdk/cache.py +111 -0
- evalgate_sdk/ci_context.py +123 -0
- evalgate_sdk/cli/__init__.py +111 -0
- evalgate_sdk/cli/api.py +261 -0
- evalgate_sdk/cli/cli_constants.py +20 -0
- evalgate_sdk/cli/commands.py +1041 -0
- evalgate_sdk/cli/config.py +228 -0
- evalgate_sdk/cli/env.py +43 -0
- evalgate_sdk/cli/formatters/types.py +132 -0
- evalgate_sdk/cli/golden_commands.py +322 -0
- evalgate_sdk/cli/manifest.py +301 -0
- evalgate_sdk/cli/new_commands.py +435 -0
- evalgate_sdk/cli/policy_packs.py +103 -0
- evalgate_sdk/cli/profiles.py +12 -0
- evalgate_sdk/cli/regression_gate.py +312 -0
- evalgate_sdk/cli/render/__init__.py +1 -0
- evalgate_sdk/cli/render/snippet.py +18 -0
- evalgate_sdk/cli/render/sort.py +29 -0
- evalgate_sdk/cli/report/__init__.py +1 -0
- evalgate_sdk/cli/report/build_check_report.py +209 -0
- evalgate_sdk/cli/traces.py +186 -0
- evalgate_sdk/cli/workspace.py +63 -0
- evalgate_sdk/client.py +609 -0
- evalgate_sdk/cluster.py +359 -0
- evalgate_sdk/collector.py +161 -0
- evalgate_sdk/constants.py +6 -0
- evalgate_sdk/context.py +151 -0
- evalgate_sdk/errors.py +236 -0
- evalgate_sdk/export.py +238 -0
- evalgate_sdk/formatters/__init__.py +11 -0
- evalgate_sdk/formatters/github.py +51 -0
- evalgate_sdk/formatters/human.py +68 -0
- evalgate_sdk/formatters/json_fmt.py +11 -0
- evalgate_sdk/formatters/pr_comment.py +80 -0
- evalgate_sdk/golden.py +426 -0
- evalgate_sdk/integrations/__init__.py +1 -0
- evalgate_sdk/integrations/anthropic.py +99 -0
- evalgate_sdk/integrations/autogen.py +62 -0
- evalgate_sdk/integrations/crewai.py +61 -0
- evalgate_sdk/integrations/langchain.py +100 -0
- evalgate_sdk/integrations/openai.py +155 -0
- evalgate_sdk/integrations/openai_eval.py +221 -0
- evalgate_sdk/local.py +144 -0
- evalgate_sdk/logger.py +123 -0
- evalgate_sdk/matchers.py +62 -0
- evalgate_sdk/otel.py +256 -0
- evalgate_sdk/pagination.py +145 -0
- evalgate_sdk/py.typed +0 -0
- evalgate_sdk/pytest_plugin.py +96 -0
- evalgate_sdk/reason_codes.py +103 -0
- evalgate_sdk/regression.py +196 -0
- evalgate_sdk/replay_decision.py +115 -0
- evalgate_sdk/runtime/__init__.py +50 -0
- evalgate_sdk/runtime/adapters/__init__.py +1 -0
- evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
- evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
- evalgate_sdk/runtime/context.py +68 -0
- evalgate_sdk/runtime/eval.py +318 -0
- evalgate_sdk/runtime/execution_mode.py +170 -0
- evalgate_sdk/runtime/executor.py +92 -0
- evalgate_sdk/runtime/registry.py +125 -0
- evalgate_sdk/runtime/run_report.py +249 -0
- evalgate_sdk/runtime/types.py +143 -0
- evalgate_sdk/snapshot.py +219 -0
- evalgate_sdk/streaming.py +124 -0
- evalgate_sdk/synthesize.py +226 -0
- evalgate_sdk/testing.py +128 -0
- evalgate_sdk/types.py +666 -0
- evalgate_sdk/utils/__init__.py +1 -0
- evalgate_sdk/utils/input_hash.py +42 -0
- evalgate_sdk/workflows.py +264 -0
- evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
- evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
- evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
- evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""Config → DSL Adapter — LAYER 2 Compatibility Bridge.
|
|
2
|
+
|
|
3
|
+
Migrates existing evalgate.config.json and TestSuite configurations
|
|
4
|
+
to the new define_eval() DSL without breaking user workflows.
|
|
5
|
+
|
|
6
|
+
Port of ``runtime/adapters/config-to-dsl.ts``.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import os
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from datetime import datetime, timezone
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class MigrationResult:
|
|
21
|
+
"""Migration result information."""
|
|
22
|
+
|
|
23
|
+
success: bool = True
|
|
24
|
+
specs_generated: int = 0
|
|
25
|
+
errors: list[str] = field(default_factory=list)
|
|
26
|
+
warnings: list[str] = field(default_factory=list)
|
|
27
|
+
output_path: str = ""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class EvalAIConfig:
|
|
32
|
+
"""Configuration file structure (existing evalgate.config.json)."""
|
|
33
|
+
|
|
34
|
+
evaluation_id: str | None = None
|
|
35
|
+
gate: dict[str, str] | None = None
|
|
36
|
+
packages: dict[str, Any] | None = None
|
|
37
|
+
extra: dict[str, Any] = field(default_factory=dict)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def migrate_config_to_dsl(
|
|
41
|
+
config_path: str,
|
|
42
|
+
output_path: str,
|
|
43
|
+
) -> MigrationResult:
|
|
44
|
+
"""Convert evalgate.config.json to DSL specifications."""
|
|
45
|
+
result = MigrationResult(output_path=output_path)
|
|
46
|
+
|
|
47
|
+
if not os.path.exists(config_path):
|
|
48
|
+
result.success = False
|
|
49
|
+
result.errors.append(f"Configuration file not found: {config_path}")
|
|
50
|
+
return result
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
with open(config_path, encoding="utf-8") as f:
|
|
54
|
+
config_data = json.load(f)
|
|
55
|
+
|
|
56
|
+
config = EvalAIConfig(
|
|
57
|
+
evaluation_id=config_data.get("evaluationId"),
|
|
58
|
+
gate=config_data.get("gate"),
|
|
59
|
+
packages=config_data.get("packages"),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
dsl_content = _generate_dsl_from_config(config)
|
|
63
|
+
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
Path(output_path).write_text(dsl_content, encoding="utf-8")
|
|
65
|
+
|
|
66
|
+
result.specs_generated = 1
|
|
67
|
+
result.warnings.append("Generated basic DSL structure from evalgate.config.json. Manual completion required.")
|
|
68
|
+
except Exception as exc:
|
|
69
|
+
result.success = False
|
|
70
|
+
result.errors.append(f"Config migration failed: {exc}")
|
|
71
|
+
|
|
72
|
+
return result
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def migrate_testsuite_to_dsl(
|
|
76
|
+
suite_data: dict[str, Any],
|
|
77
|
+
output_path: str,
|
|
78
|
+
) -> MigrationResult:
|
|
79
|
+
"""Convert TestSuite data to defineEval() DSL specifications."""
|
|
80
|
+
result = MigrationResult(output_path=output_path)
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
name = suite_data.get("name", "test-suite")
|
|
84
|
+
cases = suite_data.get("cases", [])
|
|
85
|
+
|
|
86
|
+
dsl_content = _generate_dsl_from_suite(name, cases)
|
|
87
|
+
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
|
88
|
+
Path(output_path).write_text(dsl_content, encoding="utf-8")
|
|
89
|
+
|
|
90
|
+
result.specs_generated = len(cases)
|
|
91
|
+
result.warnings.append(f"Migrated {len(cases)} test cases from TestSuite to define_eval() DSL")
|
|
92
|
+
except Exception as exc:
|
|
93
|
+
result.success = False
|
|
94
|
+
result.errors.append(f"Migration failed: {exc}")
|
|
95
|
+
|
|
96
|
+
return result
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def migrate_project_to_dsl(
|
|
100
|
+
project_root: str,
|
|
101
|
+
output_dir: str | None = None,
|
|
102
|
+
dry_run: bool = False,
|
|
103
|
+
) -> MigrationResult:
|
|
104
|
+
"""Discover and migrate all configurations in a project."""
|
|
105
|
+
out_dir = output_dir or os.path.join(project_root, ".evalgate", "migrated")
|
|
106
|
+
result = MigrationResult(output_path=out_dir)
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
# Find config files
|
|
110
|
+
config_names = [
|
|
111
|
+
"evalgate.config.json",
|
|
112
|
+
"evalai.config.json",
|
|
113
|
+
]
|
|
114
|
+
config_path: str | None = None
|
|
115
|
+
for name in config_names:
|
|
116
|
+
candidate = os.path.join(project_root, name)
|
|
117
|
+
if os.path.exists(candidate):
|
|
118
|
+
config_path = candidate
|
|
119
|
+
break
|
|
120
|
+
|
|
121
|
+
if config_path:
|
|
122
|
+
output_path = os.path.join(out_dir, "evalgate.config.migrated.py")
|
|
123
|
+
if not dry_run:
|
|
124
|
+
sub = migrate_config_to_dsl(config_path, output_path)
|
|
125
|
+
result.specs_generated += sub.specs_generated
|
|
126
|
+
result.errors.extend(sub.errors)
|
|
127
|
+
result.warnings.extend(sub.warnings)
|
|
128
|
+
else:
|
|
129
|
+
result.warnings.append(f"Would migrate {config_path} to {output_path}")
|
|
130
|
+
|
|
131
|
+
# Find TestSuite usage in Python files
|
|
132
|
+
test_files = _find_testsuite_files(project_root)
|
|
133
|
+
for test_file in test_files:
|
|
134
|
+
output_path = os.path.join(
|
|
135
|
+
out_dir,
|
|
136
|
+
Path(test_file).stem + ".migrated.py",
|
|
137
|
+
)
|
|
138
|
+
if not dry_run:
|
|
139
|
+
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
|
140
|
+
content = _generate_placeholder_dsl(test_file)
|
|
141
|
+
Path(output_path).write_text(content, encoding="utf-8")
|
|
142
|
+
result.specs_generated += 1
|
|
143
|
+
result.warnings.append(f"Created migration placeholder for {test_file}")
|
|
144
|
+
else:
|
|
145
|
+
result.warnings.append(f"Would migrate {test_file} to {output_path}")
|
|
146
|
+
|
|
147
|
+
if result.specs_generated == 0:
|
|
148
|
+
result.warnings.append("No TestSuite configurations found to migrate")
|
|
149
|
+
except Exception as exc:
|
|
150
|
+
result.success = False
|
|
151
|
+
result.errors.append(f"Project migration failed: {exc}")
|
|
152
|
+
|
|
153
|
+
return result
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _find_testsuite_files(project_root: str) -> list[str]:
|
|
157
|
+
"""Find files that might contain TestSuite usage."""
|
|
158
|
+
found: list[str] = []
|
|
159
|
+
for root, dirs, files in os.walk(project_root):
|
|
160
|
+
# Skip hidden dirs and common non-source dirs
|
|
161
|
+
dirs[:] = [
|
|
162
|
+
d for d in dirs if not d.startswith(".") and d not in ("node_modules", "__pycache__", ".git", "venv")
|
|
163
|
+
]
|
|
164
|
+
for fname in files:
|
|
165
|
+
if not fname.endswith(".py"):
|
|
166
|
+
continue
|
|
167
|
+
full = os.path.join(root, fname)
|
|
168
|
+
try:
|
|
169
|
+
text = Path(full).read_text(encoding="utf-8", errors="ignore")
|
|
170
|
+
if "create_test_suite" in text or "TestSuite" in text:
|
|
171
|
+
found.append(full)
|
|
172
|
+
except OSError:
|
|
173
|
+
pass
|
|
174
|
+
return found
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _generate_dsl_from_config(config: EvalAIConfig) -> str:
|
|
178
|
+
"""Generate DSL code from configuration."""
|
|
179
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
180
|
+
eval_id = json.dumps(config.evaluation_id) if config.evaluation_id else "None"
|
|
181
|
+
return f"""# Auto-generated from evalgate.config.json
|
|
182
|
+
# Generated at: {now}
|
|
183
|
+
# This is a basic DSL structure — complete with your actual evaluations
|
|
184
|
+
|
|
185
|
+
from evalgate_sdk import define_eval, create_result
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
define_eval("basic-evaluation", _basic_eval)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
async def _basic_eval(ctx):
|
|
192
|
+
input_text = ctx.input
|
|
193
|
+
|
|
194
|
+
# TODO: Replace with your actual agent/LLM call
|
|
195
|
+
output = f"Agent response to: {{input_text}}"
|
|
196
|
+
|
|
197
|
+
passed = len(output) > 0
|
|
198
|
+
score = 100 if passed else 0
|
|
199
|
+
|
|
200
|
+
return create_result(
|
|
201
|
+
passed=passed,
|
|
202
|
+
score=score,
|
|
203
|
+
output=output,
|
|
204
|
+
metadata={{
|
|
205
|
+
"evaluation_id": {eval_id},
|
|
206
|
+
"input": input_text,
|
|
207
|
+
}},
|
|
208
|
+
)
|
|
209
|
+
"""
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _generate_dsl_from_suite(name: str, cases: list[dict[str, Any]]) -> str:
|
|
213
|
+
"""Generate DSL code from TestSuite data."""
|
|
214
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
215
|
+
lines = [
|
|
216
|
+
f"# Auto-generated from TestSuite: {name}",
|
|
217
|
+
f"# Generated at: {now}",
|
|
218
|
+
"# This file replaces the old TestSuite configuration",
|
|
219
|
+
"",
|
|
220
|
+
"from evalgate_sdk import define_eval, create_result",
|
|
221
|
+
"",
|
|
222
|
+
]
|
|
223
|
+
|
|
224
|
+
for i, case in enumerate(cases):
|
|
225
|
+
case_id = case.get("id", f"{name}-case-{i + 1}")
|
|
226
|
+
case_input = json.dumps(case.get("input", ""))
|
|
227
|
+
case_expected = json.dumps(case.get("expected"))
|
|
228
|
+
lines.extend(
|
|
229
|
+
[
|
|
230
|
+
f'define_eval("{case_id}", lambda ctx: _eval_{i}(ctx))',
|
|
231
|
+
"",
|
|
232
|
+
f"async def _eval_{i}(ctx):",
|
|
233
|
+
f" # Original input: {case_input}",
|
|
234
|
+
f" # Original expected: {case_expected}",
|
|
235
|
+
" input_text = ctx.input",
|
|
236
|
+
" # TODO: Replace with your actual agent/LLM call",
|
|
237
|
+
' output = f"Agent response to: {input_text}"',
|
|
238
|
+
" passed = len(output) > 0",
|
|
239
|
+
" return create_result(passed=passed, score=100 if passed else 0, output=output)",
|
|
240
|
+
"",
|
|
241
|
+
]
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
return "\n".join(lines)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _generate_placeholder_dsl(original_file: str) -> str:
|
|
248
|
+
"""Generate placeholder DSL for files that need manual migration."""
|
|
249
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
250
|
+
basename = Path(original_file).stem
|
|
251
|
+
return f"""# Migration placeholder for: {original_file}
|
|
252
|
+
# Generated at: {now}
|
|
253
|
+
# This file contains TestSuite usage that needs manual migration
|
|
254
|
+
|
|
255
|
+
from evalgate_sdk import define_eval, create_result
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
define_eval("placeholder-from-{basename}", _placeholder_eval)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
async def _placeholder_eval(ctx):
|
|
262
|
+
# TODO: Manually migrate TestSuite from {original_file}
|
|
263
|
+
input_text = ctx.input
|
|
264
|
+
output = f"Response to: {{input_text}}"
|
|
265
|
+
return create_result(
|
|
266
|
+
passed=len(output) > 0,
|
|
267
|
+
score=100 if len(output) > 0 else 0,
|
|
268
|
+
metadata={{"migrated_from": {json.dumps(original_file)}}},
|
|
269
|
+
)
|
|
270
|
+
"""
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
"""COMPAT-202: Legacy TestSuite → define_eval adapter.
|
|
2
|
+
|
|
3
|
+
Converts legacy TestSuite instances to define_eval specifications
|
|
4
|
+
without forcing migration. Enables lossless where possible.
|
|
5
|
+
|
|
6
|
+
Port of ``runtime/adapters/testsuite-to-dsl.ts``.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import base64
|
|
12
|
+
from collections.abc import Callable
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from datetime import datetime, timezone
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
from evalgate_sdk.runtime.eval import create_result
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class TestSuiteAdapterOptions:
|
|
22
|
+
"""Adapter configuration options."""
|
|
23
|
+
|
|
24
|
+
include_provenance: bool = True
|
|
25
|
+
preserve_ids: bool = True
|
|
26
|
+
generate_helpers: bool = True
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class TestDefinition:
|
|
31
|
+
"""A single test definition from a legacy TestSuite."""
|
|
32
|
+
|
|
33
|
+
id: str = ""
|
|
34
|
+
input: str = ""
|
|
35
|
+
expected: str | None = None
|
|
36
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
37
|
+
assertions: list[Any] = field(default_factory=list)
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def has_assertions(self) -> bool:
|
|
41
|
+
return len(self.assertions) > 0
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def assertion_count(self) -> int:
|
|
45
|
+
return len(self.assertions)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class AdaptedSpec:
|
|
50
|
+
"""An adapted EvalSpec from a legacy TestSuite."""
|
|
51
|
+
|
|
52
|
+
id: str = ""
|
|
53
|
+
name: str = ""
|
|
54
|
+
file_path: str = "legacy://testsuite"
|
|
55
|
+
position: dict[str, int] = field(default_factory=lambda: {"line": 1, "column": 1})
|
|
56
|
+
description: str = ""
|
|
57
|
+
tags: list[str] = field(default_factory=lambda: ["legacy", "migrated"])
|
|
58
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
59
|
+
config: dict[str, Any] = field(default_factory=dict)
|
|
60
|
+
executor: Callable[..., Any] | None = None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def adapt_test_suite(
|
|
64
|
+
tests: list[TestDefinition],
|
|
65
|
+
suite_name: str = "legacy-suite",
|
|
66
|
+
options: TestSuiteAdapterOptions | None = None,
|
|
67
|
+
) -> list[AdaptedSpec]:
|
|
68
|
+
"""Convert TestSuite test definitions to adapted EvalSpec list.
|
|
69
|
+
|
|
70
|
+
Parameters
|
|
71
|
+
----------
|
|
72
|
+
tests:
|
|
73
|
+
List of test definitions extracted from a TestSuite.
|
|
74
|
+
suite_name:
|
|
75
|
+
Name of the original suite.
|
|
76
|
+
options:
|
|
77
|
+
Adapter configuration.
|
|
78
|
+
"""
|
|
79
|
+
opts = options or TestSuiteAdapterOptions()
|
|
80
|
+
specs: list[AdaptedSpec] = []
|
|
81
|
+
|
|
82
|
+
for test in tests:
|
|
83
|
+
spec_id = _generate_spec_id(test, suite_name, opts.preserve_ids)
|
|
84
|
+
|
|
85
|
+
metadata: dict[str, Any] = dict(test.metadata)
|
|
86
|
+
if opts.include_provenance:
|
|
87
|
+
metadata.update(
|
|
88
|
+
{
|
|
89
|
+
"source": "legacy",
|
|
90
|
+
"legacy_suite_name": suite_name,
|
|
91
|
+
"legacy_test_id": test.id,
|
|
92
|
+
"original_input": test.input,
|
|
93
|
+
"original_expected": test.expected,
|
|
94
|
+
}
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
executor = _create_executor_from_test(test, opts.generate_helpers)
|
|
98
|
+
|
|
99
|
+
spec = AdaptedSpec(
|
|
100
|
+
id=spec_id,
|
|
101
|
+
name=test.id,
|
|
102
|
+
description=f"Legacy test: {test.id}",
|
|
103
|
+
tags=["legacy", "migrated"],
|
|
104
|
+
metadata=metadata,
|
|
105
|
+
executor=executor,
|
|
106
|
+
)
|
|
107
|
+
specs.append(spec)
|
|
108
|
+
|
|
109
|
+
return specs
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def generate_define_eval_code(
|
|
113
|
+
tests: list[TestDefinition],
|
|
114
|
+
suite_name: str = "legacy-suite",
|
|
115
|
+
options: TestSuiteAdapterOptions | None = None,
|
|
116
|
+
) -> str:
|
|
117
|
+
"""Generate define_eval() Python code from TestSuite data.
|
|
118
|
+
|
|
119
|
+
Returns generated Python source code as a string.
|
|
120
|
+
"""
|
|
121
|
+
specs = adapt_test_suite(tests, suite_name, options)
|
|
122
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
123
|
+
|
|
124
|
+
lines = [
|
|
125
|
+
f"# Auto-generated from TestSuite: {suite_name}",
|
|
126
|
+
f"# Generated at: {now}",
|
|
127
|
+
"# This file replaces the legacy TestSuite with define_eval() specifications",
|
|
128
|
+
"",
|
|
129
|
+
"from evalgate_sdk import define_eval, create_result",
|
|
130
|
+
"",
|
|
131
|
+
]
|
|
132
|
+
|
|
133
|
+
for i, spec in enumerate(specs):
|
|
134
|
+
original_input = repr(spec.metadata.get("original_input", ""))
|
|
135
|
+
original_expected = repr(spec.metadata.get("original_expected"))
|
|
136
|
+
|
|
137
|
+
lines.extend(
|
|
138
|
+
[
|
|
139
|
+
f'define_eval("{spec.name}", _eval_{i}, options={{'
|
|
140
|
+
f'"description": "{spec.description}", "tags": {spec.tags!r}}})',
|
|
141
|
+
"",
|
|
142
|
+
f"async def _eval_{i}(ctx):",
|
|
143
|
+
f" # Original input: {original_input}",
|
|
144
|
+
f" # Original expected: {original_expected}",
|
|
145
|
+
" input_text = ctx.input",
|
|
146
|
+
"",
|
|
147
|
+
" # TODO: Replace with your actual agent/LLM call",
|
|
148
|
+
' output = f"Response to: {input_text}"',
|
|
149
|
+
"",
|
|
150
|
+
" # Legacy evaluation logic",
|
|
151
|
+
f" expected = {original_expected}",
|
|
152
|
+
" if expected is not None:",
|
|
153
|
+
" passed = output == expected",
|
|
154
|
+
" else:",
|
|
155
|
+
" passed = len(output) > 0",
|
|
156
|
+
" return create_result(passed=passed, score=100 if passed else 0, output=output)",
|
|
157
|
+
"",
|
|
158
|
+
]
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return "\n".join(lines)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _generate_spec_id(
|
|
165
|
+
test: TestDefinition,
|
|
166
|
+
suite_name: str,
|
|
167
|
+
preserve_ids: bool,
|
|
168
|
+
) -> str:
|
|
169
|
+
"""Generate specification ID for legacy test."""
|
|
170
|
+
import re
|
|
171
|
+
|
|
172
|
+
if preserve_ids and test.id and not test.id.startswith("case-"):
|
|
173
|
+
cleaned = re.sub(r"[^a-zA-Z0-9_-]", "_", test.id)[:20]
|
|
174
|
+
return cleaned
|
|
175
|
+
|
|
176
|
+
content = f"{suite_name}|{test.id}|{test.input}|{test.expected or ''}"
|
|
177
|
+
raw = base64.b64encode(content.encode("utf-8")).decode("ascii")
|
|
178
|
+
cleaned = raw.replace("+", "").replace("/", "").replace("=", "")[:20].lower()
|
|
179
|
+
return cleaned
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _create_executor_from_test(
|
|
183
|
+
test: TestDefinition,
|
|
184
|
+
generate_helpers: bool,
|
|
185
|
+
) -> Callable[..., Any]:
|
|
186
|
+
"""Create executor function from test case."""
|
|
187
|
+
|
|
188
|
+
async def executor(ctx: Any) -> Any:
|
|
189
|
+
input_text = getattr(ctx, "input", str(ctx))
|
|
190
|
+
|
|
191
|
+
if test.expected is not None:
|
|
192
|
+
# In a real migration the caller replaces this with their agent call.
|
|
193
|
+
# The default stub echoes input so the comparison is meaningful.
|
|
194
|
+
output = input_text
|
|
195
|
+
exact_match = output == test.expected
|
|
196
|
+
return create_result(
|
|
197
|
+
passed=exact_match,
|
|
198
|
+
score=100 if exact_match else 0,
|
|
199
|
+
output=output,
|
|
200
|
+
metadata={
|
|
201
|
+
"test_case_id": test.id,
|
|
202
|
+
"original_input": test.input,
|
|
203
|
+
"original_expected": test.expected,
|
|
204
|
+
},
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
return create_result(
|
|
208
|
+
passed=False,
|
|
209
|
+
score=0,
|
|
210
|
+
error="No executor or expected output available for legacy test",
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
return executor
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""EvalGate Runtime Context — Layer 1 Foundation.
|
|
2
|
+
|
|
3
|
+
Execution context management for specifications.
|
|
4
|
+
Port of ``runtime/context.ts``.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def create_runtime_context(
|
|
13
|
+
input: str,
|
|
14
|
+
metadata: dict[str, Any] | None = None,
|
|
15
|
+
options: dict[str, Any] | None = None,
|
|
16
|
+
) -> dict[str, Any]:
|
|
17
|
+
"""Create a new execution context."""
|
|
18
|
+
return {
|
|
19
|
+
"input": input,
|
|
20
|
+
"metadata": metadata or {},
|
|
21
|
+
"options": options,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def merge_runtime_contexts(
|
|
26
|
+
base: dict[str, Any],
|
|
27
|
+
*overrides: dict[str, Any],
|
|
28
|
+
) -> dict[str, Any]:
|
|
29
|
+
"""Merge contexts with proper precedence. Later contexts override earlier ones."""
|
|
30
|
+
if not base.get("input"):
|
|
31
|
+
raise ValueError("Base context must have a valid input")
|
|
32
|
+
|
|
33
|
+
merged = dict(base)
|
|
34
|
+
for override in overrides:
|
|
35
|
+
merged = {
|
|
36
|
+
"input": override.get("input") or merged.get("input"),
|
|
37
|
+
"metadata": {**(merged.get("metadata") or {}), **(override.get("metadata") or {})},
|
|
38
|
+
"options": (
|
|
39
|
+
{**(merged.get("options") or {}), **(override.get("options") or {})}
|
|
40
|
+
if override.get("options")
|
|
41
|
+
else merged.get("options")
|
|
42
|
+
),
|
|
43
|
+
}
|
|
44
|
+
return merged
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def clone_runtime_context(context: dict[str, Any]) -> dict[str, Any]:
|
|
48
|
+
"""Clone a context for safe modification."""
|
|
49
|
+
return {
|
|
50
|
+
"input": context.get("input"),
|
|
51
|
+
"metadata": dict(context.get("metadata") or {}),
|
|
52
|
+
"options": dict(context.get("options") or {}) if context.get("options") else None,
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def validate_runtime_context(context: Any) -> None:
|
|
57
|
+
"""Validate context structure."""
|
|
58
|
+
if not isinstance(context, dict):
|
|
59
|
+
raise TypeError("Context must be a dict")
|
|
60
|
+
|
|
61
|
+
if not isinstance(context.get("input"), str):
|
|
62
|
+
raise TypeError("Context input must be a string")
|
|
63
|
+
|
|
64
|
+
if context.get("metadata") is not None and not isinstance(context.get("metadata"), dict):
|
|
65
|
+
raise TypeError("Context metadata must be a dict")
|
|
66
|
+
|
|
67
|
+
if context.get("options") is not None and not isinstance(context.get("options"), dict):
|
|
68
|
+
raise TypeError("Context options must be a dict")
|