evalgate-sdk 3.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalgate_sdk/__init__.py +707 -0
- evalgate_sdk/_version.py +3 -0
- evalgate_sdk/assertions.py +1362 -0
- evalgate_sdk/auto.py +247 -0
- evalgate_sdk/batch.py +174 -0
- evalgate_sdk/cache.py +111 -0
- evalgate_sdk/ci_context.py +123 -0
- evalgate_sdk/cli/__init__.py +111 -0
- evalgate_sdk/cli/api.py +261 -0
- evalgate_sdk/cli/cli_constants.py +20 -0
- evalgate_sdk/cli/commands.py +1041 -0
- evalgate_sdk/cli/config.py +228 -0
- evalgate_sdk/cli/env.py +43 -0
- evalgate_sdk/cli/formatters/types.py +132 -0
- evalgate_sdk/cli/golden_commands.py +322 -0
- evalgate_sdk/cli/manifest.py +301 -0
- evalgate_sdk/cli/new_commands.py +435 -0
- evalgate_sdk/cli/policy_packs.py +103 -0
- evalgate_sdk/cli/profiles.py +12 -0
- evalgate_sdk/cli/regression_gate.py +312 -0
- evalgate_sdk/cli/render/__init__.py +1 -0
- evalgate_sdk/cli/render/snippet.py +18 -0
- evalgate_sdk/cli/render/sort.py +29 -0
- evalgate_sdk/cli/report/__init__.py +1 -0
- evalgate_sdk/cli/report/build_check_report.py +209 -0
- evalgate_sdk/cli/traces.py +186 -0
- evalgate_sdk/cli/workspace.py +63 -0
- evalgate_sdk/client.py +609 -0
- evalgate_sdk/cluster.py +359 -0
- evalgate_sdk/collector.py +161 -0
- evalgate_sdk/constants.py +6 -0
- evalgate_sdk/context.py +151 -0
- evalgate_sdk/errors.py +236 -0
- evalgate_sdk/export.py +238 -0
- evalgate_sdk/formatters/__init__.py +11 -0
- evalgate_sdk/formatters/github.py +51 -0
- evalgate_sdk/formatters/human.py +68 -0
- evalgate_sdk/formatters/json_fmt.py +11 -0
- evalgate_sdk/formatters/pr_comment.py +80 -0
- evalgate_sdk/golden.py +426 -0
- evalgate_sdk/integrations/__init__.py +1 -0
- evalgate_sdk/integrations/anthropic.py +99 -0
- evalgate_sdk/integrations/autogen.py +62 -0
- evalgate_sdk/integrations/crewai.py +61 -0
- evalgate_sdk/integrations/langchain.py +100 -0
- evalgate_sdk/integrations/openai.py +155 -0
- evalgate_sdk/integrations/openai_eval.py +221 -0
- evalgate_sdk/local.py +144 -0
- evalgate_sdk/logger.py +123 -0
- evalgate_sdk/matchers.py +62 -0
- evalgate_sdk/otel.py +256 -0
- evalgate_sdk/pagination.py +145 -0
- evalgate_sdk/py.typed +0 -0
- evalgate_sdk/pytest_plugin.py +96 -0
- evalgate_sdk/reason_codes.py +103 -0
- evalgate_sdk/regression.py +196 -0
- evalgate_sdk/replay_decision.py +115 -0
- evalgate_sdk/runtime/__init__.py +50 -0
- evalgate_sdk/runtime/adapters/__init__.py +1 -0
- evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
- evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
- evalgate_sdk/runtime/context.py +68 -0
- evalgate_sdk/runtime/eval.py +318 -0
- evalgate_sdk/runtime/execution_mode.py +170 -0
- evalgate_sdk/runtime/executor.py +92 -0
- evalgate_sdk/runtime/registry.py +125 -0
- evalgate_sdk/runtime/run_report.py +249 -0
- evalgate_sdk/runtime/types.py +143 -0
- evalgate_sdk/snapshot.py +219 -0
- evalgate_sdk/streaming.py +124 -0
- evalgate_sdk/synthesize.py +226 -0
- evalgate_sdk/testing.py +128 -0
- evalgate_sdk/types.py +666 -0
- evalgate_sdk/utils/__init__.py +1 -0
- evalgate_sdk/utils/input_hash.py +42 -0
- evalgate_sdk/workflows.py +264 -0
- evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
- evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
- evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
- evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
"""defineEval DSL — the primary API for declaring evaluation specs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
import hashlib
|
|
7
|
+
import inspect
|
|
8
|
+
import io
|
|
9
|
+
import json
|
|
10
|
+
import os
|
|
11
|
+
import re
|
|
12
|
+
from collections.abc import Callable
|
|
13
|
+
from typing import Any, Literal
|
|
14
|
+
|
|
15
|
+
from evalgate_sdk.runtime.registry import get_active_runtime
|
|
16
|
+
from evalgate_sdk.runtime.types import (
|
|
17
|
+
EvalContext,
|
|
18
|
+
EvalResult,
|
|
19
|
+
EvalSpec,
|
|
20
|
+
SpecConfig,
|
|
21
|
+
SpecOptions,
|
|
22
|
+
SpecRegistrationError,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
_NAME_PATTERN = re.compile(r"^[\w\s\-]{1,100}$")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _generate_spec_id(name: str, file_path: str | None = None) -> str:
|
|
29
|
+
"""Generate a content-addressable spec ID."""
|
|
30
|
+
source = name
|
|
31
|
+
if file_path is None:
|
|
32
|
+
frame = inspect.stack()
|
|
33
|
+
for f in frame[1:]:
|
|
34
|
+
if "evalgate_sdk" not in f.filename:
|
|
35
|
+
file_path = f"{f.filename}:{f.lineno}"
|
|
36
|
+
break
|
|
37
|
+
if file_path:
|
|
38
|
+
source = f"{file_path}:{name}"
|
|
39
|
+
return hashlib.sha256(source.encode()).hexdigest()[:16]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _validate_name(name: str) -> None:
|
|
43
|
+
if not _NAME_PATTERN.match(name):
|
|
44
|
+
raise SpecRegistrationError(
|
|
45
|
+
f"Invalid spec name '{name}': must be 1-100 chars, alphanumeric/hyphens/underscores"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def define_eval(
|
|
50
|
+
name_or_config: Any = None,
|
|
51
|
+
executor: Callable[..., Any] | None = None,
|
|
52
|
+
*,
|
|
53
|
+
name: str | None = None,
|
|
54
|
+
options: SpecOptions | None = None,
|
|
55
|
+
description: str | None = None,
|
|
56
|
+
suite: str | None = None,
|
|
57
|
+
tags: list[str] | None = None,
|
|
58
|
+
timeout_ms: int = 30_000,
|
|
59
|
+
) -> EvalSpec | None:
|
|
60
|
+
"""Register an eval spec with the active runtime.
|
|
61
|
+
|
|
62
|
+
Can be called as::
|
|
63
|
+
|
|
64
|
+
# Positional style
|
|
65
|
+
define_eval("my-test", my_executor)
|
|
66
|
+
|
|
67
|
+
# Config style
|
|
68
|
+
define_eval(SpecConfig(name="my-test", executor=my_executor))
|
|
69
|
+
|
|
70
|
+
# Decorator style
|
|
71
|
+
@define_eval(name="my-test")
|
|
72
|
+
async def my_test(ctx):
|
|
73
|
+
...
|
|
74
|
+
"""
|
|
75
|
+
if isinstance(name_or_config, SpecConfig):
|
|
76
|
+
cfg = name_or_config
|
|
77
|
+
spec_name = cfg.name
|
|
78
|
+
spec_executor = cfg.executor
|
|
79
|
+
spec_options = cfg.options
|
|
80
|
+
spec_desc = cfg.description
|
|
81
|
+
spec_suite = cfg.suite
|
|
82
|
+
elif isinstance(name_or_config, str):
|
|
83
|
+
spec_name = name_or_config
|
|
84
|
+
spec_executor = executor
|
|
85
|
+
spec_options = options or SpecOptions(timeout_ms=timeout_ms, tags=tags or [])
|
|
86
|
+
spec_desc = description
|
|
87
|
+
spec_suite = suite
|
|
88
|
+
elif name_or_config is None and name is not None:
|
|
89
|
+
spec_name = name
|
|
90
|
+
spec_executor = executor
|
|
91
|
+
spec_options = options or SpecOptions(timeout_ms=timeout_ms, tags=tags or [])
|
|
92
|
+
spec_desc = description
|
|
93
|
+
spec_suite = suite
|
|
94
|
+
elif callable(name_or_config) and name is not None:
|
|
95
|
+
spec_name = name
|
|
96
|
+
spec_executor = name_or_config
|
|
97
|
+
spec_options = options or SpecOptions(timeout_ms=timeout_ms, tags=tags or [])
|
|
98
|
+
spec_desc = description
|
|
99
|
+
spec_suite = suite
|
|
100
|
+
else:
|
|
101
|
+
# Decorator mode — return a decorator
|
|
102
|
+
def decorator(fn: Callable[..., Any]) -> Callable[..., Any]:
|
|
103
|
+
define_eval(
|
|
104
|
+
fn,
|
|
105
|
+
name=name or fn.__name__,
|
|
106
|
+
options=options,
|
|
107
|
+
description=description,
|
|
108
|
+
suite=suite,
|
|
109
|
+
tags=tags,
|
|
110
|
+
timeout_ms=timeout_ms,
|
|
111
|
+
)
|
|
112
|
+
return fn
|
|
113
|
+
|
|
114
|
+
if callable(name_or_config):
|
|
115
|
+
return decorator(name_or_config)
|
|
116
|
+
return decorator # type: ignore[return-value]
|
|
117
|
+
|
|
118
|
+
_validate_name(spec_name)
|
|
119
|
+
spec_id = _generate_spec_id(spec_name)
|
|
120
|
+
|
|
121
|
+
spec = EvalSpec(
|
|
122
|
+
id=spec_id,
|
|
123
|
+
name=spec_name,
|
|
124
|
+
executor=spec_executor,
|
|
125
|
+
options=spec_options,
|
|
126
|
+
suite=spec_suite,
|
|
127
|
+
description=spec_desc,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
runtime = get_active_runtime()
|
|
131
|
+
if runtime is not None:
|
|
132
|
+
runtime.register(spec)
|
|
133
|
+
|
|
134
|
+
return spec
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _define_eval_with_mode(
|
|
138
|
+
mode: Literal["normal", "skip", "only"],
|
|
139
|
+
name_or_config: Any = None,
|
|
140
|
+
executor: Callable[..., Any] | None = None,
|
|
141
|
+
*,
|
|
142
|
+
name: str | None = None,
|
|
143
|
+
options: SpecOptions | None = None,
|
|
144
|
+
description: str | None = None,
|
|
145
|
+
suite: str | None = None,
|
|
146
|
+
tags: list[str] | None = None,
|
|
147
|
+
timeout_ms: int = 30_000,
|
|
148
|
+
) -> EvalSpec | None:
|
|
149
|
+
"""Internal: register with a specific mode."""
|
|
150
|
+
spec = define_eval(
|
|
151
|
+
name_or_config,
|
|
152
|
+
executor,
|
|
153
|
+
name=name,
|
|
154
|
+
options=options,
|
|
155
|
+
description=description,
|
|
156
|
+
suite=suite,
|
|
157
|
+
tags=tags,
|
|
158
|
+
timeout_ms=timeout_ms,
|
|
159
|
+
)
|
|
160
|
+
if isinstance(spec, EvalSpec):
|
|
161
|
+
spec.mode = mode
|
|
162
|
+
return spec
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def define_eval_skip(
|
|
166
|
+
name_or_config: Any = None,
|
|
167
|
+
executor: Callable[..., Any] | None = None,
|
|
168
|
+
**kwargs: Any,
|
|
169
|
+
) -> EvalSpec | None:
|
|
170
|
+
"""Register a spec but skip it during execution (vitest/jest ``.skip`` convention)."""
|
|
171
|
+
return _define_eval_with_mode("skip", name_or_config, executor, **kwargs)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def define_eval_only(
|
|
175
|
+
name_or_config: Any = None,
|
|
176
|
+
executor: Callable[..., Any] | None = None,
|
|
177
|
+
**kwargs: Any,
|
|
178
|
+
) -> EvalSpec | None:
|
|
179
|
+
"""Register a spec for exclusive execution (vitest/jest ``.only`` convention)."""
|
|
180
|
+
return _define_eval_with_mode("only", name_or_config, executor, **kwargs)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
class _EvalAI:
|
|
184
|
+
"""Convenience namespace — ``evalai.test`` is an alias for ``define_eval``."""
|
|
185
|
+
|
|
186
|
+
test = staticmethod(define_eval)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
evalai = _EvalAI()
|
|
190
|
+
|
|
191
|
+
# Attach .skip / .only on define_eval to match TS defineEval.skip() / defineEval.only()
|
|
192
|
+
define_eval.skip = define_eval_skip # type: ignore[attr-defined]
|
|
193
|
+
define_eval.only = define_eval_only # type: ignore[attr-defined]
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# ── skip/only filtering ──────────────────────────────────────────────
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def get_filtered_specs(specs: list[EvalSpec]) -> list[EvalSpec]:
|
|
200
|
+
"""Apply skip/only semantics to a list of specs.
|
|
201
|
+
|
|
202
|
+
If any spec has ``mode == "only"``, return only those.
|
|
203
|
+
Otherwise, return all specs except those with ``mode == "skip"``.
|
|
204
|
+
"""
|
|
205
|
+
only_specs = [s for s in specs if s.mode == "only"]
|
|
206
|
+
if only_specs:
|
|
207
|
+
return only_specs
|
|
208
|
+
return [s for s in specs if s.mode != "skip"]
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
# ── from_dataset ─────────────────────────────────────────────────────
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _parse_jsonl(content: str) -> list[dict[str, Any]]:
|
|
215
|
+
rows: list[dict[str, Any]] = []
|
|
216
|
+
for i, line in enumerate(content.splitlines()):
|
|
217
|
+
line = line.strip()
|
|
218
|
+
if not line:
|
|
219
|
+
continue
|
|
220
|
+
try:
|
|
221
|
+
rows.append(json.loads(line))
|
|
222
|
+
except json.JSONDecodeError as exc:
|
|
223
|
+
raise SpecRegistrationError(f"Invalid JSON on line {i + 1} of dataset: {exc}") from exc
|
|
224
|
+
return rows
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _parse_csv(content: str) -> list[dict[str, Any]]:
|
|
228
|
+
reader = csv.DictReader(io.StringIO(content))
|
|
229
|
+
return [dict(row) for row in reader]
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def from_dataset(
|
|
233
|
+
name: str,
|
|
234
|
+
dataset_path: str,
|
|
235
|
+
executor: Callable[..., Any],
|
|
236
|
+
options: SpecOptions | None = None,
|
|
237
|
+
) -> None:
|
|
238
|
+
"""Load a JSONL, CSV, or JSON dataset and register one spec per row.
|
|
239
|
+
|
|
240
|
+
Each row is passed as ``context.input`` to the executor.
|
|
241
|
+
|
|
242
|
+
Example::
|
|
243
|
+
|
|
244
|
+
from_dataset("rag-accuracy", "./evals/golden.jsonl", my_executor)
|
|
245
|
+
"""
|
|
246
|
+
resolved = os.path.abspath(dataset_path)
|
|
247
|
+
if not os.path.isfile(resolved):
|
|
248
|
+
raise SpecRegistrationError(f"Dataset file not found: {resolved}")
|
|
249
|
+
|
|
250
|
+
with open(resolved, encoding="utf-8") as f:
|
|
251
|
+
content = f.read()
|
|
252
|
+
|
|
253
|
+
ext = os.path.splitext(resolved)[1].lower()
|
|
254
|
+
if ext in (".jsonl", ".ndjson"):
|
|
255
|
+
rows = _parse_jsonl(content)
|
|
256
|
+
elif ext == ".csv":
|
|
257
|
+
rows = _parse_csv(content)
|
|
258
|
+
elif ext == ".json":
|
|
259
|
+
parsed = json.loads(content)
|
|
260
|
+
rows = parsed if isinstance(parsed, list) else [parsed]
|
|
261
|
+
else:
|
|
262
|
+
raise SpecRegistrationError(f"Unsupported dataset format: {ext}. Use .jsonl, .ndjson, .csv, or .json")
|
|
263
|
+
|
|
264
|
+
if not rows:
|
|
265
|
+
raise SpecRegistrationError(f"Dataset is empty: {resolved}")
|
|
266
|
+
|
|
267
|
+
for i, row in enumerate(rows):
|
|
268
|
+
row_name = f"{name} - row {i + 1}"
|
|
269
|
+
|
|
270
|
+
def _make_wrapper(r: dict[str, Any]) -> Callable[..., Any]:
|
|
271
|
+
async def wrapper(ctx: EvalContext) -> EvalResult:
|
|
272
|
+
ctx.input = r
|
|
273
|
+
return await executor(ctx)
|
|
274
|
+
|
|
275
|
+
return wrapper
|
|
276
|
+
|
|
277
|
+
row_options = SpecOptions(
|
|
278
|
+
timeout_ms=options.timeout_ms if options else 30_000,
|
|
279
|
+
tags=list(options.tags) if options and options.tags else [],
|
|
280
|
+
metadata={
|
|
281
|
+
**(options.metadata or {} if options else {}),
|
|
282
|
+
"dataset_path": resolved,
|
|
283
|
+
"dataset_row": i + 1,
|
|
284
|
+
},
|
|
285
|
+
)
|
|
286
|
+
define_eval(row_name, _make_wrapper(row), options=row_options)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def define_suite(name: str, specs: list[Callable[[], None]]) -> None:
|
|
290
|
+
"""Group multiple define_eval calls into a named suite."""
|
|
291
|
+
for spec_fn in specs:
|
|
292
|
+
spec_fn()
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def create_result(
|
|
296
|
+
*,
|
|
297
|
+
passed: bool,
|
|
298
|
+
score: float = 0.0,
|
|
299
|
+
assertions: list[Any] | None = None,
|
|
300
|
+
metadata: dict[str, Any] | None = None,
|
|
301
|
+
error: str | None = None,
|
|
302
|
+
output: str | None = None,
|
|
303
|
+
tokens: int | None = None,
|
|
304
|
+
duration_ms: float | None = None,
|
|
305
|
+
) -> EvalResult:
|
|
306
|
+
"""Create an evaluation result."""
|
|
307
|
+
clamped_score = max(0.0, min(100.0, score))
|
|
308
|
+
return EvalResult(
|
|
309
|
+
passed=passed,
|
|
310
|
+
score=clamped_score,
|
|
311
|
+
assertions=assertions or [],
|
|
312
|
+
metadata=metadata or {},
|
|
313
|
+
error=error,
|
|
314
|
+
status="passed" if passed else ("error" if error else "failed"),
|
|
315
|
+
output=output,
|
|
316
|
+
tokens=tokens,
|
|
317
|
+
duration_ms=duration_ms or 0.0,
|
|
318
|
+
)
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""Dual-path execution mode detection (T8).
|
|
2
|
+
|
|
3
|
+
Port of the TypeScript SDK's ``execution-mode.ts``.
|
|
4
|
+
Environment flag ``EVALGATE_RUNTIME=legacy|spec|auto``.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Literal
|
|
13
|
+
|
|
14
|
+
ExecutionMode = Literal["legacy", "spec", "auto"]
|
|
15
|
+
|
|
16
|
+
ENV_VAR = "EVALGATE_RUNTIME"
|
|
17
|
+
POSSIBLE_VALUES = ("legacy", "spec", "auto")
|
|
18
|
+
DEFAULT_MODE: ExecutionMode = "auto"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ExecutionModeConfig:
|
|
23
|
+
mode: ExecutionMode = "auto"
|
|
24
|
+
has_spec_runtime: bool = False
|
|
25
|
+
has_legacy_runtime: bool = False
|
|
26
|
+
project_root: str = ""
|
|
27
|
+
spec_files: list[str] = field(default_factory=list)
|
|
28
|
+
legacy_config: str | None = None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def find_spec_files(project_root: str) -> list[str]:
|
|
32
|
+
"""Search for Python files containing ``define_eval`` calls."""
|
|
33
|
+
root = Path(project_root)
|
|
34
|
+
patterns = [
|
|
35
|
+
"eval/**/*.py",
|
|
36
|
+
"evals/**/*.py",
|
|
37
|
+
"src/**/*.py",
|
|
38
|
+
"tests/**/*.py",
|
|
39
|
+
"spec/**/*.py",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
found: list[str] = []
|
|
43
|
+
for pattern in patterns:
|
|
44
|
+
for f in root.glob(pattern):
|
|
45
|
+
if f.name.startswith("_"):
|
|
46
|
+
continue
|
|
47
|
+
try:
|
|
48
|
+
content = f.read_text(encoding="utf-8", errors="ignore")
|
|
49
|
+
if "define_eval" in content:
|
|
50
|
+
found.append(str(f))
|
|
51
|
+
except OSError:
|
|
52
|
+
continue
|
|
53
|
+
return found
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def find_legacy_config(project_root: str) -> str | None:
|
|
57
|
+
"""Search for legacy config files."""
|
|
58
|
+
root = Path(project_root)
|
|
59
|
+
candidates = [
|
|
60
|
+
"evalai.config.json",
|
|
61
|
+
"evalai.config.py",
|
|
62
|
+
"evalgate.config.json",
|
|
63
|
+
".evalgaterc",
|
|
64
|
+
".evalgaterc.json",
|
|
65
|
+
".evalgate/config.json",
|
|
66
|
+
".evalai/config.json",
|
|
67
|
+
]
|
|
68
|
+
for name in candidates:
|
|
69
|
+
p = root / name
|
|
70
|
+
if p.exists():
|
|
71
|
+
return str(p)
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_execution_mode(project_root: str | None = None) -> ExecutionModeConfig:
|
|
76
|
+
"""Determine execution mode from environment or auto-detection."""
|
|
77
|
+
root = project_root or os.getcwd()
|
|
78
|
+
env_mode = os.environ.get(ENV_VAR, "").lower()
|
|
79
|
+
|
|
80
|
+
if env_mode in POSSIBLE_VALUES:
|
|
81
|
+
spec_files = find_spec_files(root) if env_mode != "legacy" else []
|
|
82
|
+
legacy_config = find_legacy_config(root) if env_mode != "spec" else None
|
|
83
|
+
return ExecutionModeConfig(
|
|
84
|
+
mode=env_mode, # type: ignore[arg-type]
|
|
85
|
+
has_spec_runtime=env_mode != "legacy",
|
|
86
|
+
has_legacy_runtime=env_mode != "spec",
|
|
87
|
+
project_root=root,
|
|
88
|
+
spec_files=spec_files,
|
|
89
|
+
legacy_config=legacy_config,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Auto-detect
|
|
93
|
+
spec_files = find_spec_files(root)
|
|
94
|
+
legacy_config = find_legacy_config(root)
|
|
95
|
+
has_spec = len(spec_files) > 0
|
|
96
|
+
has_legacy = legacy_config is not None
|
|
97
|
+
|
|
98
|
+
if has_spec:
|
|
99
|
+
mode: ExecutionMode = "spec"
|
|
100
|
+
elif has_legacy:
|
|
101
|
+
mode = "legacy"
|
|
102
|
+
else:
|
|
103
|
+
mode = "auto"
|
|
104
|
+
|
|
105
|
+
return ExecutionModeConfig(
|
|
106
|
+
mode=mode,
|
|
107
|
+
has_spec_runtime=has_spec,
|
|
108
|
+
has_legacy_runtime=has_legacy,
|
|
109
|
+
project_root=root,
|
|
110
|
+
spec_files=spec_files,
|
|
111
|
+
legacy_config=legacy_config,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def can_run_spec_mode(config: ExecutionModeConfig) -> bool:
|
|
116
|
+
return config.has_spec_runtime and len(config.spec_files) > 0
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def can_run_legacy_mode(config: ExecutionModeConfig) -> bool:
|
|
120
|
+
return config.has_legacy_runtime and config.legacy_config is not None
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def get_recommended_mode(config: ExecutionModeConfig) -> ExecutionMode:
|
|
124
|
+
if config.mode != "auto":
|
|
125
|
+
return config.mode
|
|
126
|
+
if can_run_spec_mode(config):
|
|
127
|
+
return "spec"
|
|
128
|
+
if can_run_legacy_mode(config):
|
|
129
|
+
return "legacy"
|
|
130
|
+
return "auto"
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def validate_execution_mode(config: ExecutionModeConfig) -> dict[str, list[str] | bool]:
|
|
134
|
+
"""Validate execution mode compatibility."""
|
|
135
|
+
warnings: list[str] = []
|
|
136
|
+
errors: list[str] = []
|
|
137
|
+
|
|
138
|
+
if config.has_spec_runtime and config.has_legacy_runtime:
|
|
139
|
+
warnings.append(
|
|
140
|
+
"Project contains both spec files and legacy config. Consider migrating legacy tests to spec format."
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
if not config.has_spec_runtime and not config.has_legacy_runtime:
|
|
144
|
+
warnings.append("No tests found. Use 'evalgate init' to create a new project.")
|
|
145
|
+
|
|
146
|
+
if config.mode == "spec" and not can_run_spec_mode(config):
|
|
147
|
+
errors.append(
|
|
148
|
+
"Spec mode requested but no spec files found. Create spec files with define_eval() or use legacy mode."
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
if config.mode == "legacy" and not can_run_legacy_mode(config):
|
|
152
|
+
errors.append("Legacy mode requested but no config file found. Create a config file or use spec mode.")
|
|
153
|
+
|
|
154
|
+
return {"valid": len(errors) == 0, "warnings": warnings, "errors": errors}
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def has_execution_mode_env() -> bool:
|
|
158
|
+
return ENV_VAR in os.environ
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def get_execution_mode_env() -> str | None:
|
|
162
|
+
return os.environ.get(ENV_VAR)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def set_execution_mode_env(mode: ExecutionMode) -> None:
|
|
166
|
+
os.environ[ENV_VAR] = mode
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def clear_execution_mode_env() -> None:
|
|
170
|
+
os.environ.pop(ENV_VAR, None)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Local executor — runs eval specs with timeout protection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import time
|
|
7
|
+
|
|
8
|
+
from evalgate_sdk.runtime.types import (
|
|
9
|
+
EvalContext,
|
|
10
|
+
EvalResult,
|
|
11
|
+
EvalSpec,
|
|
12
|
+
ExecutorCapabilities,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class LocalExecutor:
|
|
17
|
+
"""Execute eval specs locally with timeout and error handling."""
|
|
18
|
+
|
|
19
|
+
def __init__(self) -> None:
|
|
20
|
+
self.capabilities = ExecutorCapabilities(
|
|
21
|
+
supports_async=True,
|
|
22
|
+
supports_timeout=True,
|
|
23
|
+
supports_retries=True,
|
|
24
|
+
supports_parallel=False,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
async def execute(self, spec: EvalSpec, context: EvalContext) -> EvalResult:
|
|
28
|
+
timeout_s = spec.options.timeout_ms / 1000
|
|
29
|
+
start = time.monotonic()
|
|
30
|
+
|
|
31
|
+
for attempt in range(1 + spec.options.retries):
|
|
32
|
+
try:
|
|
33
|
+
result = spec.executor(context)
|
|
34
|
+
if asyncio.iscoroutine(result) or asyncio.isfuture(result) or hasattr(result, "__await__"):
|
|
35
|
+
result = await asyncio.wait_for(result, timeout=timeout_s)
|
|
36
|
+
|
|
37
|
+
duration = (time.monotonic() - start) * 1000
|
|
38
|
+
|
|
39
|
+
if isinstance(result, EvalResult):
|
|
40
|
+
result.duration_ms = duration
|
|
41
|
+
return result
|
|
42
|
+
|
|
43
|
+
if isinstance(result, dict):
|
|
44
|
+
return EvalResult(
|
|
45
|
+
passed=result.get("passed", result.get("pass", False)),
|
|
46
|
+
score=result.get("score", 1.0 if result.get("passed", result.get("pass")) else 0.0),
|
|
47
|
+
assertions=result.get("assertions", []),
|
|
48
|
+
metadata=result.get("metadata", {}),
|
|
49
|
+
error=result.get("error"),
|
|
50
|
+
duration_ms=duration,
|
|
51
|
+
status="passed" if result.get("passed", result.get("pass")) else "failed",
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
passed = bool(result)
|
|
55
|
+
return EvalResult(
|
|
56
|
+
passed=passed,
|
|
57
|
+
score=1.0 if passed else 0.0,
|
|
58
|
+
duration_ms=duration,
|
|
59
|
+
status="passed" if passed else "failed",
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
except asyncio.TimeoutError:
|
|
63
|
+
duration = (time.monotonic() - start) * 1000
|
|
64
|
+
if attempt < spec.options.retries:
|
|
65
|
+
continue
|
|
66
|
+
return EvalResult(
|
|
67
|
+
passed=False,
|
|
68
|
+
score=0.0,
|
|
69
|
+
duration_ms=duration,
|
|
70
|
+
error=f"Timeout after {spec.options.timeout_ms}ms",
|
|
71
|
+
status="timeout",
|
|
72
|
+
)
|
|
73
|
+
except Exception as exc:
|
|
74
|
+
duration = (time.monotonic() - start) * 1000
|
|
75
|
+
if attempt < spec.options.retries:
|
|
76
|
+
continue
|
|
77
|
+
return EvalResult(
|
|
78
|
+
passed=False,
|
|
79
|
+
score=0.0,
|
|
80
|
+
duration_ms=duration,
|
|
81
|
+
error=str(exc),
|
|
82
|
+
status="error",
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
return EvalResult(passed=False, score=0.0, error="Max retries exceeded", status="error")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def create_local_executor() -> LocalExecutor:
|
|
89
|
+
return LocalExecutor()
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
default_local_executor = LocalExecutor()
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Runtime registry — manages eval spec lifecycle and lookup."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import builtins
|
|
6
|
+
import hashlib
|
|
7
|
+
import time
|
|
8
|
+
from collections.abc import Callable
|
|
9
|
+
from typing import Any, TypeVar
|
|
10
|
+
|
|
11
|
+
from evalgate_sdk.runtime.types import (
|
|
12
|
+
EvalSpec,
|
|
13
|
+
RuntimeHealth,
|
|
14
|
+
SpecRegistrationError,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
T = TypeVar("T")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class EvalRuntime:
|
|
21
|
+
"""In-process registry for eval specs."""
|
|
22
|
+
|
|
23
|
+
def __init__(self, namespace: str = "default") -> None:
|
|
24
|
+
self._namespace = namespace
|
|
25
|
+
self._specs: dict[str, EvalSpec] = {}
|
|
26
|
+
self._started_at = time.monotonic()
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def namespace(self) -> str:
|
|
30
|
+
return self._namespace
|
|
31
|
+
|
|
32
|
+
def register(self, spec: EvalSpec) -> None:
|
|
33
|
+
if spec.id in self._specs:
|
|
34
|
+
raise SpecRegistrationError(f"Spec '{spec.id}' already registered")
|
|
35
|
+
self._specs[spec.id] = spec
|
|
36
|
+
|
|
37
|
+
def get(self, spec_id: str) -> EvalSpec | None:
|
|
38
|
+
return self._specs.get(spec_id)
|
|
39
|
+
|
|
40
|
+
def list(self, *, suite: str | None = None, tags: builtins.list[str] | None = None) -> builtins.list[EvalSpec]:
|
|
41
|
+
specs = list(self._specs.values())
|
|
42
|
+
if suite is not None:
|
|
43
|
+
specs = [s for s in specs if s.suite == suite]
|
|
44
|
+
if tags:
|
|
45
|
+
tag_set = set(tags)
|
|
46
|
+
specs = [s for s in specs if tag_set.issubset(set(s.options.tags))]
|
|
47
|
+
return specs
|
|
48
|
+
|
|
49
|
+
def find(self, pattern: str) -> builtins.list[EvalSpec]:
|
|
50
|
+
return [s for s in self._specs.values() if pattern in s.name or pattern in s.id]
|
|
51
|
+
|
|
52
|
+
def clear(self) -> None:
|
|
53
|
+
self._specs.clear()
|
|
54
|
+
|
|
55
|
+
def get_health(self) -> RuntimeHealth:
|
|
56
|
+
return RuntimeHealth(
|
|
57
|
+
status="healthy",
|
|
58
|
+
spec_count=len(self._specs),
|
|
59
|
+
memory_estimate_mb=len(self._specs) * 0.001,
|
|
60
|
+
uptime_ms=(time.monotonic() - self._started_at) * 1000,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class RuntimeHandle:
|
|
65
|
+
"""Scoped runtime with lifecycle management."""
|
|
66
|
+
|
|
67
|
+
def __init__(self, runtime: EvalRuntime) -> None:
|
|
68
|
+
self.runtime = runtime
|
|
69
|
+
|
|
70
|
+
def define_eval(self, spec: EvalSpec) -> None:
|
|
71
|
+
self.runtime.register(spec)
|
|
72
|
+
|
|
73
|
+
def dispose(self) -> None:
|
|
74
|
+
self.runtime.clear()
|
|
75
|
+
|
|
76
|
+
def snapshot(self) -> dict[str, Any]:
|
|
77
|
+
return {
|
|
78
|
+
"namespace": self.runtime.namespace,
|
|
79
|
+
"specs": [
|
|
80
|
+
{"id": s.id, "name": s.name, "suite": s.suite, "tags": s.options.tags} for s in self.runtime.list()
|
|
81
|
+
],
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
def load(self, data: dict[str, Any]) -> None:
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
_active_runtime: EvalRuntime | None = None
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def create_eval_runtime(project_root: str | None = None) -> RuntimeHandle:
|
|
92
|
+
namespace = "default"
|
|
93
|
+
if project_root:
|
|
94
|
+
namespace = hashlib.sha256(project_root.encode()).hexdigest()[:12]
|
|
95
|
+
runtime = EvalRuntime(namespace=namespace)
|
|
96
|
+
global _active_runtime
|
|
97
|
+
_active_runtime = runtime
|
|
98
|
+
return RuntimeHandle(runtime)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def get_active_runtime() -> EvalRuntime | None:
|
|
102
|
+
return _active_runtime
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def set_active_runtime(runtime: EvalRuntime) -> None:
|
|
106
|
+
global _active_runtime
|
|
107
|
+
_active_runtime = runtime
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def dispose_active_runtime() -> None:
|
|
111
|
+
global _active_runtime
|
|
112
|
+
if _active_runtime:
|
|
113
|
+
_active_runtime.clear()
|
|
114
|
+
_active_runtime = None
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
async def with_runtime(project_root: str, fn: Callable[[RuntimeHandle], Any]) -> Any:
|
|
118
|
+
handle = create_eval_runtime(project_root)
|
|
119
|
+
try:
|
|
120
|
+
result = fn(handle)
|
|
121
|
+
if hasattr(result, "__await__"):
|
|
122
|
+
return await result
|
|
123
|
+
return result
|
|
124
|
+
finally:
|
|
125
|
+
handle.dispose()
|