evalgate-sdk 3.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. evalgate_sdk/__init__.py +707 -0
  2. evalgate_sdk/_version.py +3 -0
  3. evalgate_sdk/assertions.py +1362 -0
  4. evalgate_sdk/auto.py +247 -0
  5. evalgate_sdk/batch.py +174 -0
  6. evalgate_sdk/cache.py +111 -0
  7. evalgate_sdk/ci_context.py +123 -0
  8. evalgate_sdk/cli/__init__.py +111 -0
  9. evalgate_sdk/cli/api.py +261 -0
  10. evalgate_sdk/cli/cli_constants.py +20 -0
  11. evalgate_sdk/cli/commands.py +1041 -0
  12. evalgate_sdk/cli/config.py +228 -0
  13. evalgate_sdk/cli/env.py +43 -0
  14. evalgate_sdk/cli/formatters/types.py +132 -0
  15. evalgate_sdk/cli/golden_commands.py +322 -0
  16. evalgate_sdk/cli/manifest.py +301 -0
  17. evalgate_sdk/cli/new_commands.py +435 -0
  18. evalgate_sdk/cli/policy_packs.py +103 -0
  19. evalgate_sdk/cli/profiles.py +12 -0
  20. evalgate_sdk/cli/regression_gate.py +312 -0
  21. evalgate_sdk/cli/render/__init__.py +1 -0
  22. evalgate_sdk/cli/render/snippet.py +18 -0
  23. evalgate_sdk/cli/render/sort.py +29 -0
  24. evalgate_sdk/cli/report/__init__.py +1 -0
  25. evalgate_sdk/cli/report/build_check_report.py +209 -0
  26. evalgate_sdk/cli/traces.py +186 -0
  27. evalgate_sdk/cli/workspace.py +63 -0
  28. evalgate_sdk/client.py +609 -0
  29. evalgate_sdk/cluster.py +359 -0
  30. evalgate_sdk/collector.py +161 -0
  31. evalgate_sdk/constants.py +6 -0
  32. evalgate_sdk/context.py +151 -0
  33. evalgate_sdk/errors.py +236 -0
  34. evalgate_sdk/export.py +238 -0
  35. evalgate_sdk/formatters/__init__.py +11 -0
  36. evalgate_sdk/formatters/github.py +51 -0
  37. evalgate_sdk/formatters/human.py +68 -0
  38. evalgate_sdk/formatters/json_fmt.py +11 -0
  39. evalgate_sdk/formatters/pr_comment.py +80 -0
  40. evalgate_sdk/golden.py +426 -0
  41. evalgate_sdk/integrations/__init__.py +1 -0
  42. evalgate_sdk/integrations/anthropic.py +99 -0
  43. evalgate_sdk/integrations/autogen.py +62 -0
  44. evalgate_sdk/integrations/crewai.py +61 -0
  45. evalgate_sdk/integrations/langchain.py +100 -0
  46. evalgate_sdk/integrations/openai.py +155 -0
  47. evalgate_sdk/integrations/openai_eval.py +221 -0
  48. evalgate_sdk/local.py +144 -0
  49. evalgate_sdk/logger.py +123 -0
  50. evalgate_sdk/matchers.py +62 -0
  51. evalgate_sdk/otel.py +256 -0
  52. evalgate_sdk/pagination.py +145 -0
  53. evalgate_sdk/py.typed +0 -0
  54. evalgate_sdk/pytest_plugin.py +96 -0
  55. evalgate_sdk/reason_codes.py +103 -0
  56. evalgate_sdk/regression.py +196 -0
  57. evalgate_sdk/replay_decision.py +115 -0
  58. evalgate_sdk/runtime/__init__.py +50 -0
  59. evalgate_sdk/runtime/adapters/__init__.py +1 -0
  60. evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
  61. evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
  62. evalgate_sdk/runtime/context.py +68 -0
  63. evalgate_sdk/runtime/eval.py +318 -0
  64. evalgate_sdk/runtime/execution_mode.py +170 -0
  65. evalgate_sdk/runtime/executor.py +92 -0
  66. evalgate_sdk/runtime/registry.py +125 -0
  67. evalgate_sdk/runtime/run_report.py +249 -0
  68. evalgate_sdk/runtime/types.py +143 -0
  69. evalgate_sdk/snapshot.py +219 -0
  70. evalgate_sdk/streaming.py +124 -0
  71. evalgate_sdk/synthesize.py +226 -0
  72. evalgate_sdk/testing.py +128 -0
  73. evalgate_sdk/types.py +666 -0
  74. evalgate_sdk/utils/__init__.py +1 -0
  75. evalgate_sdk/utils/input_hash.py +42 -0
  76. evalgate_sdk/workflows.py +264 -0
  77. evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
  78. evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
  79. evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
  80. evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,318 @@
1
+ """defineEval DSL — the primary API for declaring evaluation specs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import hashlib
7
+ import inspect
8
+ import io
9
+ import json
10
+ import os
11
+ import re
12
+ from collections.abc import Callable
13
+ from typing import Any, Literal
14
+
15
+ from evalgate_sdk.runtime.registry import get_active_runtime
16
+ from evalgate_sdk.runtime.types import (
17
+ EvalContext,
18
+ EvalResult,
19
+ EvalSpec,
20
+ SpecConfig,
21
+ SpecOptions,
22
+ SpecRegistrationError,
23
+ )
24
+
25
+ _NAME_PATTERN = re.compile(r"^[\w\s\-]{1,100}$")
26
+
27
+
28
+ def _generate_spec_id(name: str, file_path: str | None = None) -> str:
29
+ """Generate a content-addressable spec ID."""
30
+ source = name
31
+ if file_path is None:
32
+ frame = inspect.stack()
33
+ for f in frame[1:]:
34
+ if "evalgate_sdk" not in f.filename:
35
+ file_path = f"{f.filename}:{f.lineno}"
36
+ break
37
+ if file_path:
38
+ source = f"{file_path}:{name}"
39
+ return hashlib.sha256(source.encode()).hexdigest()[:16]
40
+
41
+
42
+ def _validate_name(name: str) -> None:
43
+ if not _NAME_PATTERN.match(name):
44
+ raise SpecRegistrationError(
45
+ f"Invalid spec name '{name}': must be 1-100 chars, alphanumeric/hyphens/underscores"
46
+ )
47
+
48
+
49
+ def define_eval(
50
+ name_or_config: Any = None,
51
+ executor: Callable[..., Any] | None = None,
52
+ *,
53
+ name: str | None = None,
54
+ options: SpecOptions | None = None,
55
+ description: str | None = None,
56
+ suite: str | None = None,
57
+ tags: list[str] | None = None,
58
+ timeout_ms: int = 30_000,
59
+ ) -> EvalSpec | None:
60
+ """Register an eval spec with the active runtime.
61
+
62
+ Can be called as::
63
+
64
+ # Positional style
65
+ define_eval("my-test", my_executor)
66
+
67
+ # Config style
68
+ define_eval(SpecConfig(name="my-test", executor=my_executor))
69
+
70
+ # Decorator style
71
+ @define_eval(name="my-test")
72
+ async def my_test(ctx):
73
+ ...
74
+ """
75
+ if isinstance(name_or_config, SpecConfig):
76
+ cfg = name_or_config
77
+ spec_name = cfg.name
78
+ spec_executor = cfg.executor
79
+ spec_options = cfg.options
80
+ spec_desc = cfg.description
81
+ spec_suite = cfg.suite
82
+ elif isinstance(name_or_config, str):
83
+ spec_name = name_or_config
84
+ spec_executor = executor
85
+ spec_options = options or SpecOptions(timeout_ms=timeout_ms, tags=tags or [])
86
+ spec_desc = description
87
+ spec_suite = suite
88
+ elif name_or_config is None and name is not None:
89
+ spec_name = name
90
+ spec_executor = executor
91
+ spec_options = options or SpecOptions(timeout_ms=timeout_ms, tags=tags or [])
92
+ spec_desc = description
93
+ spec_suite = suite
94
+ elif callable(name_or_config) and name is not None:
95
+ spec_name = name
96
+ spec_executor = name_or_config
97
+ spec_options = options or SpecOptions(timeout_ms=timeout_ms, tags=tags or [])
98
+ spec_desc = description
99
+ spec_suite = suite
100
+ else:
101
+ # Decorator mode — return a decorator
102
+ def decorator(fn: Callable[..., Any]) -> Callable[..., Any]:
103
+ define_eval(
104
+ fn,
105
+ name=name or fn.__name__,
106
+ options=options,
107
+ description=description,
108
+ suite=suite,
109
+ tags=tags,
110
+ timeout_ms=timeout_ms,
111
+ )
112
+ return fn
113
+
114
+ if callable(name_or_config):
115
+ return decorator(name_or_config)
116
+ return decorator # type: ignore[return-value]
117
+
118
+ _validate_name(spec_name)
119
+ spec_id = _generate_spec_id(spec_name)
120
+
121
+ spec = EvalSpec(
122
+ id=spec_id,
123
+ name=spec_name,
124
+ executor=spec_executor,
125
+ options=spec_options,
126
+ suite=spec_suite,
127
+ description=spec_desc,
128
+ )
129
+
130
+ runtime = get_active_runtime()
131
+ if runtime is not None:
132
+ runtime.register(spec)
133
+
134
+ return spec
135
+
136
+
137
+ def _define_eval_with_mode(
138
+ mode: Literal["normal", "skip", "only"],
139
+ name_or_config: Any = None,
140
+ executor: Callable[..., Any] | None = None,
141
+ *,
142
+ name: str | None = None,
143
+ options: SpecOptions | None = None,
144
+ description: str | None = None,
145
+ suite: str | None = None,
146
+ tags: list[str] | None = None,
147
+ timeout_ms: int = 30_000,
148
+ ) -> EvalSpec | None:
149
+ """Internal: register with a specific mode."""
150
+ spec = define_eval(
151
+ name_or_config,
152
+ executor,
153
+ name=name,
154
+ options=options,
155
+ description=description,
156
+ suite=suite,
157
+ tags=tags,
158
+ timeout_ms=timeout_ms,
159
+ )
160
+ if isinstance(spec, EvalSpec):
161
+ spec.mode = mode
162
+ return spec
163
+
164
+
165
+ def define_eval_skip(
166
+ name_or_config: Any = None,
167
+ executor: Callable[..., Any] | None = None,
168
+ **kwargs: Any,
169
+ ) -> EvalSpec | None:
170
+ """Register a spec but skip it during execution (vitest/jest ``.skip`` convention)."""
171
+ return _define_eval_with_mode("skip", name_or_config, executor, **kwargs)
172
+
173
+
174
+ def define_eval_only(
175
+ name_or_config: Any = None,
176
+ executor: Callable[..., Any] | None = None,
177
+ **kwargs: Any,
178
+ ) -> EvalSpec | None:
179
+ """Register a spec for exclusive execution (vitest/jest ``.only`` convention)."""
180
+ return _define_eval_with_mode("only", name_or_config, executor, **kwargs)
181
+
182
+
183
+ class _EvalAI:
184
+ """Convenience namespace — ``evalai.test`` is an alias for ``define_eval``."""
185
+
186
+ test = staticmethod(define_eval)
187
+
188
+
189
+ evalai = _EvalAI()
190
+
191
+ # Attach .skip / .only on define_eval to match TS defineEval.skip() / defineEval.only()
192
+ define_eval.skip = define_eval_skip # type: ignore[attr-defined]
193
+ define_eval.only = define_eval_only # type: ignore[attr-defined]
194
+
195
+
196
+ # ── skip/only filtering ──────────────────────────────────────────────
197
+
198
+
199
+ def get_filtered_specs(specs: list[EvalSpec]) -> list[EvalSpec]:
200
+ """Apply skip/only semantics to a list of specs.
201
+
202
+ If any spec has ``mode == "only"``, return only those.
203
+ Otherwise, return all specs except those with ``mode == "skip"``.
204
+ """
205
+ only_specs = [s for s in specs if s.mode == "only"]
206
+ if only_specs:
207
+ return only_specs
208
+ return [s for s in specs if s.mode != "skip"]
209
+
210
+
211
+ # ── from_dataset ─────────────────────────────────────────────────────
212
+
213
+
214
+ def _parse_jsonl(content: str) -> list[dict[str, Any]]:
215
+ rows: list[dict[str, Any]] = []
216
+ for i, line in enumerate(content.splitlines()):
217
+ line = line.strip()
218
+ if not line:
219
+ continue
220
+ try:
221
+ rows.append(json.loads(line))
222
+ except json.JSONDecodeError as exc:
223
+ raise SpecRegistrationError(f"Invalid JSON on line {i + 1} of dataset: {exc}") from exc
224
+ return rows
225
+
226
+
227
+ def _parse_csv(content: str) -> list[dict[str, Any]]:
228
+ reader = csv.DictReader(io.StringIO(content))
229
+ return [dict(row) for row in reader]
230
+
231
+
232
+ def from_dataset(
233
+ name: str,
234
+ dataset_path: str,
235
+ executor: Callable[..., Any],
236
+ options: SpecOptions | None = None,
237
+ ) -> None:
238
+ """Load a JSONL, CSV, or JSON dataset and register one spec per row.
239
+
240
+ Each row is passed as ``context.input`` to the executor.
241
+
242
+ Example::
243
+
244
+ from_dataset("rag-accuracy", "./evals/golden.jsonl", my_executor)
245
+ """
246
+ resolved = os.path.abspath(dataset_path)
247
+ if not os.path.isfile(resolved):
248
+ raise SpecRegistrationError(f"Dataset file not found: {resolved}")
249
+
250
+ with open(resolved, encoding="utf-8") as f:
251
+ content = f.read()
252
+
253
+ ext = os.path.splitext(resolved)[1].lower()
254
+ if ext in (".jsonl", ".ndjson"):
255
+ rows = _parse_jsonl(content)
256
+ elif ext == ".csv":
257
+ rows = _parse_csv(content)
258
+ elif ext == ".json":
259
+ parsed = json.loads(content)
260
+ rows = parsed if isinstance(parsed, list) else [parsed]
261
+ else:
262
+ raise SpecRegistrationError(f"Unsupported dataset format: {ext}. Use .jsonl, .ndjson, .csv, or .json")
263
+
264
+ if not rows:
265
+ raise SpecRegistrationError(f"Dataset is empty: {resolved}")
266
+
267
+ for i, row in enumerate(rows):
268
+ row_name = f"{name} - row {i + 1}"
269
+
270
+ def _make_wrapper(r: dict[str, Any]) -> Callable[..., Any]:
271
+ async def wrapper(ctx: EvalContext) -> EvalResult:
272
+ ctx.input = r
273
+ return await executor(ctx)
274
+
275
+ return wrapper
276
+
277
+ row_options = SpecOptions(
278
+ timeout_ms=options.timeout_ms if options else 30_000,
279
+ tags=list(options.tags) if options and options.tags else [],
280
+ metadata={
281
+ **(options.metadata or {} if options else {}),
282
+ "dataset_path": resolved,
283
+ "dataset_row": i + 1,
284
+ },
285
+ )
286
+ define_eval(row_name, _make_wrapper(row), options=row_options)
287
+
288
+
289
+ def define_suite(name: str, specs: list[Callable[[], None]]) -> None:
290
+ """Group multiple define_eval calls into a named suite."""
291
+ for spec_fn in specs:
292
+ spec_fn()
293
+
294
+
295
+ def create_result(
296
+ *,
297
+ passed: bool,
298
+ score: float = 0.0,
299
+ assertions: list[Any] | None = None,
300
+ metadata: dict[str, Any] | None = None,
301
+ error: str | None = None,
302
+ output: str | None = None,
303
+ tokens: int | None = None,
304
+ duration_ms: float | None = None,
305
+ ) -> EvalResult:
306
+ """Create an evaluation result."""
307
+ clamped_score = max(0.0, min(100.0, score))
308
+ return EvalResult(
309
+ passed=passed,
310
+ score=clamped_score,
311
+ assertions=assertions or [],
312
+ metadata=metadata or {},
313
+ error=error,
314
+ status="passed" if passed else ("error" if error else "failed"),
315
+ output=output,
316
+ tokens=tokens,
317
+ duration_ms=duration_ms or 0.0,
318
+ )
@@ -0,0 +1,170 @@
1
+ """Dual-path execution mode detection (T8).
2
+
3
+ Port of the TypeScript SDK's ``execution-mode.ts``.
4
+ Environment flag ``EVALGATE_RUNTIME=legacy|spec|auto``.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ from dataclasses import dataclass, field
11
+ from pathlib import Path
12
+ from typing import Literal
13
+
14
+ ExecutionMode = Literal["legacy", "spec", "auto"]
15
+
16
+ ENV_VAR = "EVALGATE_RUNTIME"
17
+ POSSIBLE_VALUES = ("legacy", "spec", "auto")
18
+ DEFAULT_MODE: ExecutionMode = "auto"
19
+
20
+
21
+ @dataclass
22
+ class ExecutionModeConfig:
23
+ mode: ExecutionMode = "auto"
24
+ has_spec_runtime: bool = False
25
+ has_legacy_runtime: bool = False
26
+ project_root: str = ""
27
+ spec_files: list[str] = field(default_factory=list)
28
+ legacy_config: str | None = None
29
+
30
+
31
+ def find_spec_files(project_root: str) -> list[str]:
32
+ """Search for Python files containing ``define_eval`` calls."""
33
+ root = Path(project_root)
34
+ patterns = [
35
+ "eval/**/*.py",
36
+ "evals/**/*.py",
37
+ "src/**/*.py",
38
+ "tests/**/*.py",
39
+ "spec/**/*.py",
40
+ ]
41
+
42
+ found: list[str] = []
43
+ for pattern in patterns:
44
+ for f in root.glob(pattern):
45
+ if f.name.startswith("_"):
46
+ continue
47
+ try:
48
+ content = f.read_text(encoding="utf-8", errors="ignore")
49
+ if "define_eval" in content:
50
+ found.append(str(f))
51
+ except OSError:
52
+ continue
53
+ return found
54
+
55
+
56
+ def find_legacy_config(project_root: str) -> str | None:
57
+ """Search for legacy config files."""
58
+ root = Path(project_root)
59
+ candidates = [
60
+ "evalai.config.json",
61
+ "evalai.config.py",
62
+ "evalgate.config.json",
63
+ ".evalgaterc",
64
+ ".evalgaterc.json",
65
+ ".evalgate/config.json",
66
+ ".evalai/config.json",
67
+ ]
68
+ for name in candidates:
69
+ p = root / name
70
+ if p.exists():
71
+ return str(p)
72
+ return None
73
+
74
+
75
+ def get_execution_mode(project_root: str | None = None) -> ExecutionModeConfig:
76
+ """Determine execution mode from environment or auto-detection."""
77
+ root = project_root or os.getcwd()
78
+ env_mode = os.environ.get(ENV_VAR, "").lower()
79
+
80
+ if env_mode in POSSIBLE_VALUES:
81
+ spec_files = find_spec_files(root) if env_mode != "legacy" else []
82
+ legacy_config = find_legacy_config(root) if env_mode != "spec" else None
83
+ return ExecutionModeConfig(
84
+ mode=env_mode, # type: ignore[arg-type]
85
+ has_spec_runtime=env_mode != "legacy",
86
+ has_legacy_runtime=env_mode != "spec",
87
+ project_root=root,
88
+ spec_files=spec_files,
89
+ legacy_config=legacy_config,
90
+ )
91
+
92
+ # Auto-detect
93
+ spec_files = find_spec_files(root)
94
+ legacy_config = find_legacy_config(root)
95
+ has_spec = len(spec_files) > 0
96
+ has_legacy = legacy_config is not None
97
+
98
+ if has_spec:
99
+ mode: ExecutionMode = "spec"
100
+ elif has_legacy:
101
+ mode = "legacy"
102
+ else:
103
+ mode = "auto"
104
+
105
+ return ExecutionModeConfig(
106
+ mode=mode,
107
+ has_spec_runtime=has_spec,
108
+ has_legacy_runtime=has_legacy,
109
+ project_root=root,
110
+ spec_files=spec_files,
111
+ legacy_config=legacy_config,
112
+ )
113
+
114
+
115
+ def can_run_spec_mode(config: ExecutionModeConfig) -> bool:
116
+ return config.has_spec_runtime and len(config.spec_files) > 0
117
+
118
+
119
+ def can_run_legacy_mode(config: ExecutionModeConfig) -> bool:
120
+ return config.has_legacy_runtime and config.legacy_config is not None
121
+
122
+
123
+ def get_recommended_mode(config: ExecutionModeConfig) -> ExecutionMode:
124
+ if config.mode != "auto":
125
+ return config.mode
126
+ if can_run_spec_mode(config):
127
+ return "spec"
128
+ if can_run_legacy_mode(config):
129
+ return "legacy"
130
+ return "auto"
131
+
132
+
133
+ def validate_execution_mode(config: ExecutionModeConfig) -> dict[str, list[str] | bool]:
134
+ """Validate execution mode compatibility."""
135
+ warnings: list[str] = []
136
+ errors: list[str] = []
137
+
138
+ if config.has_spec_runtime and config.has_legacy_runtime:
139
+ warnings.append(
140
+ "Project contains both spec files and legacy config. Consider migrating legacy tests to spec format."
141
+ )
142
+
143
+ if not config.has_spec_runtime and not config.has_legacy_runtime:
144
+ warnings.append("No tests found. Use 'evalgate init' to create a new project.")
145
+
146
+ if config.mode == "spec" and not can_run_spec_mode(config):
147
+ errors.append(
148
+ "Spec mode requested but no spec files found. Create spec files with define_eval() or use legacy mode."
149
+ )
150
+
151
+ if config.mode == "legacy" and not can_run_legacy_mode(config):
152
+ errors.append("Legacy mode requested but no config file found. Create a config file or use spec mode.")
153
+
154
+ return {"valid": len(errors) == 0, "warnings": warnings, "errors": errors}
155
+
156
+
157
+ def has_execution_mode_env() -> bool:
158
+ return ENV_VAR in os.environ
159
+
160
+
161
+ def get_execution_mode_env() -> str | None:
162
+ return os.environ.get(ENV_VAR)
163
+
164
+
165
+ def set_execution_mode_env(mode: ExecutionMode) -> None:
166
+ os.environ[ENV_VAR] = mode
167
+
168
+
169
+ def clear_execution_mode_env() -> None:
170
+ os.environ.pop(ENV_VAR, None)
@@ -0,0 +1,92 @@
1
+ """Local executor — runs eval specs with timeout protection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import time
7
+
8
+ from evalgate_sdk.runtime.types import (
9
+ EvalContext,
10
+ EvalResult,
11
+ EvalSpec,
12
+ ExecutorCapabilities,
13
+ )
14
+
15
+
16
+ class LocalExecutor:
17
+ """Execute eval specs locally with timeout and error handling."""
18
+
19
+ def __init__(self) -> None:
20
+ self.capabilities = ExecutorCapabilities(
21
+ supports_async=True,
22
+ supports_timeout=True,
23
+ supports_retries=True,
24
+ supports_parallel=False,
25
+ )
26
+
27
+ async def execute(self, spec: EvalSpec, context: EvalContext) -> EvalResult:
28
+ timeout_s = spec.options.timeout_ms / 1000
29
+ start = time.monotonic()
30
+
31
+ for attempt in range(1 + spec.options.retries):
32
+ try:
33
+ result = spec.executor(context)
34
+ if asyncio.iscoroutine(result) or asyncio.isfuture(result) or hasattr(result, "__await__"):
35
+ result = await asyncio.wait_for(result, timeout=timeout_s)
36
+
37
+ duration = (time.monotonic() - start) * 1000
38
+
39
+ if isinstance(result, EvalResult):
40
+ result.duration_ms = duration
41
+ return result
42
+
43
+ if isinstance(result, dict):
44
+ return EvalResult(
45
+ passed=result.get("passed", result.get("pass", False)),
46
+ score=result.get("score", 1.0 if result.get("passed", result.get("pass")) else 0.0),
47
+ assertions=result.get("assertions", []),
48
+ metadata=result.get("metadata", {}),
49
+ error=result.get("error"),
50
+ duration_ms=duration,
51
+ status="passed" if result.get("passed", result.get("pass")) else "failed",
52
+ )
53
+
54
+ passed = bool(result)
55
+ return EvalResult(
56
+ passed=passed,
57
+ score=1.0 if passed else 0.0,
58
+ duration_ms=duration,
59
+ status="passed" if passed else "failed",
60
+ )
61
+
62
+ except asyncio.TimeoutError:
63
+ duration = (time.monotonic() - start) * 1000
64
+ if attempt < spec.options.retries:
65
+ continue
66
+ return EvalResult(
67
+ passed=False,
68
+ score=0.0,
69
+ duration_ms=duration,
70
+ error=f"Timeout after {spec.options.timeout_ms}ms",
71
+ status="timeout",
72
+ )
73
+ except Exception as exc:
74
+ duration = (time.monotonic() - start) * 1000
75
+ if attempt < spec.options.retries:
76
+ continue
77
+ return EvalResult(
78
+ passed=False,
79
+ score=0.0,
80
+ duration_ms=duration,
81
+ error=str(exc),
82
+ status="error",
83
+ )
84
+
85
+ return EvalResult(passed=False, score=0.0, error="Max retries exceeded", status="error")
86
+
87
+
88
+ def create_local_executor() -> LocalExecutor:
89
+ return LocalExecutor()
90
+
91
+
92
+ default_local_executor = LocalExecutor()
@@ -0,0 +1,125 @@
1
+ """Runtime registry — manages eval spec lifecycle and lookup."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import builtins
6
+ import hashlib
7
+ import time
8
+ from collections.abc import Callable
9
+ from typing import Any, TypeVar
10
+
11
+ from evalgate_sdk.runtime.types import (
12
+ EvalSpec,
13
+ RuntimeHealth,
14
+ SpecRegistrationError,
15
+ )
16
+
17
+ T = TypeVar("T")
18
+
19
+
20
+ class EvalRuntime:
21
+ """In-process registry for eval specs."""
22
+
23
+ def __init__(self, namespace: str = "default") -> None:
24
+ self._namespace = namespace
25
+ self._specs: dict[str, EvalSpec] = {}
26
+ self._started_at = time.monotonic()
27
+
28
+ @property
29
+ def namespace(self) -> str:
30
+ return self._namespace
31
+
32
+ def register(self, spec: EvalSpec) -> None:
33
+ if spec.id in self._specs:
34
+ raise SpecRegistrationError(f"Spec '{spec.id}' already registered")
35
+ self._specs[spec.id] = spec
36
+
37
+ def get(self, spec_id: str) -> EvalSpec | None:
38
+ return self._specs.get(spec_id)
39
+
40
+ def list(self, *, suite: str | None = None, tags: builtins.list[str] | None = None) -> builtins.list[EvalSpec]:
41
+ specs = list(self._specs.values())
42
+ if suite is not None:
43
+ specs = [s for s in specs if s.suite == suite]
44
+ if tags:
45
+ tag_set = set(tags)
46
+ specs = [s for s in specs if tag_set.issubset(set(s.options.tags))]
47
+ return specs
48
+
49
+ def find(self, pattern: str) -> builtins.list[EvalSpec]:
50
+ return [s for s in self._specs.values() if pattern in s.name or pattern in s.id]
51
+
52
+ def clear(self) -> None:
53
+ self._specs.clear()
54
+
55
+ def get_health(self) -> RuntimeHealth:
56
+ return RuntimeHealth(
57
+ status="healthy",
58
+ spec_count=len(self._specs),
59
+ memory_estimate_mb=len(self._specs) * 0.001,
60
+ uptime_ms=(time.monotonic() - self._started_at) * 1000,
61
+ )
62
+
63
+
64
+ class RuntimeHandle:
65
+ """Scoped runtime with lifecycle management."""
66
+
67
+ def __init__(self, runtime: EvalRuntime) -> None:
68
+ self.runtime = runtime
69
+
70
+ def define_eval(self, spec: EvalSpec) -> None:
71
+ self.runtime.register(spec)
72
+
73
+ def dispose(self) -> None:
74
+ self.runtime.clear()
75
+
76
+ def snapshot(self) -> dict[str, Any]:
77
+ return {
78
+ "namespace": self.runtime.namespace,
79
+ "specs": [
80
+ {"id": s.id, "name": s.name, "suite": s.suite, "tags": s.options.tags} for s in self.runtime.list()
81
+ ],
82
+ }
83
+
84
+ def load(self, data: dict[str, Any]) -> None:
85
+ pass
86
+
87
+
88
+ _active_runtime: EvalRuntime | None = None
89
+
90
+
91
+ def create_eval_runtime(project_root: str | None = None) -> RuntimeHandle:
92
+ namespace = "default"
93
+ if project_root:
94
+ namespace = hashlib.sha256(project_root.encode()).hexdigest()[:12]
95
+ runtime = EvalRuntime(namespace=namespace)
96
+ global _active_runtime
97
+ _active_runtime = runtime
98
+ return RuntimeHandle(runtime)
99
+
100
+
101
+ def get_active_runtime() -> EvalRuntime | None:
102
+ return _active_runtime
103
+
104
+
105
+ def set_active_runtime(runtime: EvalRuntime) -> None:
106
+ global _active_runtime
107
+ _active_runtime = runtime
108
+
109
+
110
+ def dispose_active_runtime() -> None:
111
+ global _active_runtime
112
+ if _active_runtime:
113
+ _active_runtime.clear()
114
+ _active_runtime = None
115
+
116
+
117
+ async def with_runtime(project_root: str, fn: Callable[[RuntimeHandle], Any]) -> Any:
118
+ handle = create_eval_runtime(project_root)
119
+ try:
120
+ result = fn(handle)
121
+ if hasattr(result, "__await__"):
122
+ return await result
123
+ return result
124
+ finally:
125
+ handle.dispose()