evalgate-sdk 3.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalgate_sdk/__init__.py +707 -0
- evalgate_sdk/_version.py +3 -0
- evalgate_sdk/assertions.py +1362 -0
- evalgate_sdk/auto.py +247 -0
- evalgate_sdk/batch.py +174 -0
- evalgate_sdk/cache.py +111 -0
- evalgate_sdk/ci_context.py +123 -0
- evalgate_sdk/cli/__init__.py +111 -0
- evalgate_sdk/cli/api.py +261 -0
- evalgate_sdk/cli/cli_constants.py +20 -0
- evalgate_sdk/cli/commands.py +1041 -0
- evalgate_sdk/cli/config.py +228 -0
- evalgate_sdk/cli/env.py +43 -0
- evalgate_sdk/cli/formatters/types.py +132 -0
- evalgate_sdk/cli/golden_commands.py +322 -0
- evalgate_sdk/cli/manifest.py +301 -0
- evalgate_sdk/cli/new_commands.py +435 -0
- evalgate_sdk/cli/policy_packs.py +103 -0
- evalgate_sdk/cli/profiles.py +12 -0
- evalgate_sdk/cli/regression_gate.py +312 -0
- evalgate_sdk/cli/render/__init__.py +1 -0
- evalgate_sdk/cli/render/snippet.py +18 -0
- evalgate_sdk/cli/render/sort.py +29 -0
- evalgate_sdk/cli/report/__init__.py +1 -0
- evalgate_sdk/cli/report/build_check_report.py +209 -0
- evalgate_sdk/cli/traces.py +186 -0
- evalgate_sdk/cli/workspace.py +63 -0
- evalgate_sdk/client.py +609 -0
- evalgate_sdk/cluster.py +359 -0
- evalgate_sdk/collector.py +161 -0
- evalgate_sdk/constants.py +6 -0
- evalgate_sdk/context.py +151 -0
- evalgate_sdk/errors.py +236 -0
- evalgate_sdk/export.py +238 -0
- evalgate_sdk/formatters/__init__.py +11 -0
- evalgate_sdk/formatters/github.py +51 -0
- evalgate_sdk/formatters/human.py +68 -0
- evalgate_sdk/formatters/json_fmt.py +11 -0
- evalgate_sdk/formatters/pr_comment.py +80 -0
- evalgate_sdk/golden.py +426 -0
- evalgate_sdk/integrations/__init__.py +1 -0
- evalgate_sdk/integrations/anthropic.py +99 -0
- evalgate_sdk/integrations/autogen.py +62 -0
- evalgate_sdk/integrations/crewai.py +61 -0
- evalgate_sdk/integrations/langchain.py +100 -0
- evalgate_sdk/integrations/openai.py +155 -0
- evalgate_sdk/integrations/openai_eval.py +221 -0
- evalgate_sdk/local.py +144 -0
- evalgate_sdk/logger.py +123 -0
- evalgate_sdk/matchers.py +62 -0
- evalgate_sdk/otel.py +256 -0
- evalgate_sdk/pagination.py +145 -0
- evalgate_sdk/py.typed +0 -0
- evalgate_sdk/pytest_plugin.py +96 -0
- evalgate_sdk/reason_codes.py +103 -0
- evalgate_sdk/regression.py +196 -0
- evalgate_sdk/replay_decision.py +115 -0
- evalgate_sdk/runtime/__init__.py +50 -0
- evalgate_sdk/runtime/adapters/__init__.py +1 -0
- evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
- evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
- evalgate_sdk/runtime/context.py +68 -0
- evalgate_sdk/runtime/eval.py +318 -0
- evalgate_sdk/runtime/execution_mode.py +170 -0
- evalgate_sdk/runtime/executor.py +92 -0
- evalgate_sdk/runtime/registry.py +125 -0
- evalgate_sdk/runtime/run_report.py +249 -0
- evalgate_sdk/runtime/types.py +143 -0
- evalgate_sdk/snapshot.py +219 -0
- evalgate_sdk/streaming.py +124 -0
- evalgate_sdk/synthesize.py +226 -0
- evalgate_sdk/testing.py +128 -0
- evalgate_sdk/types.py +666 -0
- evalgate_sdk/utils/__init__.py +1 -0
- evalgate_sdk/utils/input_hash.py +42 -0
- evalgate_sdk/workflows.py +264 -0
- evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
- evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
- evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
- evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""Streaming evaluation, batch reading, and rate-limiting utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import time
|
|
7
|
+
from collections.abc import AsyncIterator, Callable, Coroutine
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import Any, Generic, TypeVar
|
|
10
|
+
|
|
11
|
+
T = TypeVar("T")
|
|
12
|
+
R = TypeVar("R")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class BatchProgress:
|
|
17
|
+
completed: int = 0
|
|
18
|
+
total: int = 0
|
|
19
|
+
failed: int = 0
|
|
20
|
+
elapsed_ms: float = 0
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class BatchResult(Generic[R]):
|
|
25
|
+
results: list[R] = field(default_factory=list)
|
|
26
|
+
errors: list[dict[str, Any]] = field(default_factory=list)
|
|
27
|
+
total: int = 0
|
|
28
|
+
succeeded: int = 0
|
|
29
|
+
failed: int = 0
|
|
30
|
+
duration_ms: float = 0
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class RateLimiter:
|
|
34
|
+
"""Token-bucket rate limiter.
|
|
35
|
+
|
|
36
|
+
Usage::
|
|
37
|
+
|
|
38
|
+
limiter = RateLimiter(requests_per_second=10)
|
|
39
|
+
result = await limiter.throttle(my_async_fn)
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, requests_per_second: float = 10) -> None:
|
|
43
|
+
self._interval = 1.0 / requests_per_second
|
|
44
|
+
self._last_call = 0.0
|
|
45
|
+
self._lock = asyncio.Lock()
|
|
46
|
+
|
|
47
|
+
async def throttle(self, fn: Callable[[], Coroutine[Any, Any, T]]) -> T:
|
|
48
|
+
async with self._lock:
|
|
49
|
+
now = time.monotonic()
|
|
50
|
+
wait = self._interval - (now - self._last_call)
|
|
51
|
+
if wait > 0:
|
|
52
|
+
await asyncio.sleep(wait)
|
|
53
|
+
self._last_call = time.monotonic()
|
|
54
|
+
return await fn()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def chunk(items: list[T], size: int) -> list[list[T]]:
|
|
58
|
+
"""Split a list into chunks of *size*."""
|
|
59
|
+
return [items[i : i + size] for i in range(0, len(items), size)]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
async def stream_evaluation(
|
|
63
|
+
evaluator: Callable[[str], Coroutine[Any, Any, str]],
|
|
64
|
+
inputs: list[str],
|
|
65
|
+
*,
|
|
66
|
+
concurrency: int = 3,
|
|
67
|
+
on_progress: Callable[[BatchProgress], None] | None = None,
|
|
68
|
+
) -> AsyncIterator[dict[str, Any]]:
|
|
69
|
+
"""Stream evaluation results as they complete.
|
|
70
|
+
|
|
71
|
+
Yields dicts with ``input``, ``output``, ``index``, ``duration_ms``, and optionally ``error``.
|
|
72
|
+
"""
|
|
73
|
+
semaphore = asyncio.Semaphore(concurrency)
|
|
74
|
+
progress = BatchProgress(total=len(inputs))
|
|
75
|
+
start = time.monotonic()
|
|
76
|
+
|
|
77
|
+
async def _run(index: int, input_text: str) -> dict[str, Any]:
|
|
78
|
+
async with semaphore:
|
|
79
|
+
t0 = time.monotonic()
|
|
80
|
+
try:
|
|
81
|
+
output = await evaluator(input_text)
|
|
82
|
+
elapsed = (time.monotonic() - t0) * 1000
|
|
83
|
+
return {"index": index, "input": input_text, "output": output, "duration_ms": elapsed}
|
|
84
|
+
except Exception as exc:
|
|
85
|
+
elapsed = (time.monotonic() - t0) * 1000
|
|
86
|
+
return {"index": index, "input": input_text, "error": str(exc), "duration_ms": elapsed}
|
|
87
|
+
|
|
88
|
+
tasks = [asyncio.create_task(_run(i, inp)) for i, inp in enumerate(inputs)]
|
|
89
|
+
|
|
90
|
+
for coro in asyncio.as_completed(tasks):
|
|
91
|
+
result = await coro
|
|
92
|
+
progress.completed += 1
|
|
93
|
+
if "error" in result:
|
|
94
|
+
progress.failed += 1
|
|
95
|
+
progress.elapsed_ms = (time.monotonic() - start) * 1000
|
|
96
|
+
if on_progress:
|
|
97
|
+
on_progress(progress)
|
|
98
|
+
yield result
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
async def batch_read(
|
|
102
|
+
fetcher: Callable[[int, int], Coroutine[Any, Any, list[T]]],
|
|
103
|
+
*,
|
|
104
|
+
limit: int = 100,
|
|
105
|
+
max_pages: int = 100,
|
|
106
|
+
) -> list[T]:
|
|
107
|
+
"""Read all pages from a paginated API endpoint.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
fetcher: Async function(offset, limit) -> list of items.
|
|
111
|
+
limit: Page size.
|
|
112
|
+
max_pages: Safety cap on number of pages.
|
|
113
|
+
"""
|
|
114
|
+
all_items: list[T] = []
|
|
115
|
+
offset = 0
|
|
116
|
+
for _ in range(max_pages):
|
|
117
|
+
page = await fetcher(offset, limit)
|
|
118
|
+
if not page:
|
|
119
|
+
break
|
|
120
|
+
all_items.extend(page)
|
|
121
|
+
if len(page) < limit:
|
|
122
|
+
break
|
|
123
|
+
offset += limit
|
|
124
|
+
return all_items
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from evalgate_sdk.golden import DEFAULT_SYNTHETIC_DATASET_PATH, LabeledGoldenCase, SyntheticGoldenCase
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(slots=True)
|
|
11
|
+
class SynthesizeSummary:
|
|
12
|
+
source_cases: int
|
|
13
|
+
source_failures: int
|
|
14
|
+
selected_failure_modes: list[str]
|
|
15
|
+
dimension_names: list[str]
|
|
16
|
+
dimension_combination_count: int
|
|
17
|
+
generated: int
|
|
18
|
+
mode_counts: list[dict[str, Any]]
|
|
19
|
+
output_path: str
|
|
20
|
+
cases: list[SyntheticGoldenCase]
|
|
21
|
+
|
|
22
|
+
def to_dict(self) -> dict[str, Any]:
|
|
23
|
+
return {
|
|
24
|
+
"sourceCases": self.source_cases,
|
|
25
|
+
"sourceFailures": self.source_failures,
|
|
26
|
+
"selectedFailureModes": list(self.selected_failure_modes),
|
|
27
|
+
"dimensionNames": list(self.dimension_names),
|
|
28
|
+
"dimensionCombinationCount": self.dimension_combination_count,
|
|
29
|
+
"generated": self.generated,
|
|
30
|
+
"modeCounts": list(self.mode_counts),
|
|
31
|
+
"outputPath": self.output_path,
|
|
32
|
+
"cases": [item.to_dict() for item in self.cases],
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass(slots=True)
|
|
37
|
+
class DimensionMatrix:
|
|
38
|
+
dimensions: dict[str, list[str]]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def parse_dimension_matrix(content: str) -> DimensionMatrix:
|
|
42
|
+
try:
|
|
43
|
+
parsed = json.loads(content)
|
|
44
|
+
except json.JSONDecodeError as exc:
|
|
45
|
+
raise ValueError("Dimension matrix must be valid JSON") from exc
|
|
46
|
+
if not isinstance(parsed, dict):
|
|
47
|
+
raise ValueError("Dimension matrix must be a JSON object")
|
|
48
|
+
raw_dimensions = parsed.get("dimensions") if isinstance(parsed.get("dimensions"), dict) else parsed
|
|
49
|
+
if not isinstance(raw_dimensions, dict):
|
|
50
|
+
raise ValueError("Dimension matrix must be a JSON object")
|
|
51
|
+
dimensions: dict[str, list[str]] = {}
|
|
52
|
+
for name, values in raw_dimensions.items():
|
|
53
|
+
if not isinstance(values, list):
|
|
54
|
+
raise ValueError(f"Dimension '{name}' must be an array of strings")
|
|
55
|
+
normalized: list[str] = []
|
|
56
|
+
for value in values:
|
|
57
|
+
if not isinstance(value, str):
|
|
58
|
+
raise ValueError(f"Dimension '{name}' must contain only strings")
|
|
59
|
+
stripped = value.strip()
|
|
60
|
+
if stripped:
|
|
61
|
+
normalized.append(stripped)
|
|
62
|
+
if not normalized:
|
|
63
|
+
raise ValueError(f"Dimension '{name}' must contain at least one value")
|
|
64
|
+
dimensions[str(name)] = normalized
|
|
65
|
+
return DimensionMatrix(dimensions=dimensions)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _cartesian_dimensions(dimensions: dict[str, list[str]]) -> list[dict[str, str]]:
|
|
69
|
+
entries = list(dimensions.items())
|
|
70
|
+
if not entries:
|
|
71
|
+
return [{}]
|
|
72
|
+
combinations: list[dict[str, str]] = [{}]
|
|
73
|
+
for name, values in entries:
|
|
74
|
+
next_items: list[dict[str, str]] = []
|
|
75
|
+
for combination in combinations:
|
|
76
|
+
for value in values:
|
|
77
|
+
candidate = dict(combination)
|
|
78
|
+
candidate[name] = value
|
|
79
|
+
next_items.append(candidate)
|
|
80
|
+
combinations = next_items
|
|
81
|
+
return combinations
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _slugify(value: str) -> str:
|
|
85
|
+
result = []
|
|
86
|
+
last_dash = False
|
|
87
|
+
for char in value.lower():
|
|
88
|
+
if char.isalnum():
|
|
89
|
+
result.append(char)
|
|
90
|
+
last_dash = False
|
|
91
|
+
elif not last_dash:
|
|
92
|
+
result.append("-")
|
|
93
|
+
last_dash = True
|
|
94
|
+
return "".join(result).strip("-")[:48]
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _dimension_label(dimensions: dict[str, str]) -> str:
|
|
98
|
+
pairs = [f"{name}={value}" for name, value in dimensions.items()]
|
|
99
|
+
return ", ".join(pairs) if pairs else "base"
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _build_synthetic_case(
|
|
103
|
+
prototype: LabeledGoldenCase,
|
|
104
|
+
failure_mode: str,
|
|
105
|
+
dimensions: dict[str, str],
|
|
106
|
+
sequence: int,
|
|
107
|
+
) -> SyntheticGoldenCase:
|
|
108
|
+
from datetime import datetime, timezone
|
|
109
|
+
|
|
110
|
+
timestamp = datetime.now(timezone.utc).isoformat()
|
|
111
|
+
dims_text = _dimension_label(dimensions)
|
|
112
|
+
dimension_suffix = _slugify(dims_text) or "base"
|
|
113
|
+
mode_suffix = _slugify(failure_mode) or "failure-mode"
|
|
114
|
+
input_parts = [prototype.input.strip()]
|
|
115
|
+
expected_parts = [prototype.expected.strip()]
|
|
116
|
+
actual_parts = [f"Representative {failure_mode} failure draft.", prototype.actual.strip()]
|
|
117
|
+
if dims_text != "base":
|
|
118
|
+
input_parts.append(f"Synthetic dimensions: {dims_text}")
|
|
119
|
+
expected_parts.append(f"Target dimensions: {dims_text}")
|
|
120
|
+
actual_parts.insert(1, f"Scenario dimensions: {dims_text}")
|
|
121
|
+
return SyntheticGoldenCase(
|
|
122
|
+
case_id=f"synthetic-{mode_suffix}-{dimension_suffix}-{str(sequence + 1).zfill(3)}",
|
|
123
|
+
input="\n".join(part for part in input_parts if part),
|
|
124
|
+
expected="\n".join(part for part in expected_parts if part),
|
|
125
|
+
actual="\n".join(part for part in actual_parts if part),
|
|
126
|
+
label="fail",
|
|
127
|
+
failure_mode=failure_mode,
|
|
128
|
+
labeled_at=timestamp,
|
|
129
|
+
synthetic=True,
|
|
130
|
+
synthesized_at=timestamp,
|
|
131
|
+
source_case_ids=[prototype.case_id],
|
|
132
|
+
dimensions=dict(dimensions),
|
|
133
|
+
cluster_id=prototype.cluster_id,
|
|
134
|
+
cluster_label=prototype.cluster_label,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def synthesize_labeled_dataset(
|
|
139
|
+
rows: list[LabeledGoldenCase],
|
|
140
|
+
*,
|
|
141
|
+
dimensions: dict[str, list[str]] | None = None,
|
|
142
|
+
count: int | None = None,
|
|
143
|
+
failure_modes: list[str] | None = None,
|
|
144
|
+
output_path: str = DEFAULT_SYNTHETIC_DATASET_PATH,
|
|
145
|
+
) -> SynthesizeSummary:
|
|
146
|
+
failed_rows = [row for row in rows if row.label == "fail" and isinstance(row.failure_mode, str) and row.failure_mode.strip()]
|
|
147
|
+
grouped: dict[str, list[LabeledGoldenCase]] = {}
|
|
148
|
+
for row in failed_rows:
|
|
149
|
+
key = row.failure_mode.strip()
|
|
150
|
+
grouped.setdefault(key, []).append(row)
|
|
151
|
+
|
|
152
|
+
requested_modes = [mode.strip() for mode in (failure_modes or []) if mode.strip()]
|
|
153
|
+
if requested_modes:
|
|
154
|
+
selected_failure_modes = [mode for mode in requested_modes if mode in grouped]
|
|
155
|
+
else:
|
|
156
|
+
selected_failure_modes = [
|
|
157
|
+
mode for mode, _items in sorted(grouped.items(), key=lambda item: (-len(item[1]), item[0]))
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
dimension_matrix = dimensions or {}
|
|
161
|
+
combinations = _cartesian_dimensions(dimension_matrix)
|
|
162
|
+
dimension_names = list(dimension_matrix.keys())
|
|
163
|
+
|
|
164
|
+
plan: list[tuple[str, dict[str, str]]] = []
|
|
165
|
+
for failure_mode in selected_failure_modes:
|
|
166
|
+
for combo in combinations:
|
|
167
|
+
plan.append((failure_mode, combo))
|
|
168
|
+
|
|
169
|
+
if count is not None:
|
|
170
|
+
target_count = max(0, count)
|
|
171
|
+
elif plan:
|
|
172
|
+
target_count = len(plan)
|
|
173
|
+
else:
|
|
174
|
+
target_count = len(selected_failure_modes)
|
|
175
|
+
|
|
176
|
+
cases: list[SyntheticGoldenCase] = []
|
|
177
|
+
if target_count > 0 and plan:
|
|
178
|
+
for index in range(target_count):
|
|
179
|
+
failure_mode, combo = plan[index % len(plan)]
|
|
180
|
+
source_rows = grouped[failure_mode]
|
|
181
|
+
prototype = source_rows[(index // max(1, len(plan))) % len(source_rows)]
|
|
182
|
+
cases.append(_build_synthetic_case(prototype, failure_mode, combo, index))
|
|
183
|
+
|
|
184
|
+
counts: dict[str, int] = {}
|
|
185
|
+
for case in cases:
|
|
186
|
+
mode = case.failure_mode or "unknown"
|
|
187
|
+
counts[mode] = counts.get(mode, 0) + 1
|
|
188
|
+
|
|
189
|
+
return SynthesizeSummary(
|
|
190
|
+
source_cases=len(rows),
|
|
191
|
+
source_failures=len(failed_rows),
|
|
192
|
+
selected_failure_modes=selected_failure_modes,
|
|
193
|
+
dimension_names=dimension_names,
|
|
194
|
+
dimension_combination_count=len(combinations),
|
|
195
|
+
generated=len(cases),
|
|
196
|
+
mode_counts=[
|
|
197
|
+
{"failureMode": mode, "count": count_value}
|
|
198
|
+
for mode, count_value in sorted(counts.items(), key=lambda item: (-item[1], item[0]))
|
|
199
|
+
],
|
|
200
|
+
output_path=output_path,
|
|
201
|
+
cases=cases,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def format_synthesize_human(summary: SynthesizeSummary) -> str:
|
|
206
|
+
lines = [
|
|
207
|
+
"Synthesize phase",
|
|
208
|
+
f"Source cases: {summary.source_cases}",
|
|
209
|
+
f"Source failures: {summary.source_failures}",
|
|
210
|
+
f"Failure modes used: {len(summary.selected_failure_modes)}",
|
|
211
|
+
f"Dimension combinations: {summary.dimension_combination_count}",
|
|
212
|
+
f"Generated synthetic cases: {summary.generated}",
|
|
213
|
+
]
|
|
214
|
+
if summary.selected_failure_modes:
|
|
215
|
+
lines.append(f"Modes: {', '.join(summary.selected_failure_modes)}")
|
|
216
|
+
if summary.dimension_names:
|
|
217
|
+
lines.append(f"Dimensions: {', '.join(summary.dimension_names)}")
|
|
218
|
+
if summary.mode_counts:
|
|
219
|
+
mode_line = ", ".join(f"{item['failureMode']} ×{item['count']}" for item in summary.mode_counts)
|
|
220
|
+
lines.append(f"Mode counts: {mode_line}")
|
|
221
|
+
if summary.cases:
|
|
222
|
+
samples = ", ".join(
|
|
223
|
+
f"{item.case_id} ({_dimension_label(item.dimensions)})" for item in summary.cases[:3]
|
|
224
|
+
)
|
|
225
|
+
lines.append(f"Samples: {samples}")
|
|
226
|
+
return "\n".join(lines)
|
evalgate_sdk/testing.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""Test suite builder for running structured evaluations against LLM outputs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from evalgate_sdk.assertions import AssertionResult, expect
|
|
9
|
+
from evalgate_sdk.types import (
|
|
10
|
+
TestSuiteCase,
|
|
11
|
+
TestSuiteCaseResult,
|
|
12
|
+
TestSuiteConfig,
|
|
13
|
+
TestSuiteResult,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TestSuite:
|
|
18
|
+
"""A collection of test cases that can be executed against an evaluator function.
|
|
19
|
+
|
|
20
|
+
Usage::
|
|
21
|
+
|
|
22
|
+
suite = create_test_suite("My Suite", config=TestSuiteConfig(
|
|
23
|
+
test_cases=[
|
|
24
|
+
TestSuiteCase(name="greet", input="Hello", expected_output="Hi"),
|
|
25
|
+
],
|
|
26
|
+
evaluator=my_llm_call,
|
|
27
|
+
))
|
|
28
|
+
result = await suite.run()
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, name: str, config: TestSuiteConfig) -> None:
|
|
32
|
+
self._name = name
|
|
33
|
+
self._config = config
|
|
34
|
+
self._cases: list[TestSuiteCase] = list(config.test_cases)
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def name(self) -> str:
|
|
38
|
+
return self._name
|
|
39
|
+
|
|
40
|
+
def add_case(self, case: TestSuiteCase) -> None:
|
|
41
|
+
self._cases.append(case)
|
|
42
|
+
|
|
43
|
+
def get_config(self) -> TestSuiteConfig:
|
|
44
|
+
return self._config
|
|
45
|
+
|
|
46
|
+
def get_cases(self) -> list[TestSuiteCase]:
|
|
47
|
+
return list(self._cases)
|
|
48
|
+
|
|
49
|
+
async def run(self) -> TestSuiteResult:
|
|
50
|
+
evaluator = self._config.evaluator
|
|
51
|
+
if evaluator is None:
|
|
52
|
+
raise ValueError("No evaluator provided in TestSuiteConfig")
|
|
53
|
+
|
|
54
|
+
results: list[TestSuiteCaseResult] = []
|
|
55
|
+
suite_start = time.monotonic()
|
|
56
|
+
|
|
57
|
+
for case in self._cases:
|
|
58
|
+
case_start = time.monotonic()
|
|
59
|
+
output: str | None = None
|
|
60
|
+
error: str | None = None
|
|
61
|
+
assertions: list[AssertionResult] = []
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
raw = evaluator(case.input)
|
|
65
|
+
output = str(await raw) if hasattr(raw, "__await__") or hasattr(raw, "__anext__") else str(raw)
|
|
66
|
+
except Exception as exc:
|
|
67
|
+
error = str(exc)
|
|
68
|
+
|
|
69
|
+
if output is not None and case.assertions:
|
|
70
|
+
for assertion_def in case.assertions:
|
|
71
|
+
a_type = assertion_def.get("type", "")
|
|
72
|
+
a_value = assertion_def.get("value")
|
|
73
|
+
exp = expect(output)
|
|
74
|
+
if a_type == "contains" and isinstance(a_value, str):
|
|
75
|
+
assertions.append(exp.to_contain(a_value))
|
|
76
|
+
elif a_type == "equals" and a_value is not None:
|
|
77
|
+
assertions.append(exp.to_equal(a_value))
|
|
78
|
+
elif a_type == "not_contains_pii":
|
|
79
|
+
assertions.append(exp.to_not_contain_pii())
|
|
80
|
+
|
|
81
|
+
if output is not None and case.expected_output is not None and not assertions:
|
|
82
|
+
assertions.append(expect(output).to_equal(case.expected_output))
|
|
83
|
+
|
|
84
|
+
case_passed = error is None and all(a.passed for a in assertions)
|
|
85
|
+
duration = int((time.monotonic() - case_start) * 1000)
|
|
86
|
+
|
|
87
|
+
results.append(
|
|
88
|
+
TestSuiteCaseResult(
|
|
89
|
+
name=case.name,
|
|
90
|
+
passed=case_passed,
|
|
91
|
+
duration_ms=duration,
|
|
92
|
+
input=case.input,
|
|
93
|
+
output=output,
|
|
94
|
+
expected_output=case.expected_output,
|
|
95
|
+
assertions=assertions,
|
|
96
|
+
error=error,
|
|
97
|
+
)
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
total_duration = int((time.monotonic() - suite_start) * 1000)
|
|
101
|
+
passed_count = sum(1 for r in results if r.passed)
|
|
102
|
+
|
|
103
|
+
return TestSuiteResult(
|
|
104
|
+
suite_name=self._name,
|
|
105
|
+
passed=all(r.passed for r in results),
|
|
106
|
+
total=len(results),
|
|
107
|
+
passed_count=passed_count,
|
|
108
|
+
failed_count=len(results) - passed_count,
|
|
109
|
+
duration_ms=total_duration,
|
|
110
|
+
results=results,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
def to_dict(self) -> dict[str, Any]:
|
|
114
|
+
return {
|
|
115
|
+
"name": self._name,
|
|
116
|
+
"config": self._config.model_dump(),
|
|
117
|
+
"cases": [c.model_dump() for c in self._cases],
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def create_test_suite(name: str, config: TestSuiteConfig) -> TestSuite:
|
|
122
|
+
"""Create a new test suite.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
name: Human-readable suite name.
|
|
126
|
+
config: Suite configuration including test cases and evaluator.
|
|
127
|
+
"""
|
|
128
|
+
return TestSuite(name, config)
|