evalgate-sdk 3.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalgate_sdk/__init__.py +707 -0
- evalgate_sdk/_version.py +3 -0
- evalgate_sdk/assertions.py +1362 -0
- evalgate_sdk/auto.py +247 -0
- evalgate_sdk/batch.py +174 -0
- evalgate_sdk/cache.py +111 -0
- evalgate_sdk/ci_context.py +123 -0
- evalgate_sdk/cli/__init__.py +111 -0
- evalgate_sdk/cli/api.py +261 -0
- evalgate_sdk/cli/cli_constants.py +20 -0
- evalgate_sdk/cli/commands.py +1041 -0
- evalgate_sdk/cli/config.py +228 -0
- evalgate_sdk/cli/env.py +43 -0
- evalgate_sdk/cli/formatters/types.py +132 -0
- evalgate_sdk/cli/golden_commands.py +322 -0
- evalgate_sdk/cli/manifest.py +301 -0
- evalgate_sdk/cli/new_commands.py +435 -0
- evalgate_sdk/cli/policy_packs.py +103 -0
- evalgate_sdk/cli/profiles.py +12 -0
- evalgate_sdk/cli/regression_gate.py +312 -0
- evalgate_sdk/cli/render/__init__.py +1 -0
- evalgate_sdk/cli/render/snippet.py +18 -0
- evalgate_sdk/cli/render/sort.py +29 -0
- evalgate_sdk/cli/report/__init__.py +1 -0
- evalgate_sdk/cli/report/build_check_report.py +209 -0
- evalgate_sdk/cli/traces.py +186 -0
- evalgate_sdk/cli/workspace.py +63 -0
- evalgate_sdk/client.py +609 -0
- evalgate_sdk/cluster.py +359 -0
- evalgate_sdk/collector.py +161 -0
- evalgate_sdk/constants.py +6 -0
- evalgate_sdk/context.py +151 -0
- evalgate_sdk/errors.py +236 -0
- evalgate_sdk/export.py +238 -0
- evalgate_sdk/formatters/__init__.py +11 -0
- evalgate_sdk/formatters/github.py +51 -0
- evalgate_sdk/formatters/human.py +68 -0
- evalgate_sdk/formatters/json_fmt.py +11 -0
- evalgate_sdk/formatters/pr_comment.py +80 -0
- evalgate_sdk/golden.py +426 -0
- evalgate_sdk/integrations/__init__.py +1 -0
- evalgate_sdk/integrations/anthropic.py +99 -0
- evalgate_sdk/integrations/autogen.py +62 -0
- evalgate_sdk/integrations/crewai.py +61 -0
- evalgate_sdk/integrations/langchain.py +100 -0
- evalgate_sdk/integrations/openai.py +155 -0
- evalgate_sdk/integrations/openai_eval.py +221 -0
- evalgate_sdk/local.py +144 -0
- evalgate_sdk/logger.py +123 -0
- evalgate_sdk/matchers.py +62 -0
- evalgate_sdk/otel.py +256 -0
- evalgate_sdk/pagination.py +145 -0
- evalgate_sdk/py.typed +0 -0
- evalgate_sdk/pytest_plugin.py +96 -0
- evalgate_sdk/reason_codes.py +103 -0
- evalgate_sdk/regression.py +196 -0
- evalgate_sdk/replay_decision.py +115 -0
- evalgate_sdk/runtime/__init__.py +50 -0
- evalgate_sdk/runtime/adapters/__init__.py +1 -0
- evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
- evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
- evalgate_sdk/runtime/context.py +68 -0
- evalgate_sdk/runtime/eval.py +318 -0
- evalgate_sdk/runtime/execution_mode.py +170 -0
- evalgate_sdk/runtime/executor.py +92 -0
- evalgate_sdk/runtime/registry.py +125 -0
- evalgate_sdk/runtime/run_report.py +249 -0
- evalgate_sdk/runtime/types.py +143 -0
- evalgate_sdk/snapshot.py +219 -0
- evalgate_sdk/streaming.py +124 -0
- evalgate_sdk/synthesize.py +226 -0
- evalgate_sdk/testing.py +128 -0
- evalgate_sdk/types.py +666 -0
- evalgate_sdk/utils/__init__.py +1 -0
- evalgate_sdk/utils/input_hash.py +42 -0
- evalgate_sdk/workflows.py +264 -0
- evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
- evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
- evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
- evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
|
|
11
|
+
from evalgate_sdk.auto import (
|
|
12
|
+
DEFAULT_AUTO_HISTORY_PATH,
|
|
13
|
+
DEFAULT_AUTO_REPORT_PATH,
|
|
14
|
+
AutoOptions,
|
|
15
|
+
build_auto_report,
|
|
16
|
+
format_auto_human,
|
|
17
|
+
run_auto_daemon,
|
|
18
|
+
write_auto_report,
|
|
19
|
+
)
|
|
20
|
+
from evalgate_sdk.cluster import cluster_run_result, format_cluster_human
|
|
21
|
+
from evalgate_sdk.golden import (
|
|
22
|
+
DEFAULT_LABELED_DATASET_PATH,
|
|
23
|
+
DEFAULT_SYNTHETIC_DATASET_PATH,
|
|
24
|
+
LabeledGoldenCase,
|
|
25
|
+
analyze_labeled_dataset,
|
|
26
|
+
format_analyze_human,
|
|
27
|
+
normalize_run_artifact,
|
|
28
|
+
parse_labeled_dataset,
|
|
29
|
+
write_jsonl,
|
|
30
|
+
)
|
|
31
|
+
from evalgate_sdk.replay_decision import NormalizedBudgetConfig, evaluate_replay_outcome
|
|
32
|
+
from evalgate_sdk.synthesize import format_synthesize_human, parse_dimension_matrix, synthesize_labeled_dataset
|
|
33
|
+
|
|
34
|
+
console = Console()
|
|
35
|
+
|
|
36
|
+
auto_app = typer.Typer(help="Autonomous prompt-improvement workflow commands.", no_args_is_help=True)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _read_json(file_path: str) -> dict[str, Any]:
|
|
40
|
+
try:
|
|
41
|
+
data = json.loads(Path(file_path).read_text(encoding="utf-8"))
|
|
42
|
+
except FileNotFoundError as exc:
|
|
43
|
+
raise typer.BadParameter(f"File not found: {file_path}") from exc
|
|
44
|
+
except json.JSONDecodeError as exc:
|
|
45
|
+
raise typer.BadParameter(f"Invalid JSON: {file_path}") from exc
|
|
46
|
+
if not isinstance(data, dict):
|
|
47
|
+
raise typer.BadParameter(f"Expected JSON object in {file_path}")
|
|
48
|
+
return data
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _build_labeled_rows_from_cluster_summary(
|
|
52
|
+
cluster_summary: dict[str, Any],
|
|
53
|
+
*,
|
|
54
|
+
default_label: str,
|
|
55
|
+
failure_mode: str | None,
|
|
56
|
+
include_passed: bool,
|
|
57
|
+
) -> list[LabeledGoldenCase]:
|
|
58
|
+
rows: list[LabeledGoldenCase] = []
|
|
59
|
+
timestamp = datetime.now(timezone.utc).isoformat()
|
|
60
|
+
for cluster in cluster_summary.get("clusters", []):
|
|
61
|
+
if not isinstance(cluster, dict):
|
|
62
|
+
continue
|
|
63
|
+
cluster_id = cluster.get("id") if isinstance(cluster.get("id"), str) else None
|
|
64
|
+
cluster_label = cluster.get("clusterLabel") if isinstance(cluster.get("clusterLabel"), str) else None
|
|
65
|
+
cluster_failure_mode = failure_mode
|
|
66
|
+
if cluster_failure_mode is None and isinstance(cluster.get("suggestedFailureMode"), str):
|
|
67
|
+
cluster_failure_mode = cluster["suggestedFailureMode"]
|
|
68
|
+
for case in cluster.get("cases", []):
|
|
69
|
+
if not isinstance(case, dict):
|
|
70
|
+
continue
|
|
71
|
+
status = case.get("status") if isinstance(case.get("status"), str) else "failed"
|
|
72
|
+
if status == "passed" and not include_passed:
|
|
73
|
+
continue
|
|
74
|
+
label_value = "pass" if status == "passed" and default_label != "fail" else ("pass" if status == "passed" else "fail")
|
|
75
|
+
rows.append(
|
|
76
|
+
LabeledGoldenCase(
|
|
77
|
+
case_id=str(case.get("caseId") or case.get("id") or case.get("name") or "case"),
|
|
78
|
+
input=str(case.get("input") or ""),
|
|
79
|
+
expected=str(case.get("expected") or ""),
|
|
80
|
+
actual=str(case.get("actual") or ""),
|
|
81
|
+
label=label_value,
|
|
82
|
+
failure_mode=None if label_value == "pass" else (cluster_failure_mode or status),
|
|
83
|
+
labeled_at=timestamp,
|
|
84
|
+
cluster_id=cluster_id,
|
|
85
|
+
cluster_label=cluster_label,
|
|
86
|
+
)
|
|
87
|
+
)
|
|
88
|
+
return rows
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@auto_app.command("run")
|
|
92
|
+
def auto_run(
|
|
93
|
+
objective: str = typer.Option(..., "--objective", help="Target failure mode or objective"),
|
|
94
|
+
prompt: str | None = typer.Option(None, "--prompt", help="Prompt file being optimized"),
|
|
95
|
+
hypothesis: str | None = typer.Option(None, "--hypothesis", help="Experiment hypothesis"),
|
|
96
|
+
baseline_run: str | None = typer.Option(None, "--baseline-run", help="Baseline run artifact"),
|
|
97
|
+
candidate_run: str | None = typer.Option(None, "--candidate-run", help="Candidate run artifact"),
|
|
98
|
+
budget: int = typer.Option(1, "--budget", help="Per-iteration budget"),
|
|
99
|
+
budget_mode: str = typer.Option("traces", "--budget-mode", help="Budget mode: traces or cost"),
|
|
100
|
+
autonomous: bool = typer.Option(False, "--autonomous", help="Enable autonomous bounded mode"),
|
|
101
|
+
dry_run: bool = typer.Option(False, "--dry-run", help="Plan only"),
|
|
102
|
+
fmt: str = typer.Option("human", "--format", help="Output format: human or json"),
|
|
103
|
+
report: str = typer.Option(DEFAULT_AUTO_REPORT_PATH, "--report", help="Auto report output path"),
|
|
104
|
+
) -> None:
|
|
105
|
+
options = AutoOptions(
|
|
106
|
+
objective=objective,
|
|
107
|
+
hypothesis=hypothesis,
|
|
108
|
+
prompt_path=prompt,
|
|
109
|
+
baseline_run_path=baseline_run,
|
|
110
|
+
candidate_run_path=candidate_run,
|
|
111
|
+
budget=budget,
|
|
112
|
+
budget_mode="cost" if budget_mode == "cost" else "traces",
|
|
113
|
+
autonomous=autonomous,
|
|
114
|
+
dry_run=dry_run,
|
|
115
|
+
format="json" if fmt == "json" else "human",
|
|
116
|
+
report_path=report,
|
|
117
|
+
)
|
|
118
|
+
auto_report = build_auto_report(options)
|
|
119
|
+
write_auto_report(auto_report, report)
|
|
120
|
+
if fmt == "json":
|
|
121
|
+
console.print_json(json.dumps(auto_report.to_dict()))
|
|
122
|
+
else:
|
|
123
|
+
console.print(format_auto_human(auto_report))
|
|
124
|
+
console.print(f"\nSaved → {Path(report)}")
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@auto_app.command("daemon")
|
|
128
|
+
def auto_daemon(
|
|
129
|
+
objective: str = typer.Option(..., "--objective", help="Target failure mode or objective"),
|
|
130
|
+
prompt: str | None = typer.Option(None, "--prompt", help="Prompt file being optimized"),
|
|
131
|
+
hypothesis: str | None = typer.Option(None, "--hypothesis", help="Experiment hypothesis"),
|
|
132
|
+
baseline_run: str | None = typer.Option(None, "--baseline-run", help="Baseline run artifact"),
|
|
133
|
+
candidate_run: str | None = typer.Option(None, "--candidate-run", help="Candidate run artifact"),
|
|
134
|
+
budget: int = typer.Option(1, "--budget", help="Per-cycle budget"),
|
|
135
|
+
cycles: int = typer.Option(1, "--cycles", help="Number of bounded cycles to run"),
|
|
136
|
+
interval_ms: int = typer.Option(0, "--interval-ms", help="Delay between cycles"),
|
|
137
|
+
fmt: str = typer.Option("human", "--format", help="Output format: human or json"),
|
|
138
|
+
history: str = typer.Option(DEFAULT_AUTO_HISTORY_PATH, "--history", help="History JSONL path"),
|
|
139
|
+
) -> None:
|
|
140
|
+
options = AutoOptions(
|
|
141
|
+
objective=objective,
|
|
142
|
+
hypothesis=hypothesis,
|
|
143
|
+
prompt_path=prompt,
|
|
144
|
+
baseline_run_path=baseline_run,
|
|
145
|
+
candidate_run_path=candidate_run,
|
|
146
|
+
budget=budget,
|
|
147
|
+
format="json" if fmt == "json" else "human",
|
|
148
|
+
)
|
|
149
|
+
reports = run_auto_daemon(options, cycles=cycles, interval_ms=interval_ms, history_path=history)
|
|
150
|
+
if fmt == "json":
|
|
151
|
+
console.print_json(json.dumps({"cycles": len(reports), "history": history, "reports": [report.to_dict() for report in reports]}))
|
|
152
|
+
else:
|
|
153
|
+
for index, report_item in enumerate(reports, start=1):
|
|
154
|
+
console.print(f"EvalGate auto daemon cycle {index}/{len(reports)}")
|
|
155
|
+
console.print(format_auto_human(report_item))
|
|
156
|
+
console.print()
|
|
157
|
+
console.print(f"completed {len(reports)} cycle(s) successfully")
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@auto_app.command("history")
|
|
161
|
+
def auto_history(
|
|
162
|
+
history: str = typer.Option(DEFAULT_AUTO_HISTORY_PATH, "--history", help="History JSONL path"),
|
|
163
|
+
fmt: str = typer.Option("human", "--format", help="Output format: human or json"),
|
|
164
|
+
) -> None:
|
|
165
|
+
path = Path(history)
|
|
166
|
+
if not path.exists():
|
|
167
|
+
raise typer.Exit(0)
|
|
168
|
+
rows = []
|
|
169
|
+
for line in path.read_text(encoding="utf-8").splitlines():
|
|
170
|
+
line = line.strip()
|
|
171
|
+
if not line:
|
|
172
|
+
continue
|
|
173
|
+
rows.append(json.loads(line))
|
|
174
|
+
if fmt == "json":
|
|
175
|
+
console.print_json(json.dumps({"history": rows}))
|
|
176
|
+
return
|
|
177
|
+
console.print(f"Auto history ({len(rows)})")
|
|
178
|
+
for row in rows[-10:]:
|
|
179
|
+
console.print(f"- {row.get('generatedAt', '?')} — {row.get('objective', '?')} ({row.get('executionMode', '?')})")
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
@auto_app.command("init")
|
|
183
|
+
def auto_init(
|
|
184
|
+
report: str = typer.Option(DEFAULT_AUTO_REPORT_PATH, "--report", help="Initial report path"),
|
|
185
|
+
history: str = typer.Option(DEFAULT_AUTO_HISTORY_PATH, "--history", help="History JSONL path"),
|
|
186
|
+
) -> None:
|
|
187
|
+
Path(report).parent.mkdir(parents=True, exist_ok=True)
|
|
188
|
+
Path(history).parent.mkdir(parents=True, exist_ok=True)
|
|
189
|
+
Path(history).touch(exist_ok=True)
|
|
190
|
+
console.print(f"[green]✓[/green] Prepared {Path(report).parent}")
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def cluster(
|
|
194
|
+
run: str = typer.Option(..., "--run", help="Run artifact to cluster"),
|
|
195
|
+
output: str | None = typer.Option(None, "--output", help="Cluster report output path"),
|
|
196
|
+
clusters: int | None = typer.Option(None, "--clusters", help="Requested cluster count"),
|
|
197
|
+
include_passed: bool = typer.Option(False, "--include-passed", help="Include passed cases"),
|
|
198
|
+
fmt: str = typer.Option("human", "--format", help="Output format: human or json"),
|
|
199
|
+
) -> None:
|
|
200
|
+
run_data = _read_json(run)
|
|
201
|
+
summary = cluster_run_result(normalize_run_artifact(run_data), clusters=clusters, include_passed=include_passed)
|
|
202
|
+
if output:
|
|
203
|
+
output_path = Path(output)
|
|
204
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
205
|
+
output_path.write_text(json.dumps(summary.to_dict(), indent=2), encoding="utf-8")
|
|
206
|
+
if fmt == "json":
|
|
207
|
+
console.print_json(json.dumps(summary.to_dict()))
|
|
208
|
+
else:
|
|
209
|
+
console.print(format_cluster_human(summary))
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def analyze(
|
|
213
|
+
dataset: str = typer.Option(DEFAULT_LABELED_DATASET_PATH, "--dataset", help="Labeled dataset JSONL path"),
|
|
214
|
+
top: int = typer.Option(5, "--top", help="Top N failure modes"),
|
|
215
|
+
fmt: str = typer.Option("human", "--format", help="Output format: human or json"),
|
|
216
|
+
) -> None:
|
|
217
|
+
rows = parse_labeled_dataset(Path(dataset).read_text(encoding="utf-8"))
|
|
218
|
+
summary = analyze_labeled_dataset(rows, top=top)
|
|
219
|
+
if fmt == "json":
|
|
220
|
+
console.print_json(json.dumps(summary.to_dict()))
|
|
221
|
+
else:
|
|
222
|
+
console.print(format_analyze_human(summary))
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def label(
|
|
226
|
+
run: str | None = typer.Option(None, "--run", help="Run artifact to convert into labeled JSONL"),
|
|
227
|
+
cluster: str | None = typer.Option(None, "--cluster", help="Cluster report to convert into labeled JSONL"),
|
|
228
|
+
output: str = typer.Option(DEFAULT_LABELED_DATASET_PATH, "--output", help="Output JSONL path"),
|
|
229
|
+
default_label: str = typer.Option("fail", "--default-label", help="Default label: pass or fail"),
|
|
230
|
+
failure_mode: str | None = typer.Option(None, "--failure-mode", help="Failure mode for failed rows"),
|
|
231
|
+
include_passed: bool = typer.Option(False, "--include-passed", help="Include passed cases too"),
|
|
232
|
+
fmt: str = typer.Option("human", "--format", help="Output format: human or json"),
|
|
233
|
+
) -> None:
|
|
234
|
+
if bool(run) == bool(cluster):
|
|
235
|
+
raise typer.BadParameter("Use exactly one of --run or --cluster")
|
|
236
|
+
rows: list[LabeledGoldenCase] = []
|
|
237
|
+
if cluster:
|
|
238
|
+
rows = _build_labeled_rows_from_cluster_summary(
|
|
239
|
+
_read_json(cluster),
|
|
240
|
+
default_label=default_label,
|
|
241
|
+
failure_mode=failure_mode,
|
|
242
|
+
include_passed=include_passed,
|
|
243
|
+
)
|
|
244
|
+
else:
|
|
245
|
+
normalized = normalize_run_artifact(_read_json(run or ""))
|
|
246
|
+
timestamp = datetime.now(timezone.utc).isoformat()
|
|
247
|
+
for case in normalized.cases:
|
|
248
|
+
if case.status == "passed" and not include_passed:
|
|
249
|
+
continue
|
|
250
|
+
label_value = "pass" if case.status == "passed" and default_label != "fail" else ("pass" if case.status == "passed" else "fail")
|
|
251
|
+
row_failure_mode = None if label_value == "pass" else (failure_mode or case.status)
|
|
252
|
+
rows.append(
|
|
253
|
+
LabeledGoldenCase(
|
|
254
|
+
case_id=case.case_id,
|
|
255
|
+
input=case.input,
|
|
256
|
+
expected=case.expected,
|
|
257
|
+
actual=case.actual,
|
|
258
|
+
label=label_value,
|
|
259
|
+
failure_mode=row_failure_mode,
|
|
260
|
+
labeled_at=timestamp,
|
|
261
|
+
)
|
|
262
|
+
)
|
|
263
|
+
write_jsonl(output, rows)
|
|
264
|
+
if fmt == "json":
|
|
265
|
+
console.print_json(json.dumps({"total": len(rows), "output": output}))
|
|
266
|
+
else:
|
|
267
|
+
console.print(f"Wrote {len(rows)} labeled case(s) → {output}")
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def synthesize(
|
|
271
|
+
dataset: str = typer.Option(DEFAULT_LABELED_DATASET_PATH, "--dataset", help="Labeled dataset JSONL path"),
|
|
272
|
+
dimensions: str | None = typer.Option(None, "--dimensions", help="Dimension matrix JSON path"),
|
|
273
|
+
count: int | None = typer.Option(None, "--count", help="Synthetic case count"),
|
|
274
|
+
failure_mode: list[str] | None = typer.Option(None, "--failure-mode", help="Requested failure modes"),
|
|
275
|
+
output: str = typer.Option(DEFAULT_SYNTHETIC_DATASET_PATH, "--output", help="Synthetic dataset output path"),
|
|
276
|
+
fmt: str = typer.Option("human", "--format", help="Output format: human or json"),
|
|
277
|
+
) -> None:
|
|
278
|
+
rows = parse_labeled_dataset(Path(dataset).read_text(encoding="utf-8"))
|
|
279
|
+
dimension_values: dict[str, list[str]] = {}
|
|
280
|
+
if dimensions:
|
|
281
|
+
dimension_values = parse_dimension_matrix(Path(dimensions).read_text(encoding="utf-8")).dimensions
|
|
282
|
+
summary = synthesize_labeled_dataset(
|
|
283
|
+
rows,
|
|
284
|
+
dimensions=dimension_values,
|
|
285
|
+
count=count,
|
|
286
|
+
failure_modes=failure_mode,
|
|
287
|
+
output_path=output,
|
|
288
|
+
)
|
|
289
|
+
if not summary.selected_failure_modes:
|
|
290
|
+
raise typer.Exit(2)
|
|
291
|
+
write_jsonl(output, summary.cases)
|
|
292
|
+
if fmt == "json":
|
|
293
|
+
console.print_json(json.dumps(summary.to_dict()))
|
|
294
|
+
else:
|
|
295
|
+
console.print(format_synthesize_human(summary))
|
|
296
|
+
console.print(f"\nSaved → {output}")
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def replay_decision(
|
|
300
|
+
previous: str = typer.Option(..., "--previous", help="Previous run artifact"),
|
|
301
|
+
current: str = typer.Option(..., "--current", help="Current run artifact"),
|
|
302
|
+
budget_mode: str = typer.Option("traces", "--budget-mode", help="Budget mode: traces or cost"),
|
|
303
|
+
max_traces: int = typer.Option(100, "--max-traces", help="Trace budget when using traces mode"),
|
|
304
|
+
max_cost_usd: float = typer.Option(1.0, "--max-cost-usd", help="Cost budget when using cost mode"),
|
|
305
|
+
fmt: str = typer.Option("human", "--format", help="Output format: human or json"),
|
|
306
|
+
) -> None:
|
|
307
|
+
previous_run = _read_json(previous)
|
|
308
|
+
current_run = _read_json(current)
|
|
309
|
+
config = NormalizedBudgetConfig(
|
|
310
|
+
mode="cost" if budget_mode == "cost" else "traces",
|
|
311
|
+
max_traces=max_traces,
|
|
312
|
+
max_cost_usd=max_cost_usd,
|
|
313
|
+
)
|
|
314
|
+
decision = evaluate_replay_outcome(previous_run, current_run, config)
|
|
315
|
+
if fmt == "json":
|
|
316
|
+
console.print_json(json.dumps(decision.to_dict()))
|
|
317
|
+
else:
|
|
318
|
+
console.print(f"Decision: {decision.action}")
|
|
319
|
+
console.print(f"Reason: {decision.reason}")
|
|
320
|
+
console.print(
|
|
321
|
+
f"Comparison basis: {decision.comparison_basis} ({decision.previous_pass_rate:.4f} → {decision.new_pass_rate:.4f})"
|
|
322
|
+
)
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
"""Evaluation Manifest Generation.
|
|
2
|
+
|
|
3
|
+
Turn discovery output into a stable, versioned, machine-consumable artifact
|
|
4
|
+
that becomes the input to run / impact / diff.
|
|
5
|
+
|
|
6
|
+
Port of ``cli/manifest.ts``.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import hashlib
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
import re
|
|
15
|
+
import time
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
from evalgate_sdk._version import SDK_VERSION
|
|
21
|
+
|
|
22
|
+
MANIFEST_SCHEMA_VERSION = 1
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class SpecFile:
|
|
27
|
+
"""Spec file information."""
|
|
28
|
+
|
|
29
|
+
file_path: str = ""
|
|
30
|
+
file_hash: str = ""
|
|
31
|
+
spec_count: int = 0
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class Spec:
|
|
36
|
+
"""Individual specification."""
|
|
37
|
+
|
|
38
|
+
id: str = ""
|
|
39
|
+
name: str = ""
|
|
40
|
+
suite_path: list[str] = field(default_factory=list)
|
|
41
|
+
file_path: str = ""
|
|
42
|
+
position: dict[str, int] = field(default_factory=lambda: {"line": 1, "column": 1})
|
|
43
|
+
tags: list[str] = field(default_factory=list)
|
|
44
|
+
depends_on: dict[str, list[str]] = field(
|
|
45
|
+
default_factory=lambda: {
|
|
46
|
+
"prompts": [],
|
|
47
|
+
"datasets": [],
|
|
48
|
+
"tools": [],
|
|
49
|
+
"code": [],
|
|
50
|
+
}
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class EvaluationManifest:
|
|
56
|
+
"""Evaluation Manifest Schema."""
|
|
57
|
+
|
|
58
|
+
schema_version: int = MANIFEST_SCHEMA_VERSION
|
|
59
|
+
generated_at: int = 0
|
|
60
|
+
project: dict[str, str] = field(default_factory=dict)
|
|
61
|
+
runtime: dict[str, str] = field(default_factory=dict)
|
|
62
|
+
spec_files: list[SpecFile] = field(default_factory=list)
|
|
63
|
+
specs: list[Spec] = field(default_factory=list)
|
|
64
|
+
|
|
65
|
+
def to_dict(self) -> dict[str, Any]:
|
|
66
|
+
return {
|
|
67
|
+
"schemaVersion": self.schema_version,
|
|
68
|
+
"generatedAt": self.generated_at,
|
|
69
|
+
"project": self.project,
|
|
70
|
+
"runtime": self.runtime,
|
|
71
|
+
"specFiles": [
|
|
72
|
+
{"filePath": sf.file_path, "fileHash": sf.file_hash, "specCount": sf.spec_count}
|
|
73
|
+
for sf in self.spec_files
|
|
74
|
+
],
|
|
75
|
+
"specs": [
|
|
76
|
+
{
|
|
77
|
+
"id": s.id,
|
|
78
|
+
"name": s.name,
|
|
79
|
+
"suitePath": s.suite_path,
|
|
80
|
+
"filePath": s.file_path,
|
|
81
|
+
"position": s.position,
|
|
82
|
+
"tags": s.tags,
|
|
83
|
+
"dependsOn": s.depends_on,
|
|
84
|
+
}
|
|
85
|
+
for s in self.specs
|
|
86
|
+
],
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass
|
|
91
|
+
class ManifestLock:
|
|
92
|
+
"""Lock file for caching."""
|
|
93
|
+
|
|
94
|
+
generated_at: int = 0
|
|
95
|
+
file_hashes: dict[str, str] = field(default_factory=dict)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@dataclass
|
|
99
|
+
class SpecAnalysis:
|
|
100
|
+
"""Discovery result for a single spec."""
|
|
101
|
+
|
|
102
|
+
id: str = ""
|
|
103
|
+
name: str = ""
|
|
104
|
+
file: str = ""
|
|
105
|
+
tags: list[str] = field(default_factory=list)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def generate_manifest(
|
|
109
|
+
specs: list[SpecAnalysis],
|
|
110
|
+
project_root: str,
|
|
111
|
+
project_name: str,
|
|
112
|
+
execution_mode: Any,
|
|
113
|
+
) -> EvaluationManifest:
|
|
114
|
+
"""Generate evaluation manifest from discovery results."""
|
|
115
|
+
generated_at = int(time.time())
|
|
116
|
+
namespace = _generate_namespace(project_root)
|
|
117
|
+
|
|
118
|
+
specs_by_file: dict[str, list[SpecAnalysis]] = {}
|
|
119
|
+
for spec in specs:
|
|
120
|
+
norm = _normalize_path(spec.file, project_root)
|
|
121
|
+
specs_by_file.setdefault(norm, []).append(spec)
|
|
122
|
+
|
|
123
|
+
spec_files: list[SpecFile] = []
|
|
124
|
+
processed_specs: list[Spec] = []
|
|
125
|
+
|
|
126
|
+
for file_path, file_specs in specs_by_file.items():
|
|
127
|
+
abs_path = os.path.join(project_root, file_path)
|
|
128
|
+
file_hash = _hash_file(abs_path)
|
|
129
|
+
|
|
130
|
+
spec_files.append(SpecFile(file_path=file_path, file_hash=file_hash, spec_count=len(file_specs)))
|
|
131
|
+
|
|
132
|
+
for sa in file_specs:
|
|
133
|
+
content = _read_file_safe(abs_path)
|
|
134
|
+
position = _extract_position(content, sa.name)
|
|
135
|
+
depends_on = _extract_dependencies(content)
|
|
136
|
+
suite_path = _generate_suite_path(sa.tags, file_path)
|
|
137
|
+
|
|
138
|
+
processed_specs.append(
|
|
139
|
+
Spec(
|
|
140
|
+
id=sa.id,
|
|
141
|
+
name=sa.name,
|
|
142
|
+
suite_path=suite_path,
|
|
143
|
+
file_path=_normalize_path(sa.file, project_root),
|
|
144
|
+
position=position,
|
|
145
|
+
tags=sa.tags,
|
|
146
|
+
depends_on=depends_on,
|
|
147
|
+
)
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
mode_str = getattr(execution_mode, "mode", "spec") if execution_mode else "spec"
|
|
151
|
+
|
|
152
|
+
return EvaluationManifest(
|
|
153
|
+
schema_version=MANIFEST_SCHEMA_VERSION,
|
|
154
|
+
generated_at=generated_at,
|
|
155
|
+
project={"name": project_name, "root": ".", "namespace": namespace},
|
|
156
|
+
runtime={"mode": mode_str, "sdkVersion": SDK_VERSION},
|
|
157
|
+
spec_files=spec_files,
|
|
158
|
+
specs=processed_specs,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def write_manifest(manifest: EvaluationManifest, project_root: str) -> None:
|
|
163
|
+
"""Write manifest to disk."""
|
|
164
|
+
evalgate_dir = os.path.join(project_root, ".evalgate")
|
|
165
|
+
os.makedirs(evalgate_dir, exist_ok=True)
|
|
166
|
+
|
|
167
|
+
manifest_path = os.path.join(evalgate_dir, "manifest.json")
|
|
168
|
+
Path(manifest_path).write_text(json.dumps(manifest.to_dict(), indent=2), encoding="utf-8")
|
|
169
|
+
|
|
170
|
+
lock = ManifestLock(
|
|
171
|
+
generated_at=manifest.generated_at,
|
|
172
|
+
file_hashes={sf.file_path: sf.file_hash for sf in manifest.spec_files},
|
|
173
|
+
)
|
|
174
|
+
lock_path = os.path.join(evalgate_dir, "manifest.lock.json")
|
|
175
|
+
Path(lock_path).write_text(
|
|
176
|
+
json.dumps(
|
|
177
|
+
{
|
|
178
|
+
"generatedAt": lock.generated_at,
|
|
179
|
+
"fileHashes": lock.file_hashes,
|
|
180
|
+
},
|
|
181
|
+
indent=2,
|
|
182
|
+
),
|
|
183
|
+
encoding="utf-8",
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def read_manifest(project_root: str) -> EvaluationManifest | None:
|
|
188
|
+
"""Read existing manifest."""
|
|
189
|
+
manifest_path = os.path.join(project_root, ".evalgate", "manifest.json")
|
|
190
|
+
try:
|
|
191
|
+
data = json.loads(Path(manifest_path).read_text(encoding="utf-8"))
|
|
192
|
+
m = EvaluationManifest(
|
|
193
|
+
schema_version=data.get("schemaVersion", 1),
|
|
194
|
+
generated_at=data.get("generatedAt", 0),
|
|
195
|
+
project=data.get("project", {}),
|
|
196
|
+
runtime=data.get("runtime", {}),
|
|
197
|
+
)
|
|
198
|
+
for sf in data.get("specFiles", []):
|
|
199
|
+
m.spec_files.append(
|
|
200
|
+
SpecFile(file_path=sf["filePath"], file_hash=sf["fileHash"], spec_count=sf["specCount"])
|
|
201
|
+
)
|
|
202
|
+
for s in data.get("specs", []):
|
|
203
|
+
m.specs.append(
|
|
204
|
+
Spec(
|
|
205
|
+
id=s["id"],
|
|
206
|
+
name=s["name"],
|
|
207
|
+
suite_path=s.get("suitePath", []),
|
|
208
|
+
file_path=s["filePath"],
|
|
209
|
+
position=s.get("position", {"line": 1, "column": 1}),
|
|
210
|
+
tags=s.get("tags", []),
|
|
211
|
+
depends_on=s.get("dependsOn", {"prompts": [], "datasets": [], "tools": [], "code": []}),
|
|
212
|
+
)
|
|
213
|
+
)
|
|
214
|
+
return m
|
|
215
|
+
except (OSError, json.JSONDecodeError, KeyError):
|
|
216
|
+
return None
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def read_lock(project_root: str) -> ManifestLock | None:
|
|
220
|
+
"""Read existing lock file."""
|
|
221
|
+
lock_path = os.path.join(project_root, ".evalgate", "manifest.lock.json")
|
|
222
|
+
try:
|
|
223
|
+
data = json.loads(Path(lock_path).read_text(encoding="utf-8"))
|
|
224
|
+
return ManifestLock(generated_at=data["generatedAt"], file_hashes=data.get("fileHashes", {}))
|
|
225
|
+
except (OSError, json.JSONDecodeError, KeyError):
|
|
226
|
+
return None
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
# ── Internal helpers ──────────────────────────────────────────────────
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _normalize_path(file_path: str, project_root: str) -> str:
|
|
233
|
+
return os.path.relpath(file_path, project_root).replace("\\", "/")
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _generate_namespace(project_root: str) -> str:
|
|
237
|
+
return hashlib.sha256(project_root.encode("utf-8")).hexdigest()[:8]
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _hash_file(file_path: str) -> str:
|
|
241
|
+
try:
|
|
242
|
+
content = Path(file_path).read_bytes()
|
|
243
|
+
return f"sha256:{hashlib.sha256(content).hexdigest()}"
|
|
244
|
+
except OSError:
|
|
245
|
+
return "sha256:0"
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _read_file_safe(path: str) -> str:
|
|
249
|
+
try:
|
|
250
|
+
return Path(path).read_text(encoding="utf-8")
|
|
251
|
+
except OSError:
|
|
252
|
+
return ""
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def _extract_position(content: str, spec_name: str) -> dict[str, int]:
|
|
256
|
+
pattern = re.compile(r'define_eval\s*\(\s*["\']' + re.escape(spec_name) + r'["\']')
|
|
257
|
+
for i, line_content in enumerate(content.splitlines()):
|
|
258
|
+
m = pattern.search(line_content)
|
|
259
|
+
if m:
|
|
260
|
+
return {"line": i + 1, "column": m.start() + 1}
|
|
261
|
+
return {"line": 1, "column": 1}
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _extract_dependencies(content: str) -> dict[str, list[str]]:
|
|
265
|
+
deps: dict[str, list[str]] = {"prompts": [], "datasets": [], "tools": [], "code": []}
|
|
266
|
+
|
|
267
|
+
depends_on_match = re.search(r"depends_on\s*=\s*\{([^}]+)\}", content, re.DOTALL)
|
|
268
|
+
if depends_on_match:
|
|
269
|
+
try:
|
|
270
|
+
raw = "{" + depends_on_match.group(1) + "}"
|
|
271
|
+
parsed = json.loads(raw)
|
|
272
|
+
return {
|
|
273
|
+
"prompts": parsed.get("prompts", []),
|
|
274
|
+
"datasets": parsed.get("datasets", []),
|
|
275
|
+
"tools": parsed.get("tools", []),
|
|
276
|
+
"code": parsed.get("code", []),
|
|
277
|
+
}
|
|
278
|
+
except (json.JSONDecodeError, TypeError):
|
|
279
|
+
pass
|
|
280
|
+
|
|
281
|
+
patterns = {
|
|
282
|
+
"prompts": re.compile(r'["\']([^"\']*\.md)["\']'),
|
|
283
|
+
"datasets": re.compile(r'["\']([^"\']*\.json)["\']'),
|
|
284
|
+
"code": re.compile(r"from\s+([^\s]+)\s+import|import\s+([^\s]+)"),
|
|
285
|
+
}
|
|
286
|
+
for key, pat in patterns.items():
|
|
287
|
+
for m in pat.finditer(content):
|
|
288
|
+
val = m.group(1) or (m.group(2) if m.lastindex and m.lastindex >= 2 else None)
|
|
289
|
+
if val:
|
|
290
|
+
deps[key].append(val)
|
|
291
|
+
|
|
292
|
+
return deps
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def _generate_suite_path(tags: list[str], file_path: str) -> list[str]:
|
|
296
|
+
if tags:
|
|
297
|
+
return [tags[0]]
|
|
298
|
+
parts = file_path.split("/")
|
|
299
|
+
if len(parts) > 1:
|
|
300
|
+
return [parts[0]]
|
|
301
|
+
return ["general"]
|