evalgate-sdk 3.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. evalgate_sdk/__init__.py +707 -0
  2. evalgate_sdk/_version.py +3 -0
  3. evalgate_sdk/assertions.py +1362 -0
  4. evalgate_sdk/auto.py +247 -0
  5. evalgate_sdk/batch.py +174 -0
  6. evalgate_sdk/cache.py +111 -0
  7. evalgate_sdk/ci_context.py +123 -0
  8. evalgate_sdk/cli/__init__.py +111 -0
  9. evalgate_sdk/cli/api.py +261 -0
  10. evalgate_sdk/cli/cli_constants.py +20 -0
  11. evalgate_sdk/cli/commands.py +1041 -0
  12. evalgate_sdk/cli/config.py +228 -0
  13. evalgate_sdk/cli/env.py +43 -0
  14. evalgate_sdk/cli/formatters/types.py +132 -0
  15. evalgate_sdk/cli/golden_commands.py +322 -0
  16. evalgate_sdk/cli/manifest.py +301 -0
  17. evalgate_sdk/cli/new_commands.py +435 -0
  18. evalgate_sdk/cli/policy_packs.py +103 -0
  19. evalgate_sdk/cli/profiles.py +12 -0
  20. evalgate_sdk/cli/regression_gate.py +312 -0
  21. evalgate_sdk/cli/render/__init__.py +1 -0
  22. evalgate_sdk/cli/render/snippet.py +18 -0
  23. evalgate_sdk/cli/render/sort.py +29 -0
  24. evalgate_sdk/cli/report/__init__.py +1 -0
  25. evalgate_sdk/cli/report/build_check_report.py +209 -0
  26. evalgate_sdk/cli/traces.py +186 -0
  27. evalgate_sdk/cli/workspace.py +63 -0
  28. evalgate_sdk/client.py +609 -0
  29. evalgate_sdk/cluster.py +359 -0
  30. evalgate_sdk/collector.py +161 -0
  31. evalgate_sdk/constants.py +6 -0
  32. evalgate_sdk/context.py +151 -0
  33. evalgate_sdk/errors.py +236 -0
  34. evalgate_sdk/export.py +238 -0
  35. evalgate_sdk/formatters/__init__.py +11 -0
  36. evalgate_sdk/formatters/github.py +51 -0
  37. evalgate_sdk/formatters/human.py +68 -0
  38. evalgate_sdk/formatters/json_fmt.py +11 -0
  39. evalgate_sdk/formatters/pr_comment.py +80 -0
  40. evalgate_sdk/golden.py +426 -0
  41. evalgate_sdk/integrations/__init__.py +1 -0
  42. evalgate_sdk/integrations/anthropic.py +99 -0
  43. evalgate_sdk/integrations/autogen.py +62 -0
  44. evalgate_sdk/integrations/crewai.py +61 -0
  45. evalgate_sdk/integrations/langchain.py +100 -0
  46. evalgate_sdk/integrations/openai.py +155 -0
  47. evalgate_sdk/integrations/openai_eval.py +221 -0
  48. evalgate_sdk/local.py +144 -0
  49. evalgate_sdk/logger.py +123 -0
  50. evalgate_sdk/matchers.py +62 -0
  51. evalgate_sdk/otel.py +256 -0
  52. evalgate_sdk/pagination.py +145 -0
  53. evalgate_sdk/py.typed +0 -0
  54. evalgate_sdk/pytest_plugin.py +96 -0
  55. evalgate_sdk/reason_codes.py +103 -0
  56. evalgate_sdk/regression.py +196 -0
  57. evalgate_sdk/replay_decision.py +115 -0
  58. evalgate_sdk/runtime/__init__.py +50 -0
  59. evalgate_sdk/runtime/adapters/__init__.py +1 -0
  60. evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
  61. evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
  62. evalgate_sdk/runtime/context.py +68 -0
  63. evalgate_sdk/runtime/eval.py +318 -0
  64. evalgate_sdk/runtime/execution_mode.py +170 -0
  65. evalgate_sdk/runtime/executor.py +92 -0
  66. evalgate_sdk/runtime/registry.py +125 -0
  67. evalgate_sdk/runtime/run_report.py +249 -0
  68. evalgate_sdk/runtime/types.py +143 -0
  69. evalgate_sdk/snapshot.py +219 -0
  70. evalgate_sdk/streaming.py +124 -0
  71. evalgate_sdk/synthesize.py +226 -0
  72. evalgate_sdk/testing.py +128 -0
  73. evalgate_sdk/types.py +666 -0
  74. evalgate_sdk/utils/__init__.py +1 -0
  75. evalgate_sdk/utils/input_hash.py +42 -0
  76. evalgate_sdk/workflows.py +264 -0
  77. evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
  78. evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
  79. evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
  80. evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,322 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from datetime import datetime, timezone
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import typer
9
+ from rich.console import Console
10
+
11
+ from evalgate_sdk.auto import (
12
+ DEFAULT_AUTO_HISTORY_PATH,
13
+ DEFAULT_AUTO_REPORT_PATH,
14
+ AutoOptions,
15
+ build_auto_report,
16
+ format_auto_human,
17
+ run_auto_daemon,
18
+ write_auto_report,
19
+ )
20
+ from evalgate_sdk.cluster import cluster_run_result, format_cluster_human
21
+ from evalgate_sdk.golden import (
22
+ DEFAULT_LABELED_DATASET_PATH,
23
+ DEFAULT_SYNTHETIC_DATASET_PATH,
24
+ LabeledGoldenCase,
25
+ analyze_labeled_dataset,
26
+ format_analyze_human,
27
+ normalize_run_artifact,
28
+ parse_labeled_dataset,
29
+ write_jsonl,
30
+ )
31
+ from evalgate_sdk.replay_decision import NormalizedBudgetConfig, evaluate_replay_outcome
32
+ from evalgate_sdk.synthesize import format_synthesize_human, parse_dimension_matrix, synthesize_labeled_dataset
33
+
34
+ console = Console()
35
+
36
+ auto_app = typer.Typer(help="Autonomous prompt-improvement workflow commands.", no_args_is_help=True)
37
+
38
+
39
+ def _read_json(file_path: str) -> dict[str, Any]:
40
+ try:
41
+ data = json.loads(Path(file_path).read_text(encoding="utf-8"))
42
+ except FileNotFoundError as exc:
43
+ raise typer.BadParameter(f"File not found: {file_path}") from exc
44
+ except json.JSONDecodeError as exc:
45
+ raise typer.BadParameter(f"Invalid JSON: {file_path}") from exc
46
+ if not isinstance(data, dict):
47
+ raise typer.BadParameter(f"Expected JSON object in {file_path}")
48
+ return data
49
+
50
+
51
+ def _build_labeled_rows_from_cluster_summary(
52
+ cluster_summary: dict[str, Any],
53
+ *,
54
+ default_label: str,
55
+ failure_mode: str | None,
56
+ include_passed: bool,
57
+ ) -> list[LabeledGoldenCase]:
58
+ rows: list[LabeledGoldenCase] = []
59
+ timestamp = datetime.now(timezone.utc).isoformat()
60
+ for cluster in cluster_summary.get("clusters", []):
61
+ if not isinstance(cluster, dict):
62
+ continue
63
+ cluster_id = cluster.get("id") if isinstance(cluster.get("id"), str) else None
64
+ cluster_label = cluster.get("clusterLabel") if isinstance(cluster.get("clusterLabel"), str) else None
65
+ cluster_failure_mode = failure_mode
66
+ if cluster_failure_mode is None and isinstance(cluster.get("suggestedFailureMode"), str):
67
+ cluster_failure_mode = cluster["suggestedFailureMode"]
68
+ for case in cluster.get("cases", []):
69
+ if not isinstance(case, dict):
70
+ continue
71
+ status = case.get("status") if isinstance(case.get("status"), str) else "failed"
72
+ if status == "passed" and not include_passed:
73
+ continue
74
+ label_value = "pass" if status == "passed" and default_label != "fail" else ("pass" if status == "passed" else "fail")
75
+ rows.append(
76
+ LabeledGoldenCase(
77
+ case_id=str(case.get("caseId") or case.get("id") or case.get("name") or "case"),
78
+ input=str(case.get("input") or ""),
79
+ expected=str(case.get("expected") or ""),
80
+ actual=str(case.get("actual") or ""),
81
+ label=label_value,
82
+ failure_mode=None if label_value == "pass" else (cluster_failure_mode or status),
83
+ labeled_at=timestamp,
84
+ cluster_id=cluster_id,
85
+ cluster_label=cluster_label,
86
+ )
87
+ )
88
+ return rows
89
+
90
+
91
+ @auto_app.command("run")
92
+ def auto_run(
93
+ objective: str = typer.Option(..., "--objective", help="Target failure mode or objective"),
94
+ prompt: str | None = typer.Option(None, "--prompt", help="Prompt file being optimized"),
95
+ hypothesis: str | None = typer.Option(None, "--hypothesis", help="Experiment hypothesis"),
96
+ baseline_run: str | None = typer.Option(None, "--baseline-run", help="Baseline run artifact"),
97
+ candidate_run: str | None = typer.Option(None, "--candidate-run", help="Candidate run artifact"),
98
+ budget: int = typer.Option(1, "--budget", help="Per-iteration budget"),
99
+ budget_mode: str = typer.Option("traces", "--budget-mode", help="Budget mode: traces or cost"),
100
+ autonomous: bool = typer.Option(False, "--autonomous", help="Enable autonomous bounded mode"),
101
+ dry_run: bool = typer.Option(False, "--dry-run", help="Plan only"),
102
+ fmt: str = typer.Option("human", "--format", help="Output format: human or json"),
103
+ report: str = typer.Option(DEFAULT_AUTO_REPORT_PATH, "--report", help="Auto report output path"),
104
+ ) -> None:
105
+ options = AutoOptions(
106
+ objective=objective,
107
+ hypothesis=hypothesis,
108
+ prompt_path=prompt,
109
+ baseline_run_path=baseline_run,
110
+ candidate_run_path=candidate_run,
111
+ budget=budget,
112
+ budget_mode="cost" if budget_mode == "cost" else "traces",
113
+ autonomous=autonomous,
114
+ dry_run=dry_run,
115
+ format="json" if fmt == "json" else "human",
116
+ report_path=report,
117
+ )
118
+ auto_report = build_auto_report(options)
119
+ write_auto_report(auto_report, report)
120
+ if fmt == "json":
121
+ console.print_json(json.dumps(auto_report.to_dict()))
122
+ else:
123
+ console.print(format_auto_human(auto_report))
124
+ console.print(f"\nSaved → {Path(report)}")
125
+
126
+
127
+ @auto_app.command("daemon")
128
+ def auto_daemon(
129
+ objective: str = typer.Option(..., "--objective", help="Target failure mode or objective"),
130
+ prompt: str | None = typer.Option(None, "--prompt", help="Prompt file being optimized"),
131
+ hypothesis: str | None = typer.Option(None, "--hypothesis", help="Experiment hypothesis"),
132
+ baseline_run: str | None = typer.Option(None, "--baseline-run", help="Baseline run artifact"),
133
+ candidate_run: str | None = typer.Option(None, "--candidate-run", help="Candidate run artifact"),
134
+ budget: int = typer.Option(1, "--budget", help="Per-cycle budget"),
135
+ cycles: int = typer.Option(1, "--cycles", help="Number of bounded cycles to run"),
136
+ interval_ms: int = typer.Option(0, "--interval-ms", help="Delay between cycles"),
137
+ fmt: str = typer.Option("human", "--format", help="Output format: human or json"),
138
+ history: str = typer.Option(DEFAULT_AUTO_HISTORY_PATH, "--history", help="History JSONL path"),
139
+ ) -> None:
140
+ options = AutoOptions(
141
+ objective=objective,
142
+ hypothesis=hypothesis,
143
+ prompt_path=prompt,
144
+ baseline_run_path=baseline_run,
145
+ candidate_run_path=candidate_run,
146
+ budget=budget,
147
+ format="json" if fmt == "json" else "human",
148
+ )
149
+ reports = run_auto_daemon(options, cycles=cycles, interval_ms=interval_ms, history_path=history)
150
+ if fmt == "json":
151
+ console.print_json(json.dumps({"cycles": len(reports), "history": history, "reports": [report.to_dict() for report in reports]}))
152
+ else:
153
+ for index, report_item in enumerate(reports, start=1):
154
+ console.print(f"EvalGate auto daemon cycle {index}/{len(reports)}")
155
+ console.print(format_auto_human(report_item))
156
+ console.print()
157
+ console.print(f"completed {len(reports)} cycle(s) successfully")
158
+
159
+
160
+ @auto_app.command("history")
161
+ def auto_history(
162
+ history: str = typer.Option(DEFAULT_AUTO_HISTORY_PATH, "--history", help="History JSONL path"),
163
+ fmt: str = typer.Option("human", "--format", help="Output format: human or json"),
164
+ ) -> None:
165
+ path = Path(history)
166
+ if not path.exists():
167
+ raise typer.Exit(0)
168
+ rows = []
169
+ for line in path.read_text(encoding="utf-8").splitlines():
170
+ line = line.strip()
171
+ if not line:
172
+ continue
173
+ rows.append(json.loads(line))
174
+ if fmt == "json":
175
+ console.print_json(json.dumps({"history": rows}))
176
+ return
177
+ console.print(f"Auto history ({len(rows)})")
178
+ for row in rows[-10:]:
179
+ console.print(f"- {row.get('generatedAt', '?')} — {row.get('objective', '?')} ({row.get('executionMode', '?')})")
180
+
181
+
182
+ @auto_app.command("init")
183
+ def auto_init(
184
+ report: str = typer.Option(DEFAULT_AUTO_REPORT_PATH, "--report", help="Initial report path"),
185
+ history: str = typer.Option(DEFAULT_AUTO_HISTORY_PATH, "--history", help="History JSONL path"),
186
+ ) -> None:
187
+ Path(report).parent.mkdir(parents=True, exist_ok=True)
188
+ Path(history).parent.mkdir(parents=True, exist_ok=True)
189
+ Path(history).touch(exist_ok=True)
190
+ console.print(f"[green]✓[/green] Prepared {Path(report).parent}")
191
+
192
+
193
+ def cluster(
194
+ run: str = typer.Option(..., "--run", help="Run artifact to cluster"),
195
+ output: str | None = typer.Option(None, "--output", help="Cluster report output path"),
196
+ clusters: int | None = typer.Option(None, "--clusters", help="Requested cluster count"),
197
+ include_passed: bool = typer.Option(False, "--include-passed", help="Include passed cases"),
198
+ fmt: str = typer.Option("human", "--format", help="Output format: human or json"),
199
+ ) -> None:
200
+ run_data = _read_json(run)
201
+ summary = cluster_run_result(normalize_run_artifact(run_data), clusters=clusters, include_passed=include_passed)
202
+ if output:
203
+ output_path = Path(output)
204
+ output_path.parent.mkdir(parents=True, exist_ok=True)
205
+ output_path.write_text(json.dumps(summary.to_dict(), indent=2), encoding="utf-8")
206
+ if fmt == "json":
207
+ console.print_json(json.dumps(summary.to_dict()))
208
+ else:
209
+ console.print(format_cluster_human(summary))
210
+
211
+
212
+ def analyze(
213
+ dataset: str = typer.Option(DEFAULT_LABELED_DATASET_PATH, "--dataset", help="Labeled dataset JSONL path"),
214
+ top: int = typer.Option(5, "--top", help="Top N failure modes"),
215
+ fmt: str = typer.Option("human", "--format", help="Output format: human or json"),
216
+ ) -> None:
217
+ rows = parse_labeled_dataset(Path(dataset).read_text(encoding="utf-8"))
218
+ summary = analyze_labeled_dataset(rows, top=top)
219
+ if fmt == "json":
220
+ console.print_json(json.dumps(summary.to_dict()))
221
+ else:
222
+ console.print(format_analyze_human(summary))
223
+
224
+
225
+ def label(
226
+ run: str | None = typer.Option(None, "--run", help="Run artifact to convert into labeled JSONL"),
227
+ cluster: str | None = typer.Option(None, "--cluster", help="Cluster report to convert into labeled JSONL"),
228
+ output: str = typer.Option(DEFAULT_LABELED_DATASET_PATH, "--output", help="Output JSONL path"),
229
+ default_label: str = typer.Option("fail", "--default-label", help="Default label: pass or fail"),
230
+ failure_mode: str | None = typer.Option(None, "--failure-mode", help="Failure mode for failed rows"),
231
+ include_passed: bool = typer.Option(False, "--include-passed", help="Include passed cases too"),
232
+ fmt: str = typer.Option("human", "--format", help="Output format: human or json"),
233
+ ) -> None:
234
+ if bool(run) == bool(cluster):
235
+ raise typer.BadParameter("Use exactly one of --run or --cluster")
236
+ rows: list[LabeledGoldenCase] = []
237
+ if cluster:
238
+ rows = _build_labeled_rows_from_cluster_summary(
239
+ _read_json(cluster),
240
+ default_label=default_label,
241
+ failure_mode=failure_mode,
242
+ include_passed=include_passed,
243
+ )
244
+ else:
245
+ normalized = normalize_run_artifact(_read_json(run or ""))
246
+ timestamp = datetime.now(timezone.utc).isoformat()
247
+ for case in normalized.cases:
248
+ if case.status == "passed" and not include_passed:
249
+ continue
250
+ label_value = "pass" if case.status == "passed" and default_label != "fail" else ("pass" if case.status == "passed" else "fail")
251
+ row_failure_mode = None if label_value == "pass" else (failure_mode or case.status)
252
+ rows.append(
253
+ LabeledGoldenCase(
254
+ case_id=case.case_id,
255
+ input=case.input,
256
+ expected=case.expected,
257
+ actual=case.actual,
258
+ label=label_value,
259
+ failure_mode=row_failure_mode,
260
+ labeled_at=timestamp,
261
+ )
262
+ )
263
+ write_jsonl(output, rows)
264
+ if fmt == "json":
265
+ console.print_json(json.dumps({"total": len(rows), "output": output}))
266
+ else:
267
+ console.print(f"Wrote {len(rows)} labeled case(s) → {output}")
268
+
269
+
270
+ def synthesize(
271
+ dataset: str = typer.Option(DEFAULT_LABELED_DATASET_PATH, "--dataset", help="Labeled dataset JSONL path"),
272
+ dimensions: str | None = typer.Option(None, "--dimensions", help="Dimension matrix JSON path"),
273
+ count: int | None = typer.Option(None, "--count", help="Synthetic case count"),
274
+ failure_mode: list[str] | None = typer.Option(None, "--failure-mode", help="Requested failure modes"),
275
+ output: str = typer.Option(DEFAULT_SYNTHETIC_DATASET_PATH, "--output", help="Synthetic dataset output path"),
276
+ fmt: str = typer.Option("human", "--format", help="Output format: human or json"),
277
+ ) -> None:
278
+ rows = parse_labeled_dataset(Path(dataset).read_text(encoding="utf-8"))
279
+ dimension_values: dict[str, list[str]] = {}
280
+ if dimensions:
281
+ dimension_values = parse_dimension_matrix(Path(dimensions).read_text(encoding="utf-8")).dimensions
282
+ summary = synthesize_labeled_dataset(
283
+ rows,
284
+ dimensions=dimension_values,
285
+ count=count,
286
+ failure_modes=failure_mode,
287
+ output_path=output,
288
+ )
289
+ if not summary.selected_failure_modes:
290
+ raise typer.Exit(2)
291
+ write_jsonl(output, summary.cases)
292
+ if fmt == "json":
293
+ console.print_json(json.dumps(summary.to_dict()))
294
+ else:
295
+ console.print(format_synthesize_human(summary))
296
+ console.print(f"\nSaved → {output}")
297
+
298
+
299
+ def replay_decision(
300
+ previous: str = typer.Option(..., "--previous", help="Previous run artifact"),
301
+ current: str = typer.Option(..., "--current", help="Current run artifact"),
302
+ budget_mode: str = typer.Option("traces", "--budget-mode", help="Budget mode: traces or cost"),
303
+ max_traces: int = typer.Option(100, "--max-traces", help="Trace budget when using traces mode"),
304
+ max_cost_usd: float = typer.Option(1.0, "--max-cost-usd", help="Cost budget when using cost mode"),
305
+ fmt: str = typer.Option("human", "--format", help="Output format: human or json"),
306
+ ) -> None:
307
+ previous_run = _read_json(previous)
308
+ current_run = _read_json(current)
309
+ config = NormalizedBudgetConfig(
310
+ mode="cost" if budget_mode == "cost" else "traces",
311
+ max_traces=max_traces,
312
+ max_cost_usd=max_cost_usd,
313
+ )
314
+ decision = evaluate_replay_outcome(previous_run, current_run, config)
315
+ if fmt == "json":
316
+ console.print_json(json.dumps(decision.to_dict()))
317
+ else:
318
+ console.print(f"Decision: {decision.action}")
319
+ console.print(f"Reason: {decision.reason}")
320
+ console.print(
321
+ f"Comparison basis: {decision.comparison_basis} ({decision.previous_pass_rate:.4f} → {decision.new_pass_rate:.4f})"
322
+ )
@@ -0,0 +1,301 @@
1
+ """Evaluation Manifest Generation.
2
+
3
+ Turn discovery output into a stable, versioned, machine-consumable artifact
4
+ that becomes the input to run / impact / diff.
5
+
6
+ Port of ``cli/manifest.ts``.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import hashlib
12
+ import json
13
+ import os
14
+ import re
15
+ import time
16
+ from dataclasses import dataclass, field
17
+ from pathlib import Path
18
+ from typing import Any
19
+
20
+ from evalgate_sdk._version import SDK_VERSION
21
+
22
+ MANIFEST_SCHEMA_VERSION = 1
23
+
24
+
25
+ @dataclass
26
+ class SpecFile:
27
+ """Spec file information."""
28
+
29
+ file_path: str = ""
30
+ file_hash: str = ""
31
+ spec_count: int = 0
32
+
33
+
34
+ @dataclass
35
+ class Spec:
36
+ """Individual specification."""
37
+
38
+ id: str = ""
39
+ name: str = ""
40
+ suite_path: list[str] = field(default_factory=list)
41
+ file_path: str = ""
42
+ position: dict[str, int] = field(default_factory=lambda: {"line": 1, "column": 1})
43
+ tags: list[str] = field(default_factory=list)
44
+ depends_on: dict[str, list[str]] = field(
45
+ default_factory=lambda: {
46
+ "prompts": [],
47
+ "datasets": [],
48
+ "tools": [],
49
+ "code": [],
50
+ }
51
+ )
52
+
53
+
54
+ @dataclass
55
+ class EvaluationManifest:
56
+ """Evaluation Manifest Schema."""
57
+
58
+ schema_version: int = MANIFEST_SCHEMA_VERSION
59
+ generated_at: int = 0
60
+ project: dict[str, str] = field(default_factory=dict)
61
+ runtime: dict[str, str] = field(default_factory=dict)
62
+ spec_files: list[SpecFile] = field(default_factory=list)
63
+ specs: list[Spec] = field(default_factory=list)
64
+
65
+ def to_dict(self) -> dict[str, Any]:
66
+ return {
67
+ "schemaVersion": self.schema_version,
68
+ "generatedAt": self.generated_at,
69
+ "project": self.project,
70
+ "runtime": self.runtime,
71
+ "specFiles": [
72
+ {"filePath": sf.file_path, "fileHash": sf.file_hash, "specCount": sf.spec_count}
73
+ for sf in self.spec_files
74
+ ],
75
+ "specs": [
76
+ {
77
+ "id": s.id,
78
+ "name": s.name,
79
+ "suitePath": s.suite_path,
80
+ "filePath": s.file_path,
81
+ "position": s.position,
82
+ "tags": s.tags,
83
+ "dependsOn": s.depends_on,
84
+ }
85
+ for s in self.specs
86
+ ],
87
+ }
88
+
89
+
90
+ @dataclass
91
+ class ManifestLock:
92
+ """Lock file for caching."""
93
+
94
+ generated_at: int = 0
95
+ file_hashes: dict[str, str] = field(default_factory=dict)
96
+
97
+
98
+ @dataclass
99
+ class SpecAnalysis:
100
+ """Discovery result for a single spec."""
101
+
102
+ id: str = ""
103
+ name: str = ""
104
+ file: str = ""
105
+ tags: list[str] = field(default_factory=list)
106
+
107
+
108
+ def generate_manifest(
109
+ specs: list[SpecAnalysis],
110
+ project_root: str,
111
+ project_name: str,
112
+ execution_mode: Any,
113
+ ) -> EvaluationManifest:
114
+ """Generate evaluation manifest from discovery results."""
115
+ generated_at = int(time.time())
116
+ namespace = _generate_namespace(project_root)
117
+
118
+ specs_by_file: dict[str, list[SpecAnalysis]] = {}
119
+ for spec in specs:
120
+ norm = _normalize_path(spec.file, project_root)
121
+ specs_by_file.setdefault(norm, []).append(spec)
122
+
123
+ spec_files: list[SpecFile] = []
124
+ processed_specs: list[Spec] = []
125
+
126
+ for file_path, file_specs in specs_by_file.items():
127
+ abs_path = os.path.join(project_root, file_path)
128
+ file_hash = _hash_file(abs_path)
129
+
130
+ spec_files.append(SpecFile(file_path=file_path, file_hash=file_hash, spec_count=len(file_specs)))
131
+
132
+ for sa in file_specs:
133
+ content = _read_file_safe(abs_path)
134
+ position = _extract_position(content, sa.name)
135
+ depends_on = _extract_dependencies(content)
136
+ suite_path = _generate_suite_path(sa.tags, file_path)
137
+
138
+ processed_specs.append(
139
+ Spec(
140
+ id=sa.id,
141
+ name=sa.name,
142
+ suite_path=suite_path,
143
+ file_path=_normalize_path(sa.file, project_root),
144
+ position=position,
145
+ tags=sa.tags,
146
+ depends_on=depends_on,
147
+ )
148
+ )
149
+
150
+ mode_str = getattr(execution_mode, "mode", "spec") if execution_mode else "spec"
151
+
152
+ return EvaluationManifest(
153
+ schema_version=MANIFEST_SCHEMA_VERSION,
154
+ generated_at=generated_at,
155
+ project={"name": project_name, "root": ".", "namespace": namespace},
156
+ runtime={"mode": mode_str, "sdkVersion": SDK_VERSION},
157
+ spec_files=spec_files,
158
+ specs=processed_specs,
159
+ )
160
+
161
+
162
+ def write_manifest(manifest: EvaluationManifest, project_root: str) -> None:
163
+ """Write manifest to disk."""
164
+ evalgate_dir = os.path.join(project_root, ".evalgate")
165
+ os.makedirs(evalgate_dir, exist_ok=True)
166
+
167
+ manifest_path = os.path.join(evalgate_dir, "manifest.json")
168
+ Path(manifest_path).write_text(json.dumps(manifest.to_dict(), indent=2), encoding="utf-8")
169
+
170
+ lock = ManifestLock(
171
+ generated_at=manifest.generated_at,
172
+ file_hashes={sf.file_path: sf.file_hash for sf in manifest.spec_files},
173
+ )
174
+ lock_path = os.path.join(evalgate_dir, "manifest.lock.json")
175
+ Path(lock_path).write_text(
176
+ json.dumps(
177
+ {
178
+ "generatedAt": lock.generated_at,
179
+ "fileHashes": lock.file_hashes,
180
+ },
181
+ indent=2,
182
+ ),
183
+ encoding="utf-8",
184
+ )
185
+
186
+
187
+ def read_manifest(project_root: str) -> EvaluationManifest | None:
188
+ """Read existing manifest."""
189
+ manifest_path = os.path.join(project_root, ".evalgate", "manifest.json")
190
+ try:
191
+ data = json.loads(Path(manifest_path).read_text(encoding="utf-8"))
192
+ m = EvaluationManifest(
193
+ schema_version=data.get("schemaVersion", 1),
194
+ generated_at=data.get("generatedAt", 0),
195
+ project=data.get("project", {}),
196
+ runtime=data.get("runtime", {}),
197
+ )
198
+ for sf in data.get("specFiles", []):
199
+ m.spec_files.append(
200
+ SpecFile(file_path=sf["filePath"], file_hash=sf["fileHash"], spec_count=sf["specCount"])
201
+ )
202
+ for s in data.get("specs", []):
203
+ m.specs.append(
204
+ Spec(
205
+ id=s["id"],
206
+ name=s["name"],
207
+ suite_path=s.get("suitePath", []),
208
+ file_path=s["filePath"],
209
+ position=s.get("position", {"line": 1, "column": 1}),
210
+ tags=s.get("tags", []),
211
+ depends_on=s.get("dependsOn", {"prompts": [], "datasets": [], "tools": [], "code": []}),
212
+ )
213
+ )
214
+ return m
215
+ except (OSError, json.JSONDecodeError, KeyError):
216
+ return None
217
+
218
+
219
+ def read_lock(project_root: str) -> ManifestLock | None:
220
+ """Read existing lock file."""
221
+ lock_path = os.path.join(project_root, ".evalgate", "manifest.lock.json")
222
+ try:
223
+ data = json.loads(Path(lock_path).read_text(encoding="utf-8"))
224
+ return ManifestLock(generated_at=data["generatedAt"], file_hashes=data.get("fileHashes", {}))
225
+ except (OSError, json.JSONDecodeError, KeyError):
226
+ return None
227
+
228
+
229
+ # ── Internal helpers ──────────────────────────────────────────────────
230
+
231
+
232
+ def _normalize_path(file_path: str, project_root: str) -> str:
233
+ return os.path.relpath(file_path, project_root).replace("\\", "/")
234
+
235
+
236
+ def _generate_namespace(project_root: str) -> str:
237
+ return hashlib.sha256(project_root.encode("utf-8")).hexdigest()[:8]
238
+
239
+
240
+ def _hash_file(file_path: str) -> str:
241
+ try:
242
+ content = Path(file_path).read_bytes()
243
+ return f"sha256:{hashlib.sha256(content).hexdigest()}"
244
+ except OSError:
245
+ return "sha256:0"
246
+
247
+
248
+ def _read_file_safe(path: str) -> str:
249
+ try:
250
+ return Path(path).read_text(encoding="utf-8")
251
+ except OSError:
252
+ return ""
253
+
254
+
255
+ def _extract_position(content: str, spec_name: str) -> dict[str, int]:
256
+ pattern = re.compile(r'define_eval\s*\(\s*["\']' + re.escape(spec_name) + r'["\']')
257
+ for i, line_content in enumerate(content.splitlines()):
258
+ m = pattern.search(line_content)
259
+ if m:
260
+ return {"line": i + 1, "column": m.start() + 1}
261
+ return {"line": 1, "column": 1}
262
+
263
+
264
+ def _extract_dependencies(content: str) -> dict[str, list[str]]:
265
+ deps: dict[str, list[str]] = {"prompts": [], "datasets": [], "tools": [], "code": []}
266
+
267
+ depends_on_match = re.search(r"depends_on\s*=\s*\{([^}]+)\}", content, re.DOTALL)
268
+ if depends_on_match:
269
+ try:
270
+ raw = "{" + depends_on_match.group(1) + "}"
271
+ parsed = json.loads(raw)
272
+ return {
273
+ "prompts": parsed.get("prompts", []),
274
+ "datasets": parsed.get("datasets", []),
275
+ "tools": parsed.get("tools", []),
276
+ "code": parsed.get("code", []),
277
+ }
278
+ except (json.JSONDecodeError, TypeError):
279
+ pass
280
+
281
+ patterns = {
282
+ "prompts": re.compile(r'["\']([^"\']*\.md)["\']'),
283
+ "datasets": re.compile(r'["\']([^"\']*\.json)["\']'),
284
+ "code": re.compile(r"from\s+([^\s]+)\s+import|import\s+([^\s]+)"),
285
+ }
286
+ for key, pat in patterns.items():
287
+ for m in pat.finditer(content):
288
+ val = m.group(1) or (m.group(2) if m.lastindex and m.lastindex >= 2 else None)
289
+ if val:
290
+ deps[key].append(val)
291
+
292
+ return deps
293
+
294
+
295
+ def _generate_suite_path(tags: list[str], file_path: str) -> list[str]:
296
+ if tags:
297
+ return [tags[0]]
298
+ parts = file_path.split("/")
299
+ if len(parts) > 1:
300
+ return [parts[0]]
301
+ return ["general"]