evalgate-sdk 3.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. evalgate_sdk/__init__.py +707 -0
  2. evalgate_sdk/_version.py +3 -0
  3. evalgate_sdk/assertions.py +1362 -0
  4. evalgate_sdk/auto.py +247 -0
  5. evalgate_sdk/batch.py +174 -0
  6. evalgate_sdk/cache.py +111 -0
  7. evalgate_sdk/ci_context.py +123 -0
  8. evalgate_sdk/cli/__init__.py +111 -0
  9. evalgate_sdk/cli/api.py +261 -0
  10. evalgate_sdk/cli/cli_constants.py +20 -0
  11. evalgate_sdk/cli/commands.py +1041 -0
  12. evalgate_sdk/cli/config.py +228 -0
  13. evalgate_sdk/cli/env.py +43 -0
  14. evalgate_sdk/cli/formatters/types.py +132 -0
  15. evalgate_sdk/cli/golden_commands.py +322 -0
  16. evalgate_sdk/cli/manifest.py +301 -0
  17. evalgate_sdk/cli/new_commands.py +435 -0
  18. evalgate_sdk/cli/policy_packs.py +103 -0
  19. evalgate_sdk/cli/profiles.py +12 -0
  20. evalgate_sdk/cli/regression_gate.py +312 -0
  21. evalgate_sdk/cli/render/__init__.py +1 -0
  22. evalgate_sdk/cli/render/snippet.py +18 -0
  23. evalgate_sdk/cli/render/sort.py +29 -0
  24. evalgate_sdk/cli/report/__init__.py +1 -0
  25. evalgate_sdk/cli/report/build_check_report.py +209 -0
  26. evalgate_sdk/cli/traces.py +186 -0
  27. evalgate_sdk/cli/workspace.py +63 -0
  28. evalgate_sdk/client.py +609 -0
  29. evalgate_sdk/cluster.py +359 -0
  30. evalgate_sdk/collector.py +161 -0
  31. evalgate_sdk/constants.py +6 -0
  32. evalgate_sdk/context.py +151 -0
  33. evalgate_sdk/errors.py +236 -0
  34. evalgate_sdk/export.py +238 -0
  35. evalgate_sdk/formatters/__init__.py +11 -0
  36. evalgate_sdk/formatters/github.py +51 -0
  37. evalgate_sdk/formatters/human.py +68 -0
  38. evalgate_sdk/formatters/json_fmt.py +11 -0
  39. evalgate_sdk/formatters/pr_comment.py +80 -0
  40. evalgate_sdk/golden.py +426 -0
  41. evalgate_sdk/integrations/__init__.py +1 -0
  42. evalgate_sdk/integrations/anthropic.py +99 -0
  43. evalgate_sdk/integrations/autogen.py +62 -0
  44. evalgate_sdk/integrations/crewai.py +61 -0
  45. evalgate_sdk/integrations/langchain.py +100 -0
  46. evalgate_sdk/integrations/openai.py +155 -0
  47. evalgate_sdk/integrations/openai_eval.py +221 -0
  48. evalgate_sdk/local.py +144 -0
  49. evalgate_sdk/logger.py +123 -0
  50. evalgate_sdk/matchers.py +62 -0
  51. evalgate_sdk/otel.py +256 -0
  52. evalgate_sdk/pagination.py +145 -0
  53. evalgate_sdk/py.typed +0 -0
  54. evalgate_sdk/pytest_plugin.py +96 -0
  55. evalgate_sdk/reason_codes.py +103 -0
  56. evalgate_sdk/regression.py +196 -0
  57. evalgate_sdk/replay_decision.py +115 -0
  58. evalgate_sdk/runtime/__init__.py +50 -0
  59. evalgate_sdk/runtime/adapters/__init__.py +1 -0
  60. evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
  61. evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
  62. evalgate_sdk/runtime/context.py +68 -0
  63. evalgate_sdk/runtime/eval.py +318 -0
  64. evalgate_sdk/runtime/execution_mode.py +170 -0
  65. evalgate_sdk/runtime/executor.py +92 -0
  66. evalgate_sdk/runtime/registry.py +125 -0
  67. evalgate_sdk/runtime/run_report.py +249 -0
  68. evalgate_sdk/runtime/types.py +143 -0
  69. evalgate_sdk/snapshot.py +219 -0
  70. evalgate_sdk/streaming.py +124 -0
  71. evalgate_sdk/synthesize.py +226 -0
  72. evalgate_sdk/testing.py +128 -0
  73. evalgate_sdk/types.py +666 -0
  74. evalgate_sdk/utils/__init__.py +1 -0
  75. evalgate_sdk/utils/input_hash.py +42 -0
  76. evalgate_sdk/workflows.py +264 -0
  77. evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
  78. evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
  79. evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
  80. evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,80 @@
1
+ """PR comment formatter — Markdown output for pull request comments."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+
8
+ def format_pr_comment(report: dict[str, Any]) -> str:
9
+ """Format a check/run report as a Markdown PR comment."""
10
+ lines: list[str] = []
11
+ verdict = report.get("verdict", "unknown")
12
+ eval_id = report.get("evaluationId", report.get("run_id", "?"))
13
+ score = report.get("score")
14
+ reason = report.get("reasonMessage", report.get("reason_message", ""))
15
+
16
+ # Header
17
+ if verdict == "pass":
18
+ lines.append("## ✅ EvalGate: Pass")
19
+ elif verdict == "warn":
20
+ lines.append("## ⚠️ EvalGate: Warning")
21
+ else:
22
+ lines.append("## ❌ EvalGate: Fail")
23
+
24
+ lines.append("")
25
+
26
+ # Summary table
27
+ summary = report.get("summary", {})
28
+ if summary:
29
+ lines.append("| Metric | Value |")
30
+ lines.append("|--------|-------|")
31
+ lines.append(f"| Total | {summary.get('total', 0)} |")
32
+ lines.append(f"| Passed | {summary.get('passed', 0)} |")
33
+ lines.append(f"| Failed | {summary.get('failed', 0)} |")
34
+ pr = summary.get("pass_rate", summary.get("passRate", 0))
35
+ lines.append(f"| Pass Rate | {pr:.1f}% |")
36
+ avg = summary.get("average_score", summary.get("averageScore", 0))
37
+ lines.append(f"| Avg Score | {avg:.1f} |")
38
+ lines.append("")
39
+
40
+ # Score + baseline
41
+ if score is not None:
42
+ baseline = report.get("baselineScore", report.get("baseline_score"))
43
+ if baseline is not None:
44
+ delta = report.get("delta", score - baseline)
45
+ sign = "+" if delta >= 0 else ""
46
+ lines.append(f"**Score:** {score} (baseline: {baseline}, delta: {sign}{delta:.1f})")
47
+ else:
48
+ lines.append(f"**Score:** {score}")
49
+ lines.append("")
50
+
51
+ # Reason
52
+ if reason:
53
+ lines.append(f"> {reason}")
54
+ lines.append("")
55
+
56
+ # Failed cases
57
+ failed_cases = report.get("failedCases", report.get("failed_cases", []))
58
+ if failed_cases:
59
+ lines.append("<details>")
60
+ lines.append(f"<summary>Failed cases ({len(failed_cases)})</summary>")
61
+ lines.append("")
62
+ for fc in failed_cases[:20]:
63
+ name = fc.get("name", fc.get("test_name", "?"))
64
+ msg = fc.get("reason", fc.get("message", ""))
65
+ lines.append(f"- **{name}**: {msg}")
66
+ if len(failed_cases) > 20:
67
+ lines.append(f"- ... and {len(failed_cases) - 20} more")
68
+ lines.append("")
69
+ lines.append("</details>")
70
+ lines.append("")
71
+
72
+ # Dashboard link
73
+ dashboard = report.get("dashboardUrl", report.get("dashboard_url"))
74
+ if dashboard:
75
+ lines.append(f"[View in dashboard]({dashboard})")
76
+
77
+ # Footer
78
+ lines.append(f"\n<sub>Evaluation: {eval_id}</sub>")
79
+
80
+ return "\n".join(lines)
evalgate_sdk/golden.py ADDED
@@ -0,0 +1,426 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import dataclass, field
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ from typing import Any, Literal
8
+
9
+ DEFAULT_LABELED_DATASET_PATH = str(Path(".evalgate") / "golden" / "labeled.jsonl")
10
+ DEFAULT_SYNTHETIC_DATASET_PATH = str(Path(".evalgate") / "golden" / "synthetic.jsonl")
11
+
12
+ LabeledOutcome = Literal["pass", "fail"]
13
+ RunStatus = Literal["passed", "failed", "error", "timeout", "skipped"]
14
+
15
+
16
+ @dataclass(slots=True)
17
+ class LabeledGoldenCase:
18
+ case_id: str
19
+ input: str
20
+ expected: str
21
+ actual: str
22
+ label: LabeledOutcome
23
+ failure_mode: str | None
24
+ labeled_at: str
25
+ cluster_id: str | None = None
26
+ cluster_label: str | None = None
27
+
28
+ def to_dict(self) -> dict[str, Any]:
29
+ data: dict[str, Any] = {
30
+ "caseId": self.case_id,
31
+ "input": self.input,
32
+ "expected": self.expected,
33
+ "actual": self.actual,
34
+ "label": self.label,
35
+ "failureMode": self.failure_mode,
36
+ "labeledAt": self.labeled_at,
37
+ }
38
+ if self.cluster_id is not None:
39
+ data["clusterId"] = self.cluster_id
40
+ if self.cluster_label is not None:
41
+ data["clusterLabel"] = self.cluster_label
42
+ return data
43
+
44
+
45
+ @dataclass(slots=True)
46
+ class SyntheticGoldenCase(LabeledGoldenCase):
47
+ synthetic: bool = True
48
+ synthesized_at: str = ""
49
+ source_case_ids: list[str] = field(default_factory=list)
50
+ dimensions: dict[str, str] = field(default_factory=dict)
51
+
52
+ def to_dict(self) -> dict[str, Any]:
53
+ data = LabeledGoldenCase.to_dict(self)
54
+ data.update(
55
+ {
56
+ "synthetic": True,
57
+ "synthesizedAt": self.synthesized_at,
58
+ "sourceCaseIds": list(self.source_case_ids),
59
+ "dimensions": dict(self.dimensions),
60
+ }
61
+ )
62
+ return data
63
+
64
+
65
+ @dataclass(slots=True)
66
+ class FailureModeSummary:
67
+ mode: str
68
+ count: int
69
+ frequency: float
70
+
71
+ def to_dict(self) -> dict[str, Any]:
72
+ return {
73
+ "mode": self.mode,
74
+ "count": self.count,
75
+ "frequency": self.frequency,
76
+ }
77
+
78
+
79
+ @dataclass(slots=True)
80
+ class AnalyzeSummary:
81
+ total: int
82
+ failed: int
83
+ pass_rate: float
84
+ failure_modes: list[FailureModeSummary]
85
+
86
+ def to_dict(self) -> dict[str, Any]:
87
+ return {
88
+ "total": self.total,
89
+ "failed": self.failed,
90
+ "passRate": self.pass_rate,
91
+ "failureModes": [item.to_dict() for item in self.failure_modes],
92
+ }
93
+
94
+
95
+ @dataclass(slots=True)
96
+ class NormalizedRunCase:
97
+ case_id: str
98
+ name: str
99
+ file_path: str
100
+ status: RunStatus
101
+ input: str
102
+ expected: str
103
+ actual: str
104
+ passed: bool
105
+ score: float
106
+ duration_ms: float
107
+ error: str | None = None
108
+ raw: dict[str, Any] = field(default_factory=dict)
109
+
110
+
111
+ @dataclass(slots=True)
112
+ class NormalizedRunArtifact:
113
+ run_id: str
114
+ total_run_results: int
115
+ summary: dict[str, Any]
116
+ cases: list[NormalizedRunCase]
117
+ source_format: str
118
+
119
+
120
+ @dataclass(slots=True)
121
+ class RunMetrics:
122
+ pass_rate_ratio: float
123
+ corrected_pass_rate_ratio: float | None
124
+ total_cost_usd: float | None
125
+ total_results: int
126
+
127
+
128
+ def _is_iso_timestamp(value: str) -> bool:
129
+ candidate = value.strip()
130
+ if not candidate:
131
+ return False
132
+ normalized = candidate[:-1] + "+00:00" if candidate.endswith("Z") else candidate
133
+ try:
134
+ datetime.fromisoformat(normalized)
135
+ except ValueError:
136
+ return False
137
+ return True
138
+
139
+
140
+ def _require_string(record: dict[str, Any], key: str, line_number: int) -> str:
141
+ value = record.get(key)
142
+ if not isinstance(value, str):
143
+ raise ValueError(f"Invalid labeled dataset at line {line_number}: {key} must be a string")
144
+ return value
145
+
146
+
147
+ def parse_labeled_dataset(content: str) -> list[LabeledGoldenCase]:
148
+ rows = [line.strip() for line in content.splitlines() if line.strip()]
149
+ parsed_rows: list[LabeledGoldenCase] = []
150
+ for index, line in enumerate(rows, start=1):
151
+ try:
152
+ parsed = json.loads(line)
153
+ except json.JSONDecodeError as exc:
154
+ raise ValueError(f"Invalid JSONL at line {index}: expected valid JSON object") from exc
155
+ if not isinstance(parsed, dict):
156
+ raise ValueError(f"Invalid JSONL at line {index}: expected JSON object record")
157
+
158
+ case_id = _require_string(parsed, "caseId", index).strip()
159
+ if not case_id:
160
+ raise ValueError(f"Invalid labeled dataset at line {index}: caseId must be a non-empty string")
161
+ input_text = _require_string(parsed, "input", index)
162
+ expected = _require_string(parsed, "expected", index)
163
+ actual = _require_string(parsed, "actual", index)
164
+ label = parsed.get("label")
165
+ if label not in ("pass", "fail"):
166
+ raise ValueError(f'Invalid labeled dataset at line {index}: label must be "pass" or "fail"')
167
+
168
+ failure_mode_value = parsed.get("failureMode")
169
+ if failure_mode_value == "":
170
+ failure_mode_value = None
171
+ if not (isinstance(failure_mode_value, str) or failure_mode_value is None):
172
+ raise ValueError(f"Invalid labeled dataset at line {index}: failureMode must be string or null")
173
+ if label == "fail" and (failure_mode_value is None or not failure_mode_value.strip()):
174
+ raise ValueError(
175
+ f"Invalid labeled dataset at line {index}: failed rows require a non-empty failureMode"
176
+ )
177
+ if label == "pass" and isinstance(failure_mode_value, str) and failure_mode_value.strip():
178
+ raise ValueError(
179
+ f"Invalid labeled dataset at line {index}: passing rows must set failureMode to null or empty string"
180
+ )
181
+
182
+ labeled_at = _require_string(parsed, "labeledAt", index)
183
+ if not _is_iso_timestamp(labeled_at):
184
+ raise ValueError(
185
+ f"Invalid labeled dataset at line {index}: labeledAt must be an ISO timestamp string"
186
+ )
187
+
188
+ cluster_id = parsed.get("clusterId")
189
+ cluster_label = parsed.get("clusterLabel")
190
+ if cluster_id is not None and not isinstance(cluster_id, str):
191
+ raise ValueError(f"Invalid labeled dataset at line {index}: clusterId must be a string when present")
192
+ if cluster_label is not None and not isinstance(cluster_label, str):
193
+ raise ValueError(f"Invalid labeled dataset at line {index}: clusterLabel must be a string when present")
194
+
195
+ parsed_rows.append(
196
+ LabeledGoldenCase(
197
+ case_id=case_id,
198
+ input=input_text,
199
+ expected=expected,
200
+ actual=actual,
201
+ label=label,
202
+ failure_mode=failure_mode_value.strip() if isinstance(failure_mode_value, str) else None,
203
+ labeled_at=labeled_at,
204
+ cluster_id=cluster_id,
205
+ cluster_label=cluster_label,
206
+ )
207
+ )
208
+
209
+ return parsed_rows
210
+
211
+
212
+ def analyze_labeled_dataset(rows: list[LabeledGoldenCase], top: int = 5) -> AnalyzeSummary:
213
+ total = len(rows)
214
+ failed_rows = [row for row in rows if row.label == "fail"]
215
+ failed = len(failed_rows)
216
+ pass_rate = (total - failed) / total if total > 0 else 0.0
217
+ counts: dict[str, int] = {}
218
+ for row in failed_rows:
219
+ mode = row.failure_mode.strip() if isinstance(row.failure_mode, str) and row.failure_mode.strip() else "failed_without_mode"
220
+ counts[mode] = counts.get(mode, 0) + 1
221
+ ordered = sorted(counts.items(), key=lambda item: (-item[1], item[0]))[: max(1, top)]
222
+ return AnalyzeSummary(
223
+ total=total,
224
+ failed=failed,
225
+ pass_rate=pass_rate,
226
+ failure_modes=[
227
+ FailureModeSummary(mode=mode, count=count, frequency=(count / failed if failed > 0 else 0.0))
228
+ for mode, count in ordered
229
+ ],
230
+ )
231
+
232
+
233
+ def format_analyze_human(summary: AnalyzeSummary) -> str:
234
+ lines = [
235
+ "Analyze phase",
236
+ f"Total cases: {summary.total}",
237
+ f"Failed: {summary.failed} ({((summary.failed / summary.total) * 100 if summary.total else 0.0):.1f}%)",
238
+ f"Pass rate: {(summary.pass_rate * 100):.1f}%",
239
+ ]
240
+ if not summary.failure_modes:
241
+ lines.append("Failure modes: none")
242
+ return "\n".join(lines)
243
+ lines.append("Top failure modes:")
244
+ for index, item in enumerate(summary.failure_modes, start=1):
245
+ lines.append(f"{index}. {item.mode} — {item.count} ({(item.frequency * 100):.1f}%)")
246
+ return "\n".join(lines)
247
+
248
+
249
+ def write_jsonl(file_path: str, rows: list[LabeledGoldenCase | SyntheticGoldenCase]) -> None:
250
+ path = Path(file_path)
251
+ path.parent.mkdir(parents=True, exist_ok=True)
252
+ content = "\n".join(json.dumps(row.to_dict(), separators=(",", ":")) for row in rows)
253
+ path.write_text(f"{content}\n" if content else "", encoding="utf-8")
254
+
255
+
256
+ def _string_value(value: Any) -> str:
257
+ if value is None:
258
+ return ""
259
+ if isinstance(value, str):
260
+ return value
261
+ return json.dumps(value, ensure_ascii=False) if isinstance(value, (dict, list)) else str(value)
262
+
263
+
264
+ def _status_from_mapping(item: dict[str, Any]) -> RunStatus:
265
+ status = item.get("status")
266
+ if isinstance(status, str) and status in {"passed", "failed", "error", "timeout", "skipped"}:
267
+ return status
268
+ passed = item.get("passed")
269
+ if isinstance(passed, bool):
270
+ return "passed" if passed else "failed"
271
+ return "failed"
272
+
273
+
274
+ def _normalize_ratio(value: Any) -> float | None:
275
+ if not isinstance(value, (int, float)):
276
+ return None
277
+ ratio = float(value)
278
+ if ratio > 1.0:
279
+ ratio /= 100.0
280
+ if ratio < 0.0:
281
+ return 0.0
282
+ if ratio > 1.0:
283
+ return 1.0
284
+ return ratio
285
+
286
+
287
+ def extract_run_metrics(run_data: dict[str, Any]) -> RunMetrics:
288
+ summary = run_data.get("summary") if isinstance(run_data.get("summary"), dict) else {}
289
+ if not isinstance(summary, dict):
290
+ summary = {}
291
+ pass_rate = _normalize_ratio(summary.get("correctedPassRate"))
292
+ corrected_pass_rate = pass_rate
293
+ raw_pass_rate = _normalize_ratio(summary.get("passRate"))
294
+ if raw_pass_rate is None:
295
+ raw_pass_rate = _normalize_ratio(summary.get("pass_rate"))
296
+ if raw_pass_rate is None:
297
+ total = summary.get("total")
298
+ passed = summary.get("passed")
299
+ if isinstance(total, int) and total > 0 and isinstance(passed, int):
300
+ raw_pass_rate = passed / total
301
+ else:
302
+ results = run_data.get("results") if isinstance(run_data.get("results"), list) else []
303
+ total_count = len(results)
304
+ passed_count = 0
305
+ for item in results:
306
+ if isinstance(item, dict):
307
+ nested_result = item.get("result")
308
+ if isinstance(nested_result, dict):
309
+ status = nested_result.get("status")
310
+ if status == "passed":
311
+ passed_count += 1
312
+ elif item.get("passed") is True:
313
+ passed_count += 1
314
+ raw_pass_rate = (passed_count / total_count) if total_count > 0 else 0.0
315
+ if corrected_pass_rate is None:
316
+ corrected_pass_rate = _normalize_ratio(summary.get("corrected_pass_rate"))
317
+ if corrected_pass_rate is None:
318
+ corrected_pass_rate = None
319
+ total_cost_usd = summary.get("totalCostUsd")
320
+ if total_cost_usd is None:
321
+ total_cost_usd = summary.get("total_cost_usd")
322
+ if total_cost_usd is not None and not isinstance(total_cost_usd, (int, float)):
323
+ total_cost_usd = None
324
+ total_results = summary.get("total") if isinstance(summary.get("total"), int) else None
325
+ if total_results is None:
326
+ results = run_data.get("results") if isinstance(run_data.get("results"), list) else []
327
+ total_results = len(results)
328
+ return RunMetrics(
329
+ pass_rate_ratio=raw_pass_rate or 0.0,
330
+ corrected_pass_rate_ratio=corrected_pass_rate,
331
+ total_cost_usd=float(total_cost_usd) if isinstance(total_cost_usd, (int, float)) else None,
332
+ total_results=total_results,
333
+ )
334
+
335
+
336
+ def normalize_run_artifact(run_data: dict[str, Any]) -> NormalizedRunArtifact:
337
+ results = run_data.get("results")
338
+ if not isinstance(results, list):
339
+ raise ValueError("Run artifact must contain a results array")
340
+
341
+ normalized_cases: list[NormalizedRunCase] = []
342
+ source_format = "legacy"
343
+ for index, item in enumerate(results):
344
+ if not isinstance(item, dict):
345
+ continue
346
+ if isinstance(item.get("result"), dict):
347
+ source_format = "typescript"
348
+ nested = item["result"]
349
+ status = nested.get("status") if isinstance(nested.get("status"), str) else _status_from_mapping(item)
350
+ case = NormalizedRunCase(
351
+ case_id=_string_value(item.get("specId") or item.get("testId") or item.get("caseId") or item.get("id") or f"case-{index + 1}"),
352
+ name=_string_value(item.get("name") or item.get("testName") or item.get("spec") or f"case-{index + 1}"),
353
+ file_path=_string_value(item.get("filePath") or item.get("file_path") or ""),
354
+ status=status,
355
+ input=_string_value(item.get("input")),
356
+ expected=_string_value(item.get("expected") or item.get("expectedOutput")),
357
+ actual=_string_value(item.get("actual") or item.get("output") or nested.get("error")),
358
+ passed=status == "passed",
359
+ score=float(nested.get("score") or 0.0),
360
+ duration_ms=float(nested.get("duration") or nested.get("durationMs") or item.get("durationMs") or 0.0),
361
+ error=_string_value(nested.get("error")) or None,
362
+ raw=item,
363
+ )
364
+ elif "test_id" in item or "testId" in item or "test_name" in item or "testName" in item:
365
+ source_format = "python_run_report"
366
+ status = _status_from_mapping(item)
367
+ metadata = item.get("metadata") if isinstance(item.get("metadata"), dict) else {}
368
+ case = NormalizedRunCase(
369
+ case_id=_string_value(item.get("test_id") or item.get("testId") or f"case-{index + 1}"),
370
+ name=_string_value(item.get("test_name") or item.get("testName") or f"case-{index + 1}"),
371
+ file_path=_string_value(item.get("file_path") or item.get("filePath") or ""),
372
+ status=status,
373
+ input=_string_value(item.get("input") or metadata.get("input")),
374
+ expected=_string_value(item.get("expected") or metadata.get("expected")),
375
+ actual=_string_value(item.get("actual") or metadata.get("actual") or metadata.get("output") or item.get("error")),
376
+ passed=bool(item.get("passed")),
377
+ score=float(item.get("score") or 0.0),
378
+ duration_ms=float(item.get("duration_ms") or item.get("durationMs") or 0.0),
379
+ error=_string_value(item.get("error")) or None,
380
+ raw=item,
381
+ )
382
+ else:
383
+ status = _status_from_mapping(item)
384
+ case = NormalizedRunCase(
385
+ case_id=_string_value(item.get("specId") or item.get("test_id") or item.get("spec") or item.get("name") or f"case-{index + 1}"),
386
+ name=_string_value(item.get("name") or item.get("spec") or item.get("test_name") or item.get("testName") or f"case-{index + 1}"),
387
+ file_path=_string_value(item.get("file_path") or item.get("filePath") or ""),
388
+ status=status,
389
+ input=_string_value(item.get("input")),
390
+ expected=_string_value(item.get("expected") or item.get("expectedOutput")),
391
+ actual=_string_value(item.get("actual") or item.get("output") or item.get("error")),
392
+ passed=bool(item.get("passed")),
393
+ score=float(item.get("score") or 0.0),
394
+ duration_ms=float(item.get("duration_ms") or item.get("durationMs") or 0.0),
395
+ error=_string_value(item.get("error")) or None,
396
+ raw=item,
397
+ )
398
+ normalized_cases.append(case)
399
+
400
+ run_id = _string_value(run_data.get("runId") or run_data.get("run_id") or run_data.get("id") or "run-latest")
401
+ metrics = extract_run_metrics(run_data)
402
+ summary = run_data.get("summary") if isinstance(run_data.get("summary"), dict) else {}
403
+ if not isinstance(summary, dict):
404
+ summary = {}
405
+ if "total" not in summary:
406
+ summary["total"] = len(normalized_cases)
407
+ if "passed" not in summary:
408
+ summary["passed"] = sum(1 for case in normalized_cases if case.passed)
409
+ if "failed" not in summary:
410
+ summary["failed"] = sum(1 for case in normalized_cases if case.status == "failed")
411
+ if "errors" not in summary:
412
+ summary["errors"] = sum(1 for case in normalized_cases if case.status == "error")
413
+ if "timeouts" not in summary:
414
+ summary["timeouts"] = sum(1 for case in normalized_cases if case.status == "timeout")
415
+ summary.setdefault("passRate", metrics.pass_rate_ratio)
416
+ summary.setdefault("pass_rate", metrics.pass_rate_ratio * 100.0)
417
+ if metrics.corrected_pass_rate_ratio is not None:
418
+ summary.setdefault("correctedPassRate", metrics.corrected_pass_rate_ratio)
419
+ summary.setdefault("corrected_pass_rate", metrics.corrected_pass_rate_ratio)
420
+ return NormalizedRunArtifact(
421
+ run_id=run_id,
422
+ total_run_results=len(normalized_cases),
423
+ summary=summary,
424
+ cases=normalized_cases,
425
+ source_format=source_format,
426
+ )
@@ -0,0 +1 @@
1
+ """Framework integrations for OpenAI, Anthropic, and agent frameworks."""
@@ -0,0 +1,99 @@
1
+ """Anthropic tracing integration — wraps Anthropic client calls with EvalAI traces."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import time
6
+ from typing import Any
7
+
8
+ from evalgate_sdk.types import CreateTraceParams
9
+
10
+
11
+ async def trace_anthropic_call(
12
+ client: Any,
13
+ name: str,
14
+ fn: Any,
15
+ *,
16
+ metadata: dict[str, Any] | None = None,
17
+ ) -> Any:
18
+ """Trace a single Anthropic call.
19
+
20
+ Args:
21
+ client: AIEvalClient instance.
22
+ name: Descriptive name for the trace.
23
+ fn: An async callable that performs the Anthropic call.
24
+ metadata: Extra metadata to attach.
25
+
26
+ Returns:
27
+ The result of calling *fn*.
28
+ """
29
+ trace = await client.traces.create(
30
+ CreateTraceParams(name=name, metadata={**(metadata or {}), "provider": "anthropic"})
31
+ )
32
+ start = time.monotonic()
33
+ try:
34
+ result = await fn()
35
+ duration_ms = int((time.monotonic() - start) * 1000)
36
+
37
+ from evalgate_sdk.types import UpdateTraceParams
38
+
39
+ output_text = ""
40
+ usage: dict[str, Any] = {}
41
+ if hasattr(result, "content") and result.content:
42
+ parts = result.content
43
+ output_text = parts[0].text if hasattr(parts[0], "text") else str(parts[0])
44
+ if hasattr(result, "usage") and result.usage:
45
+ usage = {
46
+ "input_tokens": result.usage.input_tokens,
47
+ "output_tokens": result.usage.output_tokens,
48
+ }
49
+
50
+ await client.traces.update(
51
+ trace.id,
52
+ UpdateTraceParams(
53
+ output=output_text,
54
+ status="completed",
55
+ metadata={"duration_ms": duration_ms, "usage": usage},
56
+ ),
57
+ )
58
+ return result
59
+ except Exception as exc:
60
+ from evalgate_sdk.types import UpdateTraceParams
61
+
62
+ await client.traces.update(
63
+ trace.id,
64
+ UpdateTraceParams(status="error", metadata={"error": str(exc)}),
65
+ )
66
+ raise
67
+
68
+
69
+ def trace_anthropic(anthropic_client: Any, eval_client: Any, **kwargs: Any) -> Any:
70
+ """Wrap an Anthropic client so every ``messages.create`` call is traced.
71
+
72
+ Returns a lightweight proxy; the original client is not modified.
73
+ """
74
+
75
+ class _TracedMessages:
76
+ def __init__(self, original: Any) -> None:
77
+ self._original = original
78
+
79
+ async def create(self, **kw: Any) -> Any:
80
+ name = kw.get("model", "anthropic-messages")
81
+ return await trace_anthropic_call(
82
+ eval_client,
83
+ name,
84
+ lambda: self._original.create(**kw),
85
+ metadata={"model": kw.get("model"), **kwargs},
86
+ )
87
+
88
+ def __getattr__(self, name: str) -> Any:
89
+ return getattr(self._original, name)
90
+
91
+ class _TracedAnthropic:
92
+ def __init__(self, original: Any) -> None:
93
+ self.messages = _TracedMessages(original.messages)
94
+ self._original = original
95
+
96
+ def __getattr__(self, name: str) -> Any:
97
+ return getattr(self._original, name)
98
+
99
+ return _TracedAnthropic(anthropic_client)
@@ -0,0 +1,62 @@
1
+ """AutoGen tracing integration — wraps AutoGen conversations with EvalAI workflow traces."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from evalgate_sdk.workflows import WorkflowStatus, WorkflowTracer
8
+
9
+
10
+ def trace_autogen(
11
+ conversation: Any,
12
+ tracer: WorkflowTracer,
13
+ *,
14
+ conversation_name: str = "AutoGenConversation",
15
+ ) -> Any:
16
+ """Wrap an AutoGen conversation so ``initiate_chat`` is traced as a full workflow.
17
+
18
+ Returns a lightweight proxy; the original conversation is not modified.
19
+
20
+ Args:
21
+ conversation: An AutoGen agent or ``ConversableAgent`` with
22
+ ``.initiate_chat()`` or ``.a_initiate_chat()`` methods.
23
+ tracer: An active ``WorkflowTracer`` instance.
24
+ conversation_name: Name used for the workflow and spans.
25
+
26
+ Example::
27
+
28
+ from autogen import ConversableAgent
29
+ from evalgate_sdk import WorkflowTracer, AIEvalClient
30
+
31
+ client = AIEvalClient.init()
32
+ tracer = WorkflowTracer(client)
33
+ agent = ConversableAgent(...)
34
+
35
+ traced = trace_autogen(agent, tracer, conversation_name="CodeReview")
36
+ result = await traced.initiate_chat(recipient, message="Review this PR")
37
+ """
38
+
39
+ class _TracedAutoGen:
40
+ def __init__(self, original: Any) -> None:
41
+ self._original = original
42
+
43
+ async def initiate_chat(self, *args: Any, **kwargs: Any) -> Any:
44
+ await tracer.start_workflow(conversation_name)
45
+ span = await tracer.start_agent_span(conversation_name, {"args": str(args), "kwargs": str(kwargs)})
46
+ try:
47
+ if hasattr(self._original, "a_initiate_chat"):
48
+ result = await self._original.a_initiate_chat(*args, **kwargs)
49
+ else:
50
+ result = await self._original.initiate_chat(*args, **kwargs)
51
+ await tracer.end_agent_span(span, output={"result": str(result)})
52
+ await tracer.end_workflow({"result": str(result)}, WorkflowStatus.COMPLETED)
53
+ return result
54
+ except Exception as exc:
55
+ await tracer.end_agent_span(span, error=str(exc))
56
+ await tracer.end_workflow({"error": str(exc)}, WorkflowStatus.FAILED)
57
+ raise
58
+
59
+ def __getattr__(self, name: str) -> Any:
60
+ return getattr(self._original, name)
61
+
62
+ return _TracedAutoGen(conversation)