evalgate-sdk 3.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalgate_sdk/__init__.py +707 -0
- evalgate_sdk/_version.py +3 -0
- evalgate_sdk/assertions.py +1362 -0
- evalgate_sdk/auto.py +247 -0
- evalgate_sdk/batch.py +174 -0
- evalgate_sdk/cache.py +111 -0
- evalgate_sdk/ci_context.py +123 -0
- evalgate_sdk/cli/__init__.py +111 -0
- evalgate_sdk/cli/api.py +261 -0
- evalgate_sdk/cli/cli_constants.py +20 -0
- evalgate_sdk/cli/commands.py +1041 -0
- evalgate_sdk/cli/config.py +228 -0
- evalgate_sdk/cli/env.py +43 -0
- evalgate_sdk/cli/formatters/types.py +132 -0
- evalgate_sdk/cli/golden_commands.py +322 -0
- evalgate_sdk/cli/manifest.py +301 -0
- evalgate_sdk/cli/new_commands.py +435 -0
- evalgate_sdk/cli/policy_packs.py +103 -0
- evalgate_sdk/cli/profiles.py +12 -0
- evalgate_sdk/cli/regression_gate.py +312 -0
- evalgate_sdk/cli/render/__init__.py +1 -0
- evalgate_sdk/cli/render/snippet.py +18 -0
- evalgate_sdk/cli/render/sort.py +29 -0
- evalgate_sdk/cli/report/__init__.py +1 -0
- evalgate_sdk/cli/report/build_check_report.py +209 -0
- evalgate_sdk/cli/traces.py +186 -0
- evalgate_sdk/cli/workspace.py +63 -0
- evalgate_sdk/client.py +609 -0
- evalgate_sdk/cluster.py +359 -0
- evalgate_sdk/collector.py +161 -0
- evalgate_sdk/constants.py +6 -0
- evalgate_sdk/context.py +151 -0
- evalgate_sdk/errors.py +236 -0
- evalgate_sdk/export.py +238 -0
- evalgate_sdk/formatters/__init__.py +11 -0
- evalgate_sdk/formatters/github.py +51 -0
- evalgate_sdk/formatters/human.py +68 -0
- evalgate_sdk/formatters/json_fmt.py +11 -0
- evalgate_sdk/formatters/pr_comment.py +80 -0
- evalgate_sdk/golden.py +426 -0
- evalgate_sdk/integrations/__init__.py +1 -0
- evalgate_sdk/integrations/anthropic.py +99 -0
- evalgate_sdk/integrations/autogen.py +62 -0
- evalgate_sdk/integrations/crewai.py +61 -0
- evalgate_sdk/integrations/langchain.py +100 -0
- evalgate_sdk/integrations/openai.py +155 -0
- evalgate_sdk/integrations/openai_eval.py +221 -0
- evalgate_sdk/local.py +144 -0
- evalgate_sdk/logger.py +123 -0
- evalgate_sdk/matchers.py +62 -0
- evalgate_sdk/otel.py +256 -0
- evalgate_sdk/pagination.py +145 -0
- evalgate_sdk/py.typed +0 -0
- evalgate_sdk/pytest_plugin.py +96 -0
- evalgate_sdk/reason_codes.py +103 -0
- evalgate_sdk/regression.py +196 -0
- evalgate_sdk/replay_decision.py +115 -0
- evalgate_sdk/runtime/__init__.py +50 -0
- evalgate_sdk/runtime/adapters/__init__.py +1 -0
- evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
- evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
- evalgate_sdk/runtime/context.py +68 -0
- evalgate_sdk/runtime/eval.py +318 -0
- evalgate_sdk/runtime/execution_mode.py +170 -0
- evalgate_sdk/runtime/executor.py +92 -0
- evalgate_sdk/runtime/registry.py +125 -0
- evalgate_sdk/runtime/run_report.py +249 -0
- evalgate_sdk/runtime/types.py +143 -0
- evalgate_sdk/snapshot.py +219 -0
- evalgate_sdk/streaming.py +124 -0
- evalgate_sdk/synthesize.py +226 -0
- evalgate_sdk/testing.py +128 -0
- evalgate_sdk/types.py +666 -0
- evalgate_sdk/utils/__init__.py +1 -0
- evalgate_sdk/utils/input_hash.py +42 -0
- evalgate_sdk/workflows.py +264 -0
- evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
- evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
- evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
- evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""PR comment formatter — Markdown output for pull request comments."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def format_pr_comment(report: dict[str, Any]) -> str:
|
|
9
|
+
"""Format a check/run report as a Markdown PR comment."""
|
|
10
|
+
lines: list[str] = []
|
|
11
|
+
verdict = report.get("verdict", "unknown")
|
|
12
|
+
eval_id = report.get("evaluationId", report.get("run_id", "?"))
|
|
13
|
+
score = report.get("score")
|
|
14
|
+
reason = report.get("reasonMessage", report.get("reason_message", ""))
|
|
15
|
+
|
|
16
|
+
# Header
|
|
17
|
+
if verdict == "pass":
|
|
18
|
+
lines.append("## ✅ EvalGate: Pass")
|
|
19
|
+
elif verdict == "warn":
|
|
20
|
+
lines.append("## ⚠️ EvalGate: Warning")
|
|
21
|
+
else:
|
|
22
|
+
lines.append("## ❌ EvalGate: Fail")
|
|
23
|
+
|
|
24
|
+
lines.append("")
|
|
25
|
+
|
|
26
|
+
# Summary table
|
|
27
|
+
summary = report.get("summary", {})
|
|
28
|
+
if summary:
|
|
29
|
+
lines.append("| Metric | Value |")
|
|
30
|
+
lines.append("|--------|-------|")
|
|
31
|
+
lines.append(f"| Total | {summary.get('total', 0)} |")
|
|
32
|
+
lines.append(f"| Passed | {summary.get('passed', 0)} |")
|
|
33
|
+
lines.append(f"| Failed | {summary.get('failed', 0)} |")
|
|
34
|
+
pr = summary.get("pass_rate", summary.get("passRate", 0))
|
|
35
|
+
lines.append(f"| Pass Rate | {pr:.1f}% |")
|
|
36
|
+
avg = summary.get("average_score", summary.get("averageScore", 0))
|
|
37
|
+
lines.append(f"| Avg Score | {avg:.1f} |")
|
|
38
|
+
lines.append("")
|
|
39
|
+
|
|
40
|
+
# Score + baseline
|
|
41
|
+
if score is not None:
|
|
42
|
+
baseline = report.get("baselineScore", report.get("baseline_score"))
|
|
43
|
+
if baseline is not None:
|
|
44
|
+
delta = report.get("delta", score - baseline)
|
|
45
|
+
sign = "+" if delta >= 0 else ""
|
|
46
|
+
lines.append(f"**Score:** {score} (baseline: {baseline}, delta: {sign}{delta:.1f})")
|
|
47
|
+
else:
|
|
48
|
+
lines.append(f"**Score:** {score}")
|
|
49
|
+
lines.append("")
|
|
50
|
+
|
|
51
|
+
# Reason
|
|
52
|
+
if reason:
|
|
53
|
+
lines.append(f"> {reason}")
|
|
54
|
+
lines.append("")
|
|
55
|
+
|
|
56
|
+
# Failed cases
|
|
57
|
+
failed_cases = report.get("failedCases", report.get("failed_cases", []))
|
|
58
|
+
if failed_cases:
|
|
59
|
+
lines.append("<details>")
|
|
60
|
+
lines.append(f"<summary>Failed cases ({len(failed_cases)})</summary>")
|
|
61
|
+
lines.append("")
|
|
62
|
+
for fc in failed_cases[:20]:
|
|
63
|
+
name = fc.get("name", fc.get("test_name", "?"))
|
|
64
|
+
msg = fc.get("reason", fc.get("message", ""))
|
|
65
|
+
lines.append(f"- **{name}**: {msg}")
|
|
66
|
+
if len(failed_cases) > 20:
|
|
67
|
+
lines.append(f"- ... and {len(failed_cases) - 20} more")
|
|
68
|
+
lines.append("")
|
|
69
|
+
lines.append("</details>")
|
|
70
|
+
lines.append("")
|
|
71
|
+
|
|
72
|
+
# Dashboard link
|
|
73
|
+
dashboard = report.get("dashboardUrl", report.get("dashboard_url"))
|
|
74
|
+
if dashboard:
|
|
75
|
+
lines.append(f"[View in dashboard]({dashboard})")
|
|
76
|
+
|
|
77
|
+
# Footer
|
|
78
|
+
lines.append(f"\n<sub>Evaluation: {eval_id}</sub>")
|
|
79
|
+
|
|
80
|
+
return "\n".join(lines)
|
evalgate_sdk/golden.py
ADDED
|
@@ -0,0 +1,426 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Literal
|
|
8
|
+
|
|
9
|
+
DEFAULT_LABELED_DATASET_PATH = str(Path(".evalgate") / "golden" / "labeled.jsonl")
|
|
10
|
+
DEFAULT_SYNTHETIC_DATASET_PATH = str(Path(".evalgate") / "golden" / "synthetic.jsonl")
|
|
11
|
+
|
|
12
|
+
LabeledOutcome = Literal["pass", "fail"]
|
|
13
|
+
RunStatus = Literal["passed", "failed", "error", "timeout", "skipped"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(slots=True)
|
|
17
|
+
class LabeledGoldenCase:
|
|
18
|
+
case_id: str
|
|
19
|
+
input: str
|
|
20
|
+
expected: str
|
|
21
|
+
actual: str
|
|
22
|
+
label: LabeledOutcome
|
|
23
|
+
failure_mode: str | None
|
|
24
|
+
labeled_at: str
|
|
25
|
+
cluster_id: str | None = None
|
|
26
|
+
cluster_label: str | None = None
|
|
27
|
+
|
|
28
|
+
def to_dict(self) -> dict[str, Any]:
|
|
29
|
+
data: dict[str, Any] = {
|
|
30
|
+
"caseId": self.case_id,
|
|
31
|
+
"input": self.input,
|
|
32
|
+
"expected": self.expected,
|
|
33
|
+
"actual": self.actual,
|
|
34
|
+
"label": self.label,
|
|
35
|
+
"failureMode": self.failure_mode,
|
|
36
|
+
"labeledAt": self.labeled_at,
|
|
37
|
+
}
|
|
38
|
+
if self.cluster_id is not None:
|
|
39
|
+
data["clusterId"] = self.cluster_id
|
|
40
|
+
if self.cluster_label is not None:
|
|
41
|
+
data["clusterLabel"] = self.cluster_label
|
|
42
|
+
return data
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass(slots=True)
|
|
46
|
+
class SyntheticGoldenCase(LabeledGoldenCase):
|
|
47
|
+
synthetic: bool = True
|
|
48
|
+
synthesized_at: str = ""
|
|
49
|
+
source_case_ids: list[str] = field(default_factory=list)
|
|
50
|
+
dimensions: dict[str, str] = field(default_factory=dict)
|
|
51
|
+
|
|
52
|
+
def to_dict(self) -> dict[str, Any]:
|
|
53
|
+
data = LabeledGoldenCase.to_dict(self)
|
|
54
|
+
data.update(
|
|
55
|
+
{
|
|
56
|
+
"synthetic": True,
|
|
57
|
+
"synthesizedAt": self.synthesized_at,
|
|
58
|
+
"sourceCaseIds": list(self.source_case_ids),
|
|
59
|
+
"dimensions": dict(self.dimensions),
|
|
60
|
+
}
|
|
61
|
+
)
|
|
62
|
+
return data
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass(slots=True)
|
|
66
|
+
class FailureModeSummary:
|
|
67
|
+
mode: str
|
|
68
|
+
count: int
|
|
69
|
+
frequency: float
|
|
70
|
+
|
|
71
|
+
def to_dict(self) -> dict[str, Any]:
|
|
72
|
+
return {
|
|
73
|
+
"mode": self.mode,
|
|
74
|
+
"count": self.count,
|
|
75
|
+
"frequency": self.frequency,
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass(slots=True)
|
|
80
|
+
class AnalyzeSummary:
|
|
81
|
+
total: int
|
|
82
|
+
failed: int
|
|
83
|
+
pass_rate: float
|
|
84
|
+
failure_modes: list[FailureModeSummary]
|
|
85
|
+
|
|
86
|
+
def to_dict(self) -> dict[str, Any]:
|
|
87
|
+
return {
|
|
88
|
+
"total": self.total,
|
|
89
|
+
"failed": self.failed,
|
|
90
|
+
"passRate": self.pass_rate,
|
|
91
|
+
"failureModes": [item.to_dict() for item in self.failure_modes],
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@dataclass(slots=True)
|
|
96
|
+
class NormalizedRunCase:
|
|
97
|
+
case_id: str
|
|
98
|
+
name: str
|
|
99
|
+
file_path: str
|
|
100
|
+
status: RunStatus
|
|
101
|
+
input: str
|
|
102
|
+
expected: str
|
|
103
|
+
actual: str
|
|
104
|
+
passed: bool
|
|
105
|
+
score: float
|
|
106
|
+
duration_ms: float
|
|
107
|
+
error: str | None = None
|
|
108
|
+
raw: dict[str, Any] = field(default_factory=dict)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@dataclass(slots=True)
|
|
112
|
+
class NormalizedRunArtifact:
|
|
113
|
+
run_id: str
|
|
114
|
+
total_run_results: int
|
|
115
|
+
summary: dict[str, Any]
|
|
116
|
+
cases: list[NormalizedRunCase]
|
|
117
|
+
source_format: str
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@dataclass(slots=True)
|
|
121
|
+
class RunMetrics:
|
|
122
|
+
pass_rate_ratio: float
|
|
123
|
+
corrected_pass_rate_ratio: float | None
|
|
124
|
+
total_cost_usd: float | None
|
|
125
|
+
total_results: int
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _is_iso_timestamp(value: str) -> bool:
|
|
129
|
+
candidate = value.strip()
|
|
130
|
+
if not candidate:
|
|
131
|
+
return False
|
|
132
|
+
normalized = candidate[:-1] + "+00:00" if candidate.endswith("Z") else candidate
|
|
133
|
+
try:
|
|
134
|
+
datetime.fromisoformat(normalized)
|
|
135
|
+
except ValueError:
|
|
136
|
+
return False
|
|
137
|
+
return True
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _require_string(record: dict[str, Any], key: str, line_number: int) -> str:
|
|
141
|
+
value = record.get(key)
|
|
142
|
+
if not isinstance(value, str):
|
|
143
|
+
raise ValueError(f"Invalid labeled dataset at line {line_number}: {key} must be a string")
|
|
144
|
+
return value
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def parse_labeled_dataset(content: str) -> list[LabeledGoldenCase]:
|
|
148
|
+
rows = [line.strip() for line in content.splitlines() if line.strip()]
|
|
149
|
+
parsed_rows: list[LabeledGoldenCase] = []
|
|
150
|
+
for index, line in enumerate(rows, start=1):
|
|
151
|
+
try:
|
|
152
|
+
parsed = json.loads(line)
|
|
153
|
+
except json.JSONDecodeError as exc:
|
|
154
|
+
raise ValueError(f"Invalid JSONL at line {index}: expected valid JSON object") from exc
|
|
155
|
+
if not isinstance(parsed, dict):
|
|
156
|
+
raise ValueError(f"Invalid JSONL at line {index}: expected JSON object record")
|
|
157
|
+
|
|
158
|
+
case_id = _require_string(parsed, "caseId", index).strip()
|
|
159
|
+
if not case_id:
|
|
160
|
+
raise ValueError(f"Invalid labeled dataset at line {index}: caseId must be a non-empty string")
|
|
161
|
+
input_text = _require_string(parsed, "input", index)
|
|
162
|
+
expected = _require_string(parsed, "expected", index)
|
|
163
|
+
actual = _require_string(parsed, "actual", index)
|
|
164
|
+
label = parsed.get("label")
|
|
165
|
+
if label not in ("pass", "fail"):
|
|
166
|
+
raise ValueError(f'Invalid labeled dataset at line {index}: label must be "pass" or "fail"')
|
|
167
|
+
|
|
168
|
+
failure_mode_value = parsed.get("failureMode")
|
|
169
|
+
if failure_mode_value == "":
|
|
170
|
+
failure_mode_value = None
|
|
171
|
+
if not (isinstance(failure_mode_value, str) or failure_mode_value is None):
|
|
172
|
+
raise ValueError(f"Invalid labeled dataset at line {index}: failureMode must be string or null")
|
|
173
|
+
if label == "fail" and (failure_mode_value is None or not failure_mode_value.strip()):
|
|
174
|
+
raise ValueError(
|
|
175
|
+
f"Invalid labeled dataset at line {index}: failed rows require a non-empty failureMode"
|
|
176
|
+
)
|
|
177
|
+
if label == "pass" and isinstance(failure_mode_value, str) and failure_mode_value.strip():
|
|
178
|
+
raise ValueError(
|
|
179
|
+
f"Invalid labeled dataset at line {index}: passing rows must set failureMode to null or empty string"
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
labeled_at = _require_string(parsed, "labeledAt", index)
|
|
183
|
+
if not _is_iso_timestamp(labeled_at):
|
|
184
|
+
raise ValueError(
|
|
185
|
+
f"Invalid labeled dataset at line {index}: labeledAt must be an ISO timestamp string"
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
cluster_id = parsed.get("clusterId")
|
|
189
|
+
cluster_label = parsed.get("clusterLabel")
|
|
190
|
+
if cluster_id is not None and not isinstance(cluster_id, str):
|
|
191
|
+
raise ValueError(f"Invalid labeled dataset at line {index}: clusterId must be a string when present")
|
|
192
|
+
if cluster_label is not None and not isinstance(cluster_label, str):
|
|
193
|
+
raise ValueError(f"Invalid labeled dataset at line {index}: clusterLabel must be a string when present")
|
|
194
|
+
|
|
195
|
+
parsed_rows.append(
|
|
196
|
+
LabeledGoldenCase(
|
|
197
|
+
case_id=case_id,
|
|
198
|
+
input=input_text,
|
|
199
|
+
expected=expected,
|
|
200
|
+
actual=actual,
|
|
201
|
+
label=label,
|
|
202
|
+
failure_mode=failure_mode_value.strip() if isinstance(failure_mode_value, str) else None,
|
|
203
|
+
labeled_at=labeled_at,
|
|
204
|
+
cluster_id=cluster_id,
|
|
205
|
+
cluster_label=cluster_label,
|
|
206
|
+
)
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
return parsed_rows
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def analyze_labeled_dataset(rows: list[LabeledGoldenCase], top: int = 5) -> AnalyzeSummary:
|
|
213
|
+
total = len(rows)
|
|
214
|
+
failed_rows = [row for row in rows if row.label == "fail"]
|
|
215
|
+
failed = len(failed_rows)
|
|
216
|
+
pass_rate = (total - failed) / total if total > 0 else 0.0
|
|
217
|
+
counts: dict[str, int] = {}
|
|
218
|
+
for row in failed_rows:
|
|
219
|
+
mode = row.failure_mode.strip() if isinstance(row.failure_mode, str) and row.failure_mode.strip() else "failed_without_mode"
|
|
220
|
+
counts[mode] = counts.get(mode, 0) + 1
|
|
221
|
+
ordered = sorted(counts.items(), key=lambda item: (-item[1], item[0]))[: max(1, top)]
|
|
222
|
+
return AnalyzeSummary(
|
|
223
|
+
total=total,
|
|
224
|
+
failed=failed,
|
|
225
|
+
pass_rate=pass_rate,
|
|
226
|
+
failure_modes=[
|
|
227
|
+
FailureModeSummary(mode=mode, count=count, frequency=(count / failed if failed > 0 else 0.0))
|
|
228
|
+
for mode, count in ordered
|
|
229
|
+
],
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def format_analyze_human(summary: AnalyzeSummary) -> str:
|
|
234
|
+
lines = [
|
|
235
|
+
"Analyze phase",
|
|
236
|
+
f"Total cases: {summary.total}",
|
|
237
|
+
f"Failed: {summary.failed} ({((summary.failed / summary.total) * 100 if summary.total else 0.0):.1f}%)",
|
|
238
|
+
f"Pass rate: {(summary.pass_rate * 100):.1f}%",
|
|
239
|
+
]
|
|
240
|
+
if not summary.failure_modes:
|
|
241
|
+
lines.append("Failure modes: none")
|
|
242
|
+
return "\n".join(lines)
|
|
243
|
+
lines.append("Top failure modes:")
|
|
244
|
+
for index, item in enumerate(summary.failure_modes, start=1):
|
|
245
|
+
lines.append(f"{index}. {item.mode} — {item.count} ({(item.frequency * 100):.1f}%)")
|
|
246
|
+
return "\n".join(lines)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def write_jsonl(file_path: str, rows: list[LabeledGoldenCase | SyntheticGoldenCase]) -> None:
|
|
250
|
+
path = Path(file_path)
|
|
251
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
252
|
+
content = "\n".join(json.dumps(row.to_dict(), separators=(",", ":")) for row in rows)
|
|
253
|
+
path.write_text(f"{content}\n" if content else "", encoding="utf-8")
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _string_value(value: Any) -> str:
|
|
257
|
+
if value is None:
|
|
258
|
+
return ""
|
|
259
|
+
if isinstance(value, str):
|
|
260
|
+
return value
|
|
261
|
+
return json.dumps(value, ensure_ascii=False) if isinstance(value, (dict, list)) else str(value)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _status_from_mapping(item: dict[str, Any]) -> RunStatus:
|
|
265
|
+
status = item.get("status")
|
|
266
|
+
if isinstance(status, str) and status in {"passed", "failed", "error", "timeout", "skipped"}:
|
|
267
|
+
return status
|
|
268
|
+
passed = item.get("passed")
|
|
269
|
+
if isinstance(passed, bool):
|
|
270
|
+
return "passed" if passed else "failed"
|
|
271
|
+
return "failed"
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _normalize_ratio(value: Any) -> float | None:
|
|
275
|
+
if not isinstance(value, (int, float)):
|
|
276
|
+
return None
|
|
277
|
+
ratio = float(value)
|
|
278
|
+
if ratio > 1.0:
|
|
279
|
+
ratio /= 100.0
|
|
280
|
+
if ratio < 0.0:
|
|
281
|
+
return 0.0
|
|
282
|
+
if ratio > 1.0:
|
|
283
|
+
return 1.0
|
|
284
|
+
return ratio
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def extract_run_metrics(run_data: dict[str, Any]) -> RunMetrics:
|
|
288
|
+
summary = run_data.get("summary") if isinstance(run_data.get("summary"), dict) else {}
|
|
289
|
+
if not isinstance(summary, dict):
|
|
290
|
+
summary = {}
|
|
291
|
+
pass_rate = _normalize_ratio(summary.get("correctedPassRate"))
|
|
292
|
+
corrected_pass_rate = pass_rate
|
|
293
|
+
raw_pass_rate = _normalize_ratio(summary.get("passRate"))
|
|
294
|
+
if raw_pass_rate is None:
|
|
295
|
+
raw_pass_rate = _normalize_ratio(summary.get("pass_rate"))
|
|
296
|
+
if raw_pass_rate is None:
|
|
297
|
+
total = summary.get("total")
|
|
298
|
+
passed = summary.get("passed")
|
|
299
|
+
if isinstance(total, int) and total > 0 and isinstance(passed, int):
|
|
300
|
+
raw_pass_rate = passed / total
|
|
301
|
+
else:
|
|
302
|
+
results = run_data.get("results") if isinstance(run_data.get("results"), list) else []
|
|
303
|
+
total_count = len(results)
|
|
304
|
+
passed_count = 0
|
|
305
|
+
for item in results:
|
|
306
|
+
if isinstance(item, dict):
|
|
307
|
+
nested_result = item.get("result")
|
|
308
|
+
if isinstance(nested_result, dict):
|
|
309
|
+
status = nested_result.get("status")
|
|
310
|
+
if status == "passed":
|
|
311
|
+
passed_count += 1
|
|
312
|
+
elif item.get("passed") is True:
|
|
313
|
+
passed_count += 1
|
|
314
|
+
raw_pass_rate = (passed_count / total_count) if total_count > 0 else 0.0
|
|
315
|
+
if corrected_pass_rate is None:
|
|
316
|
+
corrected_pass_rate = _normalize_ratio(summary.get("corrected_pass_rate"))
|
|
317
|
+
if corrected_pass_rate is None:
|
|
318
|
+
corrected_pass_rate = None
|
|
319
|
+
total_cost_usd = summary.get("totalCostUsd")
|
|
320
|
+
if total_cost_usd is None:
|
|
321
|
+
total_cost_usd = summary.get("total_cost_usd")
|
|
322
|
+
if total_cost_usd is not None and not isinstance(total_cost_usd, (int, float)):
|
|
323
|
+
total_cost_usd = None
|
|
324
|
+
total_results = summary.get("total") if isinstance(summary.get("total"), int) else None
|
|
325
|
+
if total_results is None:
|
|
326
|
+
results = run_data.get("results") if isinstance(run_data.get("results"), list) else []
|
|
327
|
+
total_results = len(results)
|
|
328
|
+
return RunMetrics(
|
|
329
|
+
pass_rate_ratio=raw_pass_rate or 0.0,
|
|
330
|
+
corrected_pass_rate_ratio=corrected_pass_rate,
|
|
331
|
+
total_cost_usd=float(total_cost_usd) if isinstance(total_cost_usd, (int, float)) else None,
|
|
332
|
+
total_results=total_results,
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def normalize_run_artifact(run_data: dict[str, Any]) -> NormalizedRunArtifact:
|
|
337
|
+
results = run_data.get("results")
|
|
338
|
+
if not isinstance(results, list):
|
|
339
|
+
raise ValueError("Run artifact must contain a results array")
|
|
340
|
+
|
|
341
|
+
normalized_cases: list[NormalizedRunCase] = []
|
|
342
|
+
source_format = "legacy"
|
|
343
|
+
for index, item in enumerate(results):
|
|
344
|
+
if not isinstance(item, dict):
|
|
345
|
+
continue
|
|
346
|
+
if isinstance(item.get("result"), dict):
|
|
347
|
+
source_format = "typescript"
|
|
348
|
+
nested = item["result"]
|
|
349
|
+
status = nested.get("status") if isinstance(nested.get("status"), str) else _status_from_mapping(item)
|
|
350
|
+
case = NormalizedRunCase(
|
|
351
|
+
case_id=_string_value(item.get("specId") or item.get("testId") or item.get("caseId") or item.get("id") or f"case-{index + 1}"),
|
|
352
|
+
name=_string_value(item.get("name") or item.get("testName") or item.get("spec") or f"case-{index + 1}"),
|
|
353
|
+
file_path=_string_value(item.get("filePath") or item.get("file_path") or ""),
|
|
354
|
+
status=status,
|
|
355
|
+
input=_string_value(item.get("input")),
|
|
356
|
+
expected=_string_value(item.get("expected") or item.get("expectedOutput")),
|
|
357
|
+
actual=_string_value(item.get("actual") or item.get("output") or nested.get("error")),
|
|
358
|
+
passed=status == "passed",
|
|
359
|
+
score=float(nested.get("score") or 0.0),
|
|
360
|
+
duration_ms=float(nested.get("duration") or nested.get("durationMs") or item.get("durationMs") or 0.0),
|
|
361
|
+
error=_string_value(nested.get("error")) or None,
|
|
362
|
+
raw=item,
|
|
363
|
+
)
|
|
364
|
+
elif "test_id" in item or "testId" in item or "test_name" in item or "testName" in item:
|
|
365
|
+
source_format = "python_run_report"
|
|
366
|
+
status = _status_from_mapping(item)
|
|
367
|
+
metadata = item.get("metadata") if isinstance(item.get("metadata"), dict) else {}
|
|
368
|
+
case = NormalizedRunCase(
|
|
369
|
+
case_id=_string_value(item.get("test_id") or item.get("testId") or f"case-{index + 1}"),
|
|
370
|
+
name=_string_value(item.get("test_name") or item.get("testName") or f"case-{index + 1}"),
|
|
371
|
+
file_path=_string_value(item.get("file_path") or item.get("filePath") or ""),
|
|
372
|
+
status=status,
|
|
373
|
+
input=_string_value(item.get("input") or metadata.get("input")),
|
|
374
|
+
expected=_string_value(item.get("expected") or metadata.get("expected")),
|
|
375
|
+
actual=_string_value(item.get("actual") or metadata.get("actual") or metadata.get("output") or item.get("error")),
|
|
376
|
+
passed=bool(item.get("passed")),
|
|
377
|
+
score=float(item.get("score") or 0.0),
|
|
378
|
+
duration_ms=float(item.get("duration_ms") or item.get("durationMs") or 0.0),
|
|
379
|
+
error=_string_value(item.get("error")) or None,
|
|
380
|
+
raw=item,
|
|
381
|
+
)
|
|
382
|
+
else:
|
|
383
|
+
status = _status_from_mapping(item)
|
|
384
|
+
case = NormalizedRunCase(
|
|
385
|
+
case_id=_string_value(item.get("specId") or item.get("test_id") or item.get("spec") or item.get("name") or f"case-{index + 1}"),
|
|
386
|
+
name=_string_value(item.get("name") or item.get("spec") or item.get("test_name") or item.get("testName") or f"case-{index + 1}"),
|
|
387
|
+
file_path=_string_value(item.get("file_path") or item.get("filePath") or ""),
|
|
388
|
+
status=status,
|
|
389
|
+
input=_string_value(item.get("input")),
|
|
390
|
+
expected=_string_value(item.get("expected") or item.get("expectedOutput")),
|
|
391
|
+
actual=_string_value(item.get("actual") or item.get("output") or item.get("error")),
|
|
392
|
+
passed=bool(item.get("passed")),
|
|
393
|
+
score=float(item.get("score") or 0.0),
|
|
394
|
+
duration_ms=float(item.get("duration_ms") or item.get("durationMs") or 0.0),
|
|
395
|
+
error=_string_value(item.get("error")) or None,
|
|
396
|
+
raw=item,
|
|
397
|
+
)
|
|
398
|
+
normalized_cases.append(case)
|
|
399
|
+
|
|
400
|
+
run_id = _string_value(run_data.get("runId") or run_data.get("run_id") or run_data.get("id") or "run-latest")
|
|
401
|
+
metrics = extract_run_metrics(run_data)
|
|
402
|
+
summary = run_data.get("summary") if isinstance(run_data.get("summary"), dict) else {}
|
|
403
|
+
if not isinstance(summary, dict):
|
|
404
|
+
summary = {}
|
|
405
|
+
if "total" not in summary:
|
|
406
|
+
summary["total"] = len(normalized_cases)
|
|
407
|
+
if "passed" not in summary:
|
|
408
|
+
summary["passed"] = sum(1 for case in normalized_cases if case.passed)
|
|
409
|
+
if "failed" not in summary:
|
|
410
|
+
summary["failed"] = sum(1 for case in normalized_cases if case.status == "failed")
|
|
411
|
+
if "errors" not in summary:
|
|
412
|
+
summary["errors"] = sum(1 for case in normalized_cases if case.status == "error")
|
|
413
|
+
if "timeouts" not in summary:
|
|
414
|
+
summary["timeouts"] = sum(1 for case in normalized_cases if case.status == "timeout")
|
|
415
|
+
summary.setdefault("passRate", metrics.pass_rate_ratio)
|
|
416
|
+
summary.setdefault("pass_rate", metrics.pass_rate_ratio * 100.0)
|
|
417
|
+
if metrics.corrected_pass_rate_ratio is not None:
|
|
418
|
+
summary.setdefault("correctedPassRate", metrics.corrected_pass_rate_ratio)
|
|
419
|
+
summary.setdefault("corrected_pass_rate", metrics.corrected_pass_rate_ratio)
|
|
420
|
+
return NormalizedRunArtifact(
|
|
421
|
+
run_id=run_id,
|
|
422
|
+
total_run_results=len(normalized_cases),
|
|
423
|
+
summary=summary,
|
|
424
|
+
cases=normalized_cases,
|
|
425
|
+
source_format=source_format,
|
|
426
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Framework integrations for OpenAI, Anthropic, and agent frameworks."""
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Anthropic tracing integration — wraps Anthropic client calls with EvalAI traces."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from evalgate_sdk.types import CreateTraceParams
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
async def trace_anthropic_call(
|
|
12
|
+
client: Any,
|
|
13
|
+
name: str,
|
|
14
|
+
fn: Any,
|
|
15
|
+
*,
|
|
16
|
+
metadata: dict[str, Any] | None = None,
|
|
17
|
+
) -> Any:
|
|
18
|
+
"""Trace a single Anthropic call.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
client: AIEvalClient instance.
|
|
22
|
+
name: Descriptive name for the trace.
|
|
23
|
+
fn: An async callable that performs the Anthropic call.
|
|
24
|
+
metadata: Extra metadata to attach.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
The result of calling *fn*.
|
|
28
|
+
"""
|
|
29
|
+
trace = await client.traces.create(
|
|
30
|
+
CreateTraceParams(name=name, metadata={**(metadata or {}), "provider": "anthropic"})
|
|
31
|
+
)
|
|
32
|
+
start = time.monotonic()
|
|
33
|
+
try:
|
|
34
|
+
result = await fn()
|
|
35
|
+
duration_ms = int((time.monotonic() - start) * 1000)
|
|
36
|
+
|
|
37
|
+
from evalgate_sdk.types import UpdateTraceParams
|
|
38
|
+
|
|
39
|
+
output_text = ""
|
|
40
|
+
usage: dict[str, Any] = {}
|
|
41
|
+
if hasattr(result, "content") and result.content:
|
|
42
|
+
parts = result.content
|
|
43
|
+
output_text = parts[0].text if hasattr(parts[0], "text") else str(parts[0])
|
|
44
|
+
if hasattr(result, "usage") and result.usage:
|
|
45
|
+
usage = {
|
|
46
|
+
"input_tokens": result.usage.input_tokens,
|
|
47
|
+
"output_tokens": result.usage.output_tokens,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
await client.traces.update(
|
|
51
|
+
trace.id,
|
|
52
|
+
UpdateTraceParams(
|
|
53
|
+
output=output_text,
|
|
54
|
+
status="completed",
|
|
55
|
+
metadata={"duration_ms": duration_ms, "usage": usage},
|
|
56
|
+
),
|
|
57
|
+
)
|
|
58
|
+
return result
|
|
59
|
+
except Exception as exc:
|
|
60
|
+
from evalgate_sdk.types import UpdateTraceParams
|
|
61
|
+
|
|
62
|
+
await client.traces.update(
|
|
63
|
+
trace.id,
|
|
64
|
+
UpdateTraceParams(status="error", metadata={"error": str(exc)}),
|
|
65
|
+
)
|
|
66
|
+
raise
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def trace_anthropic(anthropic_client: Any, eval_client: Any, **kwargs: Any) -> Any:
|
|
70
|
+
"""Wrap an Anthropic client so every ``messages.create`` call is traced.
|
|
71
|
+
|
|
72
|
+
Returns a lightweight proxy; the original client is not modified.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
class _TracedMessages:
|
|
76
|
+
def __init__(self, original: Any) -> None:
|
|
77
|
+
self._original = original
|
|
78
|
+
|
|
79
|
+
async def create(self, **kw: Any) -> Any:
|
|
80
|
+
name = kw.get("model", "anthropic-messages")
|
|
81
|
+
return await trace_anthropic_call(
|
|
82
|
+
eval_client,
|
|
83
|
+
name,
|
|
84
|
+
lambda: self._original.create(**kw),
|
|
85
|
+
metadata={"model": kw.get("model"), **kwargs},
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def __getattr__(self, name: str) -> Any:
|
|
89
|
+
return getattr(self._original, name)
|
|
90
|
+
|
|
91
|
+
class _TracedAnthropic:
|
|
92
|
+
def __init__(self, original: Any) -> None:
|
|
93
|
+
self.messages = _TracedMessages(original.messages)
|
|
94
|
+
self._original = original
|
|
95
|
+
|
|
96
|
+
def __getattr__(self, name: str) -> Any:
|
|
97
|
+
return getattr(self._original, name)
|
|
98
|
+
|
|
99
|
+
return _TracedAnthropic(anthropic_client)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""AutoGen tracing integration — wraps AutoGen conversations with EvalAI workflow traces."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from evalgate_sdk.workflows import WorkflowStatus, WorkflowTracer
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def trace_autogen(
|
|
11
|
+
conversation: Any,
|
|
12
|
+
tracer: WorkflowTracer,
|
|
13
|
+
*,
|
|
14
|
+
conversation_name: str = "AutoGenConversation",
|
|
15
|
+
) -> Any:
|
|
16
|
+
"""Wrap an AutoGen conversation so ``initiate_chat`` is traced as a full workflow.
|
|
17
|
+
|
|
18
|
+
Returns a lightweight proxy; the original conversation is not modified.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
conversation: An AutoGen agent or ``ConversableAgent`` with
|
|
22
|
+
``.initiate_chat()`` or ``.a_initiate_chat()`` methods.
|
|
23
|
+
tracer: An active ``WorkflowTracer`` instance.
|
|
24
|
+
conversation_name: Name used for the workflow and spans.
|
|
25
|
+
|
|
26
|
+
Example::
|
|
27
|
+
|
|
28
|
+
from autogen import ConversableAgent
|
|
29
|
+
from evalgate_sdk import WorkflowTracer, AIEvalClient
|
|
30
|
+
|
|
31
|
+
client = AIEvalClient.init()
|
|
32
|
+
tracer = WorkflowTracer(client)
|
|
33
|
+
agent = ConversableAgent(...)
|
|
34
|
+
|
|
35
|
+
traced = trace_autogen(agent, tracer, conversation_name="CodeReview")
|
|
36
|
+
result = await traced.initiate_chat(recipient, message="Review this PR")
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
class _TracedAutoGen:
|
|
40
|
+
def __init__(self, original: Any) -> None:
|
|
41
|
+
self._original = original
|
|
42
|
+
|
|
43
|
+
async def initiate_chat(self, *args: Any, **kwargs: Any) -> Any:
|
|
44
|
+
await tracer.start_workflow(conversation_name)
|
|
45
|
+
span = await tracer.start_agent_span(conversation_name, {"args": str(args), "kwargs": str(kwargs)})
|
|
46
|
+
try:
|
|
47
|
+
if hasattr(self._original, "a_initiate_chat"):
|
|
48
|
+
result = await self._original.a_initiate_chat(*args, **kwargs)
|
|
49
|
+
else:
|
|
50
|
+
result = await self._original.initiate_chat(*args, **kwargs)
|
|
51
|
+
await tracer.end_agent_span(span, output={"result": str(result)})
|
|
52
|
+
await tracer.end_workflow({"result": str(result)}, WorkflowStatus.COMPLETED)
|
|
53
|
+
return result
|
|
54
|
+
except Exception as exc:
|
|
55
|
+
await tracer.end_agent_span(span, error=str(exc))
|
|
56
|
+
await tracer.end_workflow({"error": str(exc)}, WorkflowStatus.FAILED)
|
|
57
|
+
raise
|
|
58
|
+
|
|
59
|
+
def __getattr__(self, name: str) -> Any:
|
|
60
|
+
return getattr(self._original, name)
|
|
61
|
+
|
|
62
|
+
return _TracedAutoGen(conversation)
|