evalgate-sdk 3.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalgate_sdk/__init__.py +707 -0
- evalgate_sdk/_version.py +3 -0
- evalgate_sdk/assertions.py +1362 -0
- evalgate_sdk/auto.py +247 -0
- evalgate_sdk/batch.py +174 -0
- evalgate_sdk/cache.py +111 -0
- evalgate_sdk/ci_context.py +123 -0
- evalgate_sdk/cli/__init__.py +111 -0
- evalgate_sdk/cli/api.py +261 -0
- evalgate_sdk/cli/cli_constants.py +20 -0
- evalgate_sdk/cli/commands.py +1041 -0
- evalgate_sdk/cli/config.py +228 -0
- evalgate_sdk/cli/env.py +43 -0
- evalgate_sdk/cli/formatters/types.py +132 -0
- evalgate_sdk/cli/golden_commands.py +322 -0
- evalgate_sdk/cli/manifest.py +301 -0
- evalgate_sdk/cli/new_commands.py +435 -0
- evalgate_sdk/cli/policy_packs.py +103 -0
- evalgate_sdk/cli/profiles.py +12 -0
- evalgate_sdk/cli/regression_gate.py +312 -0
- evalgate_sdk/cli/render/__init__.py +1 -0
- evalgate_sdk/cli/render/snippet.py +18 -0
- evalgate_sdk/cli/render/sort.py +29 -0
- evalgate_sdk/cli/report/__init__.py +1 -0
- evalgate_sdk/cli/report/build_check_report.py +209 -0
- evalgate_sdk/cli/traces.py +186 -0
- evalgate_sdk/cli/workspace.py +63 -0
- evalgate_sdk/client.py +609 -0
- evalgate_sdk/cluster.py +359 -0
- evalgate_sdk/collector.py +161 -0
- evalgate_sdk/constants.py +6 -0
- evalgate_sdk/context.py +151 -0
- evalgate_sdk/errors.py +236 -0
- evalgate_sdk/export.py +238 -0
- evalgate_sdk/formatters/__init__.py +11 -0
- evalgate_sdk/formatters/github.py +51 -0
- evalgate_sdk/formatters/human.py +68 -0
- evalgate_sdk/formatters/json_fmt.py +11 -0
- evalgate_sdk/formatters/pr_comment.py +80 -0
- evalgate_sdk/golden.py +426 -0
- evalgate_sdk/integrations/__init__.py +1 -0
- evalgate_sdk/integrations/anthropic.py +99 -0
- evalgate_sdk/integrations/autogen.py +62 -0
- evalgate_sdk/integrations/crewai.py +61 -0
- evalgate_sdk/integrations/langchain.py +100 -0
- evalgate_sdk/integrations/openai.py +155 -0
- evalgate_sdk/integrations/openai_eval.py +221 -0
- evalgate_sdk/local.py +144 -0
- evalgate_sdk/logger.py +123 -0
- evalgate_sdk/matchers.py +62 -0
- evalgate_sdk/otel.py +256 -0
- evalgate_sdk/pagination.py +145 -0
- evalgate_sdk/py.typed +0 -0
- evalgate_sdk/pytest_plugin.py +96 -0
- evalgate_sdk/reason_codes.py +103 -0
- evalgate_sdk/regression.py +196 -0
- evalgate_sdk/replay_decision.py +115 -0
- evalgate_sdk/runtime/__init__.py +50 -0
- evalgate_sdk/runtime/adapters/__init__.py +1 -0
- evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
- evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
- evalgate_sdk/runtime/context.py +68 -0
- evalgate_sdk/runtime/eval.py +318 -0
- evalgate_sdk/runtime/execution_mode.py +170 -0
- evalgate_sdk/runtime/executor.py +92 -0
- evalgate_sdk/runtime/registry.py +125 -0
- evalgate_sdk/runtime/run_report.py +249 -0
- evalgate_sdk/runtime/types.py +143 -0
- evalgate_sdk/snapshot.py +219 -0
- evalgate_sdk/streaming.py +124 -0
- evalgate_sdk/synthesize.py +226 -0
- evalgate_sdk/testing.py +128 -0
- evalgate_sdk/types.py +666 -0
- evalgate_sdk/utils/__init__.py +1 -0
- evalgate_sdk/utils/input_hash.py +42 -0
- evalgate_sdk/workflows.py +264 -0
- evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
- evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
- evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
- evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
"""evalgate gate — Run the regression gate.
|
|
2
|
+
|
|
3
|
+
Two modes:
|
|
4
|
+
1. Project mode: delegates to eval:regression-gate script (full gate)
|
|
5
|
+
2. Built-in mode: runs tests, compares against baseline
|
|
6
|
+
|
|
7
|
+
Port of ``cli/regression-gate.ts``.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
import re
|
|
15
|
+
import subprocess
|
|
16
|
+
import time
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from datetime import datetime, timezone
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Any, Literal
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class GateArgs:
|
|
25
|
+
format: Literal["human", "json", "github"] = "human"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class BuiltinReport:
|
|
30
|
+
schema_version: int = 1
|
|
31
|
+
timestamp: str = ""
|
|
32
|
+
exit_code: int = 0
|
|
33
|
+
category: str = "pass"
|
|
34
|
+
passed: bool = True
|
|
35
|
+
failures: list[str] = field(default_factory=list)
|
|
36
|
+
deltas: list[dict[str, Any]] = field(default_factory=list)
|
|
37
|
+
baseline: dict[str, str] | None = None
|
|
38
|
+
duration_ms: int = 0
|
|
39
|
+
command: str = ""
|
|
40
|
+
runner: str = ""
|
|
41
|
+
|
|
42
|
+
def to_dict(self) -> dict[str, Any]:
|
|
43
|
+
return {
|
|
44
|
+
"schemaVersion": self.schema_version,
|
|
45
|
+
"timestamp": self.timestamp,
|
|
46
|
+
"exitCode": self.exit_code,
|
|
47
|
+
"category": self.category,
|
|
48
|
+
"passed": self.passed,
|
|
49
|
+
"failures": self.failures,
|
|
50
|
+
"deltas": self.deltas,
|
|
51
|
+
"baseline": self.baseline,
|
|
52
|
+
"durationMs": self.duration_ms,
|
|
53
|
+
"command": self.command,
|
|
54
|
+
"runner": self.runner,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def parse_gate_args(argv: list[str]) -> GateArgs:
|
|
59
|
+
args = GateArgs()
|
|
60
|
+
i = 0
|
|
61
|
+
while i < len(argv):
|
|
62
|
+
if argv[i] == "--format" and i + 1 < len(argv):
|
|
63
|
+
fmt = argv[i + 1]
|
|
64
|
+
if fmt in ("human", "json", "github"):
|
|
65
|
+
args.format = fmt # type: ignore[assignment]
|
|
66
|
+
i += 2
|
|
67
|
+
else:
|
|
68
|
+
i += 1
|
|
69
|
+
return args
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _detect_test_runner(cwd: str) -> str:
|
|
73
|
+
"""Detect Python test runner used in the project."""
|
|
74
|
+
pyproject = os.path.join(cwd, "pyproject.toml")
|
|
75
|
+
if os.path.isfile(pyproject):
|
|
76
|
+
try:
|
|
77
|
+
text = Path(pyproject).read_text(encoding="utf-8")
|
|
78
|
+
if "pytest" in text:
|
|
79
|
+
return "pytest"
|
|
80
|
+
if "unittest" in text:
|
|
81
|
+
return "unittest"
|
|
82
|
+
except OSError:
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
if os.path.isfile(os.path.join(cwd, "pytest.ini")) or os.path.isfile(os.path.join(cwd, "setup.cfg")):
|
|
86
|
+
return "pytest"
|
|
87
|
+
|
|
88
|
+
return "pytest"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _detect_test_command(cwd: str) -> str:
|
|
92
|
+
"""Detect the test command to run."""
|
|
93
|
+
runner = _detect_test_runner(cwd)
|
|
94
|
+
if runner == "pytest":
|
|
95
|
+
return "python -m pytest"
|
|
96
|
+
return "python -m unittest discover"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def run_builtin_gate(cwd: str) -> BuiltinReport:
|
|
100
|
+
"""Run the built-in lightweight gate."""
|
|
101
|
+
t0 = time.time()
|
|
102
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
103
|
+
command = _detect_test_command(cwd)
|
|
104
|
+
runner = _detect_test_runner(cwd)
|
|
105
|
+
baseline_path = os.path.join(cwd, "evals", "baseline.json")
|
|
106
|
+
|
|
107
|
+
if not os.path.isfile(baseline_path):
|
|
108
|
+
return BuiltinReport(
|
|
109
|
+
timestamp=now,
|
|
110
|
+
exit_code=2,
|
|
111
|
+
category="infra_error",
|
|
112
|
+
passed=False,
|
|
113
|
+
failures=["Baseline file not found. Run: evalgate init"],
|
|
114
|
+
duration_ms=int((time.time() - t0) * 1000),
|
|
115
|
+
command=command,
|
|
116
|
+
runner=runner,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
baseline_data = json.loads(Path(baseline_path).read_text(encoding="utf-8"))
|
|
121
|
+
except (OSError, json.JSONDecodeError):
|
|
122
|
+
return BuiltinReport(
|
|
123
|
+
timestamp=now,
|
|
124
|
+
exit_code=2,
|
|
125
|
+
category="infra_error",
|
|
126
|
+
passed=False,
|
|
127
|
+
failures=["Failed to parse evals/baseline.json"],
|
|
128
|
+
duration_ms=int((time.time() - t0) * 1000),
|
|
129
|
+
command=command,
|
|
130
|
+
runner=runner,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
baseline_meta = None
|
|
134
|
+
if baseline_data.get("updatedAt"):
|
|
135
|
+
baseline_meta = {
|
|
136
|
+
"updatedAt": baseline_data["updatedAt"],
|
|
137
|
+
"updatedBy": baseline_data.get("updatedBy", "unknown"),
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
# Run tests
|
|
141
|
+
try:
|
|
142
|
+
result = subprocess.run(
|
|
143
|
+
command.split(),
|
|
144
|
+
cwd=cwd,
|
|
145
|
+
capture_output=True,
|
|
146
|
+
text=True,
|
|
147
|
+
timeout=300,
|
|
148
|
+
)
|
|
149
|
+
except subprocess.TimeoutExpired:
|
|
150
|
+
return BuiltinReport(
|
|
151
|
+
timestamp=now,
|
|
152
|
+
exit_code=2,
|
|
153
|
+
category="infra_error",
|
|
154
|
+
passed=False,
|
|
155
|
+
failures=["Test command timed out after 300s"],
|
|
156
|
+
duration_ms=int((time.time() - t0) * 1000),
|
|
157
|
+
command=command,
|
|
158
|
+
runner=runner,
|
|
159
|
+
)
|
|
160
|
+
except OSError as exc:
|
|
161
|
+
return BuiltinReport(
|
|
162
|
+
timestamp=now,
|
|
163
|
+
exit_code=2,
|
|
164
|
+
category="infra_error",
|
|
165
|
+
passed=False,
|
|
166
|
+
failures=[f"Failed to run test command: {exc}"],
|
|
167
|
+
duration_ms=int((time.time() - t0) * 1000),
|
|
168
|
+
command=command,
|
|
169
|
+
runner=runner,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
tests_passed = result.returncode == 0
|
|
173
|
+
output = (result.stdout or "") + (result.stderr or "")
|
|
174
|
+
|
|
175
|
+
# Extract test count
|
|
176
|
+
test_count = 0
|
|
177
|
+
count_match = (
|
|
178
|
+
re.search(r"(\d+)\s+(?:tests?|specs?)\s+(?:passed|completed)", output, re.I)
|
|
179
|
+
or re.search(r"(\d+)\s+passed", output, re.I)
|
|
180
|
+
or re.search(r"(\d+)\s+passing", output, re.I)
|
|
181
|
+
)
|
|
182
|
+
if count_match:
|
|
183
|
+
test_count = int(count_match.group(1))
|
|
184
|
+
|
|
185
|
+
baseline_passed = baseline_data.get("confidenceTests", {}).get("passed", True)
|
|
186
|
+
baseline_total = baseline_data.get("confidenceTests", {}).get("total", 0)
|
|
187
|
+
|
|
188
|
+
failures: list[str] = []
|
|
189
|
+
deltas: list[dict[str, Any]] = []
|
|
190
|
+
|
|
191
|
+
deltas.append(
|
|
192
|
+
{
|
|
193
|
+
"metric": "tests_passing",
|
|
194
|
+
"baseline": baseline_passed,
|
|
195
|
+
"current": tests_passed,
|
|
196
|
+
"delta": "0" if tests_passed == baseline_passed else ("+1" if tests_passed else "-1"),
|
|
197
|
+
"status": "pass" if tests_passed else "fail",
|
|
198
|
+
}
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
if not tests_passed and baseline_passed:
|
|
202
|
+
failures.append("Tests were passing in baseline but are now failing")
|
|
203
|
+
|
|
204
|
+
if test_count > 0 or baseline_total > 0:
|
|
205
|
+
count_delta = test_count - baseline_total
|
|
206
|
+
deltas.append(
|
|
207
|
+
{
|
|
208
|
+
"metric": "test_count",
|
|
209
|
+
"baseline": baseline_total,
|
|
210
|
+
"current": test_count,
|
|
211
|
+
"delta": f"+{count_delta}" if count_delta >= 0 else str(count_delta),
|
|
212
|
+
"status": "pass" if test_count >= baseline_total else "fail",
|
|
213
|
+
}
|
|
214
|
+
)
|
|
215
|
+
if test_count < baseline_total:
|
|
216
|
+
failures.append(f"Test count dropped from {baseline_total} to {test_count} ({count_delta})")
|
|
217
|
+
|
|
218
|
+
has_regression = len(failures) > 0
|
|
219
|
+
|
|
220
|
+
return BuiltinReport(
|
|
221
|
+
timestamp=now,
|
|
222
|
+
exit_code=1 if has_regression else 0,
|
|
223
|
+
category="regression" if has_regression else "pass",
|
|
224
|
+
passed=not has_regression,
|
|
225
|
+
failures=failures,
|
|
226
|
+
deltas=deltas,
|
|
227
|
+
baseline=baseline_meta,
|
|
228
|
+
duration_ms=int((time.time() - t0) * 1000),
|
|
229
|
+
command=command,
|
|
230
|
+
runner=runner,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def format_human(report: BuiltinReport) -> str:
|
|
235
|
+
"""Format report for human consumption."""
|
|
236
|
+
icon = "✅" if report.passed else "❌"
|
|
237
|
+
lines = [f"\n{icon} EvalGate Gate: {report.category.upper()}\n"]
|
|
238
|
+
|
|
239
|
+
if report.deltas:
|
|
240
|
+
|
|
241
|
+
def pad(s, n):
|
|
242
|
+
return str(s).ljust(n)
|
|
243
|
+
|
|
244
|
+
lines.append(f" {pad('Metric', 16)} {pad('Baseline', 10)} {pad('Current', 10)} {pad('Delta', 8)} Status")
|
|
245
|
+
lines.append(f" {'-' * 16} {'-' * 10} {'-' * 10} {'-' * 8} ------")
|
|
246
|
+
for d in report.deltas:
|
|
247
|
+
si = "✔" if d["status"] == "pass" else "✖"
|
|
248
|
+
lines.append(
|
|
249
|
+
f" {pad(d['metric'], 16)} {pad(d['baseline'], 10)} {pad(d['current'], 10)} {pad(d['delta'], 8)} {si}"
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
if report.failures:
|
|
253
|
+
lines.append("\n Failures:")
|
|
254
|
+
for f in report.failures:
|
|
255
|
+
lines.append(f" • {f}")
|
|
256
|
+
lines.append("")
|
|
257
|
+
|
|
258
|
+
return "\n".join(lines)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def format_github(report: BuiltinReport) -> str:
|
|
262
|
+
"""Format report as GitHub markdown."""
|
|
263
|
+
icon = "✅" if report.passed else "❌"
|
|
264
|
+
lines = [
|
|
265
|
+
f"## {icon} EvalGate Gate: {report.category}",
|
|
266
|
+
"",
|
|
267
|
+
"| Metric | Baseline | Current | Delta | Status |",
|
|
268
|
+
"|--------|----------|---------|-------|--------|",
|
|
269
|
+
]
|
|
270
|
+
for d in report.deltas:
|
|
271
|
+
si = "✅" if d["status"] == "pass" else "❌"
|
|
272
|
+
lines.append(f"| {d['metric']} | {d['baseline']} | {d['current']} | {d['delta']} | {si} |")
|
|
273
|
+
|
|
274
|
+
if report.failures:
|
|
275
|
+
lines.extend(["", "### Failures", ""])
|
|
276
|
+
for f in report.failures:
|
|
277
|
+
lines.append(f"- {f}")
|
|
278
|
+
|
|
279
|
+
lines.append(f"\nSchema version: {report.schema_version}")
|
|
280
|
+
return "\n".join(lines)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def run_gate(argv: list[str] | None = None) -> int:
|
|
284
|
+
"""Main gate entry point. Returns exit code."""
|
|
285
|
+
cwd = os.getcwd()
|
|
286
|
+
args = parse_gate_args(argv or [])
|
|
287
|
+
report = run_builtin_gate(cwd)
|
|
288
|
+
|
|
289
|
+
# Write report artifact
|
|
290
|
+
evals_dir = os.path.join(cwd, "evals")
|
|
291
|
+
os.makedirs(evals_dir, exist_ok=True)
|
|
292
|
+
Path(os.path.join(cwd, "evals", "regression-report.json")).write_text(
|
|
293
|
+
json.dumps(report.to_dict(), indent=2) + "\n",
|
|
294
|
+
encoding="utf-8",
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
if args.format == "json":
|
|
298
|
+
print(json.dumps(report.to_dict(), indent=2))
|
|
299
|
+
elif args.format == "github":
|
|
300
|
+
md = format_github(report)
|
|
301
|
+
summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
|
|
302
|
+
if summary_path:
|
|
303
|
+
try:
|
|
304
|
+
with open(summary_path, "a") as f:
|
|
305
|
+
f.write(md + "\n")
|
|
306
|
+
except OSError:
|
|
307
|
+
pass
|
|
308
|
+
print(md)
|
|
309
|
+
else:
|
|
310
|
+
print(format_human(report))
|
|
311
|
+
|
|
312
|
+
return report.exit_code
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Render utilities for CLI output."""
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Truncate a string for deterministic output.
|
|
2
|
+
|
|
3
|
+
Port of ``cli/render/snippet.ts``.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def truncate_snippet(s: str | None, max_len: int = 140) -> str:
|
|
12
|
+
"""Replaces newlines with space, caps length."""
|
|
13
|
+
if s is None:
|
|
14
|
+
return ""
|
|
15
|
+
normalized = re.sub(r"\s+", " ", s).strip()
|
|
16
|
+
if len(normalized) <= max_len:
|
|
17
|
+
return normalized
|
|
18
|
+
return normalized[:max_len] + "…"
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Deterministic ordering for failed cases.
|
|
2
|
+
|
|
3
|
+
Sort by status severity (failed > error > skipped > passed), then by test_case_id asc.
|
|
4
|
+
|
|
5
|
+
Port of ``cli/render/sort.ts``.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
STATUS_SEVERITY: dict[str, int] = {
|
|
13
|
+
"failed": 0,
|
|
14
|
+
"error": 1,
|
|
15
|
+
"skipped": 2,
|
|
16
|
+
"passed": 3,
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def sort_failed_cases(cases: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
21
|
+
"""Sort cases by status severity then test_case_id."""
|
|
22
|
+
|
|
23
|
+
def sort_key(c: dict[str, Any]) -> tuple[int, int]:
|
|
24
|
+
status = (c.get("status") or "").lower()
|
|
25
|
+
sev = STATUS_SEVERITY.get(status, 4)
|
|
26
|
+
tid = c.get("test_case_id") or c.get("testCaseId") or 0
|
|
27
|
+
return (sev, tid)
|
|
28
|
+
|
|
29
|
+
return sorted(cases, key=sort_key)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Report building utilities for CLI."""
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
"""Build CheckReport from API data and gate result.
|
|
2
|
+
|
|
3
|
+
Normalizes failed cases (truncate, sort), dashboard URL, top N + more.
|
|
4
|
+
|
|
5
|
+
Port of ``cli/report/build-check-report.ts``.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from evalgate_sdk.cli.formatters.types import (
|
|
13
|
+
CHECK_REPORT_SCHEMA_VERSION,
|
|
14
|
+
CheckReport,
|
|
15
|
+
FailedCase,
|
|
16
|
+
GateThresholds,
|
|
17
|
+
ScoreBreakdown01,
|
|
18
|
+
ScoreContribPts,
|
|
19
|
+
)
|
|
20
|
+
from evalgate_sdk.cli.render.snippet import truncate_snippet
|
|
21
|
+
from evalgate_sdk.cli.render.sort import sort_failed_cases
|
|
22
|
+
|
|
23
|
+
TOP_N = 3
|
|
24
|
+
SNIPPET_MAX = 50
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def compute_contrib_pts(b: ScoreBreakdown01) -> ScoreContribPts:
|
|
28
|
+
"""ContribPts from weights: passRate*50, safety*25, (0.6*judge+0.4*schema)*15, (0.6*latency+0.4*cost)*10."""
|
|
29
|
+
pr = b.pass_rate or 0
|
|
30
|
+
s = b.safety or 0
|
|
31
|
+
j = b.judge or 0
|
|
32
|
+
sc = b.schema or 0
|
|
33
|
+
lat = b.latency or 0
|
|
34
|
+
c = b.cost or 0
|
|
35
|
+
return ScoreContribPts(
|
|
36
|
+
pass_rate_pts=round(pr * 50 * 10) / 10,
|
|
37
|
+
safety_pts=round(s * 25 * 10) / 10,
|
|
38
|
+
compliance_pts=round((0.6 * j + 0.4 * sc) * 15 * 10) / 10,
|
|
39
|
+
performance_pts=round((0.6 * lat + 0.4 * c) * 10 * 10) / 10,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def build_check_report(
|
|
44
|
+
evaluation_id: str,
|
|
45
|
+
quality: dict[str, Any],
|
|
46
|
+
gate_result: dict[str, Any],
|
|
47
|
+
base_url: str = "",
|
|
48
|
+
run_details: dict[str, Any] | None = None,
|
|
49
|
+
request_id: str | None = None,
|
|
50
|
+
share_url: str | None = None,
|
|
51
|
+
baseline_run_id: int | None = None,
|
|
52
|
+
ci_run_url: str | None = None,
|
|
53
|
+
explain: bool = False,
|
|
54
|
+
policy: str | None = None,
|
|
55
|
+
min_score: float | None = None,
|
|
56
|
+
max_drop: float | None = None,
|
|
57
|
+
warn_drop: float | None = None,
|
|
58
|
+
min_n: int | None = None,
|
|
59
|
+
allow_weak_evidence: bool | None = None,
|
|
60
|
+
baseline: str | None = None,
|
|
61
|
+
max_cost_usd: float | None = None,
|
|
62
|
+
max_latency_ms: float | None = None,
|
|
63
|
+
max_cost_delta_usd: float | None = None,
|
|
64
|
+
) -> CheckReport:
|
|
65
|
+
"""Build a CheckReport from API data and gate result."""
|
|
66
|
+
score = quality.get("score", 0)
|
|
67
|
+
total = quality.get("total")
|
|
68
|
+
baseline_score = quality.get("baselineScore") or quality.get("baseline_score")
|
|
69
|
+
regression_delta = quality.get("regressionDelta") or quality.get("regression_delta")
|
|
70
|
+
evaluation_run_id = quality.get("evaluationRunId") or quality.get("evaluation_run_id")
|
|
71
|
+
breakdown = quality.get("breakdown", {})
|
|
72
|
+
flags = sorted(quality.get("flags", []))
|
|
73
|
+
|
|
74
|
+
dashboard_url = None
|
|
75
|
+
if evaluation_run_id is not None:
|
|
76
|
+
clean_base = base_url.rstrip("/")
|
|
77
|
+
dashboard_url = f"{clean_base}/evaluations/{evaluation_id}/runs/{evaluation_run_id}"
|
|
78
|
+
|
|
79
|
+
# Build failed cases from run details
|
|
80
|
+
failed_cases: list[FailedCase] = []
|
|
81
|
+
if run_details and run_details.get("results") and evaluation_run_id is not None:
|
|
82
|
+
raw = []
|
|
83
|
+
for r in run_details["results"]:
|
|
84
|
+
if r.get("status") == "failed":
|
|
85
|
+
tc = r.get("test_cases", {})
|
|
86
|
+
raw.append(
|
|
87
|
+
{
|
|
88
|
+
"test_case_id": r.get("testCaseId") or r.get("test_case_id"),
|
|
89
|
+
"status": "failed",
|
|
90
|
+
"name": tc.get("name"),
|
|
91
|
+
"input": tc.get("input"),
|
|
92
|
+
"expected_output": tc.get("expectedOutput") or tc.get("expected_output"),
|
|
93
|
+
"output": r.get("output"),
|
|
94
|
+
}
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
sorted_raw = sort_failed_cases(raw)
|
|
98
|
+
for fc in sorted_raw:
|
|
99
|
+
failed_cases.append(
|
|
100
|
+
FailedCase(
|
|
101
|
+
test_case_id=fc.get("test_case_id"),
|
|
102
|
+
status="failed",
|
|
103
|
+
name=fc.get("name"),
|
|
104
|
+
input=fc.get("input"),
|
|
105
|
+
input_snippet=truncate_snippet(fc.get("input"), SNIPPET_MAX),
|
|
106
|
+
expected_output=fc.get("expected_output"),
|
|
107
|
+
expected_snippet=truncate_snippet(fc.get("expected_output"), SNIPPET_MAX),
|
|
108
|
+
output=fc.get("output"),
|
|
109
|
+
output_snippet=truncate_snippet(fc.get("output"), SNIPPET_MAX),
|
|
110
|
+
)
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
failed_cases_shown = min(len(failed_cases), TOP_N) if failed_cases else None
|
|
114
|
+
failed_cases_more = (len(failed_cases) - TOP_N) if len(failed_cases) > TOP_N else None
|
|
115
|
+
|
|
116
|
+
gate_skipped = gate_result.get("gate_skipped", gate_result.get("gateSkipped", False))
|
|
117
|
+
gate_applied = not gate_skipped
|
|
118
|
+
gate_mode = "neutral" if gate_skipped else "enforced"
|
|
119
|
+
reason_code = gate_result.get("reason_code", gate_result.get("reasonCode", "UNKNOWN"))
|
|
120
|
+
|
|
121
|
+
if reason_code == "WARN_REGRESSION":
|
|
122
|
+
verdict = "warn"
|
|
123
|
+
elif gate_result.get("passed"):
|
|
124
|
+
verdict = "pass"
|
|
125
|
+
else:
|
|
126
|
+
verdict = "fail"
|
|
127
|
+
|
|
128
|
+
actionable_message = None
|
|
129
|
+
if gate_skipped:
|
|
130
|
+
actionable_message = (
|
|
131
|
+
"Gate not applied: baseline missing. Publish a baseline from the dashboard, "
|
|
132
|
+
"or run with --baseline previous once you have runs."
|
|
133
|
+
)
|
|
134
|
+
else:
|
|
135
|
+
actionable_message = gate_result.get("reason_message") or gate_result.get("reasonMessage")
|
|
136
|
+
|
|
137
|
+
breakdown_01 = None
|
|
138
|
+
if breakdown:
|
|
139
|
+
breakdown_01 = ScoreBreakdown01(
|
|
140
|
+
pass_rate=breakdown.get("passRate"),
|
|
141
|
+
safety=breakdown.get("safety"),
|
|
142
|
+
judge=breakdown.get("judge"),
|
|
143
|
+
schema=breakdown.get("schema"),
|
|
144
|
+
latency=breakdown.get("latency"),
|
|
145
|
+
cost=breakdown.get("cost"),
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
contrib_pts = None
|
|
149
|
+
if explain and breakdown_01:
|
|
150
|
+
contrib_pts = compute_contrib_pts(breakdown_01)
|
|
151
|
+
|
|
152
|
+
thresholds = GateThresholds(
|
|
153
|
+
min_score=min_score,
|
|
154
|
+
max_drop=max_drop,
|
|
155
|
+
warn_drop=warn_drop,
|
|
156
|
+
min_n=min_n,
|
|
157
|
+
allow_weak_evidence=allow_weak_evidence,
|
|
158
|
+
baseline=baseline,
|
|
159
|
+
max_cost_usd=max_cost_usd,
|
|
160
|
+
max_latency_ms=max_latency_ms,
|
|
161
|
+
max_cost_delta_usd=max_cost_delta_usd,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
policy_evidence = None
|
|
165
|
+
if explain and gate_result.get("policy_evidence", gate_result.get("policyEvidence")):
|
|
166
|
+
pe = gate_result.get("policy_evidence") or gate_result.get("policyEvidence")
|
|
167
|
+
policy_evidence = {
|
|
168
|
+
"failedCheck": pe.get("failed_check") or pe.get("failedCheck"),
|
|
169
|
+
"remediation": pe.get("remediation"),
|
|
170
|
+
"snapshot": pe.get("snapshot"),
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
return CheckReport(
|
|
174
|
+
schema_version=CHECK_REPORT_SCHEMA_VERSION,
|
|
175
|
+
evaluation_id=evaluation_id,
|
|
176
|
+
run_id=evaluation_run_id,
|
|
177
|
+
verdict=verdict,
|
|
178
|
+
gate_applied=gate_applied,
|
|
179
|
+
gate_mode=gate_mode,
|
|
180
|
+
actionable_message=actionable_message,
|
|
181
|
+
share_url=share_url,
|
|
182
|
+
policy=policy,
|
|
183
|
+
baseline_run_id=baseline_run_id or quality.get("baselineRunId") or quality.get("baseline_run_id"),
|
|
184
|
+
ci_run_url=ci_run_url,
|
|
185
|
+
reason_code=reason_code,
|
|
186
|
+
reason_message=gate_result.get("reason_message") or gate_result.get("reasonMessage"),
|
|
187
|
+
score=score,
|
|
188
|
+
baseline_score=baseline_score,
|
|
189
|
+
delta=regression_delta,
|
|
190
|
+
n=total,
|
|
191
|
+
evidence_level=quality.get("evidenceLevel") or quality.get("evidence_level"),
|
|
192
|
+
baseline_missing=quality.get("baselineMissing") or quality.get("baseline_missing"),
|
|
193
|
+
baseline_status=(
|
|
194
|
+
"missing"
|
|
195
|
+
if quality.get("baselineMissing") or quality.get("baseline_missing")
|
|
196
|
+
else ("found" if baseline_score is not None else None)
|
|
197
|
+
),
|
|
198
|
+
flags=flags if flags else None,
|
|
199
|
+
breakdown_01=breakdown_01,
|
|
200
|
+
contrib_pts=contrib_pts,
|
|
201
|
+
thresholds=thresholds,
|
|
202
|
+
dashboard_url=dashboard_url,
|
|
203
|
+
failed_cases=failed_cases,
|
|
204
|
+
failed_cases_shown=failed_cases_shown,
|
|
205
|
+
failed_cases_more=failed_cases_more if failed_cases_more and failed_cases_more > 0 else None,
|
|
206
|
+
request_id=request_id,
|
|
207
|
+
explain=explain if explain else None,
|
|
208
|
+
policy_evidence=policy_evidence,
|
|
209
|
+
)
|