evalgate-sdk 3.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalgate_sdk/__init__.py +707 -0
- evalgate_sdk/_version.py +3 -0
- evalgate_sdk/assertions.py +1362 -0
- evalgate_sdk/auto.py +247 -0
- evalgate_sdk/batch.py +174 -0
- evalgate_sdk/cache.py +111 -0
- evalgate_sdk/ci_context.py +123 -0
- evalgate_sdk/cli/__init__.py +111 -0
- evalgate_sdk/cli/api.py +261 -0
- evalgate_sdk/cli/cli_constants.py +20 -0
- evalgate_sdk/cli/commands.py +1041 -0
- evalgate_sdk/cli/config.py +228 -0
- evalgate_sdk/cli/env.py +43 -0
- evalgate_sdk/cli/formatters/types.py +132 -0
- evalgate_sdk/cli/golden_commands.py +322 -0
- evalgate_sdk/cli/manifest.py +301 -0
- evalgate_sdk/cli/new_commands.py +435 -0
- evalgate_sdk/cli/policy_packs.py +103 -0
- evalgate_sdk/cli/profiles.py +12 -0
- evalgate_sdk/cli/regression_gate.py +312 -0
- evalgate_sdk/cli/render/__init__.py +1 -0
- evalgate_sdk/cli/render/snippet.py +18 -0
- evalgate_sdk/cli/render/sort.py +29 -0
- evalgate_sdk/cli/report/__init__.py +1 -0
- evalgate_sdk/cli/report/build_check_report.py +209 -0
- evalgate_sdk/cli/traces.py +186 -0
- evalgate_sdk/cli/workspace.py +63 -0
- evalgate_sdk/client.py +609 -0
- evalgate_sdk/cluster.py +359 -0
- evalgate_sdk/collector.py +161 -0
- evalgate_sdk/constants.py +6 -0
- evalgate_sdk/context.py +151 -0
- evalgate_sdk/errors.py +236 -0
- evalgate_sdk/export.py +238 -0
- evalgate_sdk/formatters/__init__.py +11 -0
- evalgate_sdk/formatters/github.py +51 -0
- evalgate_sdk/formatters/human.py +68 -0
- evalgate_sdk/formatters/json_fmt.py +11 -0
- evalgate_sdk/formatters/pr_comment.py +80 -0
- evalgate_sdk/golden.py +426 -0
- evalgate_sdk/integrations/__init__.py +1 -0
- evalgate_sdk/integrations/anthropic.py +99 -0
- evalgate_sdk/integrations/autogen.py +62 -0
- evalgate_sdk/integrations/crewai.py +61 -0
- evalgate_sdk/integrations/langchain.py +100 -0
- evalgate_sdk/integrations/openai.py +155 -0
- evalgate_sdk/integrations/openai_eval.py +221 -0
- evalgate_sdk/local.py +144 -0
- evalgate_sdk/logger.py +123 -0
- evalgate_sdk/matchers.py +62 -0
- evalgate_sdk/otel.py +256 -0
- evalgate_sdk/pagination.py +145 -0
- evalgate_sdk/py.typed +0 -0
- evalgate_sdk/pytest_plugin.py +96 -0
- evalgate_sdk/reason_codes.py +103 -0
- evalgate_sdk/regression.py +196 -0
- evalgate_sdk/replay_decision.py +115 -0
- evalgate_sdk/runtime/__init__.py +50 -0
- evalgate_sdk/runtime/adapters/__init__.py +1 -0
- evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
- evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
- evalgate_sdk/runtime/context.py +68 -0
- evalgate_sdk/runtime/eval.py +318 -0
- evalgate_sdk/runtime/execution_mode.py +170 -0
- evalgate_sdk/runtime/executor.py +92 -0
- evalgate_sdk/runtime/registry.py +125 -0
- evalgate_sdk/runtime/run_report.py +249 -0
- evalgate_sdk/runtime/types.py +143 -0
- evalgate_sdk/snapshot.py +219 -0
- evalgate_sdk/streaming.py +124 -0
- evalgate_sdk/synthesize.py +226 -0
- evalgate_sdk/testing.py +128 -0
- evalgate_sdk/types.py +666 -0
- evalgate_sdk/utils/__init__.py +1 -0
- evalgate_sdk/utils/input_hash.py +42 -0
- evalgate_sdk/workflows.py +264 -0
- evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
- evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
- evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
- evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""Structured trace writer for evalgate runs.
|
|
2
|
+
|
|
3
|
+
Auto-writes structured JSON to .evalgate/traces/ on every define_eval result.
|
|
4
|
+
Each trace captures: spec identity, timing, assertions, score, and metadata.
|
|
5
|
+
|
|
6
|
+
Port of ``cli/traces.ts``.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import os
|
|
13
|
+
import platform
|
|
14
|
+
import sys
|
|
15
|
+
import time
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from datetime import datetime, timezone
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class SpecTrace:
|
|
24
|
+
"""Individual spec trace record."""
|
|
25
|
+
|
|
26
|
+
schema_version: int = 1
|
|
27
|
+
timestamp: int = 0
|
|
28
|
+
timestamp_iso: str = ""
|
|
29
|
+
run_id: str = ""
|
|
30
|
+
spec: dict[str, str] = field(default_factory=dict)
|
|
31
|
+
execution: dict[str, Any] = field(default_factory=dict)
|
|
32
|
+
git: dict[str, str] | None = None
|
|
33
|
+
env: dict[str, Any] = field(default_factory=dict)
|
|
34
|
+
|
|
35
|
+
def to_dict(self) -> dict[str, Any]:
|
|
36
|
+
d: dict[str, Any] = {
|
|
37
|
+
"schemaVersion": self.schema_version,
|
|
38
|
+
"timestamp": self.timestamp,
|
|
39
|
+
"timestampISO": self.timestamp_iso,
|
|
40
|
+
"runId": self.run_id,
|
|
41
|
+
"spec": self.spec,
|
|
42
|
+
"execution": self.execution,
|
|
43
|
+
"env": self.env,
|
|
44
|
+
}
|
|
45
|
+
if self.git:
|
|
46
|
+
d["git"] = self.git
|
|
47
|
+
return d
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class RunTrace:
|
|
52
|
+
"""Run-level trace summary."""
|
|
53
|
+
|
|
54
|
+
schema_version: int = 1
|
|
55
|
+
run: dict[str, Any] = field(default_factory=dict)
|
|
56
|
+
summary: dict[str, Any] = field(default_factory=dict)
|
|
57
|
+
latency: dict[str, Any] = field(default_factory=dict)
|
|
58
|
+
specs: list[SpecTrace] = field(default_factory=list)
|
|
59
|
+
|
|
60
|
+
def to_dict(self) -> dict[str, Any]:
|
|
61
|
+
return {
|
|
62
|
+
"schemaVersion": self.schema_version,
|
|
63
|
+
"run": self.run,
|
|
64
|
+
"summary": self.summary,
|
|
65
|
+
"latency": self.latency,
|
|
66
|
+
"specs": [s.to_dict() for s in self.specs],
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def calculate_percentiles(durations: list[float]) -> dict[str, float]:
|
|
71
|
+
"""Calculate latency percentiles from durations."""
|
|
72
|
+
if not durations:
|
|
73
|
+
return {"min": 0, "max": 0, "mean": 0, "p50": 0, "p95": 0, "p99": 0}
|
|
74
|
+
|
|
75
|
+
sorted_d = sorted(durations)
|
|
76
|
+
n = len(sorted_d)
|
|
77
|
+
total = sum(sorted_d)
|
|
78
|
+
|
|
79
|
+
return {
|
|
80
|
+
"min": sorted_d[0],
|
|
81
|
+
"max": sorted_d[-1],
|
|
82
|
+
"mean": round(total / n),
|
|
83
|
+
"p50": sorted_d[int(n * 0.5)],
|
|
84
|
+
"p95": sorted_d[min(int(n * 0.95), n - 1)],
|
|
85
|
+
"p99": sorted_d[min(int(n * 0.99), n - 1)],
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def build_run_trace(
|
|
90
|
+
result: dict[str, Any],
|
|
91
|
+
git_info: dict[str, str] | None = None,
|
|
92
|
+
) -> RunTrace:
|
|
93
|
+
"""Build a RunTrace from a run result dict."""
|
|
94
|
+
now = int(time.time() * 1000)
|
|
95
|
+
is_ci = bool(os.environ.get("CI") or os.environ.get("GITHUB_ACTIONS") or os.environ.get("GITLAB_CI"))
|
|
96
|
+
|
|
97
|
+
run_id = result.get("run_id", result.get("runId", "unknown"))
|
|
98
|
+
results_list = result.get("results", [])
|
|
99
|
+
metadata = result.get("metadata", {})
|
|
100
|
+
summary = result.get("summary", {})
|
|
101
|
+
|
|
102
|
+
spec_traces: list[SpecTrace] = []
|
|
103
|
+
for spec in results_list:
|
|
104
|
+
spec_result = spec.get("result", spec)
|
|
105
|
+
spec_traces.append(
|
|
106
|
+
SpecTrace(
|
|
107
|
+
timestamp=now,
|
|
108
|
+
timestamp_iso=datetime.now(timezone.utc).isoformat(),
|
|
109
|
+
run_id=run_id,
|
|
110
|
+
spec={
|
|
111
|
+
"id": spec.get("spec_id", spec.get("specId", "")),
|
|
112
|
+
"name": spec.get("name", ""),
|
|
113
|
+
"filePath": spec.get("file_path", spec.get("filePath", "")),
|
|
114
|
+
},
|
|
115
|
+
execution={
|
|
116
|
+
"status": spec_result.get("status", "unknown"),
|
|
117
|
+
"score": spec_result.get("score"),
|
|
118
|
+
"duration": spec_result.get("duration", spec_result.get("duration_ms", 0)),
|
|
119
|
+
"error": spec_result.get("error"),
|
|
120
|
+
},
|
|
121
|
+
git=git_info,
|
|
122
|
+
env={
|
|
123
|
+
"pythonVersion": sys.version.split()[0],
|
|
124
|
+
"platform": platform.system().lower(),
|
|
125
|
+
"ci": is_ci,
|
|
126
|
+
},
|
|
127
|
+
)
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
durations = [s.execution.get("duration", 0) for s in spec_traces if s.execution.get("status") != "skipped"]
|
|
131
|
+
latency = calculate_percentiles(durations)
|
|
132
|
+
|
|
133
|
+
return RunTrace(
|
|
134
|
+
run={
|
|
135
|
+
"id": run_id,
|
|
136
|
+
"startedAt": metadata.get("started_at", metadata.get("startedAt", now)),
|
|
137
|
+
"completedAt": metadata.get("completed_at", metadata.get("completedAt", now)),
|
|
138
|
+
"duration": metadata.get("duration", 0),
|
|
139
|
+
"mode": metadata.get("mode", "spec"),
|
|
140
|
+
},
|
|
141
|
+
summary={
|
|
142
|
+
"total": len(results_list),
|
|
143
|
+
"passed": summary.get("passed", 0),
|
|
144
|
+
"failed": summary.get("failed", 0),
|
|
145
|
+
"skipped": summary.get("skipped", 0),
|
|
146
|
+
"passRate": summary.get("pass_rate", summary.get("passRate", 0)),
|
|
147
|
+
},
|
|
148
|
+
latency=latency,
|
|
149
|
+
specs=spec_traces,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
async def write_traces(
|
|
154
|
+
result: dict[str, Any],
|
|
155
|
+
project_root: str | None = None,
|
|
156
|
+
git_info: dict[str, str] | None = None,
|
|
157
|
+
) -> str:
|
|
158
|
+
"""Write structured trace files to .evalgate/traces/."""
|
|
159
|
+
root = project_root or os.getcwd()
|
|
160
|
+
traces_dir = os.path.join(root, ".evalgate", "traces")
|
|
161
|
+
os.makedirs(traces_dir, exist_ok=True)
|
|
162
|
+
|
|
163
|
+
run_trace = build_run_trace(result, git_info)
|
|
164
|
+
run_id = run_trace.run.get("id", "unknown")
|
|
165
|
+
|
|
166
|
+
trace_file = os.path.join(traces_dir, f"{run_id}.trace.json")
|
|
167
|
+
Path(trace_file).write_text(json.dumps(run_trace.to_dict(), indent=2), encoding="utf-8")
|
|
168
|
+
|
|
169
|
+
latest_file = os.path.join(traces_dir, "latest.trace.json")
|
|
170
|
+
Path(latest_file).write_text(json.dumps(run_trace.to_dict(), indent=2), encoding="utf-8")
|
|
171
|
+
|
|
172
|
+
return trace_file
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def format_latency_table(latency: dict[str, Any]) -> str:
|
|
176
|
+
"""Format latency percentiles for human display."""
|
|
177
|
+
lines = [
|
|
178
|
+
"⏱️ Latency Percentiles:",
|
|
179
|
+
f" min: {latency.get('min', 0)}ms",
|
|
180
|
+
f" p50: {latency.get('p50', 0)}ms",
|
|
181
|
+
f" p95: {latency.get('p95', 0)}ms",
|
|
182
|
+
f" p99: {latency.get('p99', 0)}ms",
|
|
183
|
+
f" max: {latency.get('max', 0)}ms",
|
|
184
|
+
f" mean: {latency.get('mean', 0)}ms",
|
|
185
|
+
]
|
|
186
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Centralized .evalgate workspace resolution.
|
|
2
|
+
|
|
3
|
+
Provides unified workspace path resolution for all EvalGate CLI commands.
|
|
4
|
+
Prefers .evalgate/; falls back to .evalai/ for backward compatibility.
|
|
5
|
+
|
|
6
|
+
Port of ``cli/workspace.ts``.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
import warnings
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
|
|
15
|
+
_LEGACY_WARNING_SHOWN = False
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class EvalWorkspace:
|
|
20
|
+
"""EvalGate workspace paths."""
|
|
21
|
+
|
|
22
|
+
root: str
|
|
23
|
+
eval_dir: str
|
|
24
|
+
runs_dir: str
|
|
25
|
+
manifest_path: str
|
|
26
|
+
last_run_path: str
|
|
27
|
+
index_path: str
|
|
28
|
+
baseline_path: str
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def resolve_eval_workspace(project_root: str | None = None) -> EvalWorkspace:
|
|
32
|
+
"""Resolve EvalGate workspace paths.
|
|
33
|
+
|
|
34
|
+
Prefers ``.evalgate/``, falls back to ``.evalai/`` for legacy projects.
|
|
35
|
+
"""
|
|
36
|
+
global _LEGACY_WARNING_SHOWN
|
|
37
|
+
|
|
38
|
+
root = project_root or os.getcwd()
|
|
39
|
+
evalgate_dir = os.path.join(root, ".evalgate")
|
|
40
|
+
evalai_dir = os.path.join(root, ".evalai")
|
|
41
|
+
|
|
42
|
+
use_legacy = os.path.isdir(evalai_dir) and not os.path.isdir(evalgate_dir)
|
|
43
|
+
eval_dir = evalai_dir if use_legacy else evalgate_dir
|
|
44
|
+
|
|
45
|
+
if use_legacy and not _LEGACY_WARNING_SHOWN:
|
|
46
|
+
warnings.warn(
|
|
47
|
+
"[EvalGate] Deprecation: .evalai/ is deprecated. Migrate to .evalgate/ (e.g. rename .evalai to .evalgate).",
|
|
48
|
+
DeprecationWarning,
|
|
49
|
+
stacklevel=2,
|
|
50
|
+
)
|
|
51
|
+
_LEGACY_WARNING_SHOWN = True
|
|
52
|
+
|
|
53
|
+
runs_dir = os.path.join(eval_dir, "runs")
|
|
54
|
+
|
|
55
|
+
return EvalWorkspace(
|
|
56
|
+
root=root,
|
|
57
|
+
eval_dir=eval_dir,
|
|
58
|
+
runs_dir=runs_dir,
|
|
59
|
+
manifest_path=os.path.join(eval_dir, "manifest.json"),
|
|
60
|
+
last_run_path=os.path.join(eval_dir, "last-run.json"),
|
|
61
|
+
index_path=os.path.join(runs_dir, "index.json"),
|
|
62
|
+
baseline_path=os.path.join(eval_dir, "baseline-run.json"),
|
|
63
|
+
)
|