evalgate-sdk 3.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalgate_sdk/__init__.py +707 -0
- evalgate_sdk/_version.py +3 -0
- evalgate_sdk/assertions.py +1362 -0
- evalgate_sdk/auto.py +247 -0
- evalgate_sdk/batch.py +174 -0
- evalgate_sdk/cache.py +111 -0
- evalgate_sdk/ci_context.py +123 -0
- evalgate_sdk/cli/__init__.py +111 -0
- evalgate_sdk/cli/api.py +261 -0
- evalgate_sdk/cli/cli_constants.py +20 -0
- evalgate_sdk/cli/commands.py +1041 -0
- evalgate_sdk/cli/config.py +228 -0
- evalgate_sdk/cli/env.py +43 -0
- evalgate_sdk/cli/formatters/types.py +132 -0
- evalgate_sdk/cli/golden_commands.py +322 -0
- evalgate_sdk/cli/manifest.py +301 -0
- evalgate_sdk/cli/new_commands.py +435 -0
- evalgate_sdk/cli/policy_packs.py +103 -0
- evalgate_sdk/cli/profiles.py +12 -0
- evalgate_sdk/cli/regression_gate.py +312 -0
- evalgate_sdk/cli/render/__init__.py +1 -0
- evalgate_sdk/cli/render/snippet.py +18 -0
- evalgate_sdk/cli/render/sort.py +29 -0
- evalgate_sdk/cli/report/__init__.py +1 -0
- evalgate_sdk/cli/report/build_check_report.py +209 -0
- evalgate_sdk/cli/traces.py +186 -0
- evalgate_sdk/cli/workspace.py +63 -0
- evalgate_sdk/client.py +609 -0
- evalgate_sdk/cluster.py +359 -0
- evalgate_sdk/collector.py +161 -0
- evalgate_sdk/constants.py +6 -0
- evalgate_sdk/context.py +151 -0
- evalgate_sdk/errors.py +236 -0
- evalgate_sdk/export.py +238 -0
- evalgate_sdk/formatters/__init__.py +11 -0
- evalgate_sdk/formatters/github.py +51 -0
- evalgate_sdk/formatters/human.py +68 -0
- evalgate_sdk/formatters/json_fmt.py +11 -0
- evalgate_sdk/formatters/pr_comment.py +80 -0
- evalgate_sdk/golden.py +426 -0
- evalgate_sdk/integrations/__init__.py +1 -0
- evalgate_sdk/integrations/anthropic.py +99 -0
- evalgate_sdk/integrations/autogen.py +62 -0
- evalgate_sdk/integrations/crewai.py +61 -0
- evalgate_sdk/integrations/langchain.py +100 -0
- evalgate_sdk/integrations/openai.py +155 -0
- evalgate_sdk/integrations/openai_eval.py +221 -0
- evalgate_sdk/local.py +144 -0
- evalgate_sdk/logger.py +123 -0
- evalgate_sdk/matchers.py +62 -0
- evalgate_sdk/otel.py +256 -0
- evalgate_sdk/pagination.py +145 -0
- evalgate_sdk/py.typed +0 -0
- evalgate_sdk/pytest_plugin.py +96 -0
- evalgate_sdk/reason_codes.py +103 -0
- evalgate_sdk/regression.py +196 -0
- evalgate_sdk/replay_decision.py +115 -0
- evalgate_sdk/runtime/__init__.py +50 -0
- evalgate_sdk/runtime/adapters/__init__.py +1 -0
- evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
- evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
- evalgate_sdk/runtime/context.py +68 -0
- evalgate_sdk/runtime/eval.py +318 -0
- evalgate_sdk/runtime/execution_mode.py +170 -0
- evalgate_sdk/runtime/executor.py +92 -0
- evalgate_sdk/runtime/registry.py +125 -0
- evalgate_sdk/runtime/run_report.py +249 -0
- evalgate_sdk/runtime/types.py +143 -0
- evalgate_sdk/snapshot.py +219 -0
- evalgate_sdk/streaming.py +124 -0
- evalgate_sdk/synthesize.py +226 -0
- evalgate_sdk/testing.py +128 -0
- evalgate_sdk/types.py +666 -0
- evalgate_sdk/utils/__init__.py +1 -0
- evalgate_sdk/utils/input_hash.py +42 -0
- evalgate_sdk/workflows.py +264 -0
- evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
- evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
- evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
- evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
evalgate_sdk/errors.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
"""Error classes for the EvalAI SDK, with rich error information and documentation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
_ERROR_DOCS: dict[str, dict[str, Any]] = {
|
|
9
|
+
"MISSING_API_KEY": {
|
|
10
|
+
"documentation": "https://docs.evalgate.com/errors/missing-api-key",
|
|
11
|
+
"solutions": [
|
|
12
|
+
"Set EVALGATE_API_KEY environment variable",
|
|
13
|
+
'Pass api_key in config: AIEvalClient(api_key="...")',
|
|
14
|
+
],
|
|
15
|
+
"retryable": False,
|
|
16
|
+
},
|
|
17
|
+
"RATE_LIMIT_EXCEEDED": {
|
|
18
|
+
"documentation": "https://docs.evalgate.com/errors/rate-limit",
|
|
19
|
+
"solutions": [
|
|
20
|
+
"Wait before retrying (check retry_after property)",
|
|
21
|
+
"Upgrade your plan for higher rate limits",
|
|
22
|
+
"Implement exponential backoff",
|
|
23
|
+
],
|
|
24
|
+
"retryable": True,
|
|
25
|
+
},
|
|
26
|
+
"TIMEOUT": {
|
|
27
|
+
"documentation": "https://docs.evalgate.com/errors/timeout",
|
|
28
|
+
"solutions": [
|
|
29
|
+
"Increase timeout: AIEvalClient(timeout=60000)",
|
|
30
|
+
"Check your network connection",
|
|
31
|
+
],
|
|
32
|
+
"retryable": True,
|
|
33
|
+
},
|
|
34
|
+
"NETWORK_ERROR": {
|
|
35
|
+
"documentation": "https://docs.evalgate.com/errors/network",
|
|
36
|
+
"solutions": [
|
|
37
|
+
"Check your internet connection",
|
|
38
|
+
"Verify the base_url is correct",
|
|
39
|
+
],
|
|
40
|
+
"retryable": True,
|
|
41
|
+
},
|
|
42
|
+
"UNAUTHORIZED": {
|
|
43
|
+
"documentation": "https://docs.evalgate.com/errors/unauthorized",
|
|
44
|
+
"solutions": [
|
|
45
|
+
"Verify your API key is correct",
|
|
46
|
+
"Check if your API key has expired",
|
|
47
|
+
],
|
|
48
|
+
"retryable": False,
|
|
49
|
+
},
|
|
50
|
+
"FORBIDDEN": {
|
|
51
|
+
"documentation": "https://docs.evalgate.com/errors/forbidden",
|
|
52
|
+
"solutions": [
|
|
53
|
+
"Check if you have permission for this resource",
|
|
54
|
+
"Verify you're using the correct organization ID",
|
|
55
|
+
],
|
|
56
|
+
"retryable": False,
|
|
57
|
+
},
|
|
58
|
+
"NOT_FOUND": {
|
|
59
|
+
"documentation": "https://docs.evalgate.com/errors/not-found",
|
|
60
|
+
"solutions": [
|
|
61
|
+
"Verify the resource ID is correct",
|
|
62
|
+
"Check if the resource was deleted",
|
|
63
|
+
],
|
|
64
|
+
"retryable": False,
|
|
65
|
+
},
|
|
66
|
+
"VALIDATION_ERROR": {
|
|
67
|
+
"documentation": "https://docs.evalgate.com/errors/validation",
|
|
68
|
+
"solutions": [
|
|
69
|
+
"Check the error details for specific validation failures",
|
|
70
|
+
"Verify all required fields are provided",
|
|
71
|
+
],
|
|
72
|
+
"retryable": False,
|
|
73
|
+
},
|
|
74
|
+
"INTERNAL_SERVER_ERROR": {
|
|
75
|
+
"documentation": "https://docs.evalgate.com/errors/server-error",
|
|
76
|
+
"solutions": [
|
|
77
|
+
"Retry the request after a brief delay",
|
|
78
|
+
"Contact support if the issue persists",
|
|
79
|
+
],
|
|
80
|
+
"retryable": True,
|
|
81
|
+
},
|
|
82
|
+
"FEATURE_LIMIT_REACHED": {
|
|
83
|
+
"documentation": "https://docs.evalgate.com/errors/feature-limit",
|
|
84
|
+
"solutions": [
|
|
85
|
+
"Upgrade your plan for higher limits",
|
|
86
|
+
"Wait for your usage to reset",
|
|
87
|
+
],
|
|
88
|
+
"retryable": False,
|
|
89
|
+
},
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
_STATUS_TO_CODE = {
|
|
93
|
+
401: "UNAUTHORIZED",
|
|
94
|
+
403: "FORBIDDEN",
|
|
95
|
+
404: "NOT_FOUND",
|
|
96
|
+
408: "TIMEOUT",
|
|
97
|
+
422: "VALIDATION_ERROR",
|
|
98
|
+
429: "RATE_LIMIT_EXCEEDED",
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class EvalGateError(Exception):
|
|
103
|
+
"""Base error for the EvalAI SDK with rich diagnostics."""
|
|
104
|
+
|
|
105
|
+
code: str
|
|
106
|
+
status_code: int
|
|
107
|
+
documentation: str
|
|
108
|
+
solutions: list[str]
|
|
109
|
+
retryable: bool
|
|
110
|
+
details: Any | None
|
|
111
|
+
retry_after: int | None
|
|
112
|
+
reset_at: datetime | None
|
|
113
|
+
request_id: str | None
|
|
114
|
+
|
|
115
|
+
def __init__(
|
|
116
|
+
self,
|
|
117
|
+
message: str,
|
|
118
|
+
code: str = "UNKNOWN_ERROR",
|
|
119
|
+
status_code: int = 0,
|
|
120
|
+
details: Any | None = None,
|
|
121
|
+
) -> None:
|
|
122
|
+
super().__init__(message)
|
|
123
|
+
self.code = code
|
|
124
|
+
self.status_code = status_code
|
|
125
|
+
self.details = details
|
|
126
|
+
|
|
127
|
+
doc = _ERROR_DOCS.get(code, {})
|
|
128
|
+
self.documentation = doc.get("documentation", f"https://docs.evalgate.com/errors/{code}")
|
|
129
|
+
self.solutions = doc.get("solutions", ["Check the error details for more information"])
|
|
130
|
+
self.retryable = doc.get("retryable", False)
|
|
131
|
+
self.retry_after = None
|
|
132
|
+
self.reset_at = None
|
|
133
|
+
self.request_id = None
|
|
134
|
+
|
|
135
|
+
if isinstance(details, dict):
|
|
136
|
+
if code == "RATE_LIMIT_EXCEEDED" and "retryAfter" in details:
|
|
137
|
+
self.retry_after = int(details["retryAfter"])
|
|
138
|
+
if code == "FEATURE_LIMIT_REACHED" and "resetAt" in details:
|
|
139
|
+
self.reset_at = datetime.fromisoformat(details["resetAt"])
|
|
140
|
+
self.request_id = details.get("requestId")
|
|
141
|
+
|
|
142
|
+
@property
|
|
143
|
+
def message(self) -> str:
|
|
144
|
+
"""Return the error message string, matching the TS ``error.message`` API."""
|
|
145
|
+
return str(self.args[0]) if self.args else ""
|
|
146
|
+
|
|
147
|
+
def should_retry(self) -> bool:
|
|
148
|
+
return self.retryable
|
|
149
|
+
|
|
150
|
+
def detailed_message(self) -> str:
|
|
151
|
+
lines = [f"{self.code}: {self}", "", f"Documentation: {self.documentation}", ""]
|
|
152
|
+
lines.append("Suggested solutions:")
|
|
153
|
+
for i, s in enumerate(self.solutions, 1):
|
|
154
|
+
lines.append(f" {i}. {s}")
|
|
155
|
+
if self.retry_after is not None:
|
|
156
|
+
lines.append(f"\nRetry after: {self.retry_after} seconds")
|
|
157
|
+
if self.reset_at is not None:
|
|
158
|
+
lines.append(f"\nLimit resets at: {self.reset_at.isoformat()}")
|
|
159
|
+
return "\n".join(lines)
|
|
160
|
+
|
|
161
|
+
def to_dict(self) -> dict[str, Any]:
|
|
162
|
+
return {
|
|
163
|
+
"code": self.code,
|
|
164
|
+
"message": str(self),
|
|
165
|
+
"status_code": self.status_code,
|
|
166
|
+
"documentation": self.documentation,
|
|
167
|
+
"solutions": self.solutions,
|
|
168
|
+
"retryable": self.retryable,
|
|
169
|
+
"retry_after": self.retry_after,
|
|
170
|
+
"request_id": self.request_id,
|
|
171
|
+
"details": self.details,
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
class RateLimitError(EvalGateError):
|
|
176
|
+
def __init__(self, message: str = "Rate limit exceeded", retry_after: int | None = None):
|
|
177
|
+
super().__init__(message, "RATE_LIMIT_EXCEEDED", 429, {"retryAfter": retry_after} if retry_after else None)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
class AuthenticationError(EvalGateError):
|
|
181
|
+
def __init__(self, message: str = "Authentication failed"):
|
|
182
|
+
super().__init__(message, "UNAUTHORIZED", 401)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class ValidationError(EvalGateError):
|
|
186
|
+
def __init__(self, message: str = "Validation failed", details: Any | None = None):
|
|
187
|
+
super().__init__(message, "VALIDATION_ERROR", 400, details)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class NetworkError(EvalGateError):
|
|
191
|
+
def __init__(self, message: str = "Network request failed"):
|
|
192
|
+
super().__init__(message, "NETWORK_ERROR", 0)
|
|
193
|
+
self.retryable = True
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def create_error_from_response(status_code: int, data: Any) -> EvalGateError:
|
|
197
|
+
"""Create an EvalGateError from an HTTP response status and body."""
|
|
198
|
+
if isinstance(data, dict):
|
|
199
|
+
error_obj = data.get("error", data)
|
|
200
|
+
if isinstance(error_obj, str):
|
|
201
|
+
error_obj = data
|
|
202
|
+
code = (
|
|
203
|
+
(error_obj.get("code") if isinstance(error_obj, dict) else None)
|
|
204
|
+
or data.get("code")
|
|
205
|
+
or _STATUS_TO_CODE.get(status_code)
|
|
206
|
+
or ("INTERNAL_SERVER_ERROR" if status_code >= 500 else "UNKNOWN_ERROR")
|
|
207
|
+
)
|
|
208
|
+
message = (
|
|
209
|
+
(data["error"] if isinstance(data.get("error"), str) else None)
|
|
210
|
+
or (error_obj.get("message") if isinstance(error_obj, dict) else None)
|
|
211
|
+
or data.get("message")
|
|
212
|
+
or "Unknown error"
|
|
213
|
+
)
|
|
214
|
+
request_id = (error_obj.get("requestId") if isinstance(error_obj, dict) else None) or data.get("requestId")
|
|
215
|
+
else:
|
|
216
|
+
code = _STATUS_TO_CODE.get(status_code, "INTERNAL_SERVER_ERROR" if status_code >= 500 else "UNKNOWN_ERROR")
|
|
217
|
+
message = str(data) if data else "Unknown error"
|
|
218
|
+
request_id = None
|
|
219
|
+
|
|
220
|
+
err: EvalGateError
|
|
221
|
+
if status_code == 429:
|
|
222
|
+
retry_after = None
|
|
223
|
+
if isinstance(data, dict):
|
|
224
|
+
retry_after = data.get("retryAfter") or (
|
|
225
|
+
error_obj.get("retryAfter") if isinstance(error_obj, dict) else None
|
|
226
|
+
)
|
|
227
|
+
err = RateLimitError(message, retry_after=int(retry_after) if retry_after else None)
|
|
228
|
+
elif status_code == 401 or code == "UNAUTHORIZED":
|
|
229
|
+
err = AuthenticationError(message)
|
|
230
|
+
elif status_code == 400 or code == "VALIDATION_ERROR":
|
|
231
|
+
err = ValidationError(message, details=data)
|
|
232
|
+
else:
|
|
233
|
+
err = EvalGateError(message, code, status_code, data)
|
|
234
|
+
if request_id:
|
|
235
|
+
err.request_id = request_id
|
|
236
|
+
return err
|
evalgate_sdk/export.py
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
"""Data export and import — JSON, CSV, JSONL formats with LangSmith conversion."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
import io
|
|
7
|
+
import json
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Literal
|
|
11
|
+
|
|
12
|
+
ExportFormat = Literal["json", "csv", "jsonl"]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class ExportOptions:
|
|
17
|
+
format: ExportFormat = "json"
|
|
18
|
+
include_traces: bool = True
|
|
19
|
+
include_evaluations: bool = True
|
|
20
|
+
include_test_cases: bool = True
|
|
21
|
+
include_runs: bool = True
|
|
22
|
+
start_date: str | None = None
|
|
23
|
+
end_date: str | None = None
|
|
24
|
+
organization_id: int | None = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class ImportOptions:
|
|
29
|
+
skip_duplicates: bool = True
|
|
30
|
+
dry_run: bool = False
|
|
31
|
+
organization_id: int | None = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class ExportData:
|
|
36
|
+
format: ExportFormat = "json"
|
|
37
|
+
traces: list[dict[str, Any]] = field(default_factory=list)
|
|
38
|
+
evaluations: list[dict[str, Any]] = field(default_factory=list)
|
|
39
|
+
test_cases: list[dict[str, Any]] = field(default_factory=list)
|
|
40
|
+
runs: list[dict[str, Any]] = field(default_factory=list)
|
|
41
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class ImportResult:
|
|
46
|
+
imported: int = 0
|
|
47
|
+
skipped: int = 0
|
|
48
|
+
errors: list[str] = field(default_factory=list)
|
|
49
|
+
dry_run: bool = False
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
async def export_data(client: Any, options: ExportOptions | None = None) -> ExportData:
|
|
53
|
+
"""Export traces, evaluations, test cases, and runs from the platform."""
|
|
54
|
+
opts = options or ExportOptions()
|
|
55
|
+
data = ExportData(format=opts.format)
|
|
56
|
+
|
|
57
|
+
if opts.include_traces:
|
|
58
|
+
from evalgate_sdk.types import ListTracesParams
|
|
59
|
+
|
|
60
|
+
params = ListTracesParams(limit=100)
|
|
61
|
+
if opts.organization_id:
|
|
62
|
+
params.organization_id = opts.organization_id
|
|
63
|
+
traces = await client.traces.list(params)
|
|
64
|
+
data.traces = [t.model_dump(mode="json", by_alias=True) for t in traces]
|
|
65
|
+
|
|
66
|
+
if opts.include_evaluations:
|
|
67
|
+
evals = await client.evaluations.list()
|
|
68
|
+
data.evaluations = [e.model_dump(mode="json", by_alias=True) for e in evals]
|
|
69
|
+
|
|
70
|
+
if opts.include_test_cases:
|
|
71
|
+
for ev in evals:
|
|
72
|
+
tcs = await client.evaluations.list_test_cases(ev.id)
|
|
73
|
+
data.test_cases.extend(tc.model_dump(mode="json", by_alias=True) for tc in tcs)
|
|
74
|
+
|
|
75
|
+
if opts.include_runs:
|
|
76
|
+
for ev in evals:
|
|
77
|
+
runs = await client.evaluations.list_runs(ev.id)
|
|
78
|
+
data.runs.extend(r.model_dump(mode="json", by_alias=True) for r in runs)
|
|
79
|
+
|
|
80
|
+
data.metadata = {"exported_at": _now_iso(), "total_items": len(data.traces) + len(data.evaluations)}
|
|
81
|
+
return data
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
async def import_data(data: ExportData, options: ImportOptions | None = None, *, client: Any = None) -> ImportResult:
|
|
85
|
+
"""Import data back into the platform.
|
|
86
|
+
|
|
87
|
+
The *client* parameter is keyword-only. When omitted the function
|
|
88
|
+
attempts to use the global default client (``get_default_client()``).
|
|
89
|
+
This 2-arg signature matches the TS public export.
|
|
90
|
+
"""
|
|
91
|
+
if client is None:
|
|
92
|
+
try:
|
|
93
|
+
from evalgate_sdk.client import get_default_client
|
|
94
|
+
|
|
95
|
+
client = get_default_client()
|
|
96
|
+
except Exception as err:
|
|
97
|
+
raise TypeError(
|
|
98
|
+
"import_data() requires a client. Either pass client=... or initialise a default client first."
|
|
99
|
+
) from err
|
|
100
|
+
opts = options or ImportOptions()
|
|
101
|
+
result = ImportResult(dry_run=opts.dry_run)
|
|
102
|
+
|
|
103
|
+
if opts.dry_run:
|
|
104
|
+
result.imported = len(data.traces) + len(data.evaluations)
|
|
105
|
+
return result
|
|
106
|
+
|
|
107
|
+
from evalgate_sdk.types import CreateEvaluationParams, CreateTraceParams
|
|
108
|
+
|
|
109
|
+
for trace_data in data.traces:
|
|
110
|
+
try:
|
|
111
|
+
await client.traces.create(
|
|
112
|
+
CreateTraceParams(
|
|
113
|
+
name=trace_data.get("name", "imported"),
|
|
114
|
+
metadata=trace_data.get("metadata"),
|
|
115
|
+
)
|
|
116
|
+
)
|
|
117
|
+
result.imported += 1
|
|
118
|
+
except Exception as exc:
|
|
119
|
+
if opts.skip_duplicates and "duplicate" in str(exc).lower():
|
|
120
|
+
result.skipped += 1
|
|
121
|
+
else:
|
|
122
|
+
result.errors.append(str(exc))
|
|
123
|
+
|
|
124
|
+
for eval_data in data.evaluations:
|
|
125
|
+
try:
|
|
126
|
+
await client.evaluations.create(
|
|
127
|
+
CreateEvaluationParams(
|
|
128
|
+
name=eval_data.get("name", "imported"),
|
|
129
|
+
description=eval_data.get("description"),
|
|
130
|
+
)
|
|
131
|
+
)
|
|
132
|
+
result.imported += 1
|
|
133
|
+
except Exception as exc:
|
|
134
|
+
if opts.skip_duplicates and "duplicate" in str(exc).lower():
|
|
135
|
+
result.skipped += 1
|
|
136
|
+
else:
|
|
137
|
+
result.errors.append(str(exc))
|
|
138
|
+
|
|
139
|
+
return result
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def export_to_file(data: ExportData, file_path: str) -> None:
|
|
143
|
+
"""Write export data to a file in the specified format."""
|
|
144
|
+
path = Path(file_path)
|
|
145
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
146
|
+
|
|
147
|
+
if data.format == "json":
|
|
148
|
+
path.write_text(
|
|
149
|
+
json.dumps(
|
|
150
|
+
{
|
|
151
|
+
"traces": data.traces,
|
|
152
|
+
"evaluations": data.evaluations,
|
|
153
|
+
"test_cases": data.test_cases,
|
|
154
|
+
"runs": data.runs,
|
|
155
|
+
"metadata": data.metadata,
|
|
156
|
+
},
|
|
157
|
+
indent=2,
|
|
158
|
+
default=str,
|
|
159
|
+
),
|
|
160
|
+
encoding="utf-8",
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
elif data.format == "jsonl":
|
|
164
|
+
lines: list[str] = []
|
|
165
|
+
for t in data.traces:
|
|
166
|
+
lines.append(json.dumps({"type": "trace", **t}, default=str))
|
|
167
|
+
for e in data.evaluations:
|
|
168
|
+
lines.append(json.dumps({"type": "evaluation", **e}, default=str))
|
|
169
|
+
path.write_text("\n".join(lines), encoding="utf-8")
|
|
170
|
+
|
|
171
|
+
elif data.format == "csv":
|
|
172
|
+
path.write_text(convert_to_csv(data, "traces"), encoding="utf-8")
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def import_from_file(file_path: str) -> ExportData:
|
|
176
|
+
"""Read export data from a file."""
|
|
177
|
+
path = Path(file_path)
|
|
178
|
+
text = path.read_text(encoding="utf-8")
|
|
179
|
+
|
|
180
|
+
if file_path.endswith(".jsonl"):
|
|
181
|
+
data = ExportData(format="jsonl")
|
|
182
|
+
for line in text.strip().splitlines():
|
|
183
|
+
obj = json.loads(line)
|
|
184
|
+
record_type = obj.pop("type", "trace")
|
|
185
|
+
if record_type == "trace":
|
|
186
|
+
data.traces.append(obj)
|
|
187
|
+
elif record_type == "evaluation":
|
|
188
|
+
data.evaluations.append(obj)
|
|
189
|
+
return data
|
|
190
|
+
else:
|
|
191
|
+
raw = json.loads(text)
|
|
192
|
+
return ExportData(
|
|
193
|
+
format="json",
|
|
194
|
+
traces=raw.get("traces", []),
|
|
195
|
+
evaluations=raw.get("evaluations", []),
|
|
196
|
+
test_cases=raw.get("test_cases", []),
|
|
197
|
+
runs=raw.get("runs", []),
|
|
198
|
+
metadata=raw.get("metadata", {}),
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def convert_to_csv(data: ExportData, resource_type: Literal["traces", "evaluations"] = "traces") -> str:
|
|
203
|
+
"""Convert export data to CSV string."""
|
|
204
|
+
items = data.traces if resource_type == "traces" else data.evaluations
|
|
205
|
+
if not items:
|
|
206
|
+
return ""
|
|
207
|
+
buf = io.StringIO()
|
|
208
|
+
writer = csv.DictWriter(buf, fieldnames=sorted(items[0].keys()))
|
|
209
|
+
writer.writeheader()
|
|
210
|
+
for item in items:
|
|
211
|
+
writer.writerow({k: json.dumps(v) if isinstance(v, (dict, list)) else v for k, v in item.items()})
|
|
212
|
+
return buf.getvalue()
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def import_from_langsmith(langsmith_data: Any) -> ExportData:
|
|
216
|
+
"""Convert LangSmith export format to EvalAI format."""
|
|
217
|
+
data = ExportData()
|
|
218
|
+
if isinstance(langsmith_data, list):
|
|
219
|
+
for item in langsmith_data:
|
|
220
|
+
data.traces.append(
|
|
221
|
+
{
|
|
222
|
+
"name": item.get("name", item.get("run_type", "langsmith-import")),
|
|
223
|
+
"input": json.dumps(item.get("inputs", {})),
|
|
224
|
+
"output": json.dumps(item.get("outputs", {})),
|
|
225
|
+
"metadata": {
|
|
226
|
+
"langsmith_id": item.get("id"),
|
|
227
|
+
"run_type": item.get("run_type"),
|
|
228
|
+
"source": "langsmith",
|
|
229
|
+
},
|
|
230
|
+
}
|
|
231
|
+
)
|
|
232
|
+
return data
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _now_iso() -> str:
|
|
236
|
+
from datetime import datetime, timezone
|
|
237
|
+
|
|
238
|
+
return datetime.now(timezone.utc).isoformat()
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Output formatters for evaluation results (T10).
|
|
2
|
+
|
|
3
|
+
Provides human, JSON, GitHub, and PR comment output formats.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from evalgate_sdk.formatters.github import format_github
|
|
7
|
+
from evalgate_sdk.formatters.human import format_human
|
|
8
|
+
from evalgate_sdk.formatters.json_fmt import format_json
|
|
9
|
+
from evalgate_sdk.formatters.pr_comment import format_pr_comment
|
|
10
|
+
|
|
11
|
+
__all__ = ["format_human", "format_json", "format_github", "format_pr_comment"]
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""GitHub Actions output formatter."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def format_github(report: dict[str, Any]) -> str:
|
|
9
|
+
"""Format a check/run report as GitHub Actions workflow commands.
|
|
10
|
+
|
|
11
|
+
Uses ``::error``, ``::warning``, and ``::notice`` annotations.
|
|
12
|
+
"""
|
|
13
|
+
lines: list[str] = []
|
|
14
|
+
verdict = report.get("verdict", "unknown")
|
|
15
|
+
eval_id = report.get("evaluationId", report.get("run_id", "?"))
|
|
16
|
+
score = report.get("score")
|
|
17
|
+
reason = report.get("reasonMessage", report.get("reason_message", ""))
|
|
18
|
+
|
|
19
|
+
# Summary annotation
|
|
20
|
+
if verdict == "pass":
|
|
21
|
+
lines.append(f"::notice title=EvalGate Pass::Evaluation {eval_id} passed (score={score})")
|
|
22
|
+
elif verdict == "warn":
|
|
23
|
+
lines.append(f"::warning title=EvalGate Warning::Evaluation {eval_id}: {reason}")
|
|
24
|
+
else:
|
|
25
|
+
lines.append(f"::error title=EvalGate Fail::Evaluation {eval_id} failed: {reason}")
|
|
26
|
+
|
|
27
|
+
# Set output variables via GITHUB_OUTPUT (::set-output is deprecated since Oct 2022)
|
|
28
|
+
import os
|
|
29
|
+
|
|
30
|
+
github_output = os.environ.get("GITHUB_OUTPUT")
|
|
31
|
+
if github_output:
|
|
32
|
+
try:
|
|
33
|
+
with open(github_output, "a") as f:
|
|
34
|
+
f.write(f"verdict={verdict}\n")
|
|
35
|
+
if score is not None:
|
|
36
|
+
f.write(f"score={score}\n")
|
|
37
|
+
except OSError:
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
# Failed cases as annotations
|
|
41
|
+
failed_cases = report.get("failedCases", report.get("failed_cases", []))
|
|
42
|
+
for fc in failed_cases[:10]:
|
|
43
|
+
name = fc.get("name", fc.get("test_name", "?"))
|
|
44
|
+
msg = fc.get("reason", fc.get("message", "failed"))
|
|
45
|
+
file_path = fc.get("file_path", fc.get("filePath", ""))
|
|
46
|
+
if file_path:
|
|
47
|
+
lines.append(f"::error file={file_path}::{name}: {msg}")
|
|
48
|
+
else:
|
|
49
|
+
lines.append(f"::error::{name}: {msg}")
|
|
50
|
+
|
|
51
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Human-readable output formatter."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def format_human(report: dict[str, Any]) -> str:
|
|
9
|
+
"""Format a check/run report as a human-readable string."""
|
|
10
|
+
lines: list[str] = []
|
|
11
|
+
verdict = report.get("verdict", report.get("summary", {}).get("success", "unknown"))
|
|
12
|
+
eval_id = report.get("evaluationId", report.get("run_id", "?"))
|
|
13
|
+
|
|
14
|
+
# Header
|
|
15
|
+
if verdict == "pass" or verdict is True:
|
|
16
|
+
lines.append(f"✅ PASS — {eval_id}")
|
|
17
|
+
elif verdict == "warn":
|
|
18
|
+
lines.append(f"⚠️ WARN — {eval_id}")
|
|
19
|
+
else:
|
|
20
|
+
lines.append(f"❌ FAIL — {eval_id}")
|
|
21
|
+
|
|
22
|
+
lines.append("")
|
|
23
|
+
|
|
24
|
+
# Summary
|
|
25
|
+
summary = report.get("summary", {})
|
|
26
|
+
if summary:
|
|
27
|
+
total = summary.get("total", 0)
|
|
28
|
+
passed = summary.get("passed", 0)
|
|
29
|
+
failed = summary.get("failed", 0)
|
|
30
|
+
pass_rate = summary.get("pass_rate", summary.get("passRate", 0))
|
|
31
|
+
avg_score = summary.get("average_score", summary.get("averageScore", 0))
|
|
32
|
+
duration = summary.get("total_duration_ms", summary.get("totalDurationMs", 0))
|
|
33
|
+
|
|
34
|
+
lines.append(f" Total: {total}")
|
|
35
|
+
lines.append(f" Passed: {passed}")
|
|
36
|
+
lines.append(f" Failed: {failed}")
|
|
37
|
+
lines.append(f" Pass rate: {pass_rate:.1f}%")
|
|
38
|
+
lines.append(f" Avg score: {avg_score:.1f}")
|
|
39
|
+
lines.append(f" Duration: {duration:.0f}ms")
|
|
40
|
+
|
|
41
|
+
# Score
|
|
42
|
+
score = report.get("score")
|
|
43
|
+
if score is not None:
|
|
44
|
+
lines.append(f"\n Score: {score}")
|
|
45
|
+
|
|
46
|
+
baseline = report.get("baselineScore", report.get("baseline_score"))
|
|
47
|
+
if baseline is not None:
|
|
48
|
+
delta = report.get("delta", (score or 0) - baseline)
|
|
49
|
+
lines.append(f" Baseline: {baseline}")
|
|
50
|
+
lines.append(f" Delta: {delta:+.1f}")
|
|
51
|
+
|
|
52
|
+
# Reason
|
|
53
|
+
reason = report.get("reasonMessage", report.get("reason_message"))
|
|
54
|
+
if reason:
|
|
55
|
+
lines.append(f"\n Reason: {reason}")
|
|
56
|
+
|
|
57
|
+
# Failed cases
|
|
58
|
+
failed_cases = report.get("failedCases", report.get("failed_cases", []))
|
|
59
|
+
if failed_cases:
|
|
60
|
+
lines.append(f"\n Failed cases ({len(failed_cases)}):")
|
|
61
|
+
for fc in failed_cases[:5]:
|
|
62
|
+
name = fc.get("name", fc.get("test_name", "?"))
|
|
63
|
+
reason_text = fc.get("reason", fc.get("message", ""))
|
|
64
|
+
lines.append(f" • {name}: {reason_text}")
|
|
65
|
+
if len(failed_cases) > 5:
|
|
66
|
+
lines.append(f" ... and {len(failed_cases) - 5} more")
|
|
67
|
+
|
|
68
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""JSON output formatter."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def format_json(report: dict[str, Any], *, indent: int = 2) -> str:
|
|
10
|
+
"""Format a check/run report as a JSON string."""
|
|
11
|
+
return json.dumps(report, indent=indent, default=str)
|