evalgate-sdk 3.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. evalgate_sdk/__init__.py +707 -0
  2. evalgate_sdk/_version.py +3 -0
  3. evalgate_sdk/assertions.py +1362 -0
  4. evalgate_sdk/auto.py +247 -0
  5. evalgate_sdk/batch.py +174 -0
  6. evalgate_sdk/cache.py +111 -0
  7. evalgate_sdk/ci_context.py +123 -0
  8. evalgate_sdk/cli/__init__.py +111 -0
  9. evalgate_sdk/cli/api.py +261 -0
  10. evalgate_sdk/cli/cli_constants.py +20 -0
  11. evalgate_sdk/cli/commands.py +1041 -0
  12. evalgate_sdk/cli/config.py +228 -0
  13. evalgate_sdk/cli/env.py +43 -0
  14. evalgate_sdk/cli/formatters/types.py +132 -0
  15. evalgate_sdk/cli/golden_commands.py +322 -0
  16. evalgate_sdk/cli/manifest.py +301 -0
  17. evalgate_sdk/cli/new_commands.py +435 -0
  18. evalgate_sdk/cli/policy_packs.py +103 -0
  19. evalgate_sdk/cli/profiles.py +12 -0
  20. evalgate_sdk/cli/regression_gate.py +312 -0
  21. evalgate_sdk/cli/render/__init__.py +1 -0
  22. evalgate_sdk/cli/render/snippet.py +18 -0
  23. evalgate_sdk/cli/render/sort.py +29 -0
  24. evalgate_sdk/cli/report/__init__.py +1 -0
  25. evalgate_sdk/cli/report/build_check_report.py +209 -0
  26. evalgate_sdk/cli/traces.py +186 -0
  27. evalgate_sdk/cli/workspace.py +63 -0
  28. evalgate_sdk/client.py +609 -0
  29. evalgate_sdk/cluster.py +359 -0
  30. evalgate_sdk/collector.py +161 -0
  31. evalgate_sdk/constants.py +6 -0
  32. evalgate_sdk/context.py +151 -0
  33. evalgate_sdk/errors.py +236 -0
  34. evalgate_sdk/export.py +238 -0
  35. evalgate_sdk/formatters/__init__.py +11 -0
  36. evalgate_sdk/formatters/github.py +51 -0
  37. evalgate_sdk/formatters/human.py +68 -0
  38. evalgate_sdk/formatters/json_fmt.py +11 -0
  39. evalgate_sdk/formatters/pr_comment.py +80 -0
  40. evalgate_sdk/golden.py +426 -0
  41. evalgate_sdk/integrations/__init__.py +1 -0
  42. evalgate_sdk/integrations/anthropic.py +99 -0
  43. evalgate_sdk/integrations/autogen.py +62 -0
  44. evalgate_sdk/integrations/crewai.py +61 -0
  45. evalgate_sdk/integrations/langchain.py +100 -0
  46. evalgate_sdk/integrations/openai.py +155 -0
  47. evalgate_sdk/integrations/openai_eval.py +221 -0
  48. evalgate_sdk/local.py +144 -0
  49. evalgate_sdk/logger.py +123 -0
  50. evalgate_sdk/matchers.py +62 -0
  51. evalgate_sdk/otel.py +256 -0
  52. evalgate_sdk/pagination.py +145 -0
  53. evalgate_sdk/py.typed +0 -0
  54. evalgate_sdk/pytest_plugin.py +96 -0
  55. evalgate_sdk/reason_codes.py +103 -0
  56. evalgate_sdk/regression.py +196 -0
  57. evalgate_sdk/replay_decision.py +115 -0
  58. evalgate_sdk/runtime/__init__.py +50 -0
  59. evalgate_sdk/runtime/adapters/__init__.py +1 -0
  60. evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
  61. evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
  62. evalgate_sdk/runtime/context.py +68 -0
  63. evalgate_sdk/runtime/eval.py +318 -0
  64. evalgate_sdk/runtime/execution_mode.py +170 -0
  65. evalgate_sdk/runtime/executor.py +92 -0
  66. evalgate_sdk/runtime/registry.py +125 -0
  67. evalgate_sdk/runtime/run_report.py +249 -0
  68. evalgate_sdk/runtime/types.py +143 -0
  69. evalgate_sdk/snapshot.py +219 -0
  70. evalgate_sdk/streaming.py +124 -0
  71. evalgate_sdk/synthesize.py +226 -0
  72. evalgate_sdk/testing.py +128 -0
  73. evalgate_sdk/types.py +666 -0
  74. evalgate_sdk/utils/__init__.py +1 -0
  75. evalgate_sdk/utils/input_hash.py +42 -0
  76. evalgate_sdk/workflows.py +264 -0
  77. evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
  78. evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
  79. evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
  80. evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
evalgate_sdk/errors.py ADDED
@@ -0,0 +1,236 @@
1
+ """Error classes for the EvalAI SDK, with rich error information and documentation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime
6
+ from typing import Any
7
+
8
+ _ERROR_DOCS: dict[str, dict[str, Any]] = {
9
+ "MISSING_API_KEY": {
10
+ "documentation": "https://docs.evalgate.com/errors/missing-api-key",
11
+ "solutions": [
12
+ "Set EVALGATE_API_KEY environment variable",
13
+ 'Pass api_key in config: AIEvalClient(api_key="...")',
14
+ ],
15
+ "retryable": False,
16
+ },
17
+ "RATE_LIMIT_EXCEEDED": {
18
+ "documentation": "https://docs.evalgate.com/errors/rate-limit",
19
+ "solutions": [
20
+ "Wait before retrying (check retry_after property)",
21
+ "Upgrade your plan for higher rate limits",
22
+ "Implement exponential backoff",
23
+ ],
24
+ "retryable": True,
25
+ },
26
+ "TIMEOUT": {
27
+ "documentation": "https://docs.evalgate.com/errors/timeout",
28
+ "solutions": [
29
+ "Increase timeout: AIEvalClient(timeout=60000)",
30
+ "Check your network connection",
31
+ ],
32
+ "retryable": True,
33
+ },
34
+ "NETWORK_ERROR": {
35
+ "documentation": "https://docs.evalgate.com/errors/network",
36
+ "solutions": [
37
+ "Check your internet connection",
38
+ "Verify the base_url is correct",
39
+ ],
40
+ "retryable": True,
41
+ },
42
+ "UNAUTHORIZED": {
43
+ "documentation": "https://docs.evalgate.com/errors/unauthorized",
44
+ "solutions": [
45
+ "Verify your API key is correct",
46
+ "Check if your API key has expired",
47
+ ],
48
+ "retryable": False,
49
+ },
50
+ "FORBIDDEN": {
51
+ "documentation": "https://docs.evalgate.com/errors/forbidden",
52
+ "solutions": [
53
+ "Check if you have permission for this resource",
54
+ "Verify you're using the correct organization ID",
55
+ ],
56
+ "retryable": False,
57
+ },
58
+ "NOT_FOUND": {
59
+ "documentation": "https://docs.evalgate.com/errors/not-found",
60
+ "solutions": [
61
+ "Verify the resource ID is correct",
62
+ "Check if the resource was deleted",
63
+ ],
64
+ "retryable": False,
65
+ },
66
+ "VALIDATION_ERROR": {
67
+ "documentation": "https://docs.evalgate.com/errors/validation",
68
+ "solutions": [
69
+ "Check the error details for specific validation failures",
70
+ "Verify all required fields are provided",
71
+ ],
72
+ "retryable": False,
73
+ },
74
+ "INTERNAL_SERVER_ERROR": {
75
+ "documentation": "https://docs.evalgate.com/errors/server-error",
76
+ "solutions": [
77
+ "Retry the request after a brief delay",
78
+ "Contact support if the issue persists",
79
+ ],
80
+ "retryable": True,
81
+ },
82
+ "FEATURE_LIMIT_REACHED": {
83
+ "documentation": "https://docs.evalgate.com/errors/feature-limit",
84
+ "solutions": [
85
+ "Upgrade your plan for higher limits",
86
+ "Wait for your usage to reset",
87
+ ],
88
+ "retryable": False,
89
+ },
90
+ }
91
+
92
+ _STATUS_TO_CODE = {
93
+ 401: "UNAUTHORIZED",
94
+ 403: "FORBIDDEN",
95
+ 404: "NOT_FOUND",
96
+ 408: "TIMEOUT",
97
+ 422: "VALIDATION_ERROR",
98
+ 429: "RATE_LIMIT_EXCEEDED",
99
+ }
100
+
101
+
102
+ class EvalGateError(Exception):
103
+ """Base error for the EvalAI SDK with rich diagnostics."""
104
+
105
+ code: str
106
+ status_code: int
107
+ documentation: str
108
+ solutions: list[str]
109
+ retryable: bool
110
+ details: Any | None
111
+ retry_after: int | None
112
+ reset_at: datetime | None
113
+ request_id: str | None
114
+
115
+ def __init__(
116
+ self,
117
+ message: str,
118
+ code: str = "UNKNOWN_ERROR",
119
+ status_code: int = 0,
120
+ details: Any | None = None,
121
+ ) -> None:
122
+ super().__init__(message)
123
+ self.code = code
124
+ self.status_code = status_code
125
+ self.details = details
126
+
127
+ doc = _ERROR_DOCS.get(code, {})
128
+ self.documentation = doc.get("documentation", f"https://docs.evalgate.com/errors/{code}")
129
+ self.solutions = doc.get("solutions", ["Check the error details for more information"])
130
+ self.retryable = doc.get("retryable", False)
131
+ self.retry_after = None
132
+ self.reset_at = None
133
+ self.request_id = None
134
+
135
+ if isinstance(details, dict):
136
+ if code == "RATE_LIMIT_EXCEEDED" and "retryAfter" in details:
137
+ self.retry_after = int(details["retryAfter"])
138
+ if code == "FEATURE_LIMIT_REACHED" and "resetAt" in details:
139
+ self.reset_at = datetime.fromisoformat(details["resetAt"])
140
+ self.request_id = details.get("requestId")
141
+
142
+ @property
143
+ def message(self) -> str:
144
+ """Return the error message string, matching the TS ``error.message`` API."""
145
+ return str(self.args[0]) if self.args else ""
146
+
147
+ def should_retry(self) -> bool:
148
+ return self.retryable
149
+
150
+ def detailed_message(self) -> str:
151
+ lines = [f"{self.code}: {self}", "", f"Documentation: {self.documentation}", ""]
152
+ lines.append("Suggested solutions:")
153
+ for i, s in enumerate(self.solutions, 1):
154
+ lines.append(f" {i}. {s}")
155
+ if self.retry_after is not None:
156
+ lines.append(f"\nRetry after: {self.retry_after} seconds")
157
+ if self.reset_at is not None:
158
+ lines.append(f"\nLimit resets at: {self.reset_at.isoformat()}")
159
+ return "\n".join(lines)
160
+
161
+ def to_dict(self) -> dict[str, Any]:
162
+ return {
163
+ "code": self.code,
164
+ "message": str(self),
165
+ "status_code": self.status_code,
166
+ "documentation": self.documentation,
167
+ "solutions": self.solutions,
168
+ "retryable": self.retryable,
169
+ "retry_after": self.retry_after,
170
+ "request_id": self.request_id,
171
+ "details": self.details,
172
+ }
173
+
174
+
175
+ class RateLimitError(EvalGateError):
176
+ def __init__(self, message: str = "Rate limit exceeded", retry_after: int | None = None):
177
+ super().__init__(message, "RATE_LIMIT_EXCEEDED", 429, {"retryAfter": retry_after} if retry_after else None)
178
+
179
+
180
+ class AuthenticationError(EvalGateError):
181
+ def __init__(self, message: str = "Authentication failed"):
182
+ super().__init__(message, "UNAUTHORIZED", 401)
183
+
184
+
185
+ class ValidationError(EvalGateError):
186
+ def __init__(self, message: str = "Validation failed", details: Any | None = None):
187
+ super().__init__(message, "VALIDATION_ERROR", 400, details)
188
+
189
+
190
+ class NetworkError(EvalGateError):
191
+ def __init__(self, message: str = "Network request failed"):
192
+ super().__init__(message, "NETWORK_ERROR", 0)
193
+ self.retryable = True
194
+
195
+
196
+ def create_error_from_response(status_code: int, data: Any) -> EvalGateError:
197
+ """Create an EvalGateError from an HTTP response status and body."""
198
+ if isinstance(data, dict):
199
+ error_obj = data.get("error", data)
200
+ if isinstance(error_obj, str):
201
+ error_obj = data
202
+ code = (
203
+ (error_obj.get("code") if isinstance(error_obj, dict) else None)
204
+ or data.get("code")
205
+ or _STATUS_TO_CODE.get(status_code)
206
+ or ("INTERNAL_SERVER_ERROR" if status_code >= 500 else "UNKNOWN_ERROR")
207
+ )
208
+ message = (
209
+ (data["error"] if isinstance(data.get("error"), str) else None)
210
+ or (error_obj.get("message") if isinstance(error_obj, dict) else None)
211
+ or data.get("message")
212
+ or "Unknown error"
213
+ )
214
+ request_id = (error_obj.get("requestId") if isinstance(error_obj, dict) else None) or data.get("requestId")
215
+ else:
216
+ code = _STATUS_TO_CODE.get(status_code, "INTERNAL_SERVER_ERROR" if status_code >= 500 else "UNKNOWN_ERROR")
217
+ message = str(data) if data else "Unknown error"
218
+ request_id = None
219
+
220
+ err: EvalGateError
221
+ if status_code == 429:
222
+ retry_after = None
223
+ if isinstance(data, dict):
224
+ retry_after = data.get("retryAfter") or (
225
+ error_obj.get("retryAfter") if isinstance(error_obj, dict) else None
226
+ )
227
+ err = RateLimitError(message, retry_after=int(retry_after) if retry_after else None)
228
+ elif status_code == 401 or code == "UNAUTHORIZED":
229
+ err = AuthenticationError(message)
230
+ elif status_code == 400 or code == "VALIDATION_ERROR":
231
+ err = ValidationError(message, details=data)
232
+ else:
233
+ err = EvalGateError(message, code, status_code, data)
234
+ if request_id:
235
+ err.request_id = request_id
236
+ return err
evalgate_sdk/export.py ADDED
@@ -0,0 +1,238 @@
1
+ """Data export and import — JSON, CSV, JSONL formats with LangSmith conversion."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import io
7
+ import json
8
+ from dataclasses import dataclass, field
9
+ from pathlib import Path
10
+ from typing import Any, Literal
11
+
12
+ ExportFormat = Literal["json", "csv", "jsonl"]
13
+
14
+
15
+ @dataclass
16
+ class ExportOptions:
17
+ format: ExportFormat = "json"
18
+ include_traces: bool = True
19
+ include_evaluations: bool = True
20
+ include_test_cases: bool = True
21
+ include_runs: bool = True
22
+ start_date: str | None = None
23
+ end_date: str | None = None
24
+ organization_id: int | None = None
25
+
26
+
27
+ @dataclass
28
+ class ImportOptions:
29
+ skip_duplicates: bool = True
30
+ dry_run: bool = False
31
+ organization_id: int | None = None
32
+
33
+
34
+ @dataclass
35
+ class ExportData:
36
+ format: ExportFormat = "json"
37
+ traces: list[dict[str, Any]] = field(default_factory=list)
38
+ evaluations: list[dict[str, Any]] = field(default_factory=list)
39
+ test_cases: list[dict[str, Any]] = field(default_factory=list)
40
+ runs: list[dict[str, Any]] = field(default_factory=list)
41
+ metadata: dict[str, Any] = field(default_factory=dict)
42
+
43
+
44
+ @dataclass
45
+ class ImportResult:
46
+ imported: int = 0
47
+ skipped: int = 0
48
+ errors: list[str] = field(default_factory=list)
49
+ dry_run: bool = False
50
+
51
+
52
+ async def export_data(client: Any, options: ExportOptions | None = None) -> ExportData:
53
+ """Export traces, evaluations, test cases, and runs from the platform."""
54
+ opts = options or ExportOptions()
55
+ data = ExportData(format=opts.format)
56
+
57
+ if opts.include_traces:
58
+ from evalgate_sdk.types import ListTracesParams
59
+
60
+ params = ListTracesParams(limit=100)
61
+ if opts.organization_id:
62
+ params.organization_id = opts.organization_id
63
+ traces = await client.traces.list(params)
64
+ data.traces = [t.model_dump(mode="json", by_alias=True) for t in traces]
65
+
66
+ if opts.include_evaluations:
67
+ evals = await client.evaluations.list()
68
+ data.evaluations = [e.model_dump(mode="json", by_alias=True) for e in evals]
69
+
70
+ if opts.include_test_cases:
71
+ for ev in evals:
72
+ tcs = await client.evaluations.list_test_cases(ev.id)
73
+ data.test_cases.extend(tc.model_dump(mode="json", by_alias=True) for tc in tcs)
74
+
75
+ if opts.include_runs:
76
+ for ev in evals:
77
+ runs = await client.evaluations.list_runs(ev.id)
78
+ data.runs.extend(r.model_dump(mode="json", by_alias=True) for r in runs)
79
+
80
+ data.metadata = {"exported_at": _now_iso(), "total_items": len(data.traces) + len(data.evaluations)}
81
+ return data
82
+
83
+
84
+ async def import_data(data: ExportData, options: ImportOptions | None = None, *, client: Any = None) -> ImportResult:
85
+ """Import data back into the platform.
86
+
87
+ The *client* parameter is keyword-only. When omitted the function
88
+ attempts to use the global default client (``get_default_client()``).
89
+ This 2-arg signature matches the TS public export.
90
+ """
91
+ if client is None:
92
+ try:
93
+ from evalgate_sdk.client import get_default_client
94
+
95
+ client = get_default_client()
96
+ except Exception as err:
97
+ raise TypeError(
98
+ "import_data() requires a client. Either pass client=... or initialise a default client first."
99
+ ) from err
100
+ opts = options or ImportOptions()
101
+ result = ImportResult(dry_run=opts.dry_run)
102
+
103
+ if opts.dry_run:
104
+ result.imported = len(data.traces) + len(data.evaluations)
105
+ return result
106
+
107
+ from evalgate_sdk.types import CreateEvaluationParams, CreateTraceParams
108
+
109
+ for trace_data in data.traces:
110
+ try:
111
+ await client.traces.create(
112
+ CreateTraceParams(
113
+ name=trace_data.get("name", "imported"),
114
+ metadata=trace_data.get("metadata"),
115
+ )
116
+ )
117
+ result.imported += 1
118
+ except Exception as exc:
119
+ if opts.skip_duplicates and "duplicate" in str(exc).lower():
120
+ result.skipped += 1
121
+ else:
122
+ result.errors.append(str(exc))
123
+
124
+ for eval_data in data.evaluations:
125
+ try:
126
+ await client.evaluations.create(
127
+ CreateEvaluationParams(
128
+ name=eval_data.get("name", "imported"),
129
+ description=eval_data.get("description"),
130
+ )
131
+ )
132
+ result.imported += 1
133
+ except Exception as exc:
134
+ if opts.skip_duplicates and "duplicate" in str(exc).lower():
135
+ result.skipped += 1
136
+ else:
137
+ result.errors.append(str(exc))
138
+
139
+ return result
140
+
141
+
142
+ def export_to_file(data: ExportData, file_path: str) -> None:
143
+ """Write export data to a file in the specified format."""
144
+ path = Path(file_path)
145
+ path.parent.mkdir(parents=True, exist_ok=True)
146
+
147
+ if data.format == "json":
148
+ path.write_text(
149
+ json.dumps(
150
+ {
151
+ "traces": data.traces,
152
+ "evaluations": data.evaluations,
153
+ "test_cases": data.test_cases,
154
+ "runs": data.runs,
155
+ "metadata": data.metadata,
156
+ },
157
+ indent=2,
158
+ default=str,
159
+ ),
160
+ encoding="utf-8",
161
+ )
162
+
163
+ elif data.format == "jsonl":
164
+ lines: list[str] = []
165
+ for t in data.traces:
166
+ lines.append(json.dumps({"type": "trace", **t}, default=str))
167
+ for e in data.evaluations:
168
+ lines.append(json.dumps({"type": "evaluation", **e}, default=str))
169
+ path.write_text("\n".join(lines), encoding="utf-8")
170
+
171
+ elif data.format == "csv":
172
+ path.write_text(convert_to_csv(data, "traces"), encoding="utf-8")
173
+
174
+
175
+ def import_from_file(file_path: str) -> ExportData:
176
+ """Read export data from a file."""
177
+ path = Path(file_path)
178
+ text = path.read_text(encoding="utf-8")
179
+
180
+ if file_path.endswith(".jsonl"):
181
+ data = ExportData(format="jsonl")
182
+ for line in text.strip().splitlines():
183
+ obj = json.loads(line)
184
+ record_type = obj.pop("type", "trace")
185
+ if record_type == "trace":
186
+ data.traces.append(obj)
187
+ elif record_type == "evaluation":
188
+ data.evaluations.append(obj)
189
+ return data
190
+ else:
191
+ raw = json.loads(text)
192
+ return ExportData(
193
+ format="json",
194
+ traces=raw.get("traces", []),
195
+ evaluations=raw.get("evaluations", []),
196
+ test_cases=raw.get("test_cases", []),
197
+ runs=raw.get("runs", []),
198
+ metadata=raw.get("metadata", {}),
199
+ )
200
+
201
+
202
+ def convert_to_csv(data: ExportData, resource_type: Literal["traces", "evaluations"] = "traces") -> str:
203
+ """Convert export data to CSV string."""
204
+ items = data.traces if resource_type == "traces" else data.evaluations
205
+ if not items:
206
+ return ""
207
+ buf = io.StringIO()
208
+ writer = csv.DictWriter(buf, fieldnames=sorted(items[0].keys()))
209
+ writer.writeheader()
210
+ for item in items:
211
+ writer.writerow({k: json.dumps(v) if isinstance(v, (dict, list)) else v for k, v in item.items()})
212
+ return buf.getvalue()
213
+
214
+
215
+ def import_from_langsmith(langsmith_data: Any) -> ExportData:
216
+ """Convert LangSmith export format to EvalAI format."""
217
+ data = ExportData()
218
+ if isinstance(langsmith_data, list):
219
+ for item in langsmith_data:
220
+ data.traces.append(
221
+ {
222
+ "name": item.get("name", item.get("run_type", "langsmith-import")),
223
+ "input": json.dumps(item.get("inputs", {})),
224
+ "output": json.dumps(item.get("outputs", {})),
225
+ "metadata": {
226
+ "langsmith_id": item.get("id"),
227
+ "run_type": item.get("run_type"),
228
+ "source": "langsmith",
229
+ },
230
+ }
231
+ )
232
+ return data
233
+
234
+
235
+ def _now_iso() -> str:
236
+ from datetime import datetime, timezone
237
+
238
+ return datetime.now(timezone.utc).isoformat()
@@ -0,0 +1,11 @@
1
+ """Output formatters for evaluation results (T10).
2
+
3
+ Provides human, JSON, GitHub, and PR comment output formats.
4
+ """
5
+
6
+ from evalgate_sdk.formatters.github import format_github
7
+ from evalgate_sdk.formatters.human import format_human
8
+ from evalgate_sdk.formatters.json_fmt import format_json
9
+ from evalgate_sdk.formatters.pr_comment import format_pr_comment
10
+
11
+ __all__ = ["format_human", "format_json", "format_github", "format_pr_comment"]
@@ -0,0 +1,51 @@
1
+ """GitHub Actions output formatter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+
8
+ def format_github(report: dict[str, Any]) -> str:
9
+ """Format a check/run report as GitHub Actions workflow commands.
10
+
11
+ Uses ``::error``, ``::warning``, and ``::notice`` annotations.
12
+ """
13
+ lines: list[str] = []
14
+ verdict = report.get("verdict", "unknown")
15
+ eval_id = report.get("evaluationId", report.get("run_id", "?"))
16
+ score = report.get("score")
17
+ reason = report.get("reasonMessage", report.get("reason_message", ""))
18
+
19
+ # Summary annotation
20
+ if verdict == "pass":
21
+ lines.append(f"::notice title=EvalGate Pass::Evaluation {eval_id} passed (score={score})")
22
+ elif verdict == "warn":
23
+ lines.append(f"::warning title=EvalGate Warning::Evaluation {eval_id}: {reason}")
24
+ else:
25
+ lines.append(f"::error title=EvalGate Fail::Evaluation {eval_id} failed: {reason}")
26
+
27
+ # Set output variables via GITHUB_OUTPUT (::set-output is deprecated since Oct 2022)
28
+ import os
29
+
30
+ github_output = os.environ.get("GITHUB_OUTPUT")
31
+ if github_output:
32
+ try:
33
+ with open(github_output, "a") as f:
34
+ f.write(f"verdict={verdict}\n")
35
+ if score is not None:
36
+ f.write(f"score={score}\n")
37
+ except OSError:
38
+ pass
39
+
40
+ # Failed cases as annotations
41
+ failed_cases = report.get("failedCases", report.get("failed_cases", []))
42
+ for fc in failed_cases[:10]:
43
+ name = fc.get("name", fc.get("test_name", "?"))
44
+ msg = fc.get("reason", fc.get("message", "failed"))
45
+ file_path = fc.get("file_path", fc.get("filePath", ""))
46
+ if file_path:
47
+ lines.append(f"::error file={file_path}::{name}: {msg}")
48
+ else:
49
+ lines.append(f"::error::{name}: {msg}")
50
+
51
+ return "\n".join(lines)
@@ -0,0 +1,68 @@
1
+ """Human-readable output formatter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+
8
+ def format_human(report: dict[str, Any]) -> str:
9
+ """Format a check/run report as a human-readable string."""
10
+ lines: list[str] = []
11
+ verdict = report.get("verdict", report.get("summary", {}).get("success", "unknown"))
12
+ eval_id = report.get("evaluationId", report.get("run_id", "?"))
13
+
14
+ # Header
15
+ if verdict == "pass" or verdict is True:
16
+ lines.append(f"✅ PASS — {eval_id}")
17
+ elif verdict == "warn":
18
+ lines.append(f"⚠️ WARN — {eval_id}")
19
+ else:
20
+ lines.append(f"❌ FAIL — {eval_id}")
21
+
22
+ lines.append("")
23
+
24
+ # Summary
25
+ summary = report.get("summary", {})
26
+ if summary:
27
+ total = summary.get("total", 0)
28
+ passed = summary.get("passed", 0)
29
+ failed = summary.get("failed", 0)
30
+ pass_rate = summary.get("pass_rate", summary.get("passRate", 0))
31
+ avg_score = summary.get("average_score", summary.get("averageScore", 0))
32
+ duration = summary.get("total_duration_ms", summary.get("totalDurationMs", 0))
33
+
34
+ lines.append(f" Total: {total}")
35
+ lines.append(f" Passed: {passed}")
36
+ lines.append(f" Failed: {failed}")
37
+ lines.append(f" Pass rate: {pass_rate:.1f}%")
38
+ lines.append(f" Avg score: {avg_score:.1f}")
39
+ lines.append(f" Duration: {duration:.0f}ms")
40
+
41
+ # Score
42
+ score = report.get("score")
43
+ if score is not None:
44
+ lines.append(f"\n Score: {score}")
45
+
46
+ baseline = report.get("baselineScore", report.get("baseline_score"))
47
+ if baseline is not None:
48
+ delta = report.get("delta", (score or 0) - baseline)
49
+ lines.append(f" Baseline: {baseline}")
50
+ lines.append(f" Delta: {delta:+.1f}")
51
+
52
+ # Reason
53
+ reason = report.get("reasonMessage", report.get("reason_message"))
54
+ if reason:
55
+ lines.append(f"\n Reason: {reason}")
56
+
57
+ # Failed cases
58
+ failed_cases = report.get("failedCases", report.get("failed_cases", []))
59
+ if failed_cases:
60
+ lines.append(f"\n Failed cases ({len(failed_cases)}):")
61
+ for fc in failed_cases[:5]:
62
+ name = fc.get("name", fc.get("test_name", "?"))
63
+ reason_text = fc.get("reason", fc.get("message", ""))
64
+ lines.append(f" • {name}: {reason_text}")
65
+ if len(failed_cases) > 5:
66
+ lines.append(f" ... and {len(failed_cases) - 5} more")
67
+
68
+ return "\n".join(lines)
@@ -0,0 +1,11 @@
1
+ """JSON output formatter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import Any
7
+
8
+
9
+ def format_json(report: dict[str, Any], *, indent: int = 2) -> str:
10
+ """Format a check/run report as a JSON string."""
11
+ return json.dumps(report, indent=indent, default=str)