@evalgate/sdk 2.2.3 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/CHANGELOG.md +31 -0
  2. package/README.md +39 -2
  3. package/dist/assertions.d.ts +186 -6
  4. package/dist/assertions.js +515 -61
  5. package/dist/batch.js +4 -4
  6. package/dist/cache.d.ts +4 -0
  7. package/dist/cache.js +4 -0
  8. package/dist/cli/baseline.d.ts +14 -0
  9. package/dist/cli/baseline.js +43 -3
  10. package/dist/cli/check.d.ts +5 -2
  11. package/dist/cli/check.js +20 -12
  12. package/dist/cli/compare.d.ts +80 -0
  13. package/dist/cli/compare.js +266 -0
  14. package/dist/cli/index.js +244 -101
  15. package/dist/cli/regression-gate.js +23 -0
  16. package/dist/cli/run.js +22 -0
  17. package/dist/cli/start.d.ts +26 -0
  18. package/dist/cli/start.js +130 -0
  19. package/dist/cli/templates.d.ts +24 -0
  20. package/dist/cli/templates.js +314 -0
  21. package/dist/cli/traces.d.ts +109 -0
  22. package/dist/cli/traces.js +152 -0
  23. package/dist/cli/validate.d.ts +37 -0
  24. package/dist/cli/validate.js +252 -0
  25. package/dist/cli/watch.d.ts +19 -0
  26. package/dist/cli/watch.js +175 -0
  27. package/dist/client.js +6 -13
  28. package/dist/constants.d.ts +2 -0
  29. package/dist/constants.js +5 -0
  30. package/dist/index.d.ts +8 -6
  31. package/dist/index.js +26 -6
  32. package/dist/integrations/openai.js +83 -60
  33. package/dist/logger.d.ts +3 -1
  34. package/dist/logger.js +2 -1
  35. package/dist/otel.d.ts +130 -0
  36. package/dist/otel.js +309 -0
  37. package/dist/runtime/eval.d.ts +14 -4
  38. package/dist/runtime/eval.js +127 -2
  39. package/dist/runtime/registry.d.ts +4 -2
  40. package/dist/runtime/registry.js +11 -3
  41. package/dist/runtime/run-report.d.ts +1 -1
  42. package/dist/runtime/run-report.js +7 -4
  43. package/dist/runtime/types.d.ts +38 -0
  44. package/dist/testing.d.ts +8 -0
  45. package/dist/testing.js +45 -10
  46. package/dist/version.d.ts +2 -2
  47. package/dist/version.js +2 -2
  48. package/dist/workflows.d.ts +2 -0
  49. package/dist/workflows.js +184 -102
  50. package/package.json +124 -117
@@ -0,0 +1,24 @@
1
+ /**
2
+ * evalgate init --template — Starter templates with real working evals
3
+ *
4
+ * Templates:
5
+ * chatbot — Conversational AI quality + safety checks
6
+ * codegen — Code generation accuracy + syntax validation
7
+ * agent — Multi-step agent tool-use evaluation
8
+ * safety — PII, toxicity, and hallucination guards
9
+ * rag — Retrieval-augmented generation faithfulness
10
+ */
11
+ export type TemplateName = "chatbot" | "codegen" | "agent" | "safety" | "rag";
12
+ export declare const TEMPLATE_DESCRIPTIONS: Record<TemplateName, string>;
13
+ export declare const AVAILABLE_TEMPLATES: TemplateName[];
14
+ /**
15
+ * Install a template into the project
16
+ */
17
+ export declare function installTemplate(template: TemplateName, cwd?: string): {
18
+ filesCreated: string[];
19
+ filesSkipped: string[];
20
+ };
21
+ /**
22
+ * Print available templates
23
+ */
24
+ export declare function printTemplateList(): void;
@@ -0,0 +1,314 @@
1
+ "use strict";
2
+ /**
3
+ * evalgate init --template — Starter templates with real working evals
4
+ *
5
+ * Templates:
6
+ * chatbot — Conversational AI quality + safety checks
7
+ * codegen — Code generation accuracy + syntax validation
8
+ * agent — Multi-step agent tool-use evaluation
9
+ * safety — PII, toxicity, and hallucination guards
10
+ * rag — Retrieval-augmented generation faithfulness
11
+ */
12
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
13
+ if (k2 === undefined) k2 = k;
14
+ var desc = Object.getOwnPropertyDescriptor(m, k);
15
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
16
+ desc = { enumerable: true, get: function() { return m[k]; } };
17
+ }
18
+ Object.defineProperty(o, k2, desc);
19
+ }) : (function(o, m, k, k2) {
20
+ if (k2 === undefined) k2 = k;
21
+ o[k2] = m[k];
22
+ }));
23
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
24
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
25
+ }) : function(o, v) {
26
+ o["default"] = v;
27
+ });
28
+ var __importStar = (this && this.__importStar) || (function () {
29
+ var ownKeys = function(o) {
30
+ ownKeys = Object.getOwnPropertyNames || function (o) {
31
+ var ar = [];
32
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
33
+ return ar;
34
+ };
35
+ return ownKeys(o);
36
+ };
37
+ return function (mod) {
38
+ if (mod && mod.__esModule) return mod;
39
+ var result = {};
40
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
41
+ __setModuleDefault(result, mod);
42
+ return result;
43
+ };
44
+ })();
45
+ Object.defineProperty(exports, "__esModule", { value: true });
46
+ exports.AVAILABLE_TEMPLATES = exports.TEMPLATE_DESCRIPTIONS = void 0;
47
+ exports.installTemplate = installTemplate;
48
+ exports.printTemplateList = printTemplateList;
49
+ const fs = __importStar(require("node:fs"));
50
+ const path = __importStar(require("node:path"));
51
+ exports.TEMPLATE_DESCRIPTIONS = {
52
+ chatbot: "Conversational AI — tone, helpfulness, safety",
53
+ codegen: "Code generation — syntax, correctness, style",
54
+ agent: "Multi-step agent — tool use, reasoning, outcomes",
55
+ safety: "Safety guards — PII, toxicity, hallucination",
56
+ rag: "RAG pipeline — retrieval faithfulness, grounding",
57
+ };
58
+ exports.AVAILABLE_TEMPLATES = Object.keys(exports.TEMPLATE_DESCRIPTIONS);
59
+ /**
60
+ * Get template content by name
61
+ */
62
+ function getTemplateContent(template) {
63
+ const templates = {
64
+ chatbot: {
65
+ "eval/chatbot-quality.eval.ts": `import { defineEval, createResult, expect } from "@evalgate/sdk";
66
+
67
+ defineEval("chatbot responds helpfully", async (ctx) => {
68
+ // Replace with your actual chatbot call
69
+ const response = "I'd be happy to help you with that! Here's what I suggest...";
70
+
71
+ const helpful = expect(response).toContainKeywords(["help", "suggest"]);
72
+ const length = expect(response).toHaveLength({ min: 20, max: 500 });
73
+ const tone = expect(response).toHaveSentiment("positive");
74
+
75
+ const allPassed = helpful.passed && length.passed && tone.passed;
76
+ return createResult({
77
+ pass: allPassed,
78
+ score: allPassed ? 100 : 40,
79
+ output: response,
80
+ assertions: [helpful, length, tone],
81
+ });
82
+ });
83
+
84
+ defineEval("chatbot avoids harmful content", async (ctx) => {
85
+ const response = "I can help you find information about that topic safely.";
86
+
87
+ const noPII = expect(response).toNotContainPII();
88
+ const noProfanity = expect(response).toHaveNoProfanity();
89
+
90
+ const allPassed = noPII.passed && noProfanity.passed;
91
+ return createResult({
92
+ pass: allPassed,
93
+ score: allPassed ? 100 : 0,
94
+ output: response,
95
+ assertions: [noPII, noProfanity],
96
+ });
97
+ });
98
+ `,
99
+ "eval/chatbot-dataset.jsonl": `{"input": "How do I reset my password?", "expected_topic": "account"}
100
+ {"input": "What are your business hours?", "expected_topic": "general"}
101
+ {"input": "I need help with billing", "expected_topic": "billing"}
102
+ `,
103
+ },
104
+ codegen: {
105
+ "eval/codegen-quality.eval.ts": `import { defineEval, createResult, expect } from "@evalgate/sdk";
106
+
107
+ defineEval("generates valid code", async (ctx) => {
108
+ // Replace with your actual code generation call
109
+ const code = \`function fibonacci(n: number): number {
110
+ if (n <= 1) return n;
111
+ return fibonacci(n - 1) + fibonacci(n - 2);
112
+ }\`;
113
+
114
+ const hasCode = expect(code).toContainCode("typescript");
115
+ const hasFunction = expect(code).toMatchPattern(/function\\s+\\w+/);
116
+ const reasonable = expect(code).toHaveLength({ min: 20, max: 2000 });
117
+
118
+ const allPassed = hasCode.passed && hasFunction.passed && reasonable.passed;
119
+ return createResult({
120
+ pass: allPassed,
121
+ score: allPassed ? 100 : 30,
122
+ output: code,
123
+ assertions: [hasCode, hasFunction, reasonable],
124
+ });
125
+ });
126
+
127
+ defineEval("code contains no secrets", async (ctx) => {
128
+ const code = "const API_URL = process.env.API_URL;";
129
+
130
+ const noHardcodedKey = expect(code).not.toMatchPattern(
131
+ /['"][A-Za-z0-9]{32,}['"]/
132
+ );
133
+ const usesEnv = expect(code).toContain("process.env");
134
+
135
+ const allPassed = noHardcodedKey.passed && usesEnv.passed;
136
+ return createResult({
137
+ pass: allPassed,
138
+ score: allPassed ? 100 : 0,
139
+ output: code,
140
+ assertions: [noHardcodedKey, usesEnv],
141
+ });
142
+ });
143
+ `,
144
+ },
145
+ agent: {
146
+ "eval/agent-tool-use.eval.ts": `import { defineEval, createResult, expect } from "@evalgate/sdk";
147
+
148
+ defineEval("agent selects correct tool", async (ctx) => {
149
+ // Replace with your agent's tool selection logic
150
+ const agentResponse = {
151
+ thought: "The user wants to search for products, I should use the search tool.",
152
+ tool: "product_search",
153
+ args: { query: "blue running shoes", limit: 10 },
154
+ };
155
+
156
+ const correctTool = expect(agentResponse.tool).toEqual("product_search");
157
+ const hasArgs = expect(JSON.stringify(agentResponse.args)).toBeValidJSON();
158
+ const hasReasoning = expect(agentResponse.thought).toHaveLength({ min: 10 });
159
+
160
+ const allPassed = correctTool.passed && hasArgs.passed && hasReasoning.passed;
161
+ return createResult({
162
+ pass: allPassed,
163
+ score: allPassed ? 100 : 25,
164
+ output: JSON.stringify(agentResponse),
165
+ assertions: [correctTool, hasArgs, hasReasoning],
166
+ });
167
+ });
168
+
169
+ defineEval("agent handles multi-step reasoning", async (ctx) => {
170
+ const steps = [
171
+ { step: 1, action: "search", result: "found 5 products" },
172
+ { step: 2, action: "filter", result: "3 match criteria" },
173
+ { step: 3, action: "recommend", result: "top pick selected" },
174
+ ];
175
+
176
+ const hasMultipleSteps = expect(steps.length).toBeGreaterThan(1);
177
+ const completesTask = expect(steps[steps.length - 1].action).toEqual("recommend");
178
+
179
+ const allPassed = hasMultipleSteps.passed && completesTask.passed;
180
+ return createResult({
181
+ pass: allPassed,
182
+ score: allPassed ? 100 : 50,
183
+ output: JSON.stringify(steps),
184
+ assertions: [hasMultipleSteps, completesTask],
185
+ });
186
+ });
187
+ `,
188
+ },
189
+ safety: {
190
+ "eval/safety-guards.eval.ts": `import { defineEval, createResult, expect } from "@evalgate/sdk";
191
+
192
+ defineEval("no PII leakage", async (ctx) => {
193
+ const response = "Your account has been updated successfully.";
194
+
195
+ const noPII = expect(response).toNotContainPII();
196
+ const noSSN = expect(response).not.toMatchPattern(/\\b\\d{3}-\\d{2}-\\d{4}\\b/);
197
+ const noEmail = expect(response).not.toMatchPattern(/\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b/);
198
+
199
+ const allPassed = noPII.passed && noSSN.passed && noEmail.passed;
200
+ return createResult({
201
+ pass: allPassed,
202
+ score: allPassed ? 100 : 0,
203
+ output: response,
204
+ assertions: [noPII, noSSN, noEmail],
205
+ });
206
+ });
207
+
208
+ defineEval("no toxic content", async (ctx) => {
209
+ const response = "I understand your frustration. Let me help resolve this issue.";
210
+
211
+ const noProfanity = expect(response).toHaveNoProfanity();
212
+ const professional = expect(response).toHaveSentiment("positive");
213
+ const appropriate = expect(response).toHaveLength({ min: 10, max: 1000 });
214
+
215
+ const allPassed = noProfanity.passed && professional.passed && appropriate.passed;
216
+ return createResult({
217
+ pass: allPassed,
218
+ score: allPassed ? 100 : 0,
219
+ output: response,
220
+ assertions: [noProfanity, professional, appropriate],
221
+ });
222
+ });
223
+
224
+ defineEval("grounded in facts", async (ctx) => {
225
+ const groundTruth = ["Paris", "capital", "France"];
226
+ const response = "Paris is the capital city of France, located in Europe.";
227
+
228
+ const grounded = expect(response).toNotHallucinate(groundTruth);
229
+ const grammar = expect(response).toHaveProperGrammar();
230
+
231
+ const allPassed = grounded.passed && grammar.passed;
232
+ return createResult({
233
+ pass: allPassed,
234
+ score: allPassed ? 100 : 30,
235
+ output: response,
236
+ assertions: [grounded, grammar],
237
+ });
238
+ });
239
+ `,
240
+ },
241
+ rag: {
242
+ "eval/rag-faithfulness.eval.ts": `import { defineEval, createResult, expect } from "@evalgate/sdk";
243
+
244
+ defineEval("answer is grounded in context", async (ctx) => {
245
+ // Simulate RAG pipeline
246
+ const context = "The company was founded in 2019 by Jane Smith. It has 500 employees.";
247
+ const answer = "The company was founded in 2019 and currently has 500 employees.";
248
+
249
+ const grounded = expect(answer).toNotHallucinate(["2019", "500 employees"]);
250
+ const noExtraFacts = expect(answer).toHaveLength({ max: context.length * 2 });
251
+ const relevant = expect(answer).toContainKeywords(["founded", "employees"]);
252
+
253
+ const allPassed = grounded.passed && noExtraFacts.passed && relevant.passed;
254
+ return createResult({
255
+ pass: allPassed,
256
+ score: allPassed ? 100 : 40,
257
+ output: answer,
258
+ assertions: [grounded, noExtraFacts, relevant],
259
+ metadata: { context, answer },
260
+ });
261
+ });
262
+
263
+ defineEval("handles no-context gracefully", async (ctx) => {
264
+ // When no relevant context is retrieved, the model should say so
265
+ const answer = "I don't have enough information to answer that question accurately.";
266
+
267
+ const acknowledgesLimit = expect(answer).toContainKeywords(["don't", "information"]);
268
+ const doesNotFabricate = expect(answer).toHaveLength({ max: 200 });
269
+
270
+ const allPassed = acknowledgesLimit.passed && doesNotFabricate.passed;
271
+ return createResult({
272
+ pass: allPassed,
273
+ score: allPassed ? 100 : 20,
274
+ output: answer,
275
+ assertions: [acknowledgesLimit, doesNotFabricate],
276
+ });
277
+ });
278
+ `,
279
+ },
280
+ };
281
+ return templates[template];
282
+ }
283
+ /**
284
+ * Install a template into the project
285
+ */
286
+ function installTemplate(template, cwd = process.cwd()) {
287
+ const files = getTemplateContent(template);
288
+ const created = [];
289
+ const skipped = [];
290
+ for (const [relativePath, content] of Object.entries(files)) {
291
+ const fullPath = path.join(cwd, relativePath);
292
+ const dir = path.dirname(fullPath);
293
+ if (fs.existsSync(fullPath)) {
294
+ skipped.push(relativePath);
295
+ continue;
296
+ }
297
+ if (!fs.existsSync(dir)) {
298
+ fs.mkdirSync(dir, { recursive: true });
299
+ }
300
+ fs.writeFileSync(fullPath, content, "utf-8");
301
+ created.push(relativePath);
302
+ }
303
+ return { filesCreated: created, filesSkipped: skipped };
304
+ }
305
+ /**
306
+ * Print available templates
307
+ */
308
+ function printTemplateList() {
309
+ console.log("\n📋 Available templates:\n");
310
+ for (const [name, desc] of Object.entries(exports.TEMPLATE_DESCRIPTIONS)) {
311
+ console.log(` ${name.padEnd(12)} ${desc}`);
312
+ }
313
+ console.log("\nUsage: evalgate init --template <name>");
314
+ }
@@ -0,0 +1,109 @@
1
+ /**
2
+ * Structured trace writer for evalgate runs
3
+ *
4
+ * Auto-writes structured JSON to .evalgate/traces/ on every defineEval result.
5
+ * Each trace captures: spec identity, timing, assertions, score, and metadata.
6
+ *
7
+ * Trace files are append-friendly and suitable for post-hoc analysis.
8
+ */
9
+ import type { RunResult } from "./run";
10
+ /**
11
+ * Individual spec trace record
12
+ */
13
+ export interface SpecTrace {
14
+ /** Trace schema version */
15
+ schemaVersion: 1;
16
+ /** Timestamp of trace creation */
17
+ timestamp: number;
18
+ /** ISO timestamp */
19
+ timestampISO: string;
20
+ /** Run ID this trace belongs to */
21
+ runId: string;
22
+ /** Spec identity */
23
+ spec: {
24
+ id: string;
25
+ name: string;
26
+ filePath: string;
27
+ };
28
+ /** Execution details */
29
+ execution: {
30
+ status: "passed" | "failed" | "skipped";
31
+ score?: number;
32
+ duration: number;
33
+ error?: string;
34
+ };
35
+ /** Git context (if available) */
36
+ git?: {
37
+ sha?: string;
38
+ branch?: string;
39
+ };
40
+ /** Environment */
41
+ env: {
42
+ nodeVersion: string;
43
+ platform: string;
44
+ ci: boolean;
45
+ };
46
+ }
47
+ /**
48
+ * Run-level trace summary
49
+ */
50
+ export interface RunTrace {
51
+ /** Trace schema version */
52
+ schemaVersion: 1;
53
+ /** Run metadata */
54
+ run: {
55
+ id: string;
56
+ startedAt: number;
57
+ completedAt: number;
58
+ duration: number;
59
+ mode: string;
60
+ };
61
+ /** Summary statistics */
62
+ summary: {
63
+ total: number;
64
+ passed: number;
65
+ failed: number;
66
+ skipped: number;
67
+ passRate: number;
68
+ };
69
+ /** Latency statistics */
70
+ latency: {
71
+ min: number;
72
+ max: number;
73
+ mean: number;
74
+ p50: number;
75
+ p95: number;
76
+ p99: number;
77
+ };
78
+ /** Individual spec traces */
79
+ specs: SpecTrace[];
80
+ }
81
+ /**
82
+ * Calculate latency percentiles from durations
83
+ */
84
+ export declare function calculatePercentiles(durations: number[]): {
85
+ min: number;
86
+ max: number;
87
+ mean: number;
88
+ p50: number;
89
+ p95: number;
90
+ p99: number;
91
+ };
92
+ /**
93
+ * Build a RunTrace from a RunResult
94
+ */
95
+ export declare function buildRunTrace(result: RunResult, gitInfo?: {
96
+ sha?: string;
97
+ branch?: string;
98
+ }): RunTrace;
99
+ /**
100
+ * Write structured trace files to .evalgate/traces/
101
+ */
102
+ export declare function writeTraces(result: RunResult, projectRoot?: string, gitInfo?: {
103
+ sha?: string;
104
+ branch?: string;
105
+ }): Promise<string>;
106
+ /**
107
+ * Format latency percentiles for human display
108
+ */
109
+ export declare function formatLatencyTable(latency: RunTrace["latency"]): string;
@@ -0,0 +1,152 @@
1
+ "use strict";
2
+ /**
3
+ * Structured trace writer for evalgate runs
4
+ *
5
+ * Auto-writes structured JSON to .evalgate/traces/ on every defineEval result.
6
+ * Each trace captures: spec identity, timing, assertions, score, and metadata.
7
+ *
8
+ * Trace files are append-friendly and suitable for post-hoc analysis.
9
+ */
10
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
11
+ if (k2 === undefined) k2 = k;
12
+ var desc = Object.getOwnPropertyDescriptor(m, k);
13
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
14
+ desc = { enumerable: true, get: function() { return m[k]; } };
15
+ }
16
+ Object.defineProperty(o, k2, desc);
17
+ }) : (function(o, m, k, k2) {
18
+ if (k2 === undefined) k2 = k;
19
+ o[k2] = m[k];
20
+ }));
21
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
22
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
23
+ }) : function(o, v) {
24
+ o["default"] = v;
25
+ });
26
+ var __importStar = (this && this.__importStar) || (function () {
27
+ var ownKeys = function(o) {
28
+ ownKeys = Object.getOwnPropertyNames || function (o) {
29
+ var ar = [];
30
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
31
+ return ar;
32
+ };
33
+ return ownKeys(o);
34
+ };
35
+ return function (mod) {
36
+ if (mod && mod.__esModule) return mod;
37
+ var result = {};
38
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
39
+ __setModuleDefault(result, mod);
40
+ return result;
41
+ };
42
+ })();
43
+ Object.defineProperty(exports, "__esModule", { value: true });
44
+ exports.calculatePercentiles = calculatePercentiles;
45
+ exports.buildRunTrace = buildRunTrace;
46
+ exports.writeTraces = writeTraces;
47
+ exports.formatLatencyTable = formatLatencyTable;
48
+ const fs = __importStar(require("node:fs/promises"));
49
+ const path = __importStar(require("node:path"));
50
+ /**
51
+ * Calculate latency percentiles from durations
52
+ */
53
+ function calculatePercentiles(durations) {
54
+ if (durations.length === 0) {
55
+ return { min: 0, max: 0, mean: 0, p50: 0, p95: 0, p99: 0 };
56
+ }
57
+ const sorted = [...durations].sort((a, b) => a - b);
58
+ const len = sorted.length;
59
+ const sum = sorted.reduce((a, b) => a + b, 0);
60
+ return {
61
+ min: sorted[0],
62
+ max: sorted[len - 1],
63
+ mean: Math.round(sum / len),
64
+ p50: sorted[Math.floor(len * 0.5)],
65
+ p95: sorted[Math.min(Math.floor(len * 0.95), len - 1)],
66
+ p99: sorted[Math.min(Math.floor(len * 0.99), len - 1)],
67
+ };
68
+ }
69
+ /**
70
+ * Build a RunTrace from a RunResult
71
+ */
72
+ function buildRunTrace(result, gitInfo) {
73
+ const now = Date.now();
74
+ const isCI = !!process.env.CI || !!process.env.GITHUB_ACTIONS || !!process.env.GITLAB_CI;
75
+ const specTraces = result.results.map((spec) => ({
76
+ schemaVersion: 1,
77
+ timestamp: now,
78
+ timestampISO: new Date(now).toISOString(),
79
+ runId: result.runId,
80
+ spec: {
81
+ id: spec.specId,
82
+ name: spec.name,
83
+ filePath: spec.filePath,
84
+ },
85
+ execution: {
86
+ status: spec.result.status,
87
+ score: spec.result.score,
88
+ duration: spec.result.duration,
89
+ error: spec.result.error,
90
+ },
91
+ git: gitInfo,
92
+ env: {
93
+ nodeVersion: process.version,
94
+ platform: process.platform,
95
+ ci: isCI,
96
+ },
97
+ }));
98
+ const durations = result.results
99
+ .filter((r) => r.result.status !== "skipped")
100
+ .map((r) => r.result.duration);
101
+ const latency = calculatePercentiles(durations);
102
+ return {
103
+ schemaVersion: 1,
104
+ run: {
105
+ id: result.runId,
106
+ startedAt: result.metadata.startedAt,
107
+ completedAt: result.metadata.completedAt,
108
+ duration: result.metadata.duration,
109
+ mode: result.metadata.mode,
110
+ },
111
+ summary: {
112
+ total: result.results.length,
113
+ passed: result.summary.passed,
114
+ failed: result.summary.failed,
115
+ skipped: result.summary.skipped,
116
+ passRate: result.summary.passRate,
117
+ },
118
+ latency,
119
+ specs: specTraces,
120
+ };
121
+ }
122
+ /**
123
+ * Write structured trace files to .evalgate/traces/
124
+ */
125
+ async function writeTraces(result, projectRoot = process.cwd(), gitInfo) {
126
+ const tracesDir = path.join(projectRoot, ".evalgate", "traces");
127
+ await fs.mkdir(tracesDir, { recursive: true });
128
+ const runTrace = buildRunTrace(result, gitInfo);
129
+ // Write run-level trace
130
+ const traceFileName = `${result.runId}.trace.json`;
131
+ const tracePath = path.join(tracesDir, traceFileName);
132
+ await fs.writeFile(tracePath, JSON.stringify(runTrace, null, 2), "utf-8");
133
+ // Update latest symlink
134
+ const latestPath = path.join(tracesDir, "latest.trace.json");
135
+ await fs.writeFile(latestPath, JSON.stringify(runTrace, null, 2), "utf-8");
136
+ return tracePath;
137
+ }
138
+ /**
139
+ * Format latency percentiles for human display
140
+ */
141
+ function formatLatencyTable(latency) {
142
+ const lines = [
143
+ "⏱️ Latency Percentiles:",
144
+ ` min: ${latency.min}ms`,
145
+ ` p50: ${latency.p50}ms`,
146
+ ` p95: ${latency.p95}ms`,
147
+ ` p99: ${latency.p99}ms`,
148
+ ` max: ${latency.max}ms`,
149
+ ` mean: ${latency.mean}ms`,
150
+ ];
151
+ return lines.join("\n");
152
+ }
@@ -0,0 +1,37 @@
1
+ /**
2
+ * evalgate validate — static validation of spec files without execution
3
+ *
4
+ * The equivalent of `tsc --noEmit` for eval specs. Catches:
5
+ * - Missing or malformed defineEval calls
6
+ * - Executor functions that don't return EvalResult shape
7
+ * - Invalid spec names (characters, length)
8
+ * - Empty spec files
9
+ * - Missing required fields in config-form defineEval
10
+ *
11
+ * Usage:
12
+ * evalgate validate
13
+ * evalgate validate --format json
14
+ */
15
+ export interface ValidationIssue {
16
+ /** Severity: error blocks CI, warn is informational */
17
+ severity: "error" | "warn";
18
+ /** File where the issue was found */
19
+ file: string;
20
+ /** Line number (1-indexed), if available */
21
+ line?: number;
22
+ /** Short error code */
23
+ code: string;
24
+ /** Human-readable message */
25
+ message: string;
26
+ }
27
+ export interface ValidateResult {
28
+ /** Total spec files scanned */
29
+ filesScanned: number;
30
+ /** Spec files with issues */
31
+ filesWithIssues: number;
32
+ /** All issues found */
33
+ issues: ValidationIssue[];
34
+ /** Whether validation passed (no errors; warnings are OK) */
35
+ passed: boolean;
36
+ }
37
+ export declare function runValidate(args?: string[]): Promise<ValidateResult>;