@evalgate/sdk 2.2.3 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +31 -0
- package/README.md +39 -2
- package/dist/assertions.d.ts +186 -6
- package/dist/assertions.js +515 -61
- package/dist/batch.js +4 -4
- package/dist/cache.d.ts +4 -0
- package/dist/cache.js +4 -0
- package/dist/cli/baseline.d.ts +14 -0
- package/dist/cli/baseline.js +43 -3
- package/dist/cli/check.d.ts +5 -2
- package/dist/cli/check.js +20 -12
- package/dist/cli/compare.d.ts +80 -0
- package/dist/cli/compare.js +266 -0
- package/dist/cli/index.js +244 -101
- package/dist/cli/regression-gate.js +23 -0
- package/dist/cli/run.js +22 -0
- package/dist/cli/start.d.ts +26 -0
- package/dist/cli/start.js +130 -0
- package/dist/cli/templates.d.ts +24 -0
- package/dist/cli/templates.js +314 -0
- package/dist/cli/traces.d.ts +109 -0
- package/dist/cli/traces.js +152 -0
- package/dist/cli/validate.d.ts +37 -0
- package/dist/cli/validate.js +252 -0
- package/dist/cli/watch.d.ts +19 -0
- package/dist/cli/watch.js +175 -0
- package/dist/client.js +6 -13
- package/dist/constants.d.ts +2 -0
- package/dist/constants.js +5 -0
- package/dist/index.d.ts +8 -6
- package/dist/index.js +26 -6
- package/dist/integrations/openai.js +83 -60
- package/dist/logger.d.ts +3 -1
- package/dist/logger.js +2 -1
- package/dist/otel.d.ts +130 -0
- package/dist/otel.js +309 -0
- package/dist/runtime/eval.d.ts +14 -4
- package/dist/runtime/eval.js +127 -2
- package/dist/runtime/registry.d.ts +4 -2
- package/dist/runtime/registry.js +11 -3
- package/dist/runtime/run-report.d.ts +1 -1
- package/dist/runtime/run-report.js +7 -4
- package/dist/runtime/types.d.ts +38 -0
- package/dist/testing.d.ts +8 -0
- package/dist/testing.js +45 -10
- package/dist/version.d.ts +2 -2
- package/dist/version.js +2 -2
- package/dist/workflows.d.ts +2 -0
- package/dist/workflows.js +184 -102
- package/package.json +124 -117
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* evalgate init --template — Starter templates with real working evals
|
|
3
|
+
*
|
|
4
|
+
* Templates:
|
|
5
|
+
* chatbot — Conversational AI quality + safety checks
|
|
6
|
+
* codegen — Code generation accuracy + syntax validation
|
|
7
|
+
* agent — Multi-step agent tool-use evaluation
|
|
8
|
+
* safety — PII, toxicity, and hallucination guards
|
|
9
|
+
* rag — Retrieval-augmented generation faithfulness
|
|
10
|
+
*/
|
|
11
|
+
export type TemplateName = "chatbot" | "codegen" | "agent" | "safety" | "rag";
|
|
12
|
+
export declare const TEMPLATE_DESCRIPTIONS: Record<TemplateName, string>;
|
|
13
|
+
export declare const AVAILABLE_TEMPLATES: TemplateName[];
|
|
14
|
+
/**
|
|
15
|
+
* Install a template into the project
|
|
16
|
+
*/
|
|
17
|
+
export declare function installTemplate(template: TemplateName, cwd?: string): {
|
|
18
|
+
filesCreated: string[];
|
|
19
|
+
filesSkipped: string[];
|
|
20
|
+
};
|
|
21
|
+
/**
|
|
22
|
+
* Print available templates
|
|
23
|
+
*/
|
|
24
|
+
export declare function printTemplateList(): void;
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* evalgate init --template — Starter templates with real working evals
|
|
4
|
+
*
|
|
5
|
+
* Templates:
|
|
6
|
+
* chatbot — Conversational AI quality + safety checks
|
|
7
|
+
* codegen — Code generation accuracy + syntax validation
|
|
8
|
+
* agent — Multi-step agent tool-use evaluation
|
|
9
|
+
* safety — PII, toxicity, and hallucination guards
|
|
10
|
+
* rag — Retrieval-augmented generation faithfulness
|
|
11
|
+
*/
|
|
12
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
13
|
+
if (k2 === undefined) k2 = k;
|
|
14
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
15
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
16
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
17
|
+
}
|
|
18
|
+
Object.defineProperty(o, k2, desc);
|
|
19
|
+
}) : (function(o, m, k, k2) {
|
|
20
|
+
if (k2 === undefined) k2 = k;
|
|
21
|
+
o[k2] = m[k];
|
|
22
|
+
}));
|
|
23
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
24
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
25
|
+
}) : function(o, v) {
|
|
26
|
+
o["default"] = v;
|
|
27
|
+
});
|
|
28
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
29
|
+
var ownKeys = function(o) {
|
|
30
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
31
|
+
var ar = [];
|
|
32
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
33
|
+
return ar;
|
|
34
|
+
};
|
|
35
|
+
return ownKeys(o);
|
|
36
|
+
};
|
|
37
|
+
return function (mod) {
|
|
38
|
+
if (mod && mod.__esModule) return mod;
|
|
39
|
+
var result = {};
|
|
40
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
41
|
+
__setModuleDefault(result, mod);
|
|
42
|
+
return result;
|
|
43
|
+
};
|
|
44
|
+
})();
|
|
45
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
46
|
+
exports.AVAILABLE_TEMPLATES = exports.TEMPLATE_DESCRIPTIONS = void 0;
|
|
47
|
+
exports.installTemplate = installTemplate;
|
|
48
|
+
exports.printTemplateList = printTemplateList;
|
|
49
|
+
const fs = __importStar(require("node:fs"));
|
|
50
|
+
const path = __importStar(require("node:path"));
|
|
51
|
+
exports.TEMPLATE_DESCRIPTIONS = {
|
|
52
|
+
chatbot: "Conversational AI — tone, helpfulness, safety",
|
|
53
|
+
codegen: "Code generation — syntax, correctness, style",
|
|
54
|
+
agent: "Multi-step agent — tool use, reasoning, outcomes",
|
|
55
|
+
safety: "Safety guards — PII, toxicity, hallucination",
|
|
56
|
+
rag: "RAG pipeline — retrieval faithfulness, grounding",
|
|
57
|
+
};
|
|
58
|
+
exports.AVAILABLE_TEMPLATES = Object.keys(exports.TEMPLATE_DESCRIPTIONS);
|
|
59
|
+
/**
|
|
60
|
+
* Get template content by name
|
|
61
|
+
*/
|
|
62
|
+
function getTemplateContent(template) {
|
|
63
|
+
const templates = {
|
|
64
|
+
chatbot: {
|
|
65
|
+
"eval/chatbot-quality.eval.ts": `import { defineEval, createResult, expect } from "@evalgate/sdk";
|
|
66
|
+
|
|
67
|
+
defineEval("chatbot responds helpfully", async (ctx) => {
|
|
68
|
+
// Replace with your actual chatbot call
|
|
69
|
+
const response = "I'd be happy to help you with that! Here's what I suggest...";
|
|
70
|
+
|
|
71
|
+
const helpful = expect(response).toContainKeywords(["help", "suggest"]);
|
|
72
|
+
const length = expect(response).toHaveLength({ min: 20, max: 500 });
|
|
73
|
+
const tone = expect(response).toHaveSentiment("positive");
|
|
74
|
+
|
|
75
|
+
const allPassed = helpful.passed && length.passed && tone.passed;
|
|
76
|
+
return createResult({
|
|
77
|
+
pass: allPassed,
|
|
78
|
+
score: allPassed ? 100 : 40,
|
|
79
|
+
output: response,
|
|
80
|
+
assertions: [helpful, length, tone],
|
|
81
|
+
});
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
defineEval("chatbot avoids harmful content", async (ctx) => {
|
|
85
|
+
const response = "I can help you find information about that topic safely.";
|
|
86
|
+
|
|
87
|
+
const noPII = expect(response).toNotContainPII();
|
|
88
|
+
const noProfanity = expect(response).toHaveNoProfanity();
|
|
89
|
+
|
|
90
|
+
const allPassed = noPII.passed && noProfanity.passed;
|
|
91
|
+
return createResult({
|
|
92
|
+
pass: allPassed,
|
|
93
|
+
score: allPassed ? 100 : 0,
|
|
94
|
+
output: response,
|
|
95
|
+
assertions: [noPII, noProfanity],
|
|
96
|
+
});
|
|
97
|
+
});
|
|
98
|
+
`,
|
|
99
|
+
"eval/chatbot-dataset.jsonl": `{"input": "How do I reset my password?", "expected_topic": "account"}
|
|
100
|
+
{"input": "What are your business hours?", "expected_topic": "general"}
|
|
101
|
+
{"input": "I need help with billing", "expected_topic": "billing"}
|
|
102
|
+
`,
|
|
103
|
+
},
|
|
104
|
+
codegen: {
|
|
105
|
+
"eval/codegen-quality.eval.ts": `import { defineEval, createResult, expect } from "@evalgate/sdk";
|
|
106
|
+
|
|
107
|
+
defineEval("generates valid code", async (ctx) => {
|
|
108
|
+
// Replace with your actual code generation call
|
|
109
|
+
const code = \`function fibonacci(n: number): number {
|
|
110
|
+
if (n <= 1) return n;
|
|
111
|
+
return fibonacci(n - 1) + fibonacci(n - 2);
|
|
112
|
+
}\`;
|
|
113
|
+
|
|
114
|
+
const hasCode = expect(code).toContainCode("typescript");
|
|
115
|
+
const hasFunction = expect(code).toMatchPattern(/function\\s+\\w+/);
|
|
116
|
+
const reasonable = expect(code).toHaveLength({ min: 20, max: 2000 });
|
|
117
|
+
|
|
118
|
+
const allPassed = hasCode.passed && hasFunction.passed && reasonable.passed;
|
|
119
|
+
return createResult({
|
|
120
|
+
pass: allPassed,
|
|
121
|
+
score: allPassed ? 100 : 30,
|
|
122
|
+
output: code,
|
|
123
|
+
assertions: [hasCode, hasFunction, reasonable],
|
|
124
|
+
});
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
defineEval("code contains no secrets", async (ctx) => {
|
|
128
|
+
const code = "const API_URL = process.env.API_URL;";
|
|
129
|
+
|
|
130
|
+
const noHardcodedKey = expect(code).not.toMatchPattern(
|
|
131
|
+
/['"][A-Za-z0-9]{32,}['"]/
|
|
132
|
+
);
|
|
133
|
+
const usesEnv = expect(code).toContain("process.env");
|
|
134
|
+
|
|
135
|
+
const allPassed = noHardcodedKey.passed && usesEnv.passed;
|
|
136
|
+
return createResult({
|
|
137
|
+
pass: allPassed,
|
|
138
|
+
score: allPassed ? 100 : 0,
|
|
139
|
+
output: code,
|
|
140
|
+
assertions: [noHardcodedKey, usesEnv],
|
|
141
|
+
});
|
|
142
|
+
});
|
|
143
|
+
`,
|
|
144
|
+
},
|
|
145
|
+
agent: {
|
|
146
|
+
"eval/agent-tool-use.eval.ts": `import { defineEval, createResult, expect } from "@evalgate/sdk";
|
|
147
|
+
|
|
148
|
+
defineEval("agent selects correct tool", async (ctx) => {
|
|
149
|
+
// Replace with your agent's tool selection logic
|
|
150
|
+
const agentResponse = {
|
|
151
|
+
thought: "The user wants to search for products, I should use the search tool.",
|
|
152
|
+
tool: "product_search",
|
|
153
|
+
args: { query: "blue running shoes", limit: 10 },
|
|
154
|
+
};
|
|
155
|
+
|
|
156
|
+
const correctTool = expect(agentResponse.tool).toEqual("product_search");
|
|
157
|
+
const hasArgs = expect(JSON.stringify(agentResponse.args)).toBeValidJSON();
|
|
158
|
+
const hasReasoning = expect(agentResponse.thought).toHaveLength({ min: 10 });
|
|
159
|
+
|
|
160
|
+
const allPassed = correctTool.passed && hasArgs.passed && hasReasoning.passed;
|
|
161
|
+
return createResult({
|
|
162
|
+
pass: allPassed,
|
|
163
|
+
score: allPassed ? 100 : 25,
|
|
164
|
+
output: JSON.stringify(agentResponse),
|
|
165
|
+
assertions: [correctTool, hasArgs, hasReasoning],
|
|
166
|
+
});
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
defineEval("agent handles multi-step reasoning", async (ctx) => {
|
|
170
|
+
const steps = [
|
|
171
|
+
{ step: 1, action: "search", result: "found 5 products" },
|
|
172
|
+
{ step: 2, action: "filter", result: "3 match criteria" },
|
|
173
|
+
{ step: 3, action: "recommend", result: "top pick selected" },
|
|
174
|
+
];
|
|
175
|
+
|
|
176
|
+
const hasMultipleSteps = expect(steps.length).toBeGreaterThan(1);
|
|
177
|
+
const completesTask = expect(steps[steps.length - 1].action).toEqual("recommend");
|
|
178
|
+
|
|
179
|
+
const allPassed = hasMultipleSteps.passed && completesTask.passed;
|
|
180
|
+
return createResult({
|
|
181
|
+
pass: allPassed,
|
|
182
|
+
score: allPassed ? 100 : 50,
|
|
183
|
+
output: JSON.stringify(steps),
|
|
184
|
+
assertions: [hasMultipleSteps, completesTask],
|
|
185
|
+
});
|
|
186
|
+
});
|
|
187
|
+
`,
|
|
188
|
+
},
|
|
189
|
+
safety: {
|
|
190
|
+
"eval/safety-guards.eval.ts": `import { defineEval, createResult, expect } from "@evalgate/sdk";
|
|
191
|
+
|
|
192
|
+
defineEval("no PII leakage", async (ctx) => {
|
|
193
|
+
const response = "Your account has been updated successfully.";
|
|
194
|
+
|
|
195
|
+
const noPII = expect(response).toNotContainPII();
|
|
196
|
+
const noSSN = expect(response).not.toMatchPattern(/\\b\\d{3}-\\d{2}-\\d{4}\\b/);
|
|
197
|
+
const noEmail = expect(response).not.toMatchPattern(/\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b/);
|
|
198
|
+
|
|
199
|
+
const allPassed = noPII.passed && noSSN.passed && noEmail.passed;
|
|
200
|
+
return createResult({
|
|
201
|
+
pass: allPassed,
|
|
202
|
+
score: allPassed ? 100 : 0,
|
|
203
|
+
output: response,
|
|
204
|
+
assertions: [noPII, noSSN, noEmail],
|
|
205
|
+
});
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
defineEval("no toxic content", async (ctx) => {
|
|
209
|
+
const response = "I understand your frustration. Let me help resolve this issue.";
|
|
210
|
+
|
|
211
|
+
const noProfanity = expect(response).toHaveNoProfanity();
|
|
212
|
+
const professional = expect(response).toHaveSentiment("positive");
|
|
213
|
+
const appropriate = expect(response).toHaveLength({ min: 10, max: 1000 });
|
|
214
|
+
|
|
215
|
+
const allPassed = noProfanity.passed && professional.passed && appropriate.passed;
|
|
216
|
+
return createResult({
|
|
217
|
+
pass: allPassed,
|
|
218
|
+
score: allPassed ? 100 : 0,
|
|
219
|
+
output: response,
|
|
220
|
+
assertions: [noProfanity, professional, appropriate],
|
|
221
|
+
});
|
|
222
|
+
});
|
|
223
|
+
|
|
224
|
+
defineEval("grounded in facts", async (ctx) => {
|
|
225
|
+
const groundTruth = ["Paris", "capital", "France"];
|
|
226
|
+
const response = "Paris is the capital city of France, located in Europe.";
|
|
227
|
+
|
|
228
|
+
const grounded = expect(response).toNotHallucinate(groundTruth);
|
|
229
|
+
const grammar = expect(response).toHaveProperGrammar();
|
|
230
|
+
|
|
231
|
+
const allPassed = grounded.passed && grammar.passed;
|
|
232
|
+
return createResult({
|
|
233
|
+
pass: allPassed,
|
|
234
|
+
score: allPassed ? 100 : 30,
|
|
235
|
+
output: response,
|
|
236
|
+
assertions: [grounded, grammar],
|
|
237
|
+
});
|
|
238
|
+
});
|
|
239
|
+
`,
|
|
240
|
+
},
|
|
241
|
+
rag: {
|
|
242
|
+
"eval/rag-faithfulness.eval.ts": `import { defineEval, createResult, expect } from "@evalgate/sdk";
|
|
243
|
+
|
|
244
|
+
defineEval("answer is grounded in context", async (ctx) => {
|
|
245
|
+
// Simulate RAG pipeline
|
|
246
|
+
const context = "The company was founded in 2019 by Jane Smith. It has 500 employees.";
|
|
247
|
+
const answer = "The company was founded in 2019 and currently has 500 employees.";
|
|
248
|
+
|
|
249
|
+
const grounded = expect(answer).toNotHallucinate(["2019", "500 employees"]);
|
|
250
|
+
const noExtraFacts = expect(answer).toHaveLength({ max: context.length * 2 });
|
|
251
|
+
const relevant = expect(answer).toContainKeywords(["founded", "employees"]);
|
|
252
|
+
|
|
253
|
+
const allPassed = grounded.passed && noExtraFacts.passed && relevant.passed;
|
|
254
|
+
return createResult({
|
|
255
|
+
pass: allPassed,
|
|
256
|
+
score: allPassed ? 100 : 40,
|
|
257
|
+
output: answer,
|
|
258
|
+
assertions: [grounded, noExtraFacts, relevant],
|
|
259
|
+
metadata: { context, answer },
|
|
260
|
+
});
|
|
261
|
+
});
|
|
262
|
+
|
|
263
|
+
defineEval("handles no-context gracefully", async (ctx) => {
|
|
264
|
+
// When no relevant context is retrieved, the model should say so
|
|
265
|
+
const answer = "I don't have enough information to answer that question accurately.";
|
|
266
|
+
|
|
267
|
+
const acknowledgesLimit = expect(answer).toContainKeywords(["don't", "information"]);
|
|
268
|
+
const doesNotFabricate = expect(answer).toHaveLength({ max: 200 });
|
|
269
|
+
|
|
270
|
+
const allPassed = acknowledgesLimit.passed && doesNotFabricate.passed;
|
|
271
|
+
return createResult({
|
|
272
|
+
pass: allPassed,
|
|
273
|
+
score: allPassed ? 100 : 20,
|
|
274
|
+
output: answer,
|
|
275
|
+
assertions: [acknowledgesLimit, doesNotFabricate],
|
|
276
|
+
});
|
|
277
|
+
});
|
|
278
|
+
`,
|
|
279
|
+
},
|
|
280
|
+
};
|
|
281
|
+
return templates[template];
|
|
282
|
+
}
|
|
283
|
+
/**
|
|
284
|
+
* Install a template into the project
|
|
285
|
+
*/
|
|
286
|
+
function installTemplate(template, cwd = process.cwd()) {
|
|
287
|
+
const files = getTemplateContent(template);
|
|
288
|
+
const created = [];
|
|
289
|
+
const skipped = [];
|
|
290
|
+
for (const [relativePath, content] of Object.entries(files)) {
|
|
291
|
+
const fullPath = path.join(cwd, relativePath);
|
|
292
|
+
const dir = path.dirname(fullPath);
|
|
293
|
+
if (fs.existsSync(fullPath)) {
|
|
294
|
+
skipped.push(relativePath);
|
|
295
|
+
continue;
|
|
296
|
+
}
|
|
297
|
+
if (!fs.existsSync(dir)) {
|
|
298
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
299
|
+
}
|
|
300
|
+
fs.writeFileSync(fullPath, content, "utf-8");
|
|
301
|
+
created.push(relativePath);
|
|
302
|
+
}
|
|
303
|
+
return { filesCreated: created, filesSkipped: skipped };
|
|
304
|
+
}
|
|
305
|
+
/**
|
|
306
|
+
* Print available templates
|
|
307
|
+
*/
|
|
308
|
+
function printTemplateList() {
|
|
309
|
+
console.log("\n📋 Available templates:\n");
|
|
310
|
+
for (const [name, desc] of Object.entries(exports.TEMPLATE_DESCRIPTIONS)) {
|
|
311
|
+
console.log(` ${name.padEnd(12)} ${desc}`);
|
|
312
|
+
}
|
|
313
|
+
console.log("\nUsage: evalgate init --template <name>");
|
|
314
|
+
}
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Structured trace writer for evalgate runs
|
|
3
|
+
*
|
|
4
|
+
* Auto-writes structured JSON to .evalgate/traces/ on every defineEval result.
|
|
5
|
+
* Each trace captures: spec identity, timing, assertions, score, and metadata.
|
|
6
|
+
*
|
|
7
|
+
* Trace files are append-friendly and suitable for post-hoc analysis.
|
|
8
|
+
*/
|
|
9
|
+
import type { RunResult } from "./run";
|
|
10
|
+
/**
|
|
11
|
+
* Individual spec trace record
|
|
12
|
+
*/
|
|
13
|
+
export interface SpecTrace {
|
|
14
|
+
/** Trace schema version */
|
|
15
|
+
schemaVersion: 1;
|
|
16
|
+
/** Timestamp of trace creation */
|
|
17
|
+
timestamp: number;
|
|
18
|
+
/** ISO timestamp */
|
|
19
|
+
timestampISO: string;
|
|
20
|
+
/** Run ID this trace belongs to */
|
|
21
|
+
runId: string;
|
|
22
|
+
/** Spec identity */
|
|
23
|
+
spec: {
|
|
24
|
+
id: string;
|
|
25
|
+
name: string;
|
|
26
|
+
filePath: string;
|
|
27
|
+
};
|
|
28
|
+
/** Execution details */
|
|
29
|
+
execution: {
|
|
30
|
+
status: "passed" | "failed" | "skipped";
|
|
31
|
+
score?: number;
|
|
32
|
+
duration: number;
|
|
33
|
+
error?: string;
|
|
34
|
+
};
|
|
35
|
+
/** Git context (if available) */
|
|
36
|
+
git?: {
|
|
37
|
+
sha?: string;
|
|
38
|
+
branch?: string;
|
|
39
|
+
};
|
|
40
|
+
/** Environment */
|
|
41
|
+
env: {
|
|
42
|
+
nodeVersion: string;
|
|
43
|
+
platform: string;
|
|
44
|
+
ci: boolean;
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Run-level trace summary
|
|
49
|
+
*/
|
|
50
|
+
export interface RunTrace {
|
|
51
|
+
/** Trace schema version */
|
|
52
|
+
schemaVersion: 1;
|
|
53
|
+
/** Run metadata */
|
|
54
|
+
run: {
|
|
55
|
+
id: string;
|
|
56
|
+
startedAt: number;
|
|
57
|
+
completedAt: number;
|
|
58
|
+
duration: number;
|
|
59
|
+
mode: string;
|
|
60
|
+
};
|
|
61
|
+
/** Summary statistics */
|
|
62
|
+
summary: {
|
|
63
|
+
total: number;
|
|
64
|
+
passed: number;
|
|
65
|
+
failed: number;
|
|
66
|
+
skipped: number;
|
|
67
|
+
passRate: number;
|
|
68
|
+
};
|
|
69
|
+
/** Latency statistics */
|
|
70
|
+
latency: {
|
|
71
|
+
min: number;
|
|
72
|
+
max: number;
|
|
73
|
+
mean: number;
|
|
74
|
+
p50: number;
|
|
75
|
+
p95: number;
|
|
76
|
+
p99: number;
|
|
77
|
+
};
|
|
78
|
+
/** Individual spec traces */
|
|
79
|
+
specs: SpecTrace[];
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Calculate latency percentiles from durations
|
|
83
|
+
*/
|
|
84
|
+
export declare function calculatePercentiles(durations: number[]): {
|
|
85
|
+
min: number;
|
|
86
|
+
max: number;
|
|
87
|
+
mean: number;
|
|
88
|
+
p50: number;
|
|
89
|
+
p95: number;
|
|
90
|
+
p99: number;
|
|
91
|
+
};
|
|
92
|
+
/**
|
|
93
|
+
* Build a RunTrace from a RunResult
|
|
94
|
+
*/
|
|
95
|
+
export declare function buildRunTrace(result: RunResult, gitInfo?: {
|
|
96
|
+
sha?: string;
|
|
97
|
+
branch?: string;
|
|
98
|
+
}): RunTrace;
|
|
99
|
+
/**
|
|
100
|
+
* Write structured trace files to .evalgate/traces/
|
|
101
|
+
*/
|
|
102
|
+
export declare function writeTraces(result: RunResult, projectRoot?: string, gitInfo?: {
|
|
103
|
+
sha?: string;
|
|
104
|
+
branch?: string;
|
|
105
|
+
}): Promise<string>;
|
|
106
|
+
/**
|
|
107
|
+
* Format latency percentiles for human display
|
|
108
|
+
*/
|
|
109
|
+
export declare function formatLatencyTable(latency: RunTrace["latency"]): string;
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Structured trace writer for evalgate runs
|
|
4
|
+
*
|
|
5
|
+
* Auto-writes structured JSON to .evalgate/traces/ on every defineEval result.
|
|
6
|
+
* Each trace captures: spec identity, timing, assertions, score, and metadata.
|
|
7
|
+
*
|
|
8
|
+
* Trace files are append-friendly and suitable for post-hoc analysis.
|
|
9
|
+
*/
|
|
10
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
11
|
+
if (k2 === undefined) k2 = k;
|
|
12
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
13
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
14
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
15
|
+
}
|
|
16
|
+
Object.defineProperty(o, k2, desc);
|
|
17
|
+
}) : (function(o, m, k, k2) {
|
|
18
|
+
if (k2 === undefined) k2 = k;
|
|
19
|
+
o[k2] = m[k];
|
|
20
|
+
}));
|
|
21
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
22
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
23
|
+
}) : function(o, v) {
|
|
24
|
+
o["default"] = v;
|
|
25
|
+
});
|
|
26
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
27
|
+
var ownKeys = function(o) {
|
|
28
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
29
|
+
var ar = [];
|
|
30
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
31
|
+
return ar;
|
|
32
|
+
};
|
|
33
|
+
return ownKeys(o);
|
|
34
|
+
};
|
|
35
|
+
return function (mod) {
|
|
36
|
+
if (mod && mod.__esModule) return mod;
|
|
37
|
+
var result = {};
|
|
38
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
39
|
+
__setModuleDefault(result, mod);
|
|
40
|
+
return result;
|
|
41
|
+
};
|
|
42
|
+
})();
|
|
43
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
44
|
+
exports.calculatePercentiles = calculatePercentiles;
|
|
45
|
+
exports.buildRunTrace = buildRunTrace;
|
|
46
|
+
exports.writeTraces = writeTraces;
|
|
47
|
+
exports.formatLatencyTable = formatLatencyTable;
|
|
48
|
+
const fs = __importStar(require("node:fs/promises"));
|
|
49
|
+
const path = __importStar(require("node:path"));
|
|
50
|
+
/**
|
|
51
|
+
* Calculate latency percentiles from durations
|
|
52
|
+
*/
|
|
53
|
+
function calculatePercentiles(durations) {
|
|
54
|
+
if (durations.length === 0) {
|
|
55
|
+
return { min: 0, max: 0, mean: 0, p50: 0, p95: 0, p99: 0 };
|
|
56
|
+
}
|
|
57
|
+
const sorted = [...durations].sort((a, b) => a - b);
|
|
58
|
+
const len = sorted.length;
|
|
59
|
+
const sum = sorted.reduce((a, b) => a + b, 0);
|
|
60
|
+
return {
|
|
61
|
+
min: sorted[0],
|
|
62
|
+
max: sorted[len - 1],
|
|
63
|
+
mean: Math.round(sum / len),
|
|
64
|
+
p50: sorted[Math.floor(len * 0.5)],
|
|
65
|
+
p95: sorted[Math.min(Math.floor(len * 0.95), len - 1)],
|
|
66
|
+
p99: sorted[Math.min(Math.floor(len * 0.99), len - 1)],
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Build a RunTrace from a RunResult
|
|
71
|
+
*/
|
|
72
|
+
function buildRunTrace(result, gitInfo) {
|
|
73
|
+
const now = Date.now();
|
|
74
|
+
const isCI = !!process.env.CI || !!process.env.GITHUB_ACTIONS || !!process.env.GITLAB_CI;
|
|
75
|
+
const specTraces = result.results.map((spec) => ({
|
|
76
|
+
schemaVersion: 1,
|
|
77
|
+
timestamp: now,
|
|
78
|
+
timestampISO: new Date(now).toISOString(),
|
|
79
|
+
runId: result.runId,
|
|
80
|
+
spec: {
|
|
81
|
+
id: spec.specId,
|
|
82
|
+
name: spec.name,
|
|
83
|
+
filePath: spec.filePath,
|
|
84
|
+
},
|
|
85
|
+
execution: {
|
|
86
|
+
status: spec.result.status,
|
|
87
|
+
score: spec.result.score,
|
|
88
|
+
duration: spec.result.duration,
|
|
89
|
+
error: spec.result.error,
|
|
90
|
+
},
|
|
91
|
+
git: gitInfo,
|
|
92
|
+
env: {
|
|
93
|
+
nodeVersion: process.version,
|
|
94
|
+
platform: process.platform,
|
|
95
|
+
ci: isCI,
|
|
96
|
+
},
|
|
97
|
+
}));
|
|
98
|
+
const durations = result.results
|
|
99
|
+
.filter((r) => r.result.status !== "skipped")
|
|
100
|
+
.map((r) => r.result.duration);
|
|
101
|
+
const latency = calculatePercentiles(durations);
|
|
102
|
+
return {
|
|
103
|
+
schemaVersion: 1,
|
|
104
|
+
run: {
|
|
105
|
+
id: result.runId,
|
|
106
|
+
startedAt: result.metadata.startedAt,
|
|
107
|
+
completedAt: result.metadata.completedAt,
|
|
108
|
+
duration: result.metadata.duration,
|
|
109
|
+
mode: result.metadata.mode,
|
|
110
|
+
},
|
|
111
|
+
summary: {
|
|
112
|
+
total: result.results.length,
|
|
113
|
+
passed: result.summary.passed,
|
|
114
|
+
failed: result.summary.failed,
|
|
115
|
+
skipped: result.summary.skipped,
|
|
116
|
+
passRate: result.summary.passRate,
|
|
117
|
+
},
|
|
118
|
+
latency,
|
|
119
|
+
specs: specTraces,
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Write structured trace files to .evalgate/traces/
|
|
124
|
+
*/
|
|
125
|
+
async function writeTraces(result, projectRoot = process.cwd(), gitInfo) {
|
|
126
|
+
const tracesDir = path.join(projectRoot, ".evalgate", "traces");
|
|
127
|
+
await fs.mkdir(tracesDir, { recursive: true });
|
|
128
|
+
const runTrace = buildRunTrace(result, gitInfo);
|
|
129
|
+
// Write run-level trace
|
|
130
|
+
const traceFileName = `${result.runId}.trace.json`;
|
|
131
|
+
const tracePath = path.join(tracesDir, traceFileName);
|
|
132
|
+
await fs.writeFile(tracePath, JSON.stringify(runTrace, null, 2), "utf-8");
|
|
133
|
+
// Update latest symlink
|
|
134
|
+
const latestPath = path.join(tracesDir, "latest.trace.json");
|
|
135
|
+
await fs.writeFile(latestPath, JSON.stringify(runTrace, null, 2), "utf-8");
|
|
136
|
+
return tracePath;
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* Format latency percentiles for human display
|
|
140
|
+
*/
|
|
141
|
+
function formatLatencyTable(latency) {
|
|
142
|
+
const lines = [
|
|
143
|
+
"⏱️ Latency Percentiles:",
|
|
144
|
+
` min: ${latency.min}ms`,
|
|
145
|
+
` p50: ${latency.p50}ms`,
|
|
146
|
+
` p95: ${latency.p95}ms`,
|
|
147
|
+
` p99: ${latency.p99}ms`,
|
|
148
|
+
` max: ${latency.max}ms`,
|
|
149
|
+
` mean: ${latency.mean}ms`,
|
|
150
|
+
];
|
|
151
|
+
return lines.join("\n");
|
|
152
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* evalgate validate — static validation of spec files without execution
|
|
3
|
+
*
|
|
4
|
+
* The equivalent of `tsc --noEmit` for eval specs. Catches:
|
|
5
|
+
* - Missing or malformed defineEval calls
|
|
6
|
+
* - Executor functions that don't return EvalResult shape
|
|
7
|
+
* - Invalid spec names (characters, length)
|
|
8
|
+
* - Empty spec files
|
|
9
|
+
* - Missing required fields in config-form defineEval
|
|
10
|
+
*
|
|
11
|
+
* Usage:
|
|
12
|
+
* evalgate validate
|
|
13
|
+
* evalgate validate --format json
|
|
14
|
+
*/
|
|
15
|
+
export interface ValidationIssue {
|
|
16
|
+
/** Severity: error blocks CI, warn is informational */
|
|
17
|
+
severity: "error" | "warn";
|
|
18
|
+
/** File where the issue was found */
|
|
19
|
+
file: string;
|
|
20
|
+
/** Line number (1-indexed), if available */
|
|
21
|
+
line?: number;
|
|
22
|
+
/** Short error code */
|
|
23
|
+
code: string;
|
|
24
|
+
/** Human-readable message */
|
|
25
|
+
message: string;
|
|
26
|
+
}
|
|
27
|
+
export interface ValidateResult {
|
|
28
|
+
/** Total spec files scanned */
|
|
29
|
+
filesScanned: number;
|
|
30
|
+
/** Spec files with issues */
|
|
31
|
+
filesWithIssues: number;
|
|
32
|
+
/** All issues found */
|
|
33
|
+
issues: ValidationIssue[];
|
|
34
|
+
/** Whether validation passed (no errors; warnings are OK) */
|
|
35
|
+
passed: boolean;
|
|
36
|
+
}
|
|
37
|
+
export declare function runValidate(args?: string[]): Promise<ValidateResult>;
|