@evalgate/sdk 2.2.2 → 2.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/README.md +40 -1
- package/dist/assertions.d.ts +194 -10
- package/dist/assertions.js +525 -73
- package/dist/batch.js +4 -4
- package/dist/cache.d.ts +5 -1
- package/dist/cache.js +5 -1
- package/dist/cli/baseline.d.ts +14 -0
- package/dist/cli/baseline.js +43 -3
- package/dist/cli/check.d.ts +5 -2
- package/dist/cli/check.js +20 -12
- package/dist/cli/compare.d.ts +80 -0
- package/dist/cli/compare.js +266 -0
- package/dist/cli/index.js +244 -101
- package/dist/cli/regression-gate.js +23 -0
- package/dist/cli/run.js +22 -0
- package/dist/cli/start.d.ts +26 -0
- package/dist/cli/start.js +130 -0
- package/dist/cli/templates.d.ts +24 -0
- package/dist/cli/templates.js +314 -0
- package/dist/cli/traces.d.ts +109 -0
- package/dist/cli/traces.js +152 -0
- package/dist/cli/upgrade.js +5 -0
- package/dist/cli/validate.d.ts +37 -0
- package/dist/cli/validate.js +252 -0
- package/dist/cli/watch.d.ts +19 -0
- package/dist/cli/watch.js +175 -0
- package/dist/client.js +6 -13
- package/dist/constants.d.ts +2 -0
- package/dist/constants.js +5 -0
- package/dist/errors.js +7 -0
- package/dist/export.js +2 -2
- package/dist/index.d.ts +10 -9
- package/dist/index.js +24 -7
- package/dist/integrations/anthropic.js +6 -6
- package/dist/integrations/openai.js +84 -61
- package/dist/logger.d.ts +3 -1
- package/dist/logger.js +2 -1
- package/dist/otel.d.ts +130 -0
- package/dist/otel.js +309 -0
- package/dist/pagination.d.ts +13 -2
- package/dist/pagination.js +28 -2
- package/dist/runtime/adapters/testsuite-to-dsl.js +1 -6
- package/dist/runtime/eval.d.ts +14 -4
- package/dist/runtime/eval.js +127 -2
- package/dist/runtime/executor.d.ts +3 -2
- package/dist/runtime/executor.js +3 -2
- package/dist/runtime/registry.d.ts +8 -3
- package/dist/runtime/registry.js +15 -4
- package/dist/runtime/run-report.d.ts +1 -1
- package/dist/runtime/run-report.js +7 -4
- package/dist/runtime/types.d.ts +38 -0
- package/dist/snapshot.d.ts +12 -0
- package/dist/snapshot.js +24 -1
- package/dist/testing.d.ts +8 -0
- package/dist/testing.js +45 -10
- package/dist/version.d.ts +2 -2
- package/dist/version.js +2 -2
- package/dist/workflows.d.ts +2 -0
- package/dist/workflows.js +184 -102
- package/package.json +8 -1
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* evalgate validate — static validation of spec files without execution
|
|
4
|
+
*
|
|
5
|
+
* The equivalent of `tsc --noEmit` for eval specs. Catches:
|
|
6
|
+
* - Missing or malformed defineEval calls
|
|
7
|
+
* - Executor functions that don't return EvalResult shape
|
|
8
|
+
* - Invalid spec names (characters, length)
|
|
9
|
+
* - Empty spec files
|
|
10
|
+
* - Missing required fields in config-form defineEval
|
|
11
|
+
*
|
|
12
|
+
* Usage:
|
|
13
|
+
* evalgate validate
|
|
14
|
+
* evalgate validate --format json
|
|
15
|
+
*/
|
|
16
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
17
|
+
if (k2 === undefined) k2 = k;
|
|
18
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
19
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
20
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
21
|
+
}
|
|
22
|
+
Object.defineProperty(o, k2, desc);
|
|
23
|
+
}) : (function(o, m, k, k2) {
|
|
24
|
+
if (k2 === undefined) k2 = k;
|
|
25
|
+
o[k2] = m[k];
|
|
26
|
+
}));
|
|
27
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
28
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
29
|
+
}) : function(o, v) {
|
|
30
|
+
o["default"] = v;
|
|
31
|
+
});
|
|
32
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
33
|
+
var ownKeys = function(o) {
|
|
34
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
35
|
+
var ar = [];
|
|
36
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
37
|
+
return ar;
|
|
38
|
+
};
|
|
39
|
+
return ownKeys(o);
|
|
40
|
+
};
|
|
41
|
+
return function (mod) {
|
|
42
|
+
if (mod && mod.__esModule) return mod;
|
|
43
|
+
var result = {};
|
|
44
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
45
|
+
__setModuleDefault(result, mod);
|
|
46
|
+
return result;
|
|
47
|
+
};
|
|
48
|
+
})();
|
|
49
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
50
|
+
exports.runValidate = runValidate;
|
|
51
|
+
const fs = __importStar(require("node:fs"));
|
|
52
|
+
const path = __importStar(require("node:path"));
|
|
53
|
+
const execution_mode_1 = require("../runtime/execution-mode");
|
|
54
|
+
/**
|
|
55
|
+
* Name validation regex — must match the runtime's validateSpecName
|
|
56
|
+
*/
|
|
57
|
+
const VALID_NAME_RE = /^[a-zA-Z0-9\s\-_]+$/;
|
|
58
|
+
const MAX_NAME_LENGTH = 100;
|
|
59
|
+
/**
|
|
60
|
+
* Static patterns we look for in spec files
|
|
61
|
+
*/
|
|
62
|
+
const DEFINE_EVAL_RE = /defineEval\s*[.(]/g;
|
|
63
|
+
const DEFINE_EVAL_NAME_RE = /defineEval\s*\(\s*["'`]([^"'`]*)["'`]/g;
|
|
64
|
+
const DEFINE_EVAL_CONFIG_RE = /defineEval\s*\(\s*\{/g;
|
|
65
|
+
const DEFINE_EVAL_SKIP_RE = /defineEval\.skip\s*\(/g;
|
|
66
|
+
const DEFINE_EVAL_ONLY_RE = /defineEval\.only\s*\(/g;
|
|
67
|
+
const DEFINE_EVAL_FROM_DATASET_RE = /defineEval\.fromDataset\s*\(/g;
|
|
68
|
+
const EXECUTOR_RETURN_RE = /return\s*\{[^}]*pass\s*:/g;
|
|
69
|
+
const CREATE_RESULT_RE = /createResult\s*\(/g;
|
|
70
|
+
function analyzeFile(filePath) {
|
|
71
|
+
const issues = [];
|
|
72
|
+
const relPath = path.relative(process.cwd(), filePath);
|
|
73
|
+
let content;
|
|
74
|
+
try {
|
|
75
|
+
content = fs.readFileSync(filePath, "utf8");
|
|
76
|
+
}
|
|
77
|
+
catch {
|
|
78
|
+
issues.push({
|
|
79
|
+
severity: "error",
|
|
80
|
+
file: relPath,
|
|
81
|
+
code: "FILE_UNREADABLE",
|
|
82
|
+
message: `Cannot read file: ${relPath}`,
|
|
83
|
+
});
|
|
84
|
+
return issues;
|
|
85
|
+
}
|
|
86
|
+
if (content.trim().length === 0) {
|
|
87
|
+
issues.push({
|
|
88
|
+
severity: "error",
|
|
89
|
+
file: relPath,
|
|
90
|
+
code: "EMPTY_FILE",
|
|
91
|
+
message: "Spec file is empty",
|
|
92
|
+
});
|
|
93
|
+
return issues;
|
|
94
|
+
}
|
|
95
|
+
const lines = content.split("\n");
|
|
96
|
+
// Check for defineEval calls
|
|
97
|
+
const defineEvalMatches = content.match(DEFINE_EVAL_RE);
|
|
98
|
+
const skipMatches = content.match(DEFINE_EVAL_SKIP_RE);
|
|
99
|
+
const onlyMatches = content.match(DEFINE_EVAL_ONLY_RE);
|
|
100
|
+
const fromDatasetMatches = content.match(DEFINE_EVAL_FROM_DATASET_RE);
|
|
101
|
+
const totalCalls = (defineEvalMatches?.length ?? 0) +
|
|
102
|
+
(skipMatches?.length ?? 0) +
|
|
103
|
+
(onlyMatches?.length ?? 0) +
|
|
104
|
+
(fromDatasetMatches?.length ?? 0);
|
|
105
|
+
if (totalCalls === 0) {
|
|
106
|
+
issues.push({
|
|
107
|
+
severity: "warn",
|
|
108
|
+
file: relPath,
|
|
109
|
+
code: "NO_DEFINE_EVAL",
|
|
110
|
+
message: "No defineEval() calls found. File may not define any specs.",
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
// Validate spec names
|
|
114
|
+
const nameMatches = [...content.matchAll(DEFINE_EVAL_NAME_RE)];
|
|
115
|
+
for (const match of nameMatches) {
|
|
116
|
+
const name = match[1];
|
|
117
|
+
const matchIndex = match.index ?? 0;
|
|
118
|
+
const lineNum = content.substring(0, matchIndex).split("\n").length;
|
|
119
|
+
if (!name || name.trim() === "") {
|
|
120
|
+
issues.push({
|
|
121
|
+
severity: "error",
|
|
122
|
+
file: relPath,
|
|
123
|
+
line: lineNum,
|
|
124
|
+
code: "EMPTY_NAME",
|
|
125
|
+
message: "Spec name is empty",
|
|
126
|
+
});
|
|
127
|
+
continue;
|
|
128
|
+
}
|
|
129
|
+
if (name.length > MAX_NAME_LENGTH) {
|
|
130
|
+
issues.push({
|
|
131
|
+
severity: "error",
|
|
132
|
+
file: relPath,
|
|
133
|
+
line: lineNum,
|
|
134
|
+
code: "NAME_TOO_LONG",
|
|
135
|
+
message: `Spec name "${name.slice(0, 30)}..." exceeds ${MAX_NAME_LENGTH} characters`,
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
if (!VALID_NAME_RE.test(name)) {
|
|
139
|
+
issues.push({
|
|
140
|
+
severity: "error",
|
|
141
|
+
file: relPath,
|
|
142
|
+
line: lineNum,
|
|
143
|
+
code: "INVALID_NAME",
|
|
144
|
+
message: `Spec name "${name}" contains invalid characters (only letters, numbers, spaces, hyphens, underscores allowed)`,
|
|
145
|
+
});
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
// Check config-form defineEval calls have required fields
|
|
149
|
+
const configMatches = [...content.matchAll(DEFINE_EVAL_CONFIG_RE)];
|
|
150
|
+
for (const match of configMatches) {
|
|
151
|
+
const matchIndex = match.index ?? 0;
|
|
152
|
+
const lineNum = content.substring(0, matchIndex).split("\n").length;
|
|
153
|
+
// Simple heuristic: look for 'name:' and 'executor:' in the next ~20 lines
|
|
154
|
+
const contextLines = lines.slice(lineNum - 1, lineNum + 19).join("\n");
|
|
155
|
+
if (!contextLines.includes("name:") && !contextLines.includes("name :")) {
|
|
156
|
+
issues.push({
|
|
157
|
+
severity: "error",
|
|
158
|
+
file: relPath,
|
|
159
|
+
line: lineNum,
|
|
160
|
+
code: "MISSING_NAME",
|
|
161
|
+
message: "Config-form defineEval() missing required 'name' field",
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
if (!contextLines.includes("executor:") &&
|
|
165
|
+
!contextLines.includes("executor :")) {
|
|
166
|
+
issues.push({
|
|
167
|
+
severity: "error",
|
|
168
|
+
file: relPath,
|
|
169
|
+
line: lineNum,
|
|
170
|
+
code: "MISSING_EXECUTOR",
|
|
171
|
+
message: "Config-form defineEval() missing required 'executor' field",
|
|
172
|
+
});
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
// Check that executors return EvalResult shape
|
|
176
|
+
const hasCreateResult = CREATE_RESULT_RE.test(content);
|
|
177
|
+
const hasReturnPass = EXECUTOR_RETURN_RE.test(content);
|
|
178
|
+
if (totalCalls > 0 && !hasCreateResult && !hasReturnPass) {
|
|
179
|
+
issues.push({
|
|
180
|
+
severity: "warn",
|
|
181
|
+
file: relPath,
|
|
182
|
+
code: "NO_RESULT_SHAPE",
|
|
183
|
+
message: "No createResult() or return { pass: ... } found. Executors may not return the required EvalResult shape.",
|
|
184
|
+
});
|
|
185
|
+
}
|
|
186
|
+
return issues;
|
|
187
|
+
}
|
|
188
|
+
async function runValidate(args = []) {
|
|
189
|
+
const formatIndex = args.indexOf("--format");
|
|
190
|
+
const format = formatIndex !== -1 ? args[formatIndex + 1] : "human";
|
|
191
|
+
const projectRoot = process.cwd();
|
|
192
|
+
const executionMode = await (0, execution_mode_1.getExecutionMode)(projectRoot);
|
|
193
|
+
const specFiles = executionMode.specFiles;
|
|
194
|
+
if (specFiles.length === 0) {
|
|
195
|
+
const result = {
|
|
196
|
+
filesScanned: 0,
|
|
197
|
+
filesWithIssues: 0,
|
|
198
|
+
issues: [],
|
|
199
|
+
passed: true,
|
|
200
|
+
};
|
|
201
|
+
if (format === "json") {
|
|
202
|
+
console.log(JSON.stringify(result, null, 2));
|
|
203
|
+
}
|
|
204
|
+
else {
|
|
205
|
+
console.log("\n✨ No spec files found. Nothing to validate.");
|
|
206
|
+
console.log("💡 Create files with defineEval() calls to get started.");
|
|
207
|
+
}
|
|
208
|
+
return result;
|
|
209
|
+
}
|
|
210
|
+
const allIssues = [];
|
|
211
|
+
const filesWithIssues = new Set();
|
|
212
|
+
for (const file of specFiles) {
|
|
213
|
+
const issues = analyzeFile(file);
|
|
214
|
+
for (const issue of issues) {
|
|
215
|
+
allIssues.push(issue);
|
|
216
|
+
filesWithIssues.add(issue.file);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
const errors = allIssues.filter((i) => i.severity === "error");
|
|
220
|
+
const warnings = allIssues.filter((i) => i.severity === "warn");
|
|
221
|
+
const passed = errors.length === 0;
|
|
222
|
+
const result = {
|
|
223
|
+
filesScanned: specFiles.length,
|
|
224
|
+
filesWithIssues: filesWithIssues.size,
|
|
225
|
+
issues: allIssues,
|
|
226
|
+
passed,
|
|
227
|
+
};
|
|
228
|
+
if (format === "json") {
|
|
229
|
+
console.log(JSON.stringify(result, null, 2));
|
|
230
|
+
}
|
|
231
|
+
else {
|
|
232
|
+
console.log(`\n🔍 Validated ${specFiles.length} spec file${specFiles.length === 1 ? "" : "s"}`);
|
|
233
|
+
if (allIssues.length === 0) {
|
|
234
|
+
console.log("✅ All spec files are valid.\n");
|
|
235
|
+
}
|
|
236
|
+
else {
|
|
237
|
+
for (const issue of allIssues) {
|
|
238
|
+
const loc = issue.line ? `:${issue.line}` : "";
|
|
239
|
+
const icon = issue.severity === "error" ? "❌" : "⚠️";
|
|
240
|
+
console.log(` ${icon} ${issue.file}${loc} [${issue.code}] ${issue.message}`);
|
|
241
|
+
}
|
|
242
|
+
console.log(`\n${errors.length} error${errors.length === 1 ? "" : "s"}, ${warnings.length} warning${warnings.length === 1 ? "" : "s"}`);
|
|
243
|
+
if (passed) {
|
|
244
|
+
console.log("✅ Validation passed (warnings only).\n");
|
|
245
|
+
}
|
|
246
|
+
else {
|
|
247
|
+
console.log("❌ Validation failed.\n");
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
return result;
|
|
252
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Watch mode for evalgate run
|
|
3
|
+
*
|
|
4
|
+
* Re-executes evaluation specs when source files change.
|
|
5
|
+
* Uses Node.js fs.watch with debouncing to avoid rapid re-runs.
|
|
6
|
+
*/
|
|
7
|
+
import type { RunOptions } from "./run";
|
|
8
|
+
export interface WatchOptions extends RunOptions {
|
|
9
|
+
/** Debounce interval in milliseconds (default: 300) */
|
|
10
|
+
debounceMs?: number;
|
|
11
|
+
/** Additional directories to watch beyond spec files */
|
|
12
|
+
extraWatchDirs?: string[];
|
|
13
|
+
/** Clear terminal between runs */
|
|
14
|
+
clearScreen?: boolean;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Start watch mode — runs evaluations and re-runs on file changes
|
|
18
|
+
*/
|
|
19
|
+
export declare function runWatch(options: WatchOptions, projectRoot?: string): Promise<void>;
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Watch mode for evalgate run
|
|
4
|
+
*
|
|
5
|
+
* Re-executes evaluation specs when source files change.
|
|
6
|
+
* Uses Node.js fs.watch with debouncing to avoid rapid re-runs.
|
|
7
|
+
*/
|
|
8
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
9
|
+
if (k2 === undefined) k2 = k;
|
|
10
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
11
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
12
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
13
|
+
}
|
|
14
|
+
Object.defineProperty(o, k2, desc);
|
|
15
|
+
}) : (function(o, m, k, k2) {
|
|
16
|
+
if (k2 === undefined) k2 = k;
|
|
17
|
+
o[k2] = m[k];
|
|
18
|
+
}));
|
|
19
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
20
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
21
|
+
}) : function(o, v) {
|
|
22
|
+
o["default"] = v;
|
|
23
|
+
});
|
|
24
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
25
|
+
var ownKeys = function(o) {
|
|
26
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
27
|
+
var ar = [];
|
|
28
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
29
|
+
return ar;
|
|
30
|
+
};
|
|
31
|
+
return ownKeys(o);
|
|
32
|
+
};
|
|
33
|
+
return function (mod) {
|
|
34
|
+
if (mod && mod.__esModule) return mod;
|
|
35
|
+
var result = {};
|
|
36
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
37
|
+
__setModuleDefault(result, mod);
|
|
38
|
+
return result;
|
|
39
|
+
};
|
|
40
|
+
})();
|
|
41
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
42
|
+
exports.runWatch = runWatch;
|
|
43
|
+
const fs = __importStar(require("node:fs"));
|
|
44
|
+
const path = __importStar(require("node:path"));
|
|
45
|
+
const run_1 = require("./run");
|
|
46
|
+
/**
|
|
47
|
+
* Start watch mode — runs evaluations and re-runs on file changes
|
|
48
|
+
*/
|
|
49
|
+
async function runWatch(options, projectRoot = process.cwd()) {
|
|
50
|
+
const debounceMs = options.debounceMs ?? 300;
|
|
51
|
+
const clearScreen = options.clearScreen ?? true;
|
|
52
|
+
// Directories to watch
|
|
53
|
+
const watchDirs = new Set();
|
|
54
|
+
// Always watch the eval/ directory if it exists
|
|
55
|
+
const evalDir = path.join(projectRoot, "eval");
|
|
56
|
+
if (fs.existsSync(evalDir))
|
|
57
|
+
watchDirs.add(evalDir);
|
|
58
|
+
// Watch evals/ directory too
|
|
59
|
+
const evalsDir = path.join(projectRoot, "evals");
|
|
60
|
+
if (fs.existsSync(evalsDir))
|
|
61
|
+
watchDirs.add(evalsDir);
|
|
62
|
+
// Watch src/ for code changes that may affect evals
|
|
63
|
+
const srcDir = path.join(projectRoot, "src");
|
|
64
|
+
if (fs.existsSync(srcDir))
|
|
65
|
+
watchDirs.add(srcDir);
|
|
66
|
+
// Add extra watch dirs
|
|
67
|
+
if (options.extraWatchDirs) {
|
|
68
|
+
for (const dir of options.extraWatchDirs) {
|
|
69
|
+
const resolved = path.isAbsolute(dir) ? dir : path.join(projectRoot, dir);
|
|
70
|
+
if (fs.existsSync(resolved))
|
|
71
|
+
watchDirs.add(resolved);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
if (watchDirs.size === 0) {
|
|
75
|
+
console.error("❌ No directories to watch. Create eval/, evals/, or src/ directory.");
|
|
76
|
+
process.exit(1);
|
|
77
|
+
}
|
|
78
|
+
console.log("👁️ Watch mode enabled");
|
|
79
|
+
console.log(` Watching: ${[...watchDirs].map((d) => path.relative(projectRoot, d) || ".").join(", ")}`);
|
|
80
|
+
console.log(` Debounce: ${debounceMs}ms`);
|
|
81
|
+
console.log(" Press Ctrl+C to stop\n");
|
|
82
|
+
// Initial run
|
|
83
|
+
await executeRun(options, projectRoot, clearScreen, false);
|
|
84
|
+
// Set up watchers with debouncing
|
|
85
|
+
let debounceTimer = null;
|
|
86
|
+
let isRunning = false;
|
|
87
|
+
const triggerRun = () => {
|
|
88
|
+
if (debounceTimer)
|
|
89
|
+
clearTimeout(debounceTimer);
|
|
90
|
+
debounceTimer = setTimeout(async () => {
|
|
91
|
+
if (isRunning)
|
|
92
|
+
return;
|
|
93
|
+
isRunning = true;
|
|
94
|
+
try {
|
|
95
|
+
await executeRun(options, projectRoot, clearScreen, true);
|
|
96
|
+
}
|
|
97
|
+
finally {
|
|
98
|
+
isRunning = false;
|
|
99
|
+
}
|
|
100
|
+
}, debounceMs);
|
|
101
|
+
};
|
|
102
|
+
const watchers = [];
|
|
103
|
+
for (const dir of watchDirs) {
|
|
104
|
+
try {
|
|
105
|
+
const watcher = fs.watch(dir, { recursive: true }, (eventType, filename) => {
|
|
106
|
+
if (!filename)
|
|
107
|
+
return;
|
|
108
|
+
// Skip hidden files and node_modules
|
|
109
|
+
if (filename.startsWith(".") || filename.includes("node_modules"))
|
|
110
|
+
return;
|
|
111
|
+
// Only watch relevant file types
|
|
112
|
+
const ext = path.extname(filename).toLowerCase();
|
|
113
|
+
if ([".ts", ".tsx", ".js", ".jsx", ".json", ".jsonl", ".csv"].includes(ext)) {
|
|
114
|
+
console.log(`\n🔄 Change detected: ${filename} (${eventType})`);
|
|
115
|
+
triggerRun();
|
|
116
|
+
}
|
|
117
|
+
});
|
|
118
|
+
watchers.push(watcher);
|
|
119
|
+
}
|
|
120
|
+
catch (err) {
|
|
121
|
+
console.warn(`⚠️ Could not watch ${path.relative(projectRoot, dir)}: ${err instanceof Error ? err.message : String(err)}`);
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
// Handle graceful shutdown
|
|
125
|
+
const cleanup = () => {
|
|
126
|
+
console.log("\n\n👋 Watch mode stopped.");
|
|
127
|
+
for (const watcher of watchers) {
|
|
128
|
+
watcher.close();
|
|
129
|
+
}
|
|
130
|
+
if (debounceTimer)
|
|
131
|
+
clearTimeout(debounceTimer);
|
|
132
|
+
process.exit(0);
|
|
133
|
+
};
|
|
134
|
+
process.on("SIGINT", cleanup);
|
|
135
|
+
process.on("SIGTERM", cleanup);
|
|
136
|
+
// Keep process alive
|
|
137
|
+
await new Promise(() => {
|
|
138
|
+
// Never resolves — watch runs until interrupted
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Execute a single run and print results (without process.exit)
|
|
143
|
+
*/
|
|
144
|
+
async function executeRun(options, projectRoot, clearScreen, isRerun) {
|
|
145
|
+
if (clearScreen && isRerun) {
|
|
146
|
+
// Clear screen using ANSI escape
|
|
147
|
+
process.stdout.write("\x1B[2J\x1B[0f");
|
|
148
|
+
}
|
|
149
|
+
const timestamp = new Date().toLocaleTimeString();
|
|
150
|
+
console.log(`${isRerun ? "🔄 Re-running" : "▶️ Running"} evaluations... (${timestamp})`);
|
|
151
|
+
try {
|
|
152
|
+
const result = await (0, run_1.runEvaluations)({
|
|
153
|
+
specIds: options.specIds,
|
|
154
|
+
impactedOnly: options.impactedOnly,
|
|
155
|
+
baseBranch: options.baseBranch,
|
|
156
|
+
format: options.format,
|
|
157
|
+
writeResults: options.writeResults,
|
|
158
|
+
}, projectRoot);
|
|
159
|
+
if (options.format === "json") {
|
|
160
|
+
(0, run_1.printJsonResults)(result);
|
|
161
|
+
}
|
|
162
|
+
else {
|
|
163
|
+
(0, run_1.printHumanResults)(result);
|
|
164
|
+
}
|
|
165
|
+
// Print watch-specific summary
|
|
166
|
+
const statusIcon = result.summary.failed > 0 ? "❌" : "✅";
|
|
167
|
+
console.log(`\n${statusIcon} ${result.summary.passed}/${result.results.length} passed | Waiting for changes...`);
|
|
168
|
+
return result;
|
|
169
|
+
}
|
|
170
|
+
catch (error) {
|
|
171
|
+
console.error("❌ Run failed:", error instanceof Error ? error.message : String(error));
|
|
172
|
+
console.log("\n⏳ Waiting for changes...");
|
|
173
|
+
return null;
|
|
174
|
+
}
|
|
175
|
+
}
|
package/dist/client.js
CHANGED
|
@@ -3,6 +3,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
3
3
|
exports.AIEvalClient = void 0;
|
|
4
4
|
const batch_1 = require("./batch");
|
|
5
5
|
const cache_1 = require("./cache");
|
|
6
|
+
const constants_1 = require("./constants");
|
|
6
7
|
const context_1 = require("./context");
|
|
7
8
|
const errors_1 = require("./errors");
|
|
8
9
|
const logger_1 = require("./logger");
|
|
@@ -72,7 +73,7 @@ class AIEvalClient {
|
|
|
72
73
|
this.baseUrl =
|
|
73
74
|
config.baseUrl ||
|
|
74
75
|
getEnvVar("EVALGATE_BASE_URL", "EVALAI_BASE_URL") ||
|
|
75
|
-
(isBrowser ? "" :
|
|
76
|
+
(isBrowser ? "" : constants_1.DEFAULT_BASE_URL);
|
|
76
77
|
this.timeout = config.timeout || 30000;
|
|
77
78
|
// Tier 4.17: Debug mode with request logging
|
|
78
79
|
const logLevel = config.logLevel || (config.debug ? "debug" : "info");
|
|
@@ -100,7 +101,7 @@ class AIEvalClient {
|
|
|
100
101
|
const MAX_CONCURRENCY = 5;
|
|
101
102
|
this.batcher = new batch_1.RequestBatcher(async (requests) => {
|
|
102
103
|
const results = [];
|
|
103
|
-
const executing =
|
|
104
|
+
const executing = new Set();
|
|
104
105
|
for (const req of requests) {
|
|
105
106
|
const task = (async () => {
|
|
106
107
|
try {
|
|
@@ -121,18 +122,10 @@ class AIEvalClient {
|
|
|
121
122
|
});
|
|
122
123
|
}
|
|
123
124
|
})();
|
|
124
|
-
executing.
|
|
125
|
-
|
|
125
|
+
const tracked = task.finally(() => executing.delete(tracked));
|
|
126
|
+
executing.add(tracked);
|
|
127
|
+
if (executing.size >= MAX_CONCURRENCY) {
|
|
126
128
|
await Promise.race(executing);
|
|
127
|
-
// Remove settled promises
|
|
128
|
-
for (let i = executing.length - 1; i >= 0; i--) {
|
|
129
|
-
const settled = await Promise.race([
|
|
130
|
-
executing[i].then(() => true),
|
|
131
|
-
Promise.resolve(false),
|
|
132
|
-
]);
|
|
133
|
-
if (settled)
|
|
134
|
-
executing.splice(i, 1);
|
|
135
|
-
}
|
|
136
129
|
}
|
|
137
130
|
}
|
|
138
131
|
await Promise.allSettled(executing);
|
package/dist/errors.js
CHANGED
|
@@ -271,6 +271,10 @@ class RateLimitError extends EvalGateError {
|
|
|
271
271
|
constructor(message, retryAfter) {
|
|
272
272
|
super(message, "RATE_LIMIT_EXCEEDED", 429, { retryAfter });
|
|
273
273
|
this.name = "RateLimitError";
|
|
274
|
+
if (retryAfter !== undefined) {
|
|
275
|
+
this.retryAfter = retryAfter;
|
|
276
|
+
}
|
|
277
|
+
Object.setPrototypeOf(this, RateLimitError.prototype);
|
|
274
278
|
}
|
|
275
279
|
}
|
|
276
280
|
exports.RateLimitError = RateLimitError;
|
|
@@ -278,6 +282,7 @@ class AuthenticationError extends EvalGateError {
|
|
|
278
282
|
constructor(message = "Authentication failed") {
|
|
279
283
|
super(message, "AUTHENTICATION_ERROR", 401);
|
|
280
284
|
this.name = "AuthenticationError";
|
|
285
|
+
Object.setPrototypeOf(this, AuthenticationError.prototype);
|
|
281
286
|
}
|
|
282
287
|
}
|
|
283
288
|
exports.AuthenticationError = AuthenticationError;
|
|
@@ -285,6 +290,7 @@ class ValidationError extends EvalGateError {
|
|
|
285
290
|
constructor(message = "Validation failed", details) {
|
|
286
291
|
super(message, "VALIDATION_ERROR", 400, details);
|
|
287
292
|
this.name = "ValidationError";
|
|
293
|
+
Object.setPrototypeOf(this, ValidationError.prototype);
|
|
288
294
|
}
|
|
289
295
|
}
|
|
290
296
|
exports.ValidationError = ValidationError;
|
|
@@ -293,6 +299,7 @@ class NetworkError extends EvalGateError {
|
|
|
293
299
|
super(message, "NETWORK_ERROR", 0);
|
|
294
300
|
this.name = "NetworkError";
|
|
295
301
|
this.retryable = true;
|
|
302
|
+
Object.setPrototypeOf(this, NetworkError.prototype);
|
|
296
303
|
}
|
|
297
304
|
}
|
|
298
305
|
exports.NetworkError = NetworkError;
|
package/dist/export.js
CHANGED
|
@@ -155,7 +155,7 @@ async function importData(client, data, options = {}) {
|
|
|
155
155
|
return result;
|
|
156
156
|
}
|
|
157
157
|
// Import traces
|
|
158
|
-
if (data.traces) {
|
|
158
|
+
if (data.traces && client?.traces) {
|
|
159
159
|
const traceResults = { imported: 0, skipped: 0, failed: 0 };
|
|
160
160
|
for (const trace of data.traces) {
|
|
161
161
|
try {
|
|
@@ -191,7 +191,7 @@ async function importData(client, data, options = {}) {
|
|
|
191
191
|
result.summary.total += data.traces.length;
|
|
192
192
|
}
|
|
193
193
|
// Import evaluations
|
|
194
|
-
if (data.evaluations) {
|
|
194
|
+
if (data.evaluations && client?.evaluations) {
|
|
195
195
|
const evalResults = { imported: 0, skipped: 0, failed: 0 };
|
|
196
196
|
for (const evaluation of data.evaluations) {
|
|
197
197
|
try {
|
package/dist/index.d.ts
CHANGED
|
@@ -7,34 +7,35 @@
|
|
|
7
7
|
* @packageDocumentation
|
|
8
8
|
*/
|
|
9
9
|
export { AIEvalClient } from "./client";
|
|
10
|
-
import { AuthenticationError, EvalGateError, NetworkError, RateLimitError,
|
|
11
|
-
export { EvalGateError, RateLimitError, AuthenticationError,
|
|
12
|
-
|
|
13
|
-
export {
|
|
10
|
+
import { AuthenticationError, EvalGateError, NetworkError, RateLimitError, ValidationError } from "./errors";
|
|
11
|
+
export { EvalGateError, RateLimitError, AuthenticationError, ValidationError, NetworkError, };
|
|
12
|
+
export { type AssertionLLMConfig, configureAssertions, containsAllRequiredFields, containsJSON, containsKeywords, containsLanguage, containsLanguageAsync, expect, followsInstructions, getAssertionConfig, hasConsistency, hasConsistencyAsync, hasFactualAccuracy, hasFactualAccuracyAsync, hasLength, hasNoHallucinations, hasNoHallucinationsAsync, hasNoToxicity, hasNoToxicityAsync, hasPII, hasReadabilityScore, hasSentiment, hasSentimentAsync, hasSentimentWithScore, hasValidCodeSyntax, hasValidCodeSyntaxAsync, isValidEmail, isValidURL, matchesPattern, matchesSchema, notContainsPII, respondedWithinDuration, respondedWithinTime, respondedWithinTimeSince, type SentimentAsyncResult, similarTo, toSemanticallyContain, withinRange, } from "./assertions";
|
|
13
|
+
export { EvalGateError as SDKError } from "./errors";
|
|
14
14
|
import { createContext, EvalContext, getCurrentContext, withContext } from "./context";
|
|
15
15
|
export { createContext, getCurrentContext as getContext, withContext, EvalContext as ContextManager, };
|
|
16
16
|
export { cloneContext, mergeContexts, validateContext, } from "./runtime/context";
|
|
17
|
-
export { createContext as createEvalContext, createResult, defineEval, defineSuite, evalai, } from "./runtime/eval";
|
|
17
|
+
export { createContext as createEvalContext, createResult, defineEval, defineSuite, evalai, getFilteredSpecs, } from "./runtime/eval";
|
|
18
18
|
export { createLocalExecutor, defaultLocalExecutor, } from "./runtime/executor";
|
|
19
19
|
export { createEvalRuntime, disposeActiveRuntime, getActiveRuntime, setActiveRuntime, } from "./runtime/registry";
|
|
20
20
|
export type { CloudExecutor, DefineEvalFunction, EvalContext, EvalExecutor, EvalExecutorInterface, EvalOptions, EvalResult, EvalRuntime, EvalSpec, ExecutorCapabilities, LocalExecutor, SpecConfig, SpecOptions, WorkerExecutor, } from "./runtime/types";
|
|
21
21
|
export { EvalRuntimeError, RuntimeError, SpecExecutionError, SpecRegistrationError, } from "./runtime/types";
|
|
22
22
|
export { createTestSuite, type TestCaseResult, TestSuite, TestSuiteCase, TestSuiteCaseResult, TestSuiteConfig, TestSuiteResult, } from "./testing";
|
|
23
|
-
import { compareWithSnapshot, snapshot } from "./snapshot";
|
|
24
|
-
export { snapshot, compareWithSnapshot, snapshot as saveSnapshot,
|
|
23
|
+
import { compareSnapshots, compareWithSnapshot, snapshot } from "./snapshot";
|
|
24
|
+
export { snapshot, compareWithSnapshot, compareSnapshots, snapshot as saveSnapshot, };
|
|
25
25
|
import type { ExportFormat } from "./export";
|
|
26
26
|
import { exportData, importData } from "./export";
|
|
27
27
|
export { exportData, importData };
|
|
28
28
|
export type { ExportFormat, ExportFormat as ExportType };
|
|
29
29
|
export { RequestBatcher } from "./batch";
|
|
30
|
-
export { CacheTTL
|
|
30
|
+
export { CacheTTL } from "./cache";
|
|
31
31
|
export { type CheckArgs, EXIT, parseArgs, runCheck } from "./cli/check";
|
|
32
32
|
export { traceAnthropic } from "./integrations/anthropic";
|
|
33
33
|
export { traceOpenAI } from "./integrations/openai";
|
|
34
34
|
export { type OpenAIChatEvalCase, type OpenAIChatEvalOptions, type OpenAIChatEvalResult, openAIChatEval, } from "./integrations/openai-eval";
|
|
35
35
|
export { Logger } from "./logger";
|
|
36
36
|
export { extendExpectWithToPassGate } from "./matchers";
|
|
37
|
-
export {
|
|
37
|
+
export { createOTelExporter, type OTelAttribute, type OTelEvent, OTelExporter, type OTelExporterOptions, type OTelExportPayload, type OTelSpan, } from "./otel";
|
|
38
|
+
export { autoPaginate, autoPaginateGenerator, createPaginatedIterator, decodeCursor, encodeCursor, PaginatedIterator, type PaginatedResponse, type PaginationParams, } from "./pagination";
|
|
38
39
|
export { ARTIFACTS, type Baseline, type BaselineTolerance, GATE_CATEGORY, GATE_EXIT, type GateCategory, type GateExitCode, REPORT_SCHEMA_VERSION, type RegressionDelta, type RegressionReport, } from "./regression";
|
|
39
40
|
export { batchProcess, batchRead, RateLimiter, streamEvaluation, } from "./streaming";
|
|
40
41
|
export type { Annotation, AnnotationItem, AnnotationTask, APIKey, APIKeyUsage, APIKeyWithSecret, BatchOptions, ClientConfig as AIEvalConfig, CreateAnnotationItemParams, CreateAnnotationParams, CreateAnnotationTaskParams, CreateAPIKeyParams, CreateLLMJudgeConfigParams, CreateWebhookParams, Evaluation as EvaluationData, EvaluationRun, EvaluationRunDetail, ExportOptions, GenericMetadata as AnnotationData, GetLLMJudgeAlignmentParams, GetUsageParams, ImportOptions, ListAnnotationItemsParams, ListAnnotationsParams, ListAnnotationTasksParams, ListAPIKeysParams, ListLLMJudgeConfigsParams, ListLLMJudgeResultsParams, ListWebhookDeliveriesParams, ListWebhooksParams, LLMJudgeAlignment, LLMJudgeConfig, LLMJudgeEvaluateResult, LLMJudgeResult as LLMJudgeData, Organization, RetryConfig, SnapshotData, Span as SpanData, StreamOptions, TestCase, TestResult, Trace as TraceData, TraceDetail, TracedResponse, UpdateAPIKeyParams, UpdateWebhookParams, UsageStats, UsageSummary, Webhook, WebhookDelivery, } from "./types";
|