@pauly4010/evalai-sdk 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +289 -0
- package/LICENSE +21 -0
- package/README.md +565 -0
- package/dist/assertions.d.ts +189 -0
- package/dist/assertions.js +596 -0
- package/dist/batch.d.ts +68 -0
- package/dist/batch.js +178 -0
- package/dist/cache.d.ts +65 -0
- package/dist/cache.js +135 -0
- package/dist/cli/index.d.ts +6 -0
- package/dist/cli/index.js +181 -0
- package/dist/client.d.ts +358 -0
- package/dist/client.js +802 -0
- package/dist/context.d.ts +134 -0
- package/dist/context.js +215 -0
- package/dist/errors.d.ts +80 -0
- package/dist/errors.js +285 -0
- package/dist/export.d.ts +195 -0
- package/dist/export.js +334 -0
- package/dist/index.d.ts +35 -0
- package/dist/index.js +111 -0
- package/dist/integrations/anthropic.d.ts +72 -0
- package/dist/integrations/anthropic.js +159 -0
- package/dist/integrations/openai.d.ts +69 -0
- package/dist/integrations/openai.js +156 -0
- package/dist/local.d.ts +39 -0
- package/dist/local.js +146 -0
- package/dist/logger.d.ts +128 -0
- package/dist/logger.js +227 -0
- package/dist/pagination.d.ts +74 -0
- package/dist/pagination.js +135 -0
- package/dist/snapshot.d.ts +176 -0
- package/dist/snapshot.js +322 -0
- package/dist/streaming.d.ts +173 -0
- package/dist/streaming.js +268 -0
- package/dist/testing.d.ts +204 -0
- package/dist/testing.js +252 -0
- package/dist/types.d.ts +715 -0
- package/dist/types.js +54 -0
- package/dist/workflows.d.ts +378 -0
- package/dist/workflows.js +628 -0
- package/package.json +102 -0
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Streaming & Batch Operations
|
|
4
|
+
* Tier 2.8: Handle large datasets efficiently
|
|
5
|
+
*
|
|
6
|
+
* @example
|
|
7
|
+
* ```typescript
|
|
8
|
+
* import { streamEvaluations, batchCreate } from '@ai-eval-platform/sdk';
|
|
9
|
+
*
|
|
10
|
+
* // Stream large evaluation results
|
|
11
|
+
* for await (const result of streamEvaluations(client, config)) {
|
|
12
|
+
* console.log(`Progress: ${result.completed}/${result.total}`);
|
|
13
|
+
* }
|
|
14
|
+
*
|
|
15
|
+
* // Batch create traces
|
|
16
|
+
* await batchCreate(client.traces, traces, { batchSize: 100 });
|
|
17
|
+
* ```
|
|
18
|
+
*/
|
|
19
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
20
|
+
exports.RateLimiter = void 0;
|
|
21
|
+
exports.batchProcess = batchProcess;
|
|
22
|
+
exports.streamEvaluation = streamEvaluation;
|
|
23
|
+
exports.batchRead = batchRead;
|
|
24
|
+
exports.chunk = chunk;
|
|
25
|
+
/**
|
|
26
|
+
* Batch create items
|
|
27
|
+
*
|
|
28
|
+
* @example
|
|
29
|
+
* ```typescript
|
|
30
|
+
* const traces = [
|
|
31
|
+
* { name: 'trace-1', traceId: 'id-1' },
|
|
32
|
+
* { name: 'trace-2', traceId: 'id-2' },
|
|
33
|
+
* // ... 1000 more
|
|
34
|
+
* ];
|
|
35
|
+
*
|
|
36
|
+
* const result = await batchCreate(
|
|
37
|
+
* (item) => client.traces.create(item),
|
|
38
|
+
* traces,
|
|
39
|
+
* {
|
|
40
|
+
* batchSize: 100,
|
|
41
|
+
* onProgress: (p) => console.log(`${p.completed}/${p.total}`)
|
|
42
|
+
* }
|
|
43
|
+
* );
|
|
44
|
+
* ```
|
|
45
|
+
*/
|
|
46
|
+
async function batchProcess(processor, items, options = {}) {
|
|
47
|
+
const { batchSize = 100, parallel = true, delayMs = 0, onProgress, onError, continueOnError = true } = options;
|
|
48
|
+
const result = {
|
|
49
|
+
successful: [],
|
|
50
|
+
failed: [],
|
|
51
|
+
summary: {
|
|
52
|
+
total: items.length,
|
|
53
|
+
successful: 0,
|
|
54
|
+
failed: 0
|
|
55
|
+
}
|
|
56
|
+
};
|
|
57
|
+
// Split into batches
|
|
58
|
+
const batches = [];
|
|
59
|
+
for (let i = 0; i < items.length; i += batchSize) {
|
|
60
|
+
batches.push(items.slice(i, i + batchSize));
|
|
61
|
+
}
|
|
62
|
+
// Process batches
|
|
63
|
+
for (let batchIndex = 0; batchIndex < batches.length; batchIndex++) {
|
|
64
|
+
const batch = batches[batchIndex];
|
|
65
|
+
const processBatch = async () => {
|
|
66
|
+
const batchPromises = batch.map(async (item, itemIndex) => {
|
|
67
|
+
try {
|
|
68
|
+
const output = await processor(item);
|
|
69
|
+
result.successful.push(output);
|
|
70
|
+
result.summary.successful++;
|
|
71
|
+
return { success: true, output };
|
|
72
|
+
}
|
|
73
|
+
catch (error) {
|
|
74
|
+
const batchError = {
|
|
75
|
+
batch: batchIndex,
|
|
76
|
+
index: itemIndex,
|
|
77
|
+
error: error instanceof Error ? error : new Error(String(error)),
|
|
78
|
+
item
|
|
79
|
+
};
|
|
80
|
+
result.failed.push({
|
|
81
|
+
item,
|
|
82
|
+
error: batchError.error
|
|
83
|
+
});
|
|
84
|
+
result.summary.failed++;
|
|
85
|
+
if (onError)
|
|
86
|
+
onError(batchError);
|
|
87
|
+
if (!continueOnError) {
|
|
88
|
+
throw error;
|
|
89
|
+
}
|
|
90
|
+
return { success: false, error };
|
|
91
|
+
}
|
|
92
|
+
});
|
|
93
|
+
if (parallel) {
|
|
94
|
+
await Promise.all(batchPromises);
|
|
95
|
+
}
|
|
96
|
+
else {
|
|
97
|
+
for (const promise of batchPromises) {
|
|
98
|
+
await promise;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
};
|
|
102
|
+
await processBatch();
|
|
103
|
+
// Progress callback
|
|
104
|
+
if (onProgress) {
|
|
105
|
+
onProgress({
|
|
106
|
+
total: items.length,
|
|
107
|
+
completed: result.summary.successful + result.summary.failed,
|
|
108
|
+
failed: result.summary.failed,
|
|
109
|
+
batch: batchIndex + 1,
|
|
110
|
+
totalBatches: batches.length
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
// Delay between batches
|
|
114
|
+
if (delayMs > 0 && batchIndex < batches.length - 1) {
|
|
115
|
+
await new Promise(resolve => setTimeout(resolve, delayMs));
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
return result;
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Stream evaluation results
|
|
122
|
+
*
|
|
123
|
+
* @example
|
|
124
|
+
* ```typescript
|
|
125
|
+
* const config = {
|
|
126
|
+
* cases: [...],
|
|
127
|
+
* executor: async (input) => callLLM(input)
|
|
128
|
+
* };
|
|
129
|
+
*
|
|
130
|
+
* for await (const result of streamEvaluation(config)) {
|
|
131
|
+
* console.log(`Case ${result.caseId}: ${result.passed ? 'PASS' : 'FAIL'}`);
|
|
132
|
+
* console.log(`Progress: ${result.completed}/${result.total}`);
|
|
133
|
+
* }
|
|
134
|
+
* ```
|
|
135
|
+
*/
|
|
136
|
+
async function* streamEvaluation(config) {
|
|
137
|
+
const { cases, executor } = config;
|
|
138
|
+
let completed = 0;
|
|
139
|
+
for (const [index, testCase] of cases.entries()) {
|
|
140
|
+
try {
|
|
141
|
+
const result = await executor(testCase);
|
|
142
|
+
completed++;
|
|
143
|
+
yield {
|
|
144
|
+
caseId: `case-${index}`,
|
|
145
|
+
case: testCase,
|
|
146
|
+
result,
|
|
147
|
+
passed: true,
|
|
148
|
+
completed,
|
|
149
|
+
total: cases.length
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
catch (error) {
|
|
153
|
+
completed++;
|
|
154
|
+
yield {
|
|
155
|
+
caseId: `case-${index}`,
|
|
156
|
+
case: testCase,
|
|
157
|
+
result: error,
|
|
158
|
+
passed: false,
|
|
159
|
+
completed,
|
|
160
|
+
total: cases.length
|
|
161
|
+
};
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
/**
|
|
166
|
+
* Batch read with pagination
|
|
167
|
+
*
|
|
168
|
+
* @example
|
|
169
|
+
* ```typescript
|
|
170
|
+
* const allTraces = await batchRead(
|
|
171
|
+
* (params) => client.traces.list(params),
|
|
172
|
+
* { pageSize: 100 }
|
|
173
|
+
* );
|
|
174
|
+
* ```
|
|
175
|
+
*/
|
|
176
|
+
async function batchRead(fetcher, options = {}) {
|
|
177
|
+
const { pageSize = 100, maxPages, onProgress } = options;
|
|
178
|
+
const allItems = [];
|
|
179
|
+
let page = 0;
|
|
180
|
+
let hasMore = true;
|
|
181
|
+
while (hasMore && (!maxPages || page < maxPages)) {
|
|
182
|
+
const items = await fetcher({
|
|
183
|
+
limit: pageSize,
|
|
184
|
+
offset: page * pageSize
|
|
185
|
+
});
|
|
186
|
+
if (items.length === 0) {
|
|
187
|
+
hasMore = false;
|
|
188
|
+
}
|
|
189
|
+
else {
|
|
190
|
+
allItems.push(...items);
|
|
191
|
+
page++;
|
|
192
|
+
if (onProgress) {
|
|
193
|
+
onProgress(page, allItems.length);
|
|
194
|
+
}
|
|
195
|
+
if (items.length < pageSize) {
|
|
196
|
+
hasMore = false;
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
return allItems;
|
|
201
|
+
}
|
|
202
|
+
/**
|
|
203
|
+
* Rate-limited batch processor
|
|
204
|
+
*
|
|
205
|
+
* @example
|
|
206
|
+
* ```typescript
|
|
207
|
+
* const limiter = new RateLimiter({ requestsPerSecond: 10 });
|
|
208
|
+
*
|
|
209
|
+
* for (const item of items) {
|
|
210
|
+
* await limiter.throttle(() => client.traces.create(item));
|
|
211
|
+
* }
|
|
212
|
+
* ```
|
|
213
|
+
*/
|
|
214
|
+
class RateLimiter {
|
|
215
|
+
constructor(options) {
|
|
216
|
+
this.queue = [];
|
|
217
|
+
this.processing = false;
|
|
218
|
+
this.requestsPerSecond = options.requestsPerSecond;
|
|
219
|
+
this.interval = 1000 / options.requestsPerSecond;
|
|
220
|
+
}
|
|
221
|
+
/**
|
|
222
|
+
* Throttle a function call
|
|
223
|
+
*/
|
|
224
|
+
async throttle(fn) {
|
|
225
|
+
return new Promise((resolve, reject) => {
|
|
226
|
+
this.queue.push(async () => {
|
|
227
|
+
try {
|
|
228
|
+
const result = await fn();
|
|
229
|
+
resolve(result);
|
|
230
|
+
}
|
|
231
|
+
catch (error) {
|
|
232
|
+
reject(error);
|
|
233
|
+
}
|
|
234
|
+
});
|
|
235
|
+
if (!this.processing) {
|
|
236
|
+
this.process();
|
|
237
|
+
}
|
|
238
|
+
});
|
|
239
|
+
}
|
|
240
|
+
async process() {
|
|
241
|
+
this.processing = true;
|
|
242
|
+
while (this.queue.length > 0) {
|
|
243
|
+
const fn = this.queue.shift();
|
|
244
|
+
if (fn) {
|
|
245
|
+
await fn();
|
|
246
|
+
await new Promise(resolve => setTimeout(resolve, this.interval));
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
this.processing = false;
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
exports.RateLimiter = RateLimiter;
|
|
253
|
+
/**
|
|
254
|
+
* Chunk array into smaller arrays
|
|
255
|
+
*
|
|
256
|
+
* @example
|
|
257
|
+
* ```typescript
|
|
258
|
+
* const chunks = chunk([1, 2, 3, 4, 5], 2);
|
|
259
|
+
* // [[1, 2], [3, 4], [5]]
|
|
260
|
+
* ```
|
|
261
|
+
*/
|
|
262
|
+
function chunk(array, size) {
|
|
263
|
+
const chunks = [];
|
|
264
|
+
for (let i = 0; i < array.length; i += size) {
|
|
265
|
+
chunks.push(array.slice(i, i + size));
|
|
266
|
+
}
|
|
267
|
+
return chunks;
|
|
268
|
+
}
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test Suite Builder
|
|
3
|
+
* Tier 2.7: Declarative test definitions
|
|
4
|
+
*
|
|
5
|
+
* @example
|
|
6
|
+
* ```typescript
|
|
7
|
+
* import { createTestSuite, expect } from '@ai-eval-platform/sdk';
|
|
8
|
+
*
|
|
9
|
+
* const suite = createTestSuite('chatbot-responses', {
|
|
10
|
+
* cases: [
|
|
11
|
+
* {
|
|
12
|
+
* input: 'Hello',
|
|
13
|
+
* assertions: [
|
|
14
|
+
* (output) => expect(output).toContain('greeting'),
|
|
15
|
+
* (output) => expect(output).toHaveSentiment('positive')
|
|
16
|
+
* ]
|
|
17
|
+
* }
|
|
18
|
+
* ]
|
|
19
|
+
* });
|
|
20
|
+
*
|
|
21
|
+
* const results = await suite.run();
|
|
22
|
+
* ```
|
|
23
|
+
*/
|
|
24
|
+
import { AssertionResult } from './assertions';
|
|
25
|
+
/**
|
|
26
|
+
* Test suite case definition (different from API TestCase type)
|
|
27
|
+
* Use this for defining test cases in test suites with assertions
|
|
28
|
+
*/
|
|
29
|
+
export interface TestSuiteCase {
|
|
30
|
+
/** Unique identifier for the test case */
|
|
31
|
+
id?: string;
|
|
32
|
+
/** Input to the LLM */
|
|
33
|
+
input: string;
|
|
34
|
+
/** Expected output (optional) */
|
|
35
|
+
expected?: string;
|
|
36
|
+
/** Metadata for the test case */
|
|
37
|
+
metadata?: Record<string, any>;
|
|
38
|
+
/** Assertion functions to run */
|
|
39
|
+
assertions?: ((output: string) => AssertionResult)[];
|
|
40
|
+
}
|
|
41
|
+
/** @deprecated Use TestSuiteCase instead to avoid confusion with API TestCase type */
|
|
42
|
+
export type TestCase = TestSuiteCase;
|
|
43
|
+
export interface TestSuiteConfig {
|
|
44
|
+
/** Test cases to run */
|
|
45
|
+
cases: TestSuiteCase[];
|
|
46
|
+
/** Function that generates output from input */
|
|
47
|
+
executor?: (input: string) => Promise<string>;
|
|
48
|
+
/** Run tests in parallel (default: true) */
|
|
49
|
+
parallel?: boolean;
|
|
50
|
+
/** Stop on first failure (default: false) */
|
|
51
|
+
stopOnFailure?: boolean;
|
|
52
|
+
/** Timeout per test case in ms (default: 30000) */
|
|
53
|
+
timeout?: number;
|
|
54
|
+
}
|
|
55
|
+
export interface TestSuiteCaseResult {
|
|
56
|
+
/** Test case ID */
|
|
57
|
+
id: string;
|
|
58
|
+
/** Input that was tested */
|
|
59
|
+
input: string;
|
|
60
|
+
/** Expected output */
|
|
61
|
+
expected?: string;
|
|
62
|
+
/** Actual output */
|
|
63
|
+
actual: string;
|
|
64
|
+
/** Whether test passed */
|
|
65
|
+
passed: boolean;
|
|
66
|
+
/** Assertion results */
|
|
67
|
+
assertions: AssertionResult[];
|
|
68
|
+
/** Duration in milliseconds */
|
|
69
|
+
durationMs: number;
|
|
70
|
+
/** Error if test failed to execute */
|
|
71
|
+
error?: string;
|
|
72
|
+
}
|
|
73
|
+
/** @deprecated Use TestSuiteCaseResult instead */
|
|
74
|
+
export type TestCaseResult = TestSuiteCaseResult;
|
|
75
|
+
export interface TestSuiteResult {
|
|
76
|
+
/** Suite name */
|
|
77
|
+
name: string;
|
|
78
|
+
/** Total number of test cases */
|
|
79
|
+
total: number;
|
|
80
|
+
/** Number of passed tests */
|
|
81
|
+
passed: number;
|
|
82
|
+
/** Number of failed tests */
|
|
83
|
+
failed: number;
|
|
84
|
+
/** Total duration in milliseconds */
|
|
85
|
+
durationMs: number;
|
|
86
|
+
/** Individual test results */
|
|
87
|
+
results: TestSuiteCaseResult[];
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Test Suite for declarative evaluation testing
|
|
91
|
+
*/
|
|
92
|
+
export declare class TestSuite {
|
|
93
|
+
private name;
|
|
94
|
+
private config;
|
|
95
|
+
constructor(name: string, config: TestSuiteConfig);
|
|
96
|
+
/**
|
|
97
|
+
* Run all test cases
|
|
98
|
+
*
|
|
99
|
+
* @example
|
|
100
|
+
* ```typescript
|
|
101
|
+
* const results = await suite.run();
|
|
102
|
+
* console.log(`${results.passed}/${results.total} tests passed`);
|
|
103
|
+
* ```
|
|
104
|
+
*/
|
|
105
|
+
run(): Promise<TestSuiteResult>;
|
|
106
|
+
/**
|
|
107
|
+
* Add a test case to the suite
|
|
108
|
+
*/
|
|
109
|
+
addCase(testCase: TestSuiteCase): void;
|
|
110
|
+
/**
|
|
111
|
+
* Get suite configuration
|
|
112
|
+
*/
|
|
113
|
+
getConfig(): TestSuiteConfig;
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Create a test suite
|
|
117
|
+
*
|
|
118
|
+
* @example
|
|
119
|
+
* ```typescript
|
|
120
|
+
* const suite = createTestSuite('my-tests', {
|
|
121
|
+
* cases: [
|
|
122
|
+
* {
|
|
123
|
+
* input: 'Hello',
|
|
124
|
+
* assertions: [
|
|
125
|
+
* (output) => expect(output).toContain('hi'),
|
|
126
|
+
* (output) => expect(output).toHaveSentiment('positive')
|
|
127
|
+
* ]
|
|
128
|
+
* }
|
|
129
|
+
* ],
|
|
130
|
+
* executor: async (input) => {
|
|
131
|
+
* // Your LLM call here
|
|
132
|
+
* return callLLM(input);
|
|
133
|
+
* }
|
|
134
|
+
* });
|
|
135
|
+
* ```
|
|
136
|
+
*/
|
|
137
|
+
export declare function createTestSuite(name: string, config: TestSuiteConfig): TestSuite;
|
|
138
|
+
/**
|
|
139
|
+
* Helper to create assertions from expected keywords
|
|
140
|
+
*
|
|
141
|
+
* @example
|
|
142
|
+
* ```typescript
|
|
143
|
+
* const suite = createTestSuite('tests', {
|
|
144
|
+
* cases: [
|
|
145
|
+
* {
|
|
146
|
+
* input: 'refund policy',
|
|
147
|
+
* assertions: containsKeywords(['refund', 'return', 'policy'])
|
|
148
|
+
* }
|
|
149
|
+
* ]
|
|
150
|
+
* });
|
|
151
|
+
* ```
|
|
152
|
+
*/
|
|
153
|
+
export declare function containsKeywords(keywords: string[]): (output: string) => AssertionResult;
|
|
154
|
+
/**
|
|
155
|
+
* Helper to create pattern matching assertion
|
|
156
|
+
*
|
|
157
|
+
* @example
|
|
158
|
+
* ```typescript
|
|
159
|
+
* const suite = createTestSuite('tests', {
|
|
160
|
+
* cases: [
|
|
161
|
+
* {
|
|
162
|
+
* input: 'What time is it?',
|
|
163
|
+
* assertions: matchesPattern(/\d{1,2}:\d{2}/)
|
|
164
|
+
* }
|
|
165
|
+
* ]
|
|
166
|
+
* });
|
|
167
|
+
* ```
|
|
168
|
+
*/
|
|
169
|
+
export declare function matchesPattern(pattern: RegExp): (output: string) => AssertionResult;
|
|
170
|
+
/**
|
|
171
|
+
* Helper to create sentiment assertion
|
|
172
|
+
*
|
|
173
|
+
* @example
|
|
174
|
+
* ```typescript
|
|
175
|
+
* const suite = createTestSuite('tests', {
|
|
176
|
+
* cases: [
|
|
177
|
+
* {
|
|
178
|
+
* input: 'Thank you!',
|
|
179
|
+
* assertions: hasSentiment('positive')
|
|
180
|
+
* }
|
|
181
|
+
* ]
|
|
182
|
+
* });
|
|
183
|
+
* ```
|
|
184
|
+
*/
|
|
185
|
+
export declare function hasSentiment(sentiment: 'positive' | 'negative' | 'neutral'): (output: string) => AssertionResult;
|
|
186
|
+
/**
|
|
187
|
+
* Helper to create length range assertion
|
|
188
|
+
*
|
|
189
|
+
* @example
|
|
190
|
+
* ```typescript
|
|
191
|
+
* const suite = createTestSuite('tests', {
|
|
192
|
+
* cases: [
|
|
193
|
+
* {
|
|
194
|
+
* input: 'Summarize this',
|
|
195
|
+
* assertions: hasLength({ min: 50, max: 200 })
|
|
196
|
+
* }
|
|
197
|
+
* ]
|
|
198
|
+
* });
|
|
199
|
+
* ```
|
|
200
|
+
*/
|
|
201
|
+
export declare function hasLength(range: {
|
|
202
|
+
min?: number;
|
|
203
|
+
max?: number;
|
|
204
|
+
}): (output: string) => AssertionResult;
|
package/dist/testing.js
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Test Suite Builder
|
|
4
|
+
* Tier 2.7: Declarative test definitions
|
|
5
|
+
*
|
|
6
|
+
* @example
|
|
7
|
+
* ```typescript
|
|
8
|
+
* import { createTestSuite, expect } from '@ai-eval-platform/sdk';
|
|
9
|
+
*
|
|
10
|
+
* const suite = createTestSuite('chatbot-responses', {
|
|
11
|
+
* cases: [
|
|
12
|
+
* {
|
|
13
|
+
* input: 'Hello',
|
|
14
|
+
* assertions: [
|
|
15
|
+
* (output) => expect(output).toContain('greeting'),
|
|
16
|
+
* (output) => expect(output).toHaveSentiment('positive')
|
|
17
|
+
* ]
|
|
18
|
+
* }
|
|
19
|
+
* ]
|
|
20
|
+
* });
|
|
21
|
+
*
|
|
22
|
+
* const results = await suite.run();
|
|
23
|
+
* ```
|
|
24
|
+
*/
|
|
25
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
26
|
+
exports.TestSuite = void 0;
|
|
27
|
+
exports.createTestSuite = createTestSuite;
|
|
28
|
+
exports.containsKeywords = containsKeywords;
|
|
29
|
+
exports.matchesPattern = matchesPattern;
|
|
30
|
+
exports.hasSentiment = hasSentiment;
|
|
31
|
+
exports.hasLength = hasLength;
|
|
32
|
+
const assertions_1 = require("./assertions");
|
|
33
|
+
/**
|
|
34
|
+
* Test Suite for declarative evaluation testing
|
|
35
|
+
*/
|
|
36
|
+
class TestSuite {
|
|
37
|
+
constructor(name, config) {
|
|
38
|
+
this.name = name;
|
|
39
|
+
this.config = config;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Run all test cases
|
|
43
|
+
*
|
|
44
|
+
* @example
|
|
45
|
+
* ```typescript
|
|
46
|
+
* const results = await suite.run();
|
|
47
|
+
* console.log(`${results.passed}/${results.total} tests passed`);
|
|
48
|
+
* ```
|
|
49
|
+
*/
|
|
50
|
+
async run() {
|
|
51
|
+
const startTime = Date.now();
|
|
52
|
+
const results = [];
|
|
53
|
+
const runTestCase = async (testCase, index) => {
|
|
54
|
+
const caseStartTime = Date.now();
|
|
55
|
+
const id = testCase.id || `case-${index}`;
|
|
56
|
+
try {
|
|
57
|
+
// Execute to get output
|
|
58
|
+
let actual;
|
|
59
|
+
if (this.config.executor) {
|
|
60
|
+
const timeout = this.config.timeout || 30000;
|
|
61
|
+
const timeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error(`Test timeout after ${timeout}ms`)), timeout));
|
|
62
|
+
actual = await Promise.race([
|
|
63
|
+
this.config.executor(testCase.input),
|
|
64
|
+
timeoutPromise
|
|
65
|
+
]);
|
|
66
|
+
}
|
|
67
|
+
else if (testCase.expected) {
|
|
68
|
+
actual = testCase.expected; // Use expected as actual if no executor
|
|
69
|
+
}
|
|
70
|
+
else {
|
|
71
|
+
throw new Error('No executor provided and no expected output');
|
|
72
|
+
}
|
|
73
|
+
// Run assertions
|
|
74
|
+
const assertions = [];
|
|
75
|
+
let allPassed = true;
|
|
76
|
+
// Run custom assertions
|
|
77
|
+
if (testCase.assertions) {
|
|
78
|
+
for (const assertion of testCase.assertions) {
|
|
79
|
+
const result = assertion(actual);
|
|
80
|
+
assertions.push(result);
|
|
81
|
+
if (!result.passed)
|
|
82
|
+
allPassed = false;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
// Default equality check if expected provided
|
|
86
|
+
if (testCase.expected && !testCase.assertions) {
|
|
87
|
+
const result = (0, assertions_1.expect)(actual).toEqual(testCase.expected);
|
|
88
|
+
assertions.push(result);
|
|
89
|
+
if (!result.passed)
|
|
90
|
+
allPassed = false;
|
|
91
|
+
}
|
|
92
|
+
const durationMs = Date.now() - caseStartTime;
|
|
93
|
+
return {
|
|
94
|
+
id,
|
|
95
|
+
input: testCase.input,
|
|
96
|
+
expected: testCase.expected,
|
|
97
|
+
actual,
|
|
98
|
+
passed: allPassed,
|
|
99
|
+
assertions,
|
|
100
|
+
durationMs
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
catch (error) {
|
|
104
|
+
const durationMs = Date.now() - caseStartTime;
|
|
105
|
+
return {
|
|
106
|
+
id,
|
|
107
|
+
input: testCase.input,
|
|
108
|
+
expected: testCase.expected,
|
|
109
|
+
actual: '',
|
|
110
|
+
passed: false,
|
|
111
|
+
assertions: [],
|
|
112
|
+
durationMs,
|
|
113
|
+
error: error instanceof Error ? error.message : String(error)
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
};
|
|
117
|
+
// Run tests
|
|
118
|
+
if (this.config.parallel) {
|
|
119
|
+
results.push(...await Promise.all(this.config.cases.map((tc, i) => runTestCase(tc, i))));
|
|
120
|
+
}
|
|
121
|
+
else {
|
|
122
|
+
for (let i = 0; i < this.config.cases.length; i++) {
|
|
123
|
+
const result = await runTestCase(this.config.cases[i], i);
|
|
124
|
+
results.push(result);
|
|
125
|
+
if (this.config.stopOnFailure && !result.passed) {
|
|
126
|
+
break;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
const durationMs = Date.now() - startTime;
|
|
131
|
+
const passed = results.filter(r => r.passed).length;
|
|
132
|
+
const failed = results.filter(r => !r.passed).length;
|
|
133
|
+
return {
|
|
134
|
+
name: this.name,
|
|
135
|
+
total: results.length,
|
|
136
|
+
passed,
|
|
137
|
+
failed,
|
|
138
|
+
durationMs,
|
|
139
|
+
results
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* Add a test case to the suite
|
|
144
|
+
*/
|
|
145
|
+
addCase(testCase) {
|
|
146
|
+
this.config.cases.push(testCase);
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Get suite configuration
|
|
150
|
+
*/
|
|
151
|
+
getConfig() {
|
|
152
|
+
return { ...this.config };
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
exports.TestSuite = TestSuite;
|
|
156
|
+
/**
|
|
157
|
+
* Create a test suite
|
|
158
|
+
*
|
|
159
|
+
* @example
|
|
160
|
+
* ```typescript
|
|
161
|
+
* const suite = createTestSuite('my-tests', {
|
|
162
|
+
* cases: [
|
|
163
|
+
* {
|
|
164
|
+
* input: 'Hello',
|
|
165
|
+
* assertions: [
|
|
166
|
+
* (output) => expect(output).toContain('hi'),
|
|
167
|
+
* (output) => expect(output).toHaveSentiment('positive')
|
|
168
|
+
* ]
|
|
169
|
+
* }
|
|
170
|
+
* ],
|
|
171
|
+
* executor: async (input) => {
|
|
172
|
+
* // Your LLM call here
|
|
173
|
+
* return callLLM(input);
|
|
174
|
+
* }
|
|
175
|
+
* });
|
|
176
|
+
* ```
|
|
177
|
+
*/
|
|
178
|
+
function createTestSuite(name, config) {
|
|
179
|
+
return new TestSuite(name, config);
|
|
180
|
+
}
|
|
181
|
+
/**
|
|
182
|
+
* Helper to create assertions from expected keywords
|
|
183
|
+
*
|
|
184
|
+
* @example
|
|
185
|
+
* ```typescript
|
|
186
|
+
* const suite = createTestSuite('tests', {
|
|
187
|
+
* cases: [
|
|
188
|
+
* {
|
|
189
|
+
* input: 'refund policy',
|
|
190
|
+
* assertions: containsKeywords(['refund', 'return', 'policy'])
|
|
191
|
+
* }
|
|
192
|
+
* ]
|
|
193
|
+
* });
|
|
194
|
+
* ```
|
|
195
|
+
*/
|
|
196
|
+
function containsKeywords(keywords) {
|
|
197
|
+
return (output) => (0, assertions_1.expect)(output).toContainKeywords(keywords);
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Helper to create pattern matching assertion
|
|
201
|
+
*
|
|
202
|
+
* @example
|
|
203
|
+
* ```typescript
|
|
204
|
+
* const suite = createTestSuite('tests', {
|
|
205
|
+
* cases: [
|
|
206
|
+
* {
|
|
207
|
+
* input: 'What time is it?',
|
|
208
|
+
* assertions: matchesPattern(/\d{1,2}:\d{2}/)
|
|
209
|
+
* }
|
|
210
|
+
* ]
|
|
211
|
+
* });
|
|
212
|
+
* ```
|
|
213
|
+
*/
|
|
214
|
+
function matchesPattern(pattern) {
|
|
215
|
+
return (output) => (0, assertions_1.expect)(output).toMatchPattern(pattern);
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* Helper to create sentiment assertion
|
|
219
|
+
*
|
|
220
|
+
* @example
|
|
221
|
+
* ```typescript
|
|
222
|
+
* const suite = createTestSuite('tests', {
|
|
223
|
+
* cases: [
|
|
224
|
+
* {
|
|
225
|
+
* input: 'Thank you!',
|
|
226
|
+
* assertions: hasSentiment('positive')
|
|
227
|
+
* }
|
|
228
|
+
* ]
|
|
229
|
+
* });
|
|
230
|
+
* ```
|
|
231
|
+
*/
|
|
232
|
+
function hasSentiment(sentiment) {
|
|
233
|
+
return (output) => (0, assertions_1.expect)(output).toHaveSentiment(sentiment);
|
|
234
|
+
}
|
|
235
|
+
/**
|
|
236
|
+
* Helper to create length range assertion
|
|
237
|
+
*
|
|
238
|
+
* @example
|
|
239
|
+
* ```typescript
|
|
240
|
+
* const suite = createTestSuite('tests', {
|
|
241
|
+
* cases: [
|
|
242
|
+
* {
|
|
243
|
+
* input: 'Summarize this',
|
|
244
|
+
* assertions: hasLength({ min: 50, max: 200 })
|
|
245
|
+
* }
|
|
246
|
+
* ]
|
|
247
|
+
* });
|
|
248
|
+
* ```
|
|
249
|
+
*/
|
|
250
|
+
function hasLength(range) {
|
|
251
|
+
return (output) => (0, assertions_1.expect)(output).toHaveLength(range);
|
|
252
|
+
}
|