@pauly4010/evalai-sdk 1.5.7 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +46 -1
- package/README.md +12 -3
- package/dist/assertions.d.ts +11 -11
- package/dist/assertions.js +1 -1
- package/dist/batch.d.ts +3 -3
- package/dist/batch.js +1 -1
- package/dist/cache.d.ts +3 -3
- package/dist/cache.js +1 -1
- package/dist/cli/baseline.d.ts +10 -0
- package/dist/cli/baseline.js +172 -0
- package/dist/cli/formatters/github.js +1 -1
- package/dist/cli/formatters/human.js +1 -1
- package/dist/cli/formatters/pr-comment.js +1 -1
- package/dist/cli/index.js +20 -4
- package/dist/cli/regression-gate.d.ts +11 -0
- package/dist/cli/regression-gate.js +150 -0
- package/dist/client.d.ts +3 -3
- package/dist/client.js +3 -2
- package/dist/client.request.test.d.ts +1 -0
- package/dist/client.request.test.js +157 -0
- package/dist/context.d.ts +4 -4
- package/dist/context.js +1 -1
- package/dist/errors.d.ts +5 -5
- package/dist/errors.js +21 -24
- package/dist/export.d.ts +1 -1
- package/dist/export.js +4 -2
- package/dist/index.d.ts +1 -0
- package/dist/index.js +7 -1
- package/dist/integrations/openai-eval.js +1 -1
- package/dist/logger.d.ts +10 -10
- package/dist/pagination.d.ts +2 -2
- package/dist/regression.d.ts +100 -0
- package/dist/regression.js +44 -0
- package/dist/snapshot.d.ts +3 -3
- package/dist/streaming.d.ts +4 -4
- package/dist/testing.d.ts +1 -1
- package/dist/types.d.ts +33 -33
- package/dist/version.d.ts +1 -1
- package/dist/version.js +1 -1
- package/dist/workflows.d.ts +29 -18
- package/package.json +7 -3
package/dist/client.d.ts
CHANGED
|
@@ -111,7 +111,7 @@ declare class TraceAPI {
|
|
|
111
111
|
* });
|
|
112
112
|
* ```
|
|
113
113
|
*/
|
|
114
|
-
create<TMetadata = Record<string,
|
|
114
|
+
create<TMetadata = Record<string, unknown>>(params: CreateTraceParams<TMetadata>): Promise<Trace<TMetadata>>;
|
|
115
115
|
/**
|
|
116
116
|
* List traces with optional filtering
|
|
117
117
|
*/
|
|
@@ -138,7 +138,7 @@ declare class TraceAPI {
|
|
|
138
138
|
* });
|
|
139
139
|
* ```
|
|
140
140
|
*/
|
|
141
|
-
update<TMetadata = Record<string,
|
|
141
|
+
update<TMetadata = Record<string, unknown>>(id: number, params: UpdateTraceParams<TMetadata>): Promise<Trace<TMetadata>>;
|
|
142
142
|
/**
|
|
143
143
|
* Create a span for a trace
|
|
144
144
|
*/
|
|
@@ -208,7 +208,7 @@ declare class LLMJudgeAPI {
|
|
|
208
208
|
*/
|
|
209
209
|
evaluate(params: RunLLMJudgeParams): Promise<{
|
|
210
210
|
result: LLMJudgeResult;
|
|
211
|
-
config:
|
|
211
|
+
config: unknown;
|
|
212
212
|
}>;
|
|
213
213
|
/**
|
|
214
214
|
* Create an LLM judge configuration
|
package/dist/client.js
CHANGED
|
@@ -94,11 +94,12 @@ class AIEvalClient {
|
|
|
94
94
|
results.push({ id: req.id, status: 200, data });
|
|
95
95
|
}
|
|
96
96
|
catch (err) {
|
|
97
|
+
const errorObj = err;
|
|
97
98
|
results.push({
|
|
98
99
|
id: req.id,
|
|
99
|
-
status:
|
|
100
|
+
status: errorObj?.statusCode || 500,
|
|
100
101
|
data: null,
|
|
101
|
-
error:
|
|
102
|
+
error: errorObj?.message || "Unknown error",
|
|
102
103
|
});
|
|
103
104
|
}
|
|
104
105
|
})();
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
const vitest_1 = require("vitest");
|
|
37
|
+
const client_1 = require("./client");
|
|
38
|
+
const errorsModule = __importStar(require("./errors"));
|
|
39
|
+
vitest_1.vi.mock("./cache", () => {
|
|
40
|
+
const cacheTracker = { invalidatedPatterns: [] };
|
|
41
|
+
const shouldCache = vitest_1.vi.fn().mockReturnValue(true);
|
|
42
|
+
const getTTL = vitest_1.vi.fn().mockReturnValue(1000);
|
|
43
|
+
const makeKey = (method, url, params) => `${method}:${url}:${JSON.stringify(params ?? null)}`;
|
|
44
|
+
return {
|
|
45
|
+
__esModule: true,
|
|
46
|
+
shouldCache,
|
|
47
|
+
getTTL,
|
|
48
|
+
cacheTracker,
|
|
49
|
+
RequestCache: class RequestCache {
|
|
50
|
+
constructor() {
|
|
51
|
+
this.store = new Map();
|
|
52
|
+
}
|
|
53
|
+
get(method, url, params) {
|
|
54
|
+
const key = makeKey(method, url, params);
|
|
55
|
+
return this.store.get(key) ?? null;
|
|
56
|
+
}
|
|
57
|
+
set(method, url, data, _ttl, params) {
|
|
58
|
+
const key = makeKey(method, url, params);
|
|
59
|
+
this.store.set(key, data);
|
|
60
|
+
}
|
|
61
|
+
invalidatePattern(pattern) {
|
|
62
|
+
cacheTracker.invalidatedPatterns.push(pattern);
|
|
63
|
+
}
|
|
64
|
+
invalidate(_method, _url, _params) {
|
|
65
|
+
// no-op for tests
|
|
66
|
+
}
|
|
67
|
+
clear() {
|
|
68
|
+
this.store.clear();
|
|
69
|
+
}
|
|
70
|
+
},
|
|
71
|
+
};
|
|
72
|
+
});
|
|
73
|
+
const cache_1 = require("./cache");
|
|
74
|
+
(0, vitest_1.describe)("AIEvalClient.request", () => {
|
|
75
|
+
(0, vitest_1.beforeEach)(() => {
|
|
76
|
+
process.env.EVALAI_API_KEY = "test";
|
|
77
|
+
cache_1.shouldCache.mockReset().mockReturnValue(true);
|
|
78
|
+
cache_1.getTTL.mockReset().mockReturnValue(1000);
|
|
79
|
+
cache_1.cacheTracker.invalidatedPatterns.length = 0;
|
|
80
|
+
});
|
|
81
|
+
(0, vitest_1.it)("caches GET responses and reuses data without re-fetching", async () => {
|
|
82
|
+
const client = new client_1.AIEvalClient({ apiKey: "test", baseUrl: "http://localhost", timeout: 1000 });
|
|
83
|
+
const payload = { items: [1, 2, 3] };
|
|
84
|
+
const fetchMock = vitest_1.vi.fn().mockResolvedValue({
|
|
85
|
+
ok: true,
|
|
86
|
+
status: 200,
|
|
87
|
+
json: async () => payload,
|
|
88
|
+
});
|
|
89
|
+
globalThis.fetch = fetchMock;
|
|
90
|
+
const first = await client.request("/api/traces", { method: "GET" });
|
|
91
|
+
const second = await client.request("/api/traces", { method: "GET" });
|
|
92
|
+
(0, vitest_1.expect)(first).toEqual(payload);
|
|
93
|
+
(0, vitest_1.expect)(second).toEqual(payload);
|
|
94
|
+
(0, vitest_1.expect)(fetchMock).toHaveBeenCalledTimes(1);
|
|
95
|
+
});
|
|
96
|
+
(0, vitest_1.it)("propagates non-ok responses as SDK errors", async () => {
|
|
97
|
+
const client = new client_1.AIEvalClient({ apiKey: "test", baseUrl: "http://localhost" });
|
|
98
|
+
const fetchMock = vitest_1.vi.fn().mockResolvedValue({
|
|
99
|
+
ok: false,
|
|
100
|
+
status: 429,
|
|
101
|
+
json: async () => ({ error: { code: "RATE_LIMIT_EXCEEDED" } }),
|
|
102
|
+
});
|
|
103
|
+
globalThis.fetch = fetchMock;
|
|
104
|
+
const createErrorSpy = vitest_1.vi
|
|
105
|
+
.spyOn(errorsModule, "createErrorFromResponse")
|
|
106
|
+
.mockReturnValue(new errorsModule.EvalAIError("rate limited", "RATE_LIMIT_EXCEEDED", 429));
|
|
107
|
+
await (0, vitest_1.expect)(client.request("/api/fail", { method: "GET" })).rejects.toHaveProperty("code", "RATE_LIMIT_EXCEEDED");
|
|
108
|
+
createErrorSpy.mockRestore();
|
|
109
|
+
});
|
|
110
|
+
(0, vitest_1.it)("retries on retryable SDK errors and eventually succeeds", async () => {
|
|
111
|
+
const client = new client_1.AIEvalClient({ apiKey: "test", baseUrl: "http://localhost", timeout: 1000 });
|
|
112
|
+
vitest_1.vi.spyOn(client, "calculateBackoff").mockReturnValue(0);
|
|
113
|
+
const failureResponse = {
|
|
114
|
+
ok: false,
|
|
115
|
+
status: 429,
|
|
116
|
+
json: async () => ({ error: { code: "RATE_LIMIT_EXCEEDED" } }),
|
|
117
|
+
};
|
|
118
|
+
const successResponse = {
|
|
119
|
+
ok: true,
|
|
120
|
+
status: 200,
|
|
121
|
+
json: async () => ({ ok: true }),
|
|
122
|
+
};
|
|
123
|
+
const createErrorSpy = vitest_1.vi
|
|
124
|
+
.spyOn(errorsModule, "createErrorFromResponse")
|
|
125
|
+
.mockReturnValue(new errorsModule.EvalAIError("rate limited", "RATE_LIMIT_EXCEEDED", 429));
|
|
126
|
+
const fetchMock = vitest_1.vi
|
|
127
|
+
.fn()
|
|
128
|
+
.mockResolvedValueOnce(failureResponse)
|
|
129
|
+
.mockResolvedValueOnce(successResponse);
|
|
130
|
+
globalThis.fetch = fetchMock;
|
|
131
|
+
const result = await client.request("/api/retry", { method: "GET" });
|
|
132
|
+
(0, vitest_1.expect)(result).toEqual({ ok: true });
|
|
133
|
+
(0, vitest_1.expect)(fetchMock).toHaveBeenCalledTimes(2);
|
|
134
|
+
createErrorSpy.mockRestore();
|
|
135
|
+
});
|
|
136
|
+
(0, vitest_1.it)("throws a TIMEOUT SDK error when fetch aborts", async () => {
|
|
137
|
+
const client = new client_1.AIEvalClient({ apiKey: "test", baseUrl: "http://localhost", timeout: 1000 });
|
|
138
|
+
const abortError = Object.assign(new Error("aborted"), { name: "AbortError" });
|
|
139
|
+
const fetchMock = vitest_1.vi.fn().mockRejectedValue(abortError);
|
|
140
|
+
globalThis.fetch = fetchMock;
|
|
141
|
+
await (0, vitest_1.expect)(client.request("/api/timeout", { method: "GET" })).rejects.toMatchObject({
|
|
142
|
+
code: "TIMEOUT",
|
|
143
|
+
});
|
|
144
|
+
});
|
|
145
|
+
(0, vitest_1.it)("invalidates related cache entries for mutation requests", async () => {
|
|
146
|
+
const client = new client_1.AIEvalClient({ apiKey: "test", baseUrl: "http://localhost", timeout: 1000 });
|
|
147
|
+
cache_1.shouldCache.mockReturnValue(false);
|
|
148
|
+
const fetchMock = vitest_1.vi.fn().mockResolvedValue({
|
|
149
|
+
ok: true,
|
|
150
|
+
status: 201,
|
|
151
|
+
json: async () => ({ result: "ok" }),
|
|
152
|
+
});
|
|
153
|
+
globalThis.fetch = fetchMock;
|
|
154
|
+
await client.request("/api/evaluations", { method: "POST", body: JSON.stringify({}) });
|
|
155
|
+
(0, vitest_1.expect)(cache_1.cacheTracker.invalidatedPatterns).toContain("evaluations");
|
|
156
|
+
});
|
|
157
|
+
});
|
package/dist/context.d.ts
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
* Context metadata that will be automatically injected
|
|
23
23
|
*/
|
|
24
24
|
export interface ContextMetadata {
|
|
25
|
-
[key: string]:
|
|
25
|
+
[key: string]: unknown;
|
|
26
26
|
}
|
|
27
27
|
/**
|
|
28
28
|
* Context manager for automatic metadata propagation
|
|
@@ -75,7 +75,7 @@ export declare class EvalContext {
|
|
|
75
75
|
*/
|
|
76
76
|
export declare function createContext(metadata: ContextMetadata): EvalContext;
|
|
77
77
|
/**
|
|
78
|
-
* Get the current context metadata (if
|
|
78
|
+
* Get the current context metadata (if unknown)
|
|
79
79
|
*
|
|
80
80
|
* @example
|
|
81
81
|
* ```typescript
|
|
@@ -98,7 +98,7 @@ export declare function getCurrentContext(): ContextMetadata | undefined;
|
|
|
98
98
|
* };
|
|
99
99
|
* ```
|
|
100
100
|
*/
|
|
101
|
-
export declare function mergeWithContext(metadata?: Record<string,
|
|
101
|
+
export declare function mergeWithContext(metadata?: Record<string, unknown>): Record<string, unknown>;
|
|
102
102
|
/**
|
|
103
103
|
* Run with nested context (merges parent context)
|
|
104
104
|
*
|
|
@@ -131,4 +131,4 @@ export declare function withContextSync<T>(metadata: ContextMetadata, fn: () =>
|
|
|
131
131
|
* }
|
|
132
132
|
* ```
|
|
133
133
|
*/
|
|
134
|
-
export declare function WithContext(metadata: ContextMetadata): (_target:
|
|
134
|
+
export declare function WithContext(metadata: ContextMetadata): (_target: unknown, _propertyKey: string, descriptor: PropertyDescriptor) => PropertyDescriptor;
|
package/dist/context.js
CHANGED
package/dist/errors.d.ts
CHANGED
|
@@ -42,14 +42,14 @@ export declare class EvalAIError extends Error {
|
|
|
42
42
|
/** Whether this error is retryable */
|
|
43
43
|
retryable: boolean;
|
|
44
44
|
/** Additional error details from the API */
|
|
45
|
-
details?:
|
|
45
|
+
details?: unknown;
|
|
46
46
|
/** When to retry (for rate limit errors) in seconds */
|
|
47
47
|
retryAfter?: number;
|
|
48
48
|
/** When the limit resets (for feature limit errors) */
|
|
49
49
|
resetAt?: Date;
|
|
50
50
|
/** Request ID from API (for correlation/debugging) */
|
|
51
51
|
requestId?: string;
|
|
52
|
-
constructor(message: string, code: string, statusCode: number, details?:
|
|
52
|
+
constructor(message: string, code: string, statusCode: number, details?: unknown);
|
|
53
53
|
/**
|
|
54
54
|
* Get formatted error message with solutions
|
|
55
55
|
*/
|
|
@@ -61,12 +61,12 @@ export declare class EvalAIError extends Error {
|
|
|
61
61
|
/**
|
|
62
62
|
* Convert to JSON for logging
|
|
63
63
|
*/
|
|
64
|
-
toJSON(): Record<string,
|
|
64
|
+
toJSON(): Record<string, unknown>;
|
|
65
65
|
}
|
|
66
66
|
/**
|
|
67
67
|
* Create an error from an HTTP response
|
|
68
68
|
*/
|
|
69
|
-
export declare function createErrorFromResponse(response: Response, data:
|
|
69
|
+
export declare function createErrorFromResponse(response: Response, data: unknown): EvalAIError;
|
|
70
70
|
export declare class RateLimitError extends EvalAIError {
|
|
71
71
|
constructor(message: string, retryAfter?: number);
|
|
72
72
|
}
|
|
@@ -74,7 +74,7 @@ export declare class AuthenticationError extends EvalAIError {
|
|
|
74
74
|
constructor(message?: string);
|
|
75
75
|
}
|
|
76
76
|
export declare class ValidationError extends EvalAIError {
|
|
77
|
-
constructor(message?: string, details?:
|
|
77
|
+
constructor(message?: string, details?: unknown);
|
|
78
78
|
}
|
|
79
79
|
export declare class NetworkError extends EvalAIError {
|
|
80
80
|
constructor(message?: string);
|
package/dist/errors.js
CHANGED
|
@@ -160,27 +160,21 @@ class EvalAIError extends Error {
|
|
|
160
160
|
this.code = code;
|
|
161
161
|
this.statusCode = statusCode;
|
|
162
162
|
this.details = details;
|
|
163
|
-
//
|
|
164
|
-
const
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
this.retryable = errorDoc.retryable;
|
|
169
|
-
}
|
|
170
|
-
else {
|
|
171
|
-
this.documentation = "https://docs.ai-eval-platform.com/errors";
|
|
172
|
-
this.solutions = ["Check the API documentation for more information"];
|
|
173
|
-
this.retryable = false;
|
|
174
|
-
}
|
|
163
|
+
// Initialize required properties from ERROR_DOCS
|
|
164
|
+
const doc = ERROR_DOCS[code];
|
|
165
|
+
this.documentation = doc?.documentation ?? `https://docs.ai-eval-platform.com/errors/${code}`;
|
|
166
|
+
this.solutions = doc?.solutions ?? ["Check the error details for more information"];
|
|
167
|
+
this.retryable = doc?.retryable ?? false;
|
|
175
168
|
// Extract retry-after for rate limits
|
|
176
|
-
|
|
177
|
-
|
|
169
|
+
const errorDetails = details;
|
|
170
|
+
if (code === "RATE_LIMIT_EXCEEDED" && errorDetails?.retryAfter) {
|
|
171
|
+
this.retryAfter = errorDetails.retryAfter;
|
|
178
172
|
}
|
|
179
173
|
// Extract reset time for feature limits
|
|
180
|
-
if (code === "FEATURE_LIMIT_REACHED" &&
|
|
181
|
-
this.resetAt = new Date(
|
|
174
|
+
if (code === "FEATURE_LIMIT_REACHED" && errorDetails?.resetAt) {
|
|
175
|
+
this.resetAt = new Date(errorDetails.resetAt);
|
|
182
176
|
}
|
|
183
|
-
this.requestId =
|
|
177
|
+
this.requestId = errorDetails?.error?.requestId ?? errorDetails?.requestId;
|
|
184
178
|
// Ensure proper prototype chain
|
|
185
179
|
Object.setPrototypeOf(this, EvalAIError.prototype);
|
|
186
180
|
}
|
|
@@ -234,14 +228,17 @@ exports.SDKError = EvalAIError;
|
|
|
234
228
|
*/
|
|
235
229
|
function createErrorFromResponse(response, data) {
|
|
236
230
|
const status = response.status;
|
|
237
|
-
const
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
const
|
|
231
|
+
const errorData = data;
|
|
232
|
+
const errObj = errorData?.error && typeof errorData.error === "object"
|
|
233
|
+
? errorData.error
|
|
234
|
+
: errorData;
|
|
235
|
+
let code = errObj?.code ?? errorData?.code ?? "UNKNOWN_ERROR";
|
|
236
|
+
const message = typeof errorData?.error === "string"
|
|
237
|
+
? errorData.error
|
|
238
|
+
: (errObj?.message ?? errorData?.message ?? response.statusText);
|
|
239
|
+
const requestId = errObj?.requestId ?? errorData?.requestId ?? response.headers.get("x-request-id") ?? undefined;
|
|
243
240
|
// Map HTTP status to error codes when code not in response
|
|
244
|
-
if (!errObj?.code && !
|
|
241
|
+
if (!errObj?.code && !errorData?.code) {
|
|
245
242
|
if (status === 401)
|
|
246
243
|
code = "UNAUTHORIZED";
|
|
247
244
|
else if (status === 403)
|
package/dist/export.d.ts
CHANGED
|
@@ -181,7 +181,7 @@ export declare function importFromFile(client: AIEvalClient, filePath: string, o
|
|
|
181
181
|
* });
|
|
182
182
|
* ```
|
|
183
183
|
*/
|
|
184
|
-
export declare function importFromLangSmith(client: AIEvalClient, langsmithData:
|
|
184
|
+
export declare function importFromLangSmith(client: AIEvalClient, langsmithData: unknown, options: ImportOptions): Promise<ImportResult>;
|
|
185
185
|
/**
|
|
186
186
|
* Convert export data to CSV format
|
|
187
187
|
*
|
package/dist/export.js
CHANGED
|
@@ -292,8 +292,10 @@ async function importFromLangSmith(client, langsmithData, options) {
|
|
|
292
292
|
traces: [],
|
|
293
293
|
};
|
|
294
294
|
// Transform runs to traces
|
|
295
|
-
|
|
296
|
-
|
|
295
|
+
const lsData = langsmithData;
|
|
296
|
+
if (lsData.runs && Array.isArray(lsData.runs)) {
|
|
297
|
+
transformedData.traces = lsData.runs.map((run) => ({
|
|
298
|
+
id: run.id || 0,
|
|
297
299
|
name: run.name || "Imported Trace",
|
|
298
300
|
traceId: run.id || `langsmith-${Date.now()}-${Math.random()}`,
|
|
299
301
|
organizationId: options.organizationId,
|
package/dist/index.d.ts
CHANGED
|
@@ -33,5 +33,6 @@ export { batchProcess, batchRead, RateLimiter, streamEvaluation } from "./stream
|
|
|
33
33
|
export type { Annotation, AnnotationItem, AnnotationTask, APIKey, APIKeyUsage, APIKeyWithSecret, BatchOptions, ClientConfig as AIEvalConfig, CreateAnnotationItemParams, CreateAnnotationParams, CreateAnnotationTaskParams, CreateAPIKeyParams, CreateLLMJudgeConfigParams, CreateWebhookParams, Evaluation as EvaluationData, ExportOptions, GenericMetadata as AnnotationData, GetLLMJudgeAlignmentParams, GetUsageParams, ImportOptions, ListAnnotationItemsParams, ListAnnotationsParams, ListAnnotationTasksParams, ListAPIKeysParams, ListLLMJudgeConfigsParams, ListLLMJudgeResultsParams, ListWebhookDeliveriesParams, ListWebhooksParams, LLMJudgeAlignment, LLMJudgeConfig, LLMJudgeResult as LLMJudgeData, Organization, RetryConfig, SnapshotData, Span as SpanData, StreamOptions, TestCase, TestResult, Trace as TraceData, TracedResponse, UpdateAPIKeyParams, UpdateWebhookParams, UsageStats, UsageSummary, Webhook, WebhookDelivery, } from "./types";
|
|
34
34
|
export { EvaluationTemplates, type EvaluationTemplateType, type FeatureUsage, type OrganizationLimits, } from "./types";
|
|
35
35
|
export { type AgentHandoff, type AgentSpanContext, type CostCategory, type CostRecord, createWorkflowTracer, type DecisionAlternative, type DecisionType, type HandoffType, type LLMProvider, type RecordCostParams, type RecordDecisionParams, traceAutoGen, traceCrewAI, traceLangChainAgent, traceWorkflowStep, type WorkflowContext, type WorkflowDefinition, type WorkflowEdge, type WorkflowNode, type WorkflowStatus, WorkflowTracer, type WorkflowTracerOptions, } from "./workflows";
|
|
36
|
+
export { ARTIFACTS, type Baseline, type BaselineTolerance, GATE_CATEGORY, GATE_EXIT, type GateCategory, type GateExitCode, type RegressionDelta, type RegressionReport, REPORT_SCHEMA_VERSION, } from "./regression";
|
|
36
37
|
import { AIEvalClient } from "./client";
|
|
37
38
|
export default AIEvalClient;
|
package/dist/index.js
CHANGED
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
*/
|
|
10
10
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
11
11
|
exports.extendExpectWithToPassGate = exports.Logger = exports.openAIChatEval = exports.traceOpenAI = exports.traceAnthropic = exports.runCheck = exports.parseArgs = exports.EXIT = exports.RequestCache = exports.CacheTTL = exports.RequestBatcher = exports.importData = exports.exportData = exports.compareSnapshots = exports.saveSnapshot = exports.compareWithSnapshot = exports.snapshot = exports.TestSuite = exports.createTestSuite = exports.ContextManager = exports.withContext = exports.getContext = exports.createContext = exports.withinRange = exports.similarTo = exports.respondedWithinTime = exports.notContainsPII = exports.matchesSchema = exports.matchesPattern = exports.isValidURL = exports.isValidEmail = exports.hasValidCodeSyntax = exports.hasSentiment = exports.hasReadabilityScore = exports.hasNoToxicity = exports.hasNoHallucinations = exports.hasLength = exports.hasFactualAccuracy = exports.followsInstructions = exports.expect = exports.containsLanguage = exports.containsKeywords = exports.containsJSON = exports.containsAllRequiredFields = exports.NetworkError = exports.ValidationError = exports.AuthenticationError = exports.RateLimitError = exports.EvalAIError = exports.AIEvalClient = void 0;
|
|
12
|
-
exports.WorkflowTracer = exports.traceWorkflowStep = exports.traceLangChainAgent = exports.traceCrewAI = exports.traceAutoGen = exports.createWorkflowTracer = exports.EvaluationTemplates = exports.streamEvaluation = exports.RateLimiter = exports.batchRead = exports.batchProcess = exports.PaginatedIterator = exports.encodeCursor = exports.decodeCursor = exports.createPaginatedIterator = exports.autoPaginate = void 0;
|
|
12
|
+
exports.REPORT_SCHEMA_VERSION = exports.GATE_EXIT = exports.GATE_CATEGORY = exports.ARTIFACTS = exports.WorkflowTracer = exports.traceWorkflowStep = exports.traceLangChainAgent = exports.traceCrewAI = exports.traceAutoGen = exports.createWorkflowTracer = exports.EvaluationTemplates = exports.streamEvaluation = exports.RateLimiter = exports.batchRead = exports.batchProcess = exports.PaginatedIterator = exports.encodeCursor = exports.decodeCursor = exports.createPaginatedIterator = exports.autoPaginate = void 0;
|
|
13
13
|
// Main SDK exports
|
|
14
14
|
var client_1 = require("./client");
|
|
15
15
|
Object.defineProperty(exports, "AIEvalClient", { enumerable: true, get: function () { return client_1.AIEvalClient; } });
|
|
@@ -117,6 +117,12 @@ Object.defineProperty(exports, "traceCrewAI", { enumerable: true, get: function
|
|
|
117
117
|
Object.defineProperty(exports, "traceLangChainAgent", { enumerable: true, get: function () { return workflows_1.traceLangChainAgent; } });
|
|
118
118
|
Object.defineProperty(exports, "traceWorkflowStep", { enumerable: true, get: function () { return workflows_1.traceWorkflowStep; } });
|
|
119
119
|
Object.defineProperty(exports, "WorkflowTracer", { enumerable: true, get: function () { return workflows_1.WorkflowTracer; } });
|
|
120
|
+
// Regression gate constants & types (v1.6.0)
|
|
121
|
+
var regression_1 = require("./regression");
|
|
122
|
+
Object.defineProperty(exports, "ARTIFACTS", { enumerable: true, get: function () { return regression_1.ARTIFACTS; } });
|
|
123
|
+
Object.defineProperty(exports, "GATE_CATEGORY", { enumerable: true, get: function () { return regression_1.GATE_CATEGORY; } });
|
|
124
|
+
Object.defineProperty(exports, "GATE_EXIT", { enumerable: true, get: function () { return regression_1.GATE_EXIT; } });
|
|
125
|
+
Object.defineProperty(exports, "REPORT_SCHEMA_VERSION", { enumerable: true, get: function () { return regression_1.REPORT_SCHEMA_VERSION; } });
|
|
120
126
|
// Default export for convenience
|
|
121
127
|
const client_2 = require("./client");
|
|
122
128
|
exports.default = client_2.AIEvalClient;
|
|
@@ -141,7 +141,7 @@ async function openAIChatEval(options) {
|
|
|
141
141
|
for (let i = 0; i < result.results.length; i++) {
|
|
142
142
|
const tcId = cases[i]?.testCaseId;
|
|
143
143
|
if (tcId == null) {
|
|
144
|
-
console.log("reportToEvalAI: All cases must have testCaseId when
|
|
144
|
+
console.log("reportToEvalAI: All cases must have testCaseId when unknown has it.");
|
|
145
145
|
return evalResult;
|
|
146
146
|
}
|
|
147
147
|
importResults.push({
|
package/dist/logger.d.ts
CHANGED
|
@@ -29,7 +29,7 @@ export interface LogEntry {
|
|
|
29
29
|
level: LogLevel;
|
|
30
30
|
message: string;
|
|
31
31
|
timestamp: string;
|
|
32
|
-
data?:
|
|
32
|
+
data?: unknown;
|
|
33
33
|
prefix?: string;
|
|
34
34
|
}
|
|
35
35
|
/**
|
|
@@ -41,31 +41,31 @@ export declare class Logger {
|
|
|
41
41
|
/**
|
|
42
42
|
* Log a trace message
|
|
43
43
|
*/
|
|
44
|
-
trace(message: string, data?:
|
|
44
|
+
trace(message: string, data?: unknown): void;
|
|
45
45
|
/**
|
|
46
46
|
* Log a debug message
|
|
47
47
|
*/
|
|
48
|
-
debug(message: string, data?:
|
|
48
|
+
debug(message: string, data?: unknown): void;
|
|
49
49
|
/**
|
|
50
50
|
* Log an info message
|
|
51
51
|
*/
|
|
52
|
-
info(message: string, data?:
|
|
52
|
+
info(message: string, data?: unknown): void;
|
|
53
53
|
/**
|
|
54
54
|
* Log a warning message
|
|
55
55
|
*/
|
|
56
|
-
warn(message: string, data?:
|
|
56
|
+
warn(message: string, data?: unknown): void;
|
|
57
57
|
/**
|
|
58
58
|
* Log an error message
|
|
59
59
|
*/
|
|
60
|
-
error(message: string, data?:
|
|
60
|
+
error(message: string, data?: unknown): void;
|
|
61
61
|
/**
|
|
62
62
|
* Log HTTP request
|
|
63
63
|
*/
|
|
64
|
-
logRequest(method: string, url: string, data?:
|
|
64
|
+
logRequest(method: string, url: string, data?: unknown): void;
|
|
65
65
|
/**
|
|
66
66
|
* Log HTTP response
|
|
67
67
|
*/
|
|
68
|
-
logResponse(method: string, url: string, status: number, duration: number, data?:
|
|
68
|
+
logResponse(method: string, url: string, status: number, duration: number, data?: unknown): void;
|
|
69
69
|
/**
|
|
70
70
|
* Create child logger with prefix
|
|
71
71
|
*/
|
|
@@ -112,7 +112,7 @@ export declare class RequestLogger {
|
|
|
112
112
|
method: string;
|
|
113
113
|
url: string;
|
|
114
114
|
headers?: Record<string, string>;
|
|
115
|
-
body?:
|
|
115
|
+
body?: unknown;
|
|
116
116
|
}): void;
|
|
117
117
|
/**
|
|
118
118
|
* Log response after receiving
|
|
@@ -123,6 +123,6 @@ export declare class RequestLogger {
|
|
|
123
123
|
status: number;
|
|
124
124
|
duration: number;
|
|
125
125
|
headers?: Record<string, string>;
|
|
126
|
-
body?:
|
|
126
|
+
body?: unknown;
|
|
127
127
|
}): void;
|
|
128
128
|
}
|
package/dist/pagination.d.ts
CHANGED
|
@@ -56,11 +56,11 @@ export declare function autoPaginate<T>(fetchFn: (offset: number, limit: number)
|
|
|
56
56
|
/**
|
|
57
57
|
* Encode cursor for pagination (base64)
|
|
58
58
|
*/
|
|
59
|
-
export declare function encodeCursor(data:
|
|
59
|
+
export declare function encodeCursor(data: unknown): string;
|
|
60
60
|
/**
|
|
61
61
|
* Decode cursor from base64
|
|
62
62
|
*/
|
|
63
|
-
export declare function decodeCursor(cursor: string):
|
|
63
|
+
export declare function decodeCursor(cursor: string): unknown;
|
|
64
64
|
/**
|
|
65
65
|
* Create pagination metadata from response
|
|
66
66
|
*/
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Regression gate constants and types.
|
|
3
|
+
*
|
|
4
|
+
* These mirror the contracts defined in scripts/regression-gate.ts
|
|
5
|
+
* and evals/schemas/regression-report.schema.json so that SDK consumers
|
|
6
|
+
* can programmatically inspect gate results without parsing strings.
|
|
7
|
+
*
|
|
8
|
+
* @packageDocumentation
|
|
9
|
+
*/
|
|
10
|
+
/** Exit codes emitted by `evalai gate` / `scripts/regression-gate.ts`. */
|
|
11
|
+
export declare const GATE_EXIT: {
|
|
12
|
+
/** Gate passed — no regressions detected */
|
|
13
|
+
readonly PASS: 0;
|
|
14
|
+
/** One or more regression thresholds exceeded */
|
|
15
|
+
readonly REGRESSION: 1;
|
|
16
|
+
/** Infrastructure error (baseline missing, summary missing, etc.) */
|
|
17
|
+
readonly INFRA_ERROR: 2;
|
|
18
|
+
/** Confidence tests failed (test suite red) */
|
|
19
|
+
readonly CONFIDENCE_FAILED: 3;
|
|
20
|
+
/** Confidence summary file missing (test infra crashed) */
|
|
21
|
+
readonly CONFIDENCE_MISSING: 4;
|
|
22
|
+
};
|
|
23
|
+
export type GateExitCode = (typeof GATE_EXIT)[keyof typeof GATE_EXIT];
|
|
24
|
+
/** Categories written to regression-report.json `category` field. */
|
|
25
|
+
export declare const GATE_CATEGORY: {
|
|
26
|
+
readonly PASS: "pass";
|
|
27
|
+
readonly REGRESSION: "regression";
|
|
28
|
+
readonly INFRA_ERROR: "infra_error";
|
|
29
|
+
};
|
|
30
|
+
export type GateCategory = (typeof GATE_CATEGORY)[keyof typeof GATE_CATEGORY];
|
|
31
|
+
/** Current schema version for regression-report.json. */
|
|
32
|
+
export declare const REPORT_SCHEMA_VERSION = 1;
|
|
33
|
+
export interface RegressionDelta {
|
|
34
|
+
metric: string;
|
|
35
|
+
baseline: number | string;
|
|
36
|
+
current: number | string;
|
|
37
|
+
delta: string;
|
|
38
|
+
status: "pass" | "fail";
|
|
39
|
+
}
|
|
40
|
+
export interface RegressionReport {
|
|
41
|
+
schemaVersion: number;
|
|
42
|
+
timestamp: string;
|
|
43
|
+
exitCode: GateExitCode;
|
|
44
|
+
category: GateCategory;
|
|
45
|
+
passed: boolean;
|
|
46
|
+
failures: string[];
|
|
47
|
+
deltas: RegressionDelta[];
|
|
48
|
+
}
|
|
49
|
+
export interface BaselineTolerance {
|
|
50
|
+
scoreDrop: number;
|
|
51
|
+
passRateDrop: number;
|
|
52
|
+
maxLatencyIncreaseMs: number;
|
|
53
|
+
maxCostIncreaseUsd: number;
|
|
54
|
+
}
|
|
55
|
+
export interface Baseline {
|
|
56
|
+
schemaVersion: number;
|
|
57
|
+
description: string;
|
|
58
|
+
generatedAt: string;
|
|
59
|
+
generatedBy: string;
|
|
60
|
+
commitSha: string;
|
|
61
|
+
updatedAt: string;
|
|
62
|
+
updatedBy: string;
|
|
63
|
+
tolerance: BaselineTolerance;
|
|
64
|
+
goldenEval: {
|
|
65
|
+
score: number;
|
|
66
|
+
passRate: number;
|
|
67
|
+
totalCases: number;
|
|
68
|
+
passedCases: number;
|
|
69
|
+
};
|
|
70
|
+
qualityScore: {
|
|
71
|
+
overall: number;
|
|
72
|
+
grade: string;
|
|
73
|
+
accuracy: number;
|
|
74
|
+
safety: number;
|
|
75
|
+
latency: number;
|
|
76
|
+
cost: number;
|
|
77
|
+
consistency: number;
|
|
78
|
+
};
|
|
79
|
+
confidenceTests: {
|
|
80
|
+
unitPassed: boolean;
|
|
81
|
+
unitTotal: number;
|
|
82
|
+
dbPassed: boolean;
|
|
83
|
+
dbTotal: number;
|
|
84
|
+
};
|
|
85
|
+
productMetrics: {
|
|
86
|
+
p95ApiLatencyMs?: number;
|
|
87
|
+
goldenCostUsd?: number;
|
|
88
|
+
};
|
|
89
|
+
qualityMetrics?: {
|
|
90
|
+
unitLaneDurationMs?: number;
|
|
91
|
+
dbLaneDurationMs?: number;
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
/** Well-known artifact paths relative to project root. */
|
|
95
|
+
export declare const ARTIFACTS: {
|
|
96
|
+
readonly BASELINE: "evals/baseline.json";
|
|
97
|
+
readonly REGRESSION_REPORT: "evals/regression-report.json";
|
|
98
|
+
readonly CONFIDENCE_SUMMARY: "evals/confidence-summary.json";
|
|
99
|
+
readonly LATENCY_BENCHMARK: "evals/latency-benchmark.json";
|
|
100
|
+
};
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Regression gate constants and types.
|
|
4
|
+
*
|
|
5
|
+
* These mirror the contracts defined in scripts/regression-gate.ts
|
|
6
|
+
* and evals/schemas/regression-report.schema.json so that SDK consumers
|
|
7
|
+
* can programmatically inspect gate results without parsing strings.
|
|
8
|
+
*
|
|
9
|
+
* @packageDocumentation
|
|
10
|
+
*/
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.ARTIFACTS = exports.REPORT_SCHEMA_VERSION = exports.GATE_CATEGORY = exports.GATE_EXIT = void 0;
|
|
13
|
+
// ── Exit codes ──
|
|
14
|
+
/** Exit codes emitted by `evalai gate` / `scripts/regression-gate.ts`. */
|
|
15
|
+
exports.GATE_EXIT = {
|
|
16
|
+
/** Gate passed — no regressions detected */
|
|
17
|
+
PASS: 0,
|
|
18
|
+
/** One or more regression thresholds exceeded */
|
|
19
|
+
REGRESSION: 1,
|
|
20
|
+
/** Infrastructure error (baseline missing, summary missing, etc.) */
|
|
21
|
+
INFRA_ERROR: 2,
|
|
22
|
+
/** Confidence tests failed (test suite red) */
|
|
23
|
+
CONFIDENCE_FAILED: 3,
|
|
24
|
+
/** Confidence summary file missing (test infra crashed) */
|
|
25
|
+
CONFIDENCE_MISSING: 4,
|
|
26
|
+
};
|
|
27
|
+
// ── Report categories ──
|
|
28
|
+
/** Categories written to regression-report.json `category` field. */
|
|
29
|
+
exports.GATE_CATEGORY = {
|
|
30
|
+
PASS: "pass",
|
|
31
|
+
REGRESSION: "regression",
|
|
32
|
+
INFRA_ERROR: "infra_error",
|
|
33
|
+
};
|
|
34
|
+
// ── Schema version ──
|
|
35
|
+
/** Current schema version for regression-report.json. */
|
|
36
|
+
exports.REPORT_SCHEMA_VERSION = 1;
|
|
37
|
+
// ── Artifact paths ──
|
|
38
|
+
/** Well-known artifact paths relative to project root. */
|
|
39
|
+
exports.ARTIFACTS = {
|
|
40
|
+
BASELINE: "evals/baseline.json",
|
|
41
|
+
REGRESSION_REPORT: "evals/regression-report.json",
|
|
42
|
+
CONFIDENCE_SUMMARY: "evals/confidence-summary.json",
|
|
43
|
+
LATENCY_BENCHMARK: "evals/latency-benchmark.json",
|
|
44
|
+
};
|