elasticdash-test 0.1.26 → 0.1.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +100 -0
- package/dist/cli.js +175 -0
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +62 -1
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/tool-registry.d.ts +31 -0
- package/dist/tool-registry.d.ts.map +1 -0
- package/dist/tool-registry.js +73 -0
- package/dist/tool-registry.js.map +1 -0
- package/dist/tool-runner-worker.js +19 -2
- package/dist/tool-runner-worker.js.map +1 -1
- package/dist/utils/debug.d.ts +1 -1
- package/dist/utils/debug.d.ts.map +1 -1
- package/dist/utils/debug.js +2 -2
- package/dist/utils/debug.js.map +1 -1
- package/docs/observability_contract.md +192 -0
- package/package.json +2 -2
- package/src/cli.ts +184 -0
- package/src/index.ts +4 -0
- package/src/tool-registry.ts +94 -0
- package/src/tool-runner-worker.ts +17 -2
- package/src/utils/debug.ts +2 -2
- package/dist/cloud-client.d.ts +0 -34
- package/dist/cloud-client.d.ts.map +0 -1
- package/dist/cloud-client.js +0 -103
- package/dist/cloud-client.js.map +0 -1
- package/dist/evaluators/determinism.d.ts +0 -3
- package/dist/evaluators/determinism.d.ts.map +0 -1
- package/dist/evaluators/determinism.js +0 -116
- package/dist/evaluators/determinism.js.map +0 -1
- package/dist/evaluators/index.d.ts +0 -4
- package/dist/evaluators/index.d.ts.map +0 -1
- package/dist/evaluators/index.js +0 -61
- package/dist/evaluators/index.js.map +0 -1
- package/dist/evaluators/latency-budget.d.ts +0 -3
- package/dist/evaluators/latency-budget.d.ts.map +0 -1
- package/dist/evaluators/latency-budget.js +0 -45
- package/dist/evaluators/latency-budget.js.map +0 -1
- package/dist/evaluators/llm-judge.d.ts +0 -3
- package/dist/evaluators/llm-judge.d.ts.map +0 -1
- package/dist/evaluators/llm-judge.js +0 -125
- package/dist/evaluators/llm-judge.js.map +0 -1
- package/dist/evaluators/output-contains.d.ts +0 -3
- package/dist/evaluators/output-contains.d.ts.map +0 -1
- package/dist/evaluators/output-contains.js +0 -52
- package/dist/evaluators/output-contains.js.map +0 -1
- package/dist/evaluators/output-schema.d.ts +0 -3
- package/dist/evaluators/output-schema.d.ts.map +0 -1
- package/dist/evaluators/output-schema.js +0 -58
- package/dist/evaluators/output-schema.js.map +0 -1
- package/dist/evaluators/token-budget.d.ts +0 -3
- package/dist/evaluators/token-budget.d.ts.map +0 -1
- package/dist/evaluators/token-budget.js +0 -45
- package/dist/evaluators/token-budget.js.map +0 -1
- package/dist/evaluators/types.d.ts +0 -104
- package/dist/evaluators/types.d.ts.map +0 -1
- package/dist/evaluators/types.js +0 -6
- package/dist/evaluators/types.js.map +0 -1
- package/dist/test-group/cli.d.ts +0 -8
- package/dist/test-group/cli.d.ts.map +0 -1
- package/dist/test-group/cli.js +0 -162
- package/dist/test-group/cli.js.map +0 -1
- package/dist/test-group/git-context.d.ts +0 -3
- package/dist/test-group/git-context.d.ts.map +0 -1
- package/dist/test-group/git-context.js +0 -59
- package/dist/test-group/git-context.js.map +0 -1
- package/dist/test-group/reporter.d.ts +0 -4
- package/dist/test-group/reporter.d.ts.map +0 -1
- package/dist/test-group/reporter.js +0 -54
- package/dist/test-group/reporter.js.map +0 -1
- package/dist/test-group/runner.d.ts +0 -18
- package/dist/test-group/runner.d.ts.map +0 -1
- package/dist/test-group/runner.js +0 -234
- package/dist/test-group/runner.js.map +0 -1
- package/dist/tracing-universal.d.ts +0 -13
- package/dist/tracing-universal.d.ts.map +0 -1
- package/dist/tracing-universal.js +0 -33
- package/dist/tracing-universal.js.map +0 -1
- package/docs/backend_rerun_alignment.md +0 -291
- package/docs/backend_traceid_update.md +0 -141
- package/docs/observability_backend_contract.md +0 -577
- package/docs/observability_rerun_backend_plan.md +0 -596
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Global registry for tools defined via edTool().
|
|
3
|
+
*
|
|
4
|
+
* Tools registered here are discoverable for reruns regardless of which
|
|
5
|
+
* module they live in. The helper also applies wrapTool() so telemetry
|
|
6
|
+
* fires automatically — symmetric with the Python @ed_tool decorator.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { wrapTool } from './interceptors/tool.js'
|
|
10
|
+
|
|
11
|
+
export interface RegisteredTool {
|
|
12
|
+
name: string
|
|
13
|
+
fn: (...args: unknown[]) => unknown | Promise<unknown>
|
|
14
|
+
wrapped: (...args: unknown[]) => Promise<unknown>
|
|
15
|
+
isAsync: boolean
|
|
16
|
+
signature: string
|
|
17
|
+
sourceFile: string | null
|
|
18
|
+
lineNumber: number | null
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
const _registry: Map<string, RegisteredTool> = new Map()
|
|
22
|
+
|
|
23
|
+
export function getRegisteredTools(): RegisteredTool[] {
|
|
24
|
+
return Array.from(_registry.values())
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export function getRegisteredTool(name: string): RegisteredTool | undefined {
|
|
28
|
+
return _registry.get(name)
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export function clearToolRegistry(): void {
|
|
32
|
+
_registry.clear()
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function inferCallerLocation(): { file: string | null; line: number | null } {
|
|
36
|
+
const stack = new Error().stack
|
|
37
|
+
if (!stack) return { file: null, line: null }
|
|
38
|
+
const lines = stack.split('\n')
|
|
39
|
+
for (let i = 2; i < lines.length; i++) {
|
|
40
|
+
const m = lines[i].match(/\((.*?):(\d+):\d+\)|at\s+(.*?):(\d+):\d+/)
|
|
41
|
+
if (!m) continue
|
|
42
|
+
const file = m[1] ?? m[3]
|
|
43
|
+
const line = parseInt(m[2] ?? m[4], 10)
|
|
44
|
+
if (file && !file.includes('tool-registry')) {
|
|
45
|
+
return { file: file.replace(/^file:\/\//, ''), line: isNaN(line) ? null : line }
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
return { file: null, line: null }
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function inferSignature(fn: Function): string {
|
|
52
|
+
const src = fn.toString()
|
|
53
|
+
const m = src.match(/^[^(]*\(([^)]*)\)/)
|
|
54
|
+
if (!m) return '()'
|
|
55
|
+
const params = m[1]
|
|
56
|
+
.split(',')
|
|
57
|
+
.map(p => p.trim().split(/[\s=:]/)[0])
|
|
58
|
+
.filter(Boolean)
|
|
59
|
+
return `(${params.join(', ')})`
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Register a function as a rerunnable tool.
|
|
64
|
+
*
|
|
65
|
+
* export const myTool = edTool('my_tool', async (query: string) => { ... })
|
|
66
|
+
*
|
|
67
|
+
* The returned function is the telemetry-wrapped version, so calling it from
|
|
68
|
+
* normal code paths still produces traces. The CLI `run-tool` command and the
|
|
69
|
+
* ElasticDash MCP `run_tool` tool will resolve the original by name.
|
|
70
|
+
*/
|
|
71
|
+
export function edTool<Args extends unknown[], R>(
|
|
72
|
+
name: string,
|
|
73
|
+
fn: (...args: Args) => R | Promise<R>,
|
|
74
|
+
): (...args: Args) => Promise<R> {
|
|
75
|
+
const isAsync = fn.constructor.name === 'AsyncFunction'
|
|
76
|
+
const signature = inferSignature(fn)
|
|
77
|
+
const { file, line } = inferCallerLocation()
|
|
78
|
+
|
|
79
|
+
const wrapped = wrapTool(name, fn as (...args: unknown[]) => Promise<R>) as unknown as (...args: Args) => Promise<R>
|
|
80
|
+
|
|
81
|
+
_registry.set(name, {
|
|
82
|
+
name,
|
|
83
|
+
fn: fn as RegisteredTool['fn'],
|
|
84
|
+
wrapped: wrapped as RegisteredTool['wrapped'],
|
|
85
|
+
isAsync,
|
|
86
|
+
signature,
|
|
87
|
+
sourceFile: file,
|
|
88
|
+
lineNumber: line,
|
|
89
|
+
})
|
|
90
|
+
|
|
91
|
+
return wrapped
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
export const defineTool = edTool
|
|
@@ -205,9 +205,24 @@ async function main() {
|
|
|
205
205
|
originalExit(1)
|
|
206
206
|
return
|
|
207
207
|
}
|
|
208
|
-
|
|
208
|
+
|
|
209
|
+
// Registry first: covers tools defined via edTool() anywhere in the project,
|
|
210
|
+
// as long as their containing module is reachable from toolsModulePath's
|
|
211
|
+
// import graph. Falls back to ed_tools-style module export lookup.
|
|
212
|
+
let fn: ((...a: unknown[]) => unknown) | undefined
|
|
213
|
+
try {
|
|
214
|
+
const reg = await import('./tool-registry.js')
|
|
215
|
+
const registered = reg.getRegisteredTool(toolName)
|
|
216
|
+
if (registered) fn = registered.wrapped
|
|
217
|
+
} catch {
|
|
218
|
+
// Registry module not available (older SDK build); fall through to export lookup.
|
|
219
|
+
}
|
|
220
|
+
if (!fn) {
|
|
221
|
+
const exported = mod[toolName]
|
|
222
|
+
if (typeof exported === 'function') fn = exported
|
|
223
|
+
}
|
|
209
224
|
if (typeof fn !== 'function') {
|
|
210
|
-
await writeResult({ ok: false, error: `"${toolName}"
|
|
225
|
+
await writeResult({ ok: false, error: `"${toolName}" not found via edTool() registry or as an exported function in the module.` })
|
|
211
226
|
originalExit(1)
|
|
212
227
|
return
|
|
213
228
|
}
|
package/src/utils/debug.ts
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
const DEBUG_KEY = 'ELASTICDASH_DEBUG'
|
|
2
2
|
|
|
3
|
-
/** Log only when ELASTICDASH_DEBUG=1 is set.
|
|
3
|
+
/** Log only when ELASTICDASH_DEBUG=1 is set. Writes to stderr so callers parsing stdout (e.g. `elasticdash run-tool`) get a clean JSON channel. */
|
|
4
4
|
export function debugLog(...args: unknown[]): void {
|
|
5
5
|
if (typeof process !== 'undefined' && process.env?.[DEBUG_KEY] === '1') {
|
|
6
|
-
console.
|
|
6
|
+
console.error(...args)
|
|
7
7
|
}
|
|
8
8
|
}
|
package/dist/cloud-client.d.ts
DELETED
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Cloud API client — communicates with the ElasticDash backend.
|
|
3
|
-
* Used by the test-group CLI to fetch definitions and push results.
|
|
4
|
-
*/
|
|
5
|
-
import type { TestGroup, TestGroupRunResult } from './evaluators/types.js';
|
|
6
|
-
export interface CloudClientConfig {
|
|
7
|
-
apiUrl: string;
|
|
8
|
-
apiKey: string;
|
|
9
|
-
projectId: number;
|
|
10
|
-
}
|
|
11
|
-
export interface FetchFilters {
|
|
12
|
-
workflowName?: string;
|
|
13
|
-
tags?: string[];
|
|
14
|
-
}
|
|
15
|
-
export declare function fetchTestGroups(config: CloudClientConfig, filters?: FetchFilters): Promise<TestGroup[]>;
|
|
16
|
-
export declare function pushRunResult(config: CloudClientConfig, testGroupId: number, result: TestGroupRunResult): Promise<{
|
|
17
|
-
id: number;
|
|
18
|
-
}>;
|
|
19
|
-
export declare function pushBatchResult(config: CloudClientConfig, batch: {
|
|
20
|
-
testGroupRunIds: number[];
|
|
21
|
-
gitBranch?: string;
|
|
22
|
-
gitCommit?: string;
|
|
23
|
-
passed?: boolean;
|
|
24
|
-
summary?: string;
|
|
25
|
-
}): Promise<{
|
|
26
|
-
id: number;
|
|
27
|
-
}>;
|
|
28
|
-
export declare function exportTestGroups(config: CloudClientConfig): Promise<TestGroup[]>;
|
|
29
|
-
export declare function resolveCloudConfig(options: {
|
|
30
|
-
apiUrl?: string;
|
|
31
|
-
apiKey?: string;
|
|
32
|
-
projectId?: string | number;
|
|
33
|
-
}): CloudClientConfig;
|
|
34
|
-
//# sourceMappingURL=cloud-client.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"cloud-client.d.ts","sourceRoot":"","sources":["../src/cloud-client.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAAE,SAAS,EAAE,kBAAkB,EAAE,MAAM,uBAAuB,CAAA;AAE1E,MAAM,WAAW,iBAAiB;IAChC,MAAM,EAAE,MAAM,CAAA;IACd,MAAM,EAAE,MAAM,CAAA;IACd,SAAS,EAAE,MAAM,CAAA;CAClB;AAED,MAAM,WAAW,YAAY;IAC3B,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,IAAI,CAAC,EAAE,MAAM,EAAE,CAAA;CAChB;AAyED,wBAAsB,eAAe,CACnC,MAAM,EAAE,iBAAiB,EACzB,OAAO,CAAC,EAAE,YAAY,GACrB,OAAO,CAAC,SAAS,EAAE,CAAC,CAKtB;AAED,wBAAsB,aAAa,CACjC,MAAM,EAAE,iBAAiB,EACzB,WAAW,EAAE,MAAM,EACnB,MAAM,EAAE,kBAAkB,GACzB,OAAO,CAAC;IAAE,EAAE,EAAE,MAAM,CAAA;CAAE,CAAC,CAMzB;AAED,wBAAsB,eAAe,CACnC,MAAM,EAAE,iBAAiB,EACzB,KAAK,EAAE;IACL,eAAe,EAAE,MAAM,EAAE,CAAA;IACzB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,MAAM,CAAC,EAAE,OAAO,CAAA;IAChB,OAAO,CAAC,EAAE,MAAM,CAAA;CACjB,GACA,OAAO,CAAC;IAAE,EAAE,EAAE,MAAM,CAAA;CAAE,CAAC,CAMzB;AAED,wBAAsB,gBAAgB,CACpC,MAAM,EAAE,iBAAiB,GACxB,OAAO,CAAC,SAAS,EAAE,CAAC,CAItB;AAED,wBAAgB,kBAAkB,CAAC,OAAO,EAAE;IAC1C,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,SAAS,CAAC,EAAE,MAAM,GAAG,MAAM,CAAA;CAC5B,GAAG,iBAAiB,CAyBpB"}
|
package/dist/cloud-client.js
DELETED
|
@@ -1,103 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Cloud API client — communicates with the ElasticDash backend.
|
|
3
|
-
* Used by the test-group CLI to fetch definitions and push results.
|
|
4
|
-
*/
|
|
5
|
-
class CloudClientError extends Error {
|
|
6
|
-
statusCode;
|
|
7
|
-
responseBody;
|
|
8
|
-
constructor(message, statusCode, responseBody) {
|
|
9
|
-
super(message);
|
|
10
|
-
this.statusCode = statusCode;
|
|
11
|
-
this.responseBody = responseBody;
|
|
12
|
-
this.name = 'CloudClientError';
|
|
13
|
-
}
|
|
14
|
-
}
|
|
15
|
-
async function request(url, apiKey, options = {}) {
|
|
16
|
-
const { method = 'GET', body, retries = 2 } = options;
|
|
17
|
-
let lastError;
|
|
18
|
-
for (let attempt = 0; attempt <= retries; attempt++) {
|
|
19
|
-
if (attempt > 0) {
|
|
20
|
-
const delayMs = Math.min(1000 * 2 ** (attempt - 1), 5000);
|
|
21
|
-
await new Promise((r) => setTimeout(r, delayMs));
|
|
22
|
-
}
|
|
23
|
-
try {
|
|
24
|
-
const res = await fetch(url, {
|
|
25
|
-
method,
|
|
26
|
-
headers: {
|
|
27
|
-
'Authorization': `Bearer ${apiKey}`,
|
|
28
|
-
'Content-Type': 'application/json',
|
|
29
|
-
},
|
|
30
|
-
body: body ? JSON.stringify(body) : undefined,
|
|
31
|
-
signal: AbortSignal.timeout(30_000),
|
|
32
|
-
});
|
|
33
|
-
if (res.status === 401) {
|
|
34
|
-
throw new CloudClientError('Authentication failed. Check that your ELASTICDASH_API_KEY is valid and active.', 401);
|
|
35
|
-
}
|
|
36
|
-
if (!res.ok) {
|
|
37
|
-
const text = await res.text().catch(() => '');
|
|
38
|
-
throw new CloudClientError(`API request failed: ${res.status} ${res.statusText}`, res.status, text);
|
|
39
|
-
}
|
|
40
|
-
const json = await res.json();
|
|
41
|
-
// The backend wraps responses in { status, data } via generalApiResponseSender
|
|
42
|
-
return (json.data !== undefined ? json.data : json);
|
|
43
|
-
}
|
|
44
|
-
catch (err) {
|
|
45
|
-
lastError = err;
|
|
46
|
-
// Don't retry auth errors or client errors
|
|
47
|
-
if (err instanceof CloudClientError && err.statusCode && err.statusCode < 500) {
|
|
48
|
-
throw err;
|
|
49
|
-
}
|
|
50
|
-
// Retry on network/timeout/5xx errors
|
|
51
|
-
if (attempt === retries) {
|
|
52
|
-
throw err;
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
throw lastError ?? new Error('Request failed');
|
|
57
|
-
}
|
|
58
|
-
export async function fetchTestGroups(config, filters) {
|
|
59
|
-
const url = new URL(`${config.apiUrl}/api/testgroups/by-project/${encodeURIComponent(String(config.projectId))}`);
|
|
60
|
-
if (filters?.workflowName)
|
|
61
|
-
url.searchParams.set('workflowName', filters.workflowName);
|
|
62
|
-
if (filters?.tags?.length)
|
|
63
|
-
url.searchParams.set('tags', filters.tags.join(','));
|
|
64
|
-
return request(url.toString(), config.apiKey);
|
|
65
|
-
}
|
|
66
|
-
export async function pushRunResult(config, testGroupId, result) {
|
|
67
|
-
const url = `${config.apiUrl}/api/testgroups/${encodeURIComponent(String(testGroupId))}/runs`;
|
|
68
|
-
return request(url, config.apiKey, {
|
|
69
|
-
method: 'POST',
|
|
70
|
-
body: result,
|
|
71
|
-
});
|
|
72
|
-
}
|
|
73
|
-
export async function pushBatchResult(config, batch) {
|
|
74
|
-
const url = `${config.apiUrl}/api/testgroups/batches`;
|
|
75
|
-
return request(url, config.apiKey, {
|
|
76
|
-
method: 'POST',
|
|
77
|
-
body: batch,
|
|
78
|
-
});
|
|
79
|
-
}
|
|
80
|
-
export async function exportTestGroups(config) {
|
|
81
|
-
const url = new URL(`${config.apiUrl}/api/testgroups/export`);
|
|
82
|
-
url.searchParams.set('projectId', String(config.projectId));
|
|
83
|
-
return request(url.toString(), config.apiKey);
|
|
84
|
-
}
|
|
85
|
-
export function resolveCloudConfig(options) {
|
|
86
|
-
const apiUrl = options.apiUrl ||
|
|
87
|
-
process.env.ELASTICDASH_API_URL ||
|
|
88
|
-
'';
|
|
89
|
-
const apiKey = options.apiKey ||
|
|
90
|
-
process.env.ELASTICDASH_API_KEY ||
|
|
91
|
-
'';
|
|
92
|
-
const projectId = Number(options.projectId ||
|
|
93
|
-
process.env.ELASTICDASH_PROJECT_ID ||
|
|
94
|
-
'1');
|
|
95
|
-
if (!apiUrl) {
|
|
96
|
-
throw new CloudClientError('Missing API URL. Set --api-url or ELASTICDASH_API_URL.');
|
|
97
|
-
}
|
|
98
|
-
if (!apiKey) {
|
|
99
|
-
throw new CloudClientError('Missing API key. Set --api-key or ELASTICDASH_API_KEY.');
|
|
100
|
-
}
|
|
101
|
-
return { apiUrl: apiUrl.replace(/\/+$/, ''), apiKey, projectId };
|
|
102
|
-
}
|
|
103
|
-
//# sourceMappingURL=cloud-client.js.map
|
package/dist/cloud-client.js.map
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"cloud-client.js","sourceRoot":"","sources":["../src/cloud-client.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAeH,MAAM,gBAAiB,SAAQ,KAAK;IAGzB;IACA;IAHT,YACE,OAAe,EACR,UAAmB,EACnB,YAAqB;QAE5B,KAAK,CAAC,OAAO,CAAC,CAAA;QAHP,eAAU,GAAV,UAAU,CAAS;QACnB,iBAAY,GAAZ,YAAY,CAAS;QAG5B,IAAI,CAAC,IAAI,GAAG,kBAAkB,CAAA;IAChC,CAAC;CACF;AAED,KAAK,UAAU,OAAO,CACpB,GAAW,EACX,MAAc,EACd,UAAiE,EAAE;IAEnE,MAAM,EAAE,MAAM,GAAG,KAAK,EAAE,IAAI,EAAE,OAAO,GAAG,CAAC,EAAE,GAAG,OAAO,CAAA;IACrD,IAAI,SAA4B,CAAA;IAEhC,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,OAAO,EAAE,OAAO,EAAE,EAAE,CAAC;QACpD,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;YAChB,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,IAAI,CAAC,OAAO,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,CAAA;YACzD,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAA;QAClD,CAAC;QAED,IAAI,CAAC;YACH,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;gBAC3B,MAAM;gBACN,OAAO,EAAE;oBACP,eAAe,EAAE,UAAU,MAAM,EAAE;oBACnC,cAAc,EAAE,kBAAkB;iBACnC;gBACD,IAAI,EAAE,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS;gBAC7C,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,MAAM,CAAC;aACpC,CAAC,CAAA;YAEF,IAAI,GAAG,CAAC,MAAM,KAAK,GAAG,EAAE,CAAC;gBACvB,MAAM,IAAI,gBAAgB,CACxB,iFAAiF,EACjF,GAAG,CACJ,CAAA;YACH,CAAC;YAED,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;gBACZ,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,CAAA;gBAC7C,MAAM,IAAI,gBAAgB,CACxB,uBAAuB,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,UAAU,EAAE,EACrD,GAAG,CAAC,MAAM,EACV,IAAI,CACL,CAAA;YACH,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,IAAI,EAA6B,CAAA;YACxD,+EAA+E;YAC/E,OAAO,CAAC,IAAI,CAAC,IAAI,KAAK,SAAS,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAM,CAAA;QAC1D,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,SAAS,GAAG,GAAY,CAAA;YACxB,2CAA2C;YAC3C,IAAI,GAAG,YAAY,gBAAgB,IAAI,GAAG,CAAC,UAAU,IAAI,GAAG,CAAC,UAAU,GAAG,GAAG,EAAE,CAAC;gBAC9E,MAAM,GAAG,CAAA;YACX,CAAC;YACD,sCAAsC;YACtC,IAAI,OAAO,KAAK,OAAO,EAAE,CAAC;gBACxB,MAAM,GAAG,CAAA;YACX,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,SAAS,IAAI,IAAI,KAAK,CAAC,gBAAgB,CAAC,CAAA;AAChD,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,MAAyB,EACzB,OAAsB;IAEtB,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,GAAG,MAAM,CAAC,MAAM,8BAA8B,kBAAkB,CAAC,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,EAAE,CAAC,CAAA;IACjH,IAAI,OAAO,EAAE,YAAY;QAAE,GAAG,CAAC,YAAY,CAAC,GAAG,CAAC,cAAc,EAAE,OAAO,CAAC,YAAY,CAAC,CAAA;IACrF,IAAI,OAAO,EAAE,IAAI,EAAE,MAAM;QAAE,GAAG,CAAC,YAAY,CAAC,GAAG,CAAC,MAAM,EAAE,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;IAC/E,OAAO,OAAO,CAAc,GAAG,CAAC,QAAQ,EAAE,EAAE,MAAM,CAAC,MAAM,CAAC,CAAA;AAC5D,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,MAAyB,EACzB,WAAmB,EACnB,MAA0B;IAE1B,MAAM,GAAG,GAAG,GAAG,MAAM,CAAC,MAAM,mBAAmB,kBAAkB,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,OAAO,CAAA;IAC7F,OAAO,OAAO,CAAiB,GAAG,EAAE,MAAM,CAAC,MAAM,EAAE;QACjD,MAAM,EAAE,MAAM;QACd,IAAI,EAAE,MAAM;KACb,CAAC,CAAA;AACJ,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,MAAyB,EACzB,KAMC;IAED,MAAM,GAAG,GAAG,GAAG,MAAM,CAAC,MAAM,yBAAyB,CAAA;IACrD,OAAO,OAAO,CAAiB,GAAG,EAAE,MAAM,CAAC,MAAM,EAAE;QACjD,MAAM,EAAE,MAAM;QACd,IAAI,EAAE,KAAK;KACZ,CAAC,CAAA;AACJ,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,MAAyB;IAEzB,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,GAAG,MAAM,CAAC,MAAM,wBAAwB,CAAC,CAAA;IAC7D,GAAG,CAAC,YAAY,CAAC,GAAG,CAAC,WAAW,EAAE,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAA;IAC3D,OAAO,OAAO,CAAc,GAAG,CAAC,QAAQ,EAAE,EAAE,MAAM,CAAC,MAAM,CAAC,CAAA;AAC5D,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,OAIlC;IACC,MAAM,MAAM,GACV,OAAO,CAAC,MAAM;QACd,OAAO,CAAC,GAAG,CAAC,mBAAmB;QAC/B,EAAE,CAAA;IAEJ,MAAM,MAAM,GACV,OAAO,CAAC,MAAM;QACd,OAAO,CAAC,GAAG,CAAC,mBAAmB;QAC/B,EAAE,CAAA;IAEJ,MAAM,SAAS,GAAG,MAAM,CACtB,OAAO,CAAC,SAAS;QACjB,OAAO,CAAC,GAAG,CAAC,sBAAsB;QAClC,GAAG,CACJ,CAAA;IAED,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,gBAAgB,CAAC,wDAAwD,CAAC,CAAA;IACtF,CAAC;IACD,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,gBAAgB,CAAC,wDAAwD,CAAC,CAAA;IACtF,CAAC;IAED,OAAO,EAAE,MAAM,EAAE,MAAM,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,EAAE,MAAM,EAAE,SAAS,EAAE,CAAA;AAClE,CAAC"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"determinism.d.ts","sourceRoot":"","sources":["../../src/evaluators/determinism.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,aAAa,EAAE,iBAAiB,EAAgB,MAAM,YAAY,CAAA;AA+D7F,wBAAsB,mBAAmB,CACvC,WAAW,EAAE,WAAW,EACxB,IAAI,EAAE,aAAa,EAAE,GACpB,OAAO,CAAC,iBAAiB,CAAC,CA4D5B"}
|
|
@@ -1,116 +0,0 @@
|
|
|
1
|
-
function stringify(value) {
|
|
2
|
-
if (typeof value === 'string')
|
|
3
|
-
return value;
|
|
4
|
-
if (value == null)
|
|
5
|
-
return '';
|
|
6
|
-
return JSON.stringify(value);
|
|
7
|
-
}
|
|
8
|
-
function stringSimilarity(a, b) {
|
|
9
|
-
if (a === b)
|
|
10
|
-
return 1.0;
|
|
11
|
-
if (a.length === 0 || b.length === 0)
|
|
12
|
-
return 0.0;
|
|
13
|
-
// Simple character-level Jaccard similarity for fast local comparison
|
|
14
|
-
const setA = new Set(a.split(' '));
|
|
15
|
-
const setB = new Set(b.split(' '));
|
|
16
|
-
const intersection = new Set([...setA].filter((x) => setB.has(x)));
|
|
17
|
-
const union = new Set([...setA, ...setB]);
|
|
18
|
-
return union.size > 0 ? intersection.size / union.size : 0;
|
|
19
|
-
}
|
|
20
|
-
async function llmSimilarity(a, b) {
|
|
21
|
-
const apiKey = process.env.OPENAI_API_KEY;
|
|
22
|
-
if (!apiKey) {
|
|
23
|
-
// Fallback to string similarity if no API key
|
|
24
|
-
return stringSimilarity(a, b);
|
|
25
|
-
}
|
|
26
|
-
try {
|
|
27
|
-
const res = await fetch('https://api.openai.com/v1/chat/completions', {
|
|
28
|
-
method: 'POST',
|
|
29
|
-
headers: {
|
|
30
|
-
'Authorization': `Bearer ${apiKey}`,
|
|
31
|
-
'Content-Type': 'application/json',
|
|
32
|
-
},
|
|
33
|
-
body: JSON.stringify({
|
|
34
|
-
model: 'gpt-4o-mini',
|
|
35
|
-
messages: [
|
|
36
|
-
{
|
|
37
|
-
role: 'system',
|
|
38
|
-
content: 'You compare two outputs for semantic similarity. Respond with ONLY a number between 0.0 and 1.0 where 1.0 means identical meaning and 0.0 means completely different.',
|
|
39
|
-
},
|
|
40
|
-
{
|
|
41
|
-
role: 'user',
|
|
42
|
-
content: `Output A:\n${a}\n\nOutput B:\n${b}\n\nSimilarity score (0.0-1.0):`,
|
|
43
|
-
},
|
|
44
|
-
],
|
|
45
|
-
max_tokens: 10,
|
|
46
|
-
temperature: 0,
|
|
47
|
-
}),
|
|
48
|
-
signal: AbortSignal.timeout(15_000),
|
|
49
|
-
});
|
|
50
|
-
if (!res.ok)
|
|
51
|
-
return stringSimilarity(a, b);
|
|
52
|
-
const json = await res.json();
|
|
53
|
-
const content = json.choices?.[0]?.message?.content ?? '';
|
|
54
|
-
const score = parseFloat(content.trim());
|
|
55
|
-
return isNaN(score) ? stringSimilarity(a, b) : Math.max(0, Math.min(1, score));
|
|
56
|
-
}
|
|
57
|
-
catch {
|
|
58
|
-
return stringSimilarity(a, b);
|
|
59
|
-
}
|
|
60
|
-
}
|
|
61
|
-
export async function evaluateDeterminism(expectation, runs) {
|
|
62
|
-
const threshold = expectation.similarityThreshold ?? 0.8;
|
|
63
|
-
if (runs.length < 2) {
|
|
64
|
-
return {
|
|
65
|
-
expectationId: expectation.id,
|
|
66
|
-
type: 'determinism',
|
|
67
|
-
passed: true,
|
|
68
|
-
detail: 'Only 1 run, determinism check skipped',
|
|
69
|
-
};
|
|
70
|
-
}
|
|
71
|
-
const outputs = runs.map((r) => stringify(r.output));
|
|
72
|
-
// Pairwise comparison (for N ≤ 5, compare all pairs; for larger N, sample)
|
|
73
|
-
const pairs = [];
|
|
74
|
-
if (runs.length <= 5) {
|
|
75
|
-
for (let i = 0; i < runs.length; i++) {
|
|
76
|
-
for (let j = i + 1; j < runs.length; j++) {
|
|
77
|
-
pairs.push([i, j]);
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
|
-
}
|
|
81
|
-
else {
|
|
82
|
-
// Sample: compare each run against the first run
|
|
83
|
-
for (let i = 1; i < runs.length; i++) {
|
|
84
|
-
pairs.push([0, i]);
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
const scores = [];
|
|
88
|
-
for (const [i, j] of pairs) {
|
|
89
|
-
const score = await llmSimilarity(outputs[i], outputs[j]);
|
|
90
|
-
scores.push(score);
|
|
91
|
-
}
|
|
92
|
-
const avgScore = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
93
|
-
const allAboveThreshold = scores.every((s) => s >= threshold);
|
|
94
|
-
const perRun = runs.map((run, i) => {
|
|
95
|
-
// For each run, report its average similarity to other runs
|
|
96
|
-
const relevantScores = pairs
|
|
97
|
-
.map(([a, b], idx) => (a === i || b === i ? scores[idx] : null))
|
|
98
|
-
.filter((s) => s !== null);
|
|
99
|
-
const runAvg = relevantScores.length > 0
|
|
100
|
-
? relevantScores.reduce((a, b) => a + b, 0) / relevantScores.length
|
|
101
|
-
: 1.0;
|
|
102
|
-
return {
|
|
103
|
-
runIndex: run.runIndex,
|
|
104
|
-
passed: relevantScores.every((s) => s >= threshold),
|
|
105
|
-
value: parseFloat(runAvg.toFixed(3)),
|
|
106
|
-
};
|
|
107
|
-
});
|
|
108
|
-
return {
|
|
109
|
-
expectationId: expectation.id,
|
|
110
|
-
type: 'determinism',
|
|
111
|
-
passed: allAboveThreshold,
|
|
112
|
-
detail: `avg similarity ${avgScore.toFixed(3)} (threshold: ${threshold}), ${scores.filter((s) => s >= threshold).length}/${scores.length} pairs pass`,
|
|
113
|
-
perRun,
|
|
114
|
-
};
|
|
115
|
-
}
|
|
116
|
-
//# sourceMappingURL=determinism.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"determinism.js","sourceRoot":"","sources":["../../src/evaluators/determinism.ts"],"names":[],"mappings":"AAEA,SAAS,SAAS,CAAC,KAAc;IAC/B,IAAI,OAAO,KAAK,KAAK,QAAQ;QAAE,OAAO,KAAK,CAAA;IAC3C,IAAI,KAAK,IAAI,IAAI;QAAE,OAAO,EAAE,CAAA;IAC5B,OAAO,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAA;AAC9B,CAAC;AAED,SAAS,gBAAgB,CAAC,CAAS,EAAE,CAAS;IAC5C,IAAI,CAAC,KAAK,CAAC;QAAE,OAAO,GAAG,CAAA;IACvB,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,GAAG,CAAA;IAEhD,sEAAsE;IACtE,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAA;IAClC,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAA;IAClC,MAAM,YAAY,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;IAClE,MAAM,KAAK,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,IAAI,EAAE,GAAG,IAAI,CAAC,CAAC,CAAA;IACzC,OAAO,KAAK,CAAC,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;AAC5D,CAAC;AAED,KAAK,UAAU,aAAa,CAAC,CAAS,EAAE,CAAS;IAC/C,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,cAAc,CAAA;IACzC,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,8CAA8C;QAC9C,OAAO,gBAAgB,CAAC,CAAC,EAAE,CAAC,CAAC,CAAA;IAC/B,CAAC;IAED,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,4CAA4C,EAAE;YACpE,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,eAAe,EAAE,UAAU,MAAM,EAAE;gBACnC,cAAc,EAAE,kBAAkB;aACnC;YACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;gBACnB,KAAK,EAAE,aAAa;gBACpB,QAAQ,EAAE;oBACR;wBACE,IAAI,EAAE,QAAQ;wBACd,OAAO,EAAE,uKAAuK;qBACjL;oBACD;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE,cAAc,CAAC,kBAAkB,CAAC,iCAAiC;qBAC7E;iBACF;gBACD,UAAU,EAAE,EAAE;gBACd,WAAW,EAAE,CAAC;aACf,CAAC;YACF,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,MAAM,CAAC;SACpC,CAAC,CAAA;QAEF,IAAI,CAAC,GAAG,CAAC,EAAE;YAAE,OAAO,gBAAgB,CAAC,CAAC,EAAE,CAAC,CAAC,CAAA;QAE1C,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,IAAI,EAAS,CAAA;QACpC,MAAM,OAAO,GAAW,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,IAAI,EAAE,CAAA;QACjE,MAAM,KAAK,GAAG,UAAU,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAA;QACxC,OAAO,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,gBAAgB,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,CAAA;IAChF,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,gBAAgB,CAAC,CAAC,EAAE,CAAC,CAAC,CAAA;IAC/B,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,mBAAmB,CACvC,WAAwB,EACxB,IAAqB;IAErB,MAAM,SAAS,GAAG,WAAW,CAAC,mBAAmB,IAAI,GAAG,CAAA;IAExD,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACpB,OAAO;YACL,aAAa,EAAE,WAAW,CAAC,EAAE;YAC7B,IAAI,EAAE,aAAa;YACnB,MAAM,EAAE,IAAI;YACZ,MAAM,EAAE,uCAAuC;SAChD,CAAA;IACH,CAAC;IAED,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAA;IAEpD,2EAA2E;IAC3E,MAAM,KAAK,GAA4B,EAAE,CAAA;IACzC,IAAI,IAAI,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;QACrB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACzC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;YACpB,CAAC;QACH,CAAC;IACH,CAAC;SAAM,CAAC;QACN,iDAAiD;QACjD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;QACpB,CAAC;IACH,CAAC;IAED,MAAM,MAAM,GAAa,EAAE,CAAA;IAC3B,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,KAAK,EAAE,CAAC;QAC3B,MAAM,KAAK,GAAG,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAA;QACzD,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;IACpB,CAAC;IAED,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,MAAM,CAAA;IAClE,MAAM,iBAAiB,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,IAAI,SAAS,CAAC,CAAA;IAE7D,MAAM,MAAM,GAAmB,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE;QACjD,4DAA4D;QAC5D,MAAM,cAAc,GAAG,KAAK;aACzB,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;aAC/D,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,CAAC,KAAK,IAAI,CAAC,CAAA;QACzC,MAAM,MAAM,GAAG,cAAc,CAAC,MAAM,GAAG,CAAC;YACtC,CAAC,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,cAAc,CAAC,MAAM;YACnE,CAAC,CAAC,GAAG,CAAA;QACP,OAAO;YACL,QAAQ,EAAE,GAAG,CAAC,QAAQ;YACtB,MAAM,EAAE,cAAc,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,IAAI,SAAS,CAAC;YACnD,KAAK,EAAE,UAAU,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;SACrC,CAAA;IACH,CAAC,CAAC,CAAA;IAEF,OAAO;QACL,aAAa,EAAE,WAAW,CAAC,EAAE;QAC7B,IAAI,EAAE,aAAa;QACnB,MAAM,EAAE,iBAAiB;QACzB,MAAM,EAAE,kBAAkB,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,gBAAgB,SAAS,MAAM,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,IAAI,SAAS,CAAC,CAAC,MAAM,IAAI,MAAM,CAAC,MAAM,aAAa;QACrJ,MAAM;KACP,CAAA;AACH,CAAC"}
|
|
@@ -1,4 +0,0 @@
|
|
|
1
|
-
import type { Expectation, SingleRunData, EvaluationResult } from './types.js';
|
|
2
|
-
export { type Expectation, type SingleRunData, type ExpectationResult, type EvaluationResult } from './types.js';
|
|
3
|
-
export declare function evaluateExpectations(expectations: Expectation[], runs: SingleRunData[], passThreshold?: 'all' | number): Promise<EvaluationResult>;
|
|
4
|
-
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/evaluators/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,aAAa,EAAqB,gBAAgB,EAAE,MAAM,YAAY,CAAA;AAQjG,OAAO,EAAE,KAAK,WAAW,EAAE,KAAK,aAAa,EAAE,KAAK,iBAAiB,EAAE,KAAK,gBAAgB,EAAE,MAAM,YAAY,CAAA;AA6BhH,wBAAsB,oBAAoB,CACxC,YAAY,EAAE,WAAW,EAAE,EAC3B,IAAI,EAAE,aAAa,EAAE,EACrB,aAAa,GAAE,KAAK,GAAG,MAAc,GACpC,OAAO,CAAC,gBAAgB,CAAC,CAmC3B"}
|
package/dist/evaluators/index.js
DELETED
|
@@ -1,61 +0,0 @@
|
|
|
1
|
-
import { evaluateTokenBudget } from './token-budget.js';
|
|
2
|
-
import { evaluateLatencyBudget } from './latency-budget.js';
|
|
3
|
-
import { evaluateOutputContains } from './output-contains.js';
|
|
4
|
-
import { evaluateOutputSchema } from './output-schema.js';
|
|
5
|
-
import { evaluateLLMJudge } from './llm-judge.js';
|
|
6
|
-
import { evaluateDeterminism } from './determinism.js';
|
|
7
|
-
async function evaluateOne(expectation, runs) {
|
|
8
|
-
switch (expectation.type) {
|
|
9
|
-
case 'token-budget':
|
|
10
|
-
return evaluateTokenBudget(expectation, runs);
|
|
11
|
-
case 'latency-budget':
|
|
12
|
-
return evaluateLatencyBudget(expectation, runs);
|
|
13
|
-
case 'output-contains':
|
|
14
|
-
return evaluateOutputContains(expectation, runs);
|
|
15
|
-
case 'output-schema':
|
|
16
|
-
return evaluateOutputSchema(expectation, runs);
|
|
17
|
-
case 'llm-judge':
|
|
18
|
-
return evaluateLLMJudge(expectation, runs);
|
|
19
|
-
case 'determinism':
|
|
20
|
-
return evaluateDeterminism(expectation, runs);
|
|
21
|
-
default:
|
|
22
|
-
return {
|
|
23
|
-
expectationId: expectation.id,
|
|
24
|
-
type: expectation.type,
|
|
25
|
-
passed: false,
|
|
26
|
-
detail: `Unknown expectation type: ${expectation.type}`,
|
|
27
|
-
};
|
|
28
|
-
}
|
|
29
|
-
}
|
|
30
|
-
export async function evaluateExpectations(expectations, runs, passThreshold = 'all') {
|
|
31
|
-
const results = [];
|
|
32
|
-
for (const expectation of expectations) {
|
|
33
|
-
const result = await evaluateOne(expectation, runs);
|
|
34
|
-
results.push(result);
|
|
35
|
-
}
|
|
36
|
-
// Determine overall pass/fail
|
|
37
|
-
const passedExpectations = results.filter((r) => r.passed).length;
|
|
38
|
-
const totalExpectations = results.length;
|
|
39
|
-
let passed;
|
|
40
|
-
if (passThreshold === 'all') {
|
|
41
|
-
passed = results.every((r) => r.passed);
|
|
42
|
-
}
|
|
43
|
-
else {
|
|
44
|
-
passed = passedExpectations >= passThreshold;
|
|
45
|
-
}
|
|
46
|
-
// Build summary
|
|
47
|
-
const parts = [];
|
|
48
|
-
parts.push(`${passedExpectations}/${totalExpectations} expectations passed`);
|
|
49
|
-
const failedResults = results.filter((r) => !r.passed);
|
|
50
|
-
if (failedResults.length > 0) {
|
|
51
|
-
for (const fr of failedResults) {
|
|
52
|
-
parts.push(` ${fr.type}: ${fr.detail}`);
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
return {
|
|
56
|
-
passed,
|
|
57
|
-
summary: parts[0],
|
|
58
|
-
results,
|
|
59
|
-
};
|
|
60
|
-
}
|
|
61
|
-
//# sourceMappingURL=index.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/evaluators/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,mBAAmB,EAAE,MAAM,mBAAmB,CAAA;AACvD,OAAO,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAA;AAC3D,OAAO,EAAE,sBAAsB,EAAE,MAAM,sBAAsB,CAAA;AAC7D,OAAO,EAAE,oBAAoB,EAAE,MAAM,oBAAoB,CAAA;AACzD,OAAO,EAAE,gBAAgB,EAAE,MAAM,gBAAgB,CAAA;AACjD,OAAO,EAAE,mBAAmB,EAAE,MAAM,kBAAkB,CAAA;AAItD,KAAK,UAAU,WAAW,CACxB,WAAwB,EACxB,IAAqB;IAErB,QAAQ,WAAW,CAAC,IAAI,EAAE,CAAC;QACzB,KAAK,cAAc;YACjB,OAAO,mBAAmB,CAAC,WAAW,EAAE,IAAI,CAAC,CAAA;QAC/C,KAAK,gBAAgB;YACnB,OAAO,qBAAqB,CAAC,WAAW,EAAE,IAAI,CAAC,CAAA;QACjD,KAAK,iBAAiB;YACpB,OAAO,sBAAsB,CAAC,WAAW,EAAE,IAAI,CAAC,CAAA;QAClD,KAAK,eAAe;YAClB,OAAO,oBAAoB,CAAC,WAAW,EAAE,IAAI,CAAC,CAAA;QAChD,KAAK,WAAW;YACd,OAAO,gBAAgB,CAAC,WAAW,EAAE,IAAI,CAAC,CAAA;QAC5C,KAAK,aAAa;YAChB,OAAO,mBAAmB,CAAC,WAAW,EAAE,IAAI,CAAC,CAAA;QAC/C;YACE,OAAO;gBACL,aAAa,EAAE,WAAW,CAAC,EAAE;gBAC7B,IAAI,EAAE,WAAW,CAAC,IAAI;gBACtB,MAAM,EAAE,KAAK;gBACb,MAAM,EAAE,6BAA6B,WAAW,CAAC,IAAI,EAAE;aACxD,CAAA;IACL,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,YAA2B,EAC3B,IAAqB,EACrB,gBAAgC,KAAK;IAErC,MAAM,OAAO,GAAwB,EAAE,CAAA;IAEvC,KAAK,MAAM,WAAW,IAAI,YAAY,EAAE,CAAC;QACvC,MAAM,MAAM,GAAG,MAAM,WAAW,CAAC,WAAW,EAAE,IAAI,CAAC,CAAA;QACnD,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IACtB,CAAC;IAED,8BAA8B;IAC9B,MAAM,kBAAkB,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAA;IACjE,MAAM,iBAAiB,GAAG,OAAO,CAAC,MAAM,CAAA;IACxC,IAAI,MAAe,CAAA;IAEnB,IAAI,aAAa,KAAK,KAAK,EAAE,CAAC;QAC5B,MAAM,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAA;IACzC,CAAC;SAAM,CAAC;QACN,MAAM,GAAG,kBAAkB,IAAI,aAAa,CAAA;IAC9C,CAAC;IAED,gBAAgB;IAChB,MAAM,KAAK,GAAa,EAAE,CAAA;IAC1B,KAAK,CAAC,IAAI,CAAC,GAAG,kBAAkB,IAAI,iBAAiB,sBAAsB,CAAC,CAAA;IAE5E,MAAM,aAAa,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAA;IACtD,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7B,KAAK,MAAM,EAAE,IAAI,aAAa,EAAE,CAAC;YAC/B,KAAK,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC,IAAI,KAAK,EAAE,CAAC,MAAM,EAAE,CAAC,CAAA;QAC1C,CAAC;IACH,CAAC;IAED,OAAO;QACL,MAAM;QACN,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC;QACjB,OAAO;KACR,CAAA;AACH,CAAC"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"latency-budget.d.ts","sourceRoot":"","sources":["../../src/evaluators/latency-budget.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,aAAa,EAAE,iBAAiB,EAAgB,MAAM,YAAY,CAAA;AAE7F,wBAAgB,qBAAqB,CACnC,WAAW,EAAE,WAAW,EACxB,IAAI,EAAE,aAAa,EAAE,GACpB,iBAAiB,CAgDnB"}
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
export function evaluateLatencyBudget(expectation, runs) {
|
|
2
|
-
const perRun = [];
|
|
3
|
-
let allPassed = true;
|
|
4
|
-
// Per-run check
|
|
5
|
-
if (expectation.maxDurationMs != null) {
|
|
6
|
-
for (const run of runs) {
|
|
7
|
-
const passed = run.durationMs <= expectation.maxDurationMs;
|
|
8
|
-
if (!passed)
|
|
9
|
-
allPassed = false;
|
|
10
|
-
perRun.push({ runIndex: run.runIndex, passed, value: run.durationMs });
|
|
11
|
-
}
|
|
12
|
-
}
|
|
13
|
-
// Total check
|
|
14
|
-
const totalDuration = runs.reduce((sum, r) => sum + r.durationMs, 0);
|
|
15
|
-
if (expectation.maxTotalDurationMs != null && totalDuration > expectation.maxTotalDurationMs) {
|
|
16
|
-
allPassed = false;
|
|
17
|
-
}
|
|
18
|
-
// If neither constraint is set, pass by default
|
|
19
|
-
if (expectation.maxDurationMs == null && expectation.maxTotalDurationMs == null) {
|
|
20
|
-
for (const run of runs) {
|
|
21
|
-
perRun.push({ runIndex: run.runIndex, passed: true, value: run.durationMs });
|
|
22
|
-
}
|
|
23
|
-
}
|
|
24
|
-
// Build detail
|
|
25
|
-
const avg = runs.length > 0 ? Math.round(totalDuration / runs.length) : 0;
|
|
26
|
-
const formatMs = (ms) => ms >= 1000 ? `${(ms / 1000).toFixed(1)}s` : `${ms}ms`;
|
|
27
|
-
const parts = [];
|
|
28
|
-
if (expectation.maxDurationMs != null) {
|
|
29
|
-
const passedCount = perRun.filter((r) => r.passed).length;
|
|
30
|
-
parts.push(`${passedCount}/${runs.length} under ${formatMs(expectation.maxDurationMs)}`);
|
|
31
|
-
}
|
|
32
|
-
if (expectation.maxTotalDurationMs != null) {
|
|
33
|
-
const totalPassed = totalDuration <= expectation.maxTotalDurationMs;
|
|
34
|
-
parts.push(`total ${formatMs(totalDuration)} ${totalPassed ? 'within' : 'exceeds'} ${formatMs(expectation.maxTotalDurationMs)} budget`);
|
|
35
|
-
}
|
|
36
|
-
parts.push(`avg ${formatMs(avg)}`);
|
|
37
|
-
return {
|
|
38
|
-
expectationId: expectation.id,
|
|
39
|
-
type: 'latency-budget',
|
|
40
|
-
passed: allPassed,
|
|
41
|
-
detail: parts.join(', '),
|
|
42
|
-
perRun: perRun.length > 0 ? perRun : undefined,
|
|
43
|
-
};
|
|
44
|
-
}
|
|
45
|
-
//# sourceMappingURL=latency-budget.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"latency-budget.js","sourceRoot":"","sources":["../../src/evaluators/latency-budget.ts"],"names":[],"mappings":"AAEA,MAAM,UAAU,qBAAqB,CACnC,WAAwB,EACxB,IAAqB;IAErB,MAAM,MAAM,GAAmB,EAAE,CAAA;IACjC,IAAI,SAAS,GAAG,IAAI,CAAA;IAEpB,gBAAgB;IAChB,IAAI,WAAW,CAAC,aAAa,IAAI,IAAI,EAAE,CAAC;QACtC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACvB,MAAM,MAAM,GAAG,GAAG,CAAC,UAAU,IAAI,WAAW,CAAC,aAAa,CAAA;YAC1D,IAAI,CAAC,MAAM;gBAAE,SAAS,GAAG,KAAK,CAAA;YAC9B,MAAM,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,GAAG,CAAC,QAAQ,EAAE,MAAM,EAAE,KAAK,EAAE,GAAG,CAAC,UAAU,EAAE,CAAC,CAAA;QACxE,CAAC;IACH,CAAC;IAED,cAAc;IACd,MAAM,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,CAAA;IACpE,IAAI,WAAW,CAAC,kBAAkB,IAAI,IAAI,IAAI,aAAa,GAAG,WAAW,CAAC,kBAAkB,EAAE,CAAC;QAC7F,SAAS,GAAG,KAAK,CAAA;IACnB,CAAC;IAED,gDAAgD;IAChD,IAAI,WAAW,CAAC,aAAa,IAAI,IAAI,IAAI,WAAW,CAAC,kBAAkB,IAAI,IAAI,EAAE,CAAC;QAChF,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACvB,MAAM,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,GAAG,CAAC,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,CAAC,UAAU,EAAE,CAAC,CAAA;QAC9E,CAAC;IACH,CAAC;IAED,eAAe;IACf,MAAM,GAAG,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;IACzE,MAAM,QAAQ,GAAG,CAAC,EAAU,EAAE,EAAE,CAAC,EAAE,IAAI,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,EAAE,IAAI,CAAA;IACtF,MAAM,KAAK,GAAa,EAAE,CAAA;IAE1B,IAAI,WAAW,CAAC,aAAa,IAAI,IAAI,EAAE,CAAC;QACtC,MAAM,WAAW,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAA;QACzD,KAAK,CAAC,IAAI,CAAC,GAAG,WAAW,IAAI,IAAI,CAAC,MAAM,UAAU,QAAQ,CAAC,WAAW,CAAC,aAAa,CAAC,EAAE,CAAC,CAAA;IAC1F,CAAC;IACD,IAAI,WAAW,CAAC,kBAAkB,IAAI,IAAI,EAAE,CAAC;QAC3C,MAAM,WAAW,GAAG,aAAa,IAAI,WAAW,CAAC,kBAAkB,CAAA;QACnE,KAAK,CAAC,IAAI,CAAC,SAAS,QAAQ,CAAC,aAAa,CAAC,IAAI,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,IAAI,QAAQ,CAAC,WAAW,CAAC,kBAAkB,CAAC,SAAS,CAAC,CAAA;IACzI,CAAC;IACD,KAAK,CAAC,IAAI,CAAC,OAAO,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC,CAAA;IAElC,OAAO;QACL,aAAa,EAAE,WAAW,CAAC,EAAE;QAC7B,IAAI,EAAE,gBAAgB;QACtB,MAAM,EAAE,SAAS;QACjB,MAAM,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC;QACxB,MAAM,EAAE,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,SAAS;KAC/C,CAAA;AACH,CAAC"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"llm-judge.d.ts","sourceRoot":"","sources":["../../src/evaluators/llm-judge.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,aAAa,EAAE,iBAAiB,EAAgB,MAAM,YAAY,CAAA;AAqG7F,wBAAsB,gBAAgB,CACpC,WAAW,EAAE,WAAW,EACxB,IAAI,EAAE,aAAa,EAAE,GACpB,OAAO,CAAC,iBAAiB,CAAC,CA+C5B"}
|