elasticdash-test 0.1.26 → 0.1.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/README.md +100 -0
  2. package/dist/cli.js +175 -0
  3. package/dist/cli.js.map +1 -1
  4. package/dist/index.cjs +62 -1
  5. package/dist/index.d.ts +2 -0
  6. package/dist/index.d.ts.map +1 -1
  7. package/dist/index.js +2 -0
  8. package/dist/index.js.map +1 -1
  9. package/dist/tool-registry.d.ts +31 -0
  10. package/dist/tool-registry.d.ts.map +1 -0
  11. package/dist/tool-registry.js +73 -0
  12. package/dist/tool-registry.js.map +1 -0
  13. package/dist/tool-runner-worker.js +19 -2
  14. package/dist/tool-runner-worker.js.map +1 -1
  15. package/dist/utils/debug.d.ts +1 -1
  16. package/dist/utils/debug.d.ts.map +1 -1
  17. package/dist/utils/debug.js +2 -2
  18. package/dist/utils/debug.js.map +1 -1
  19. package/docs/observability_contract.md +192 -0
  20. package/package.json +2 -2
  21. package/src/cli.ts +184 -0
  22. package/src/index.ts +4 -0
  23. package/src/tool-registry.ts +94 -0
  24. package/src/tool-runner-worker.ts +17 -2
  25. package/src/utils/debug.ts +2 -2
  26. package/dist/cloud-client.d.ts +0 -34
  27. package/dist/cloud-client.d.ts.map +0 -1
  28. package/dist/cloud-client.js +0 -103
  29. package/dist/cloud-client.js.map +0 -1
  30. package/dist/evaluators/determinism.d.ts +0 -3
  31. package/dist/evaluators/determinism.d.ts.map +0 -1
  32. package/dist/evaluators/determinism.js +0 -116
  33. package/dist/evaluators/determinism.js.map +0 -1
  34. package/dist/evaluators/index.d.ts +0 -4
  35. package/dist/evaluators/index.d.ts.map +0 -1
  36. package/dist/evaluators/index.js +0 -61
  37. package/dist/evaluators/index.js.map +0 -1
  38. package/dist/evaluators/latency-budget.d.ts +0 -3
  39. package/dist/evaluators/latency-budget.d.ts.map +0 -1
  40. package/dist/evaluators/latency-budget.js +0 -45
  41. package/dist/evaluators/latency-budget.js.map +0 -1
  42. package/dist/evaluators/llm-judge.d.ts +0 -3
  43. package/dist/evaluators/llm-judge.d.ts.map +0 -1
  44. package/dist/evaluators/llm-judge.js +0 -125
  45. package/dist/evaluators/llm-judge.js.map +0 -1
  46. package/dist/evaluators/output-contains.d.ts +0 -3
  47. package/dist/evaluators/output-contains.d.ts.map +0 -1
  48. package/dist/evaluators/output-contains.js +0 -52
  49. package/dist/evaluators/output-contains.js.map +0 -1
  50. package/dist/evaluators/output-schema.d.ts +0 -3
  51. package/dist/evaluators/output-schema.d.ts.map +0 -1
  52. package/dist/evaluators/output-schema.js +0 -58
  53. package/dist/evaluators/output-schema.js.map +0 -1
  54. package/dist/evaluators/token-budget.d.ts +0 -3
  55. package/dist/evaluators/token-budget.d.ts.map +0 -1
  56. package/dist/evaluators/token-budget.js +0 -45
  57. package/dist/evaluators/token-budget.js.map +0 -1
  58. package/dist/evaluators/types.d.ts +0 -104
  59. package/dist/evaluators/types.d.ts.map +0 -1
  60. package/dist/evaluators/types.js +0 -6
  61. package/dist/evaluators/types.js.map +0 -1
  62. package/dist/test-group/cli.d.ts +0 -8
  63. package/dist/test-group/cli.d.ts.map +0 -1
  64. package/dist/test-group/cli.js +0 -162
  65. package/dist/test-group/cli.js.map +0 -1
  66. package/dist/test-group/git-context.d.ts +0 -3
  67. package/dist/test-group/git-context.d.ts.map +0 -1
  68. package/dist/test-group/git-context.js +0 -59
  69. package/dist/test-group/git-context.js.map +0 -1
  70. package/dist/test-group/reporter.d.ts +0 -4
  71. package/dist/test-group/reporter.d.ts.map +0 -1
  72. package/dist/test-group/reporter.js +0 -54
  73. package/dist/test-group/reporter.js.map +0 -1
  74. package/dist/test-group/runner.d.ts +0 -18
  75. package/dist/test-group/runner.d.ts.map +0 -1
  76. package/dist/test-group/runner.js +0 -234
  77. package/dist/test-group/runner.js.map +0 -1
  78. package/dist/tracing-universal.d.ts +0 -13
  79. package/dist/tracing-universal.d.ts.map +0 -1
  80. package/dist/tracing-universal.js +0 -33
  81. package/dist/tracing-universal.js.map +0 -1
  82. package/docs/backend_rerun_alignment.md +0 -291
  83. package/docs/backend_traceid_update.md +0 -141
  84. package/docs/observability_backend_contract.md +0 -577
  85. package/docs/observability_rerun_backend_plan.md +0 -596
@@ -0,0 +1,94 @@
1
+ /**
2
+ * Global registry for tools defined via edTool().
3
+ *
4
+ * Tools registered here are discoverable for reruns regardless of which
5
+ * module they live in. The helper also applies wrapTool() so telemetry
6
+ * fires automatically — symmetric with the Python @ed_tool decorator.
7
+ */
8
+
9
+ import { wrapTool } from './interceptors/tool.js'
10
+
11
+ export interface RegisteredTool {
12
+ name: string
13
+ fn: (...args: unknown[]) => unknown | Promise<unknown>
14
+ wrapped: (...args: unknown[]) => Promise<unknown>
15
+ isAsync: boolean
16
+ signature: string
17
+ sourceFile: string | null
18
+ lineNumber: number | null
19
+ }
20
+
21
+ const _registry: Map<string, RegisteredTool> = new Map()
22
+
23
+ export function getRegisteredTools(): RegisteredTool[] {
24
+ return Array.from(_registry.values())
25
+ }
26
+
27
+ export function getRegisteredTool(name: string): RegisteredTool | undefined {
28
+ return _registry.get(name)
29
+ }
30
+
31
+ export function clearToolRegistry(): void {
32
+ _registry.clear()
33
+ }
34
+
35
+ function inferCallerLocation(): { file: string | null; line: number | null } {
36
+ const stack = new Error().stack
37
+ if (!stack) return { file: null, line: null }
38
+ const lines = stack.split('\n')
39
+ for (let i = 2; i < lines.length; i++) {
40
+ const m = lines[i].match(/\((.*?):(\d+):\d+\)|at\s+(.*?):(\d+):\d+/)
41
+ if (!m) continue
42
+ const file = m[1] ?? m[3]
43
+ const line = parseInt(m[2] ?? m[4], 10)
44
+ if (file && !file.includes('tool-registry')) {
45
+ return { file: file.replace(/^file:\/\//, ''), line: isNaN(line) ? null : line }
46
+ }
47
+ }
48
+ return { file: null, line: null }
49
+ }
50
+
51
+ function inferSignature(fn: Function): string {
52
+ const src = fn.toString()
53
+ const m = src.match(/^[^(]*\(([^)]*)\)/)
54
+ if (!m) return '()'
55
+ const params = m[1]
56
+ .split(',')
57
+ .map(p => p.trim().split(/[\s=:]/)[0])
58
+ .filter(Boolean)
59
+ return `(${params.join(', ')})`
60
+ }
61
+
62
+ /**
63
+ * Register a function as a rerunnable tool.
64
+ *
65
+ * export const myTool = edTool('my_tool', async (query: string) => { ... })
66
+ *
67
+ * The returned function is the telemetry-wrapped version, so calling it from
68
+ * normal code paths still produces traces. The CLI `run-tool` command and the
69
+ * ElasticDash MCP `run_tool` tool will resolve the original by name.
70
+ */
71
+ export function edTool<Args extends unknown[], R>(
72
+ name: string,
73
+ fn: (...args: Args) => R | Promise<R>,
74
+ ): (...args: Args) => Promise<R> {
75
+ const isAsync = fn.constructor.name === 'AsyncFunction'
76
+ const signature = inferSignature(fn)
77
+ const { file, line } = inferCallerLocation()
78
+
79
+ const wrapped = wrapTool(name, fn as (...args: unknown[]) => Promise<R>) as unknown as (...args: Args) => Promise<R>
80
+
81
+ _registry.set(name, {
82
+ name,
83
+ fn: fn as RegisteredTool['fn'],
84
+ wrapped: wrapped as RegisteredTool['wrapped'],
85
+ isAsync,
86
+ signature,
87
+ sourceFile: file,
88
+ lineNumber: line,
89
+ })
90
+
91
+ return wrapped
92
+ }
93
+
94
+ export const defineTool = edTool
@@ -205,9 +205,24 @@ async function main() {
205
205
  originalExit(1)
206
206
  return
207
207
  }
208
- const fn = mod[toolName]
208
+
209
+ // Registry first: covers tools defined via edTool() anywhere in the project,
210
+ // as long as their containing module is reachable from toolsModulePath's
211
+ // import graph. Falls back to ed_tools-style module export lookup.
212
+ let fn: ((...a: unknown[]) => unknown) | undefined
213
+ try {
214
+ const reg = await import('./tool-registry.js')
215
+ const registered = reg.getRegisteredTool(toolName)
216
+ if (registered) fn = registered.wrapped
217
+ } catch {
218
+ // Registry module not available (older SDK build); fall through to export lookup.
219
+ }
220
+ if (!fn) {
221
+ const exported = mod[toolName]
222
+ if (typeof exported === 'function') fn = exported
223
+ }
209
224
  if (typeof fn !== 'function') {
210
- await writeResult({ ok: false, error: `"${toolName}" is not an exported function in the module.` })
225
+ await writeResult({ ok: false, error: `"${toolName}" not found via edTool() registry or as an exported function in the module.` })
211
226
  originalExit(1)
212
227
  return
213
228
  }
@@ -1,8 +1,8 @@
1
1
  const DEBUG_KEY = 'ELASTICDASH_DEBUG'
2
2
 
3
- /** Log only when ELASTICDASH_DEBUG=1 is set. Drop-in replacement for console.log in interceptors. */
3
+ /** Log only when ELASTICDASH_DEBUG=1 is set. Writes to stderr so callers parsing stdout (e.g. `elasticdash run-tool`) get a clean JSON channel. */
4
4
  export function debugLog(...args: unknown[]): void {
5
5
  if (typeof process !== 'undefined' && process.env?.[DEBUG_KEY] === '1') {
6
- console.log(...args)
6
+ console.error(...args)
7
7
  }
8
8
  }
@@ -1,34 +0,0 @@
1
- /**
2
- * Cloud API client — communicates with the ElasticDash backend.
3
- * Used by the test-group CLI to fetch definitions and push results.
4
- */
5
- import type { TestGroup, TestGroupRunResult } from './evaluators/types.js';
6
- export interface CloudClientConfig {
7
- apiUrl: string;
8
- apiKey: string;
9
- projectId: number;
10
- }
11
- export interface FetchFilters {
12
- workflowName?: string;
13
- tags?: string[];
14
- }
15
- export declare function fetchTestGroups(config: CloudClientConfig, filters?: FetchFilters): Promise<TestGroup[]>;
16
- export declare function pushRunResult(config: CloudClientConfig, testGroupId: number, result: TestGroupRunResult): Promise<{
17
- id: number;
18
- }>;
19
- export declare function pushBatchResult(config: CloudClientConfig, batch: {
20
- testGroupRunIds: number[];
21
- gitBranch?: string;
22
- gitCommit?: string;
23
- passed?: boolean;
24
- summary?: string;
25
- }): Promise<{
26
- id: number;
27
- }>;
28
- export declare function exportTestGroups(config: CloudClientConfig): Promise<TestGroup[]>;
29
- export declare function resolveCloudConfig(options: {
30
- apiUrl?: string;
31
- apiKey?: string;
32
- projectId?: string | number;
33
- }): CloudClientConfig;
34
- //# sourceMappingURL=cloud-client.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"cloud-client.d.ts","sourceRoot":"","sources":["../src/cloud-client.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAAE,SAAS,EAAE,kBAAkB,EAAE,MAAM,uBAAuB,CAAA;AAE1E,MAAM,WAAW,iBAAiB;IAChC,MAAM,EAAE,MAAM,CAAA;IACd,MAAM,EAAE,MAAM,CAAA;IACd,SAAS,EAAE,MAAM,CAAA;CAClB;AAED,MAAM,WAAW,YAAY;IAC3B,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,IAAI,CAAC,EAAE,MAAM,EAAE,CAAA;CAChB;AAyED,wBAAsB,eAAe,CACnC,MAAM,EAAE,iBAAiB,EACzB,OAAO,CAAC,EAAE,YAAY,GACrB,OAAO,CAAC,SAAS,EAAE,CAAC,CAKtB;AAED,wBAAsB,aAAa,CACjC,MAAM,EAAE,iBAAiB,EACzB,WAAW,EAAE,MAAM,EACnB,MAAM,EAAE,kBAAkB,GACzB,OAAO,CAAC;IAAE,EAAE,EAAE,MAAM,CAAA;CAAE,CAAC,CAMzB;AAED,wBAAsB,eAAe,CACnC,MAAM,EAAE,iBAAiB,EACzB,KAAK,EAAE;IACL,eAAe,EAAE,MAAM,EAAE,CAAA;IACzB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,MAAM,CAAC,EAAE,OAAO,CAAA;IAChB,OAAO,CAAC,EAAE,MAAM,CAAA;CACjB,GACA,OAAO,CAAC;IAAE,EAAE,EAAE,MAAM,CAAA;CAAE,CAAC,CAMzB;AAED,wBAAsB,gBAAgB,CACpC,MAAM,EAAE,iBAAiB,GACxB,OAAO,CAAC,SAAS,EAAE,CAAC,CAItB;AAED,wBAAgB,kBAAkB,CAAC,OAAO,EAAE;IAC1C,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,SAAS,CAAC,EAAE,MAAM,GAAG,MAAM,CAAA;CAC5B,GAAG,iBAAiB,CAyBpB"}
@@ -1,103 +0,0 @@
1
- /**
2
- * Cloud API client — communicates with the ElasticDash backend.
3
- * Used by the test-group CLI to fetch definitions and push results.
4
- */
5
- class CloudClientError extends Error {
6
- statusCode;
7
- responseBody;
8
- constructor(message, statusCode, responseBody) {
9
- super(message);
10
- this.statusCode = statusCode;
11
- this.responseBody = responseBody;
12
- this.name = 'CloudClientError';
13
- }
14
- }
15
- async function request(url, apiKey, options = {}) {
16
- const { method = 'GET', body, retries = 2 } = options;
17
- let lastError;
18
- for (let attempt = 0; attempt <= retries; attempt++) {
19
- if (attempt > 0) {
20
- const delayMs = Math.min(1000 * 2 ** (attempt - 1), 5000);
21
- await new Promise((r) => setTimeout(r, delayMs));
22
- }
23
- try {
24
- const res = await fetch(url, {
25
- method,
26
- headers: {
27
- 'Authorization': `Bearer ${apiKey}`,
28
- 'Content-Type': 'application/json',
29
- },
30
- body: body ? JSON.stringify(body) : undefined,
31
- signal: AbortSignal.timeout(30_000),
32
- });
33
- if (res.status === 401) {
34
- throw new CloudClientError('Authentication failed. Check that your ELASTICDASH_API_KEY is valid and active.', 401);
35
- }
36
- if (!res.ok) {
37
- const text = await res.text().catch(() => '');
38
- throw new CloudClientError(`API request failed: ${res.status} ${res.statusText}`, res.status, text);
39
- }
40
- const json = await res.json();
41
- // The backend wraps responses in { status, data } via generalApiResponseSender
42
- return (json.data !== undefined ? json.data : json);
43
- }
44
- catch (err) {
45
- lastError = err;
46
- // Don't retry auth errors or client errors
47
- if (err instanceof CloudClientError && err.statusCode && err.statusCode < 500) {
48
- throw err;
49
- }
50
- // Retry on network/timeout/5xx errors
51
- if (attempt === retries) {
52
- throw err;
53
- }
54
- }
55
- }
56
- throw lastError ?? new Error('Request failed');
57
- }
58
- export async function fetchTestGroups(config, filters) {
59
- const url = new URL(`${config.apiUrl}/api/testgroups/by-project/${encodeURIComponent(String(config.projectId))}`);
60
- if (filters?.workflowName)
61
- url.searchParams.set('workflowName', filters.workflowName);
62
- if (filters?.tags?.length)
63
- url.searchParams.set('tags', filters.tags.join(','));
64
- return request(url.toString(), config.apiKey);
65
- }
66
- export async function pushRunResult(config, testGroupId, result) {
67
- const url = `${config.apiUrl}/api/testgroups/${encodeURIComponent(String(testGroupId))}/runs`;
68
- return request(url, config.apiKey, {
69
- method: 'POST',
70
- body: result,
71
- });
72
- }
73
- export async function pushBatchResult(config, batch) {
74
- const url = `${config.apiUrl}/api/testgroups/batches`;
75
- return request(url, config.apiKey, {
76
- method: 'POST',
77
- body: batch,
78
- });
79
- }
80
- export async function exportTestGroups(config) {
81
- const url = new URL(`${config.apiUrl}/api/testgroups/export`);
82
- url.searchParams.set('projectId', String(config.projectId));
83
- return request(url.toString(), config.apiKey);
84
- }
85
- export function resolveCloudConfig(options) {
86
- const apiUrl = options.apiUrl ||
87
- process.env.ELASTICDASH_API_URL ||
88
- '';
89
- const apiKey = options.apiKey ||
90
- process.env.ELASTICDASH_API_KEY ||
91
- '';
92
- const projectId = Number(options.projectId ||
93
- process.env.ELASTICDASH_PROJECT_ID ||
94
- '1');
95
- if (!apiUrl) {
96
- throw new CloudClientError('Missing API URL. Set --api-url or ELASTICDASH_API_URL.');
97
- }
98
- if (!apiKey) {
99
- throw new CloudClientError('Missing API key. Set --api-key or ELASTICDASH_API_KEY.');
100
- }
101
- return { apiUrl: apiUrl.replace(/\/+$/, ''), apiKey, projectId };
102
- }
103
- //# sourceMappingURL=cloud-client.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"cloud-client.js","sourceRoot":"","sources":["../src/cloud-client.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAeH,MAAM,gBAAiB,SAAQ,KAAK;IAGzB;IACA;IAHT,YACE,OAAe,EACR,UAAmB,EACnB,YAAqB;QAE5B,KAAK,CAAC,OAAO,CAAC,CAAA;QAHP,eAAU,GAAV,UAAU,CAAS;QACnB,iBAAY,GAAZ,YAAY,CAAS;QAG5B,IAAI,CAAC,IAAI,GAAG,kBAAkB,CAAA;IAChC,CAAC;CACF;AAED,KAAK,UAAU,OAAO,CACpB,GAAW,EACX,MAAc,EACd,UAAiE,EAAE;IAEnE,MAAM,EAAE,MAAM,GAAG,KAAK,EAAE,IAAI,EAAE,OAAO,GAAG,CAAC,EAAE,GAAG,OAAO,CAAA;IACrD,IAAI,SAA4B,CAAA;IAEhC,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,OAAO,EAAE,OAAO,EAAE,EAAE,CAAC;QACpD,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;YAChB,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,IAAI,CAAC,OAAO,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,CAAA;YACzD,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAA;QAClD,CAAC;QAED,IAAI,CAAC;YACH,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;gBAC3B,MAAM;gBACN,OAAO,EAAE;oBACP,eAAe,EAAE,UAAU,MAAM,EAAE;oBACnC,cAAc,EAAE,kBAAkB;iBACnC;gBACD,IAAI,EAAE,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS;gBAC7C,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,MAAM,CAAC;aACpC,CAAC,CAAA;YAEF,IAAI,GAAG,CAAC,MAAM,KAAK,GAAG,EAAE,CAAC;gBACvB,MAAM,IAAI,gBAAgB,CACxB,iFAAiF,EACjF,GAAG,CACJ,CAAA;YACH,CAAC;YAED,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;gBACZ,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,CAAA;gBAC7C,MAAM,IAAI,gBAAgB,CACxB,uBAAuB,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,UAAU,EAAE,EACrD,GAAG,CAAC,MAAM,EACV,IAAI,CACL,CAAA;YACH,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,IAAI,EAA6B,CAAA;YACxD,+EAA+E;YAC/E,OAAO,CAAC,IAAI,CAAC,IAAI,KAAK,SAAS,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAM,CAAA;QAC1D,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,SAAS,GAAG,GAAY,CAAA;YACxB,2CAA2C;YAC3C,IAAI,GAAG,YAAY,gBAAgB,IAAI,GAAG,CAAC,UAAU,IAAI,GAAG,CAAC,UAAU,GAAG,GAAG,EAAE,CAAC;gBAC9E,MAAM,GAAG,CAAA;YACX,CAAC;YACD,sCAAsC;YACtC,IAAI,OAAO,KAAK,OAAO,EAAE,CAAC;gBACxB,MAAM,GAAG,CAAA;YACX,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,SAAS,IAAI,IAAI,KAAK,CAAC,gBAAgB,CAAC,CAAA;AAChD,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,MAAyB,EACzB,OAAsB;IAEtB,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,GAAG,MAAM,CAAC,MAAM,8BAA8B,kBAAkB,CAAC,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,EAAE,CAAC,CAAA;IACjH,IAAI,OAAO,EAAE,YAAY;QAAE,GAAG,CAAC,YAAY,CAAC,GAAG,CAAC,cAAc,EAAE,OAAO,CAAC,YAAY,CAAC,CAAA;IACrF,IAAI,OAAO,EAAE,IAAI,EAAE,MAAM;QAAE,GAAG,CAAC,YAAY,CAAC,GAAG,CAAC,MAAM,EAAE,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;IAC/E,OAAO,OAAO,CAAc,GAAG,CAAC,QAAQ,EAAE,EAAE,MAAM,CAAC,MAAM,CAAC,CAAA;AAC5D,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,MAAyB,EACzB,WAAmB,EACnB,MAA0B;IAE1B,MAAM,GAAG,GAAG,GAAG,MAAM,CAAC,MAAM,mBAAmB,kBAAkB,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,OAAO,CAAA;IAC7F,OAAO,OAAO,CAAiB,GAAG,EAAE,MAAM,CAAC,MAAM,EAAE;QACjD,MAAM,EAAE,MAAM;QACd,IAAI,EAAE,MAAM;KACb,CAAC,CAAA;AACJ,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,MAAyB,EACzB,KAMC;IAED,MAAM,GAAG,GAAG,GAAG,MAAM,CAAC,MAAM,yBAAyB,CAAA;IACrD,OAAO,OAAO,CAAiB,GAAG,EAAE,MAAM,CAAC,MAAM,EAAE;QACjD,MAAM,EAAE,MAAM;QACd,IAAI,EAAE,KAAK;KACZ,CAAC,CAAA;AACJ,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,MAAyB;IAEzB,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,GAAG,MAAM,CAAC,MAAM,wBAAwB,CAAC,CAAA;IAC7D,GAAG,CAAC,YAAY,CAAC,GAAG,CAAC,WAAW,EAAE,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAA;IAC3D,OAAO,OAAO,CAAc,GAAG,CAAC,QAAQ,EAAE,EAAE,MAAM,CAAC,MAAM,CAAC,CAAA;AAC5D,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,OAIlC;IACC,MAAM,MAAM,GACV,OAAO,CAAC,MAAM;QACd,OAAO,CAAC,GAAG,CAAC,mBAAmB;QAC/B,EAAE,CAAA;IAEJ,MAAM,MAAM,GACV,OAAO,CAAC,MAAM;QACd,OAAO,CAAC,GAAG,CAAC,mBAAmB;QAC/B,EAAE,CAAA;IAEJ,MAAM,SAAS,GAAG,MAAM,CACtB,OAAO,CAAC,SAAS;QACjB,OAAO,CAAC,GAAG,CAAC,sBAAsB;QAClC,GAAG,CACJ,CAAA;IAED,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,gBAAgB,CAAC,wDAAwD,CAAC,CAAA;IACtF,CAAC;IACD,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,gBAAgB,CAAC,wDAAwD,CAAC,CAAA;IACtF,CAAC;IAED,OAAO,EAAE,MAAM,EAAE,MAAM,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,EAAE,MAAM,EAAE,SAAS,EAAE,CAAA;AAClE,CAAC"}
@@ -1,3 +0,0 @@
1
- import type { Expectation, SingleRunData, ExpectationResult } from './types.js';
2
- export declare function evaluateDeterminism(expectation: Expectation, runs: SingleRunData[]): Promise<ExpectationResult>;
3
- //# sourceMappingURL=determinism.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"determinism.d.ts","sourceRoot":"","sources":["../../src/evaluators/determinism.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,aAAa,EAAE,iBAAiB,EAAgB,MAAM,YAAY,CAAA;AA+D7F,wBAAsB,mBAAmB,CACvC,WAAW,EAAE,WAAW,EACxB,IAAI,EAAE,aAAa,EAAE,GACpB,OAAO,CAAC,iBAAiB,CAAC,CA4D5B"}
@@ -1,116 +0,0 @@
1
- function stringify(value) {
2
- if (typeof value === 'string')
3
- return value;
4
- if (value == null)
5
- return '';
6
- return JSON.stringify(value);
7
- }
8
- function stringSimilarity(a, b) {
9
- if (a === b)
10
- return 1.0;
11
- if (a.length === 0 || b.length === 0)
12
- return 0.0;
13
- // Simple character-level Jaccard similarity for fast local comparison
14
- const setA = new Set(a.split(' '));
15
- const setB = new Set(b.split(' '));
16
- const intersection = new Set([...setA].filter((x) => setB.has(x)));
17
- const union = new Set([...setA, ...setB]);
18
- return union.size > 0 ? intersection.size / union.size : 0;
19
- }
20
- async function llmSimilarity(a, b) {
21
- const apiKey = process.env.OPENAI_API_KEY;
22
- if (!apiKey) {
23
- // Fallback to string similarity if no API key
24
- return stringSimilarity(a, b);
25
- }
26
- try {
27
- const res = await fetch('https://api.openai.com/v1/chat/completions', {
28
- method: 'POST',
29
- headers: {
30
- 'Authorization': `Bearer ${apiKey}`,
31
- 'Content-Type': 'application/json',
32
- },
33
- body: JSON.stringify({
34
- model: 'gpt-4o-mini',
35
- messages: [
36
- {
37
- role: 'system',
38
- content: 'You compare two outputs for semantic similarity. Respond with ONLY a number between 0.0 and 1.0 where 1.0 means identical meaning and 0.0 means completely different.',
39
- },
40
- {
41
- role: 'user',
42
- content: `Output A:\n${a}\n\nOutput B:\n${b}\n\nSimilarity score (0.0-1.0):`,
43
- },
44
- ],
45
- max_tokens: 10,
46
- temperature: 0,
47
- }),
48
- signal: AbortSignal.timeout(15_000),
49
- });
50
- if (!res.ok)
51
- return stringSimilarity(a, b);
52
- const json = await res.json();
53
- const content = json.choices?.[0]?.message?.content ?? '';
54
- const score = parseFloat(content.trim());
55
- return isNaN(score) ? stringSimilarity(a, b) : Math.max(0, Math.min(1, score));
56
- }
57
- catch {
58
- return stringSimilarity(a, b);
59
- }
60
- }
61
- export async function evaluateDeterminism(expectation, runs) {
62
- const threshold = expectation.similarityThreshold ?? 0.8;
63
- if (runs.length < 2) {
64
- return {
65
- expectationId: expectation.id,
66
- type: 'determinism',
67
- passed: true,
68
- detail: 'Only 1 run, determinism check skipped',
69
- };
70
- }
71
- const outputs = runs.map((r) => stringify(r.output));
72
- // Pairwise comparison (for N ≤ 5, compare all pairs; for larger N, sample)
73
- const pairs = [];
74
- if (runs.length <= 5) {
75
- for (let i = 0; i < runs.length; i++) {
76
- for (let j = i + 1; j < runs.length; j++) {
77
- pairs.push([i, j]);
78
- }
79
- }
80
- }
81
- else {
82
- // Sample: compare each run against the first run
83
- for (let i = 1; i < runs.length; i++) {
84
- pairs.push([0, i]);
85
- }
86
- }
87
- const scores = [];
88
- for (const [i, j] of pairs) {
89
- const score = await llmSimilarity(outputs[i], outputs[j]);
90
- scores.push(score);
91
- }
92
- const avgScore = scores.reduce((a, b) => a + b, 0) / scores.length;
93
- const allAboveThreshold = scores.every((s) => s >= threshold);
94
- const perRun = runs.map((run, i) => {
95
- // For each run, report its average similarity to other runs
96
- const relevantScores = pairs
97
- .map(([a, b], idx) => (a === i || b === i ? scores[idx] : null))
98
- .filter((s) => s !== null);
99
- const runAvg = relevantScores.length > 0
100
- ? relevantScores.reduce((a, b) => a + b, 0) / relevantScores.length
101
- : 1.0;
102
- return {
103
- runIndex: run.runIndex,
104
- passed: relevantScores.every((s) => s >= threshold),
105
- value: parseFloat(runAvg.toFixed(3)),
106
- };
107
- });
108
- return {
109
- expectationId: expectation.id,
110
- type: 'determinism',
111
- passed: allAboveThreshold,
112
- detail: `avg similarity ${avgScore.toFixed(3)} (threshold: ${threshold}), ${scores.filter((s) => s >= threshold).length}/${scores.length} pairs pass`,
113
- perRun,
114
- };
115
- }
116
- //# sourceMappingURL=determinism.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"determinism.js","sourceRoot":"","sources":["../../src/evaluators/determinism.ts"],"names":[],"mappings":"AAEA,SAAS,SAAS,CAAC,KAAc;IAC/B,IAAI,OAAO,KAAK,KAAK,QAAQ;QAAE,OAAO,KAAK,CAAA;IAC3C,IAAI,KAAK,IAAI,IAAI;QAAE,OAAO,EAAE,CAAA;IAC5B,OAAO,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAA;AAC9B,CAAC;AAED,SAAS,gBAAgB,CAAC,CAAS,EAAE,CAAS;IAC5C,IAAI,CAAC,KAAK,CAAC;QAAE,OAAO,GAAG,CAAA;IACvB,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,GAAG,CAAA;IAEhD,sEAAsE;IACtE,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAA;IAClC,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAA;IAClC,MAAM,YAAY,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;IAClE,MAAM,KAAK,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,IAAI,EAAE,GAAG,IAAI,CAAC,CAAC,CAAA;IACzC,OAAO,KAAK,CAAC,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;AAC5D,CAAC;AAED,KAAK,UAAU,aAAa,CAAC,CAAS,EAAE,CAAS;IAC/C,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,cAAc,CAAA;IACzC,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,8CAA8C;QAC9C,OAAO,gBAAgB,CAAC,CAAC,EAAE,CAAC,CAAC,CAAA;IAC/B,CAAC;IAED,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,4CAA4C,EAAE;YACpE,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,eAAe,EAAE,UAAU,MAAM,EAAE;gBACnC,cAAc,EAAE,kBAAkB;aACnC;YACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;gBACnB,KAAK,EAAE,aAAa;gBACpB,QAAQ,EAAE;oBACR;wBACE,IAAI,EAAE,QAAQ;wBACd,OAAO,EAAE,uKAAuK;qBACjL;oBACD;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE,cAAc,CAAC,kBAAkB,CAAC,iCAAiC;qBAC7E;iBACF;gBACD,UAAU,EAAE,EAAE;gBACd,WAAW,EAAE,CAAC;aACf,CAAC;YACF,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,MAAM,CAAC;SACpC,CAAC,CAAA;QAEF,IAAI,CAAC,GAAG,CAAC,EAAE;YAAE,OAAO,gBAAgB,CAAC,CAAC,EAAE,CAAC,CAAC,CAAA;QAE1C,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,IAAI,EAAS,CAAA;QACpC,MAAM,OAAO,GAAW,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,IAAI,EAAE,CAAA;QACjE,MAAM,KAAK,GAAG,UAAU,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAA;QACxC,OAAO,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,gBAAgB,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,CAAA;IAChF,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,gBAAgB,CAAC,CAAC,EAAE,CAAC,CAAC,CAAA;IAC/B,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,mBAAmB,CACvC,WAAwB,EACxB,IAAqB;IAErB,MAAM,SAAS,GAAG,WAAW,CAAC,mBAAmB,IAAI,GAAG,CAAA;IAExD,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACpB,OAAO;YACL,aAAa,EAAE,WAAW,CAAC,EAAE;YAC7B,IAAI,EAAE,aAAa;YACnB,MAAM,EAAE,IAAI;YACZ,MAAM,EAAE,uCAAuC;SAChD,CAAA;IACH,CAAC;IAED,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAA;IAEpD,2EAA2E;IAC3E,MAAM,KAAK,GAA4B,EAAE,CAAA;IACzC,IAAI,IAAI,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;QACrB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACzC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;YACpB,CAAC;QACH,CAAC;IACH,CAAC;SAAM,CAAC;QACN,iDAAiD;QACjD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;QACpB,CAAC;IACH,CAAC;IAED,MAAM,MAAM,GAAa,EAAE,CAAA;IAC3B,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,KAAK,EAAE,CAAC;QAC3B,MAAM,KAAK,GAAG,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAA;QACzD,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;IACpB,CAAC;IAED,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,MAAM,CAAA;IAClE,MAAM,iBAAiB,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,IAAI,SAAS,CAAC,CAAA;IAE7D,MAAM,MAAM,GAAmB,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE;QACjD,4DAA4D;QAC5D,MAAM,cAAc,GAAG,KAAK;aACzB,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;aAC/D,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,CAAC,KAAK,IAAI,CAAC,CAAA;QACzC,MAAM,MAAM,GAAG,cAAc,CAAC,MAAM,GAAG,CAAC;YACtC,CAAC,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,cAAc,CAAC,MAAM;YACnE,CAAC,CAAC,GAAG,CAAA;QACP,OAAO;YACL,QAAQ,EAAE,GAAG,CAAC,QAAQ;YACtB,MAAM,EAAE,cAAc,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,IAAI,SAAS,CAAC;YACnD,KAAK,EAAE,UAAU,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;SACrC,CAAA;IACH,CAAC,CAAC,CAAA;IAEF,OAAO;QACL,aAAa,EAAE,WAAW,CAAC,EAAE;QAC7B,IAAI,EAAE,aAAa;QACnB,MAAM,EAAE,iBAAiB;QACzB,MAAM,EAAE,kBAAkB,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,gBAAgB,SAAS,MAAM,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,IAAI,SAAS,CAAC,CAAC,MAAM,IAAI,MAAM,CAAC,MAAM,aAAa;QACrJ,MAAM;KACP,CAAA;AACH,CAAC"}
@@ -1,4 +0,0 @@
1
- import type { Expectation, SingleRunData, EvaluationResult } from './types.js';
2
- export { type Expectation, type SingleRunData, type ExpectationResult, type EvaluationResult } from './types.js';
3
- export declare function evaluateExpectations(expectations: Expectation[], runs: SingleRunData[], passThreshold?: 'all' | number): Promise<EvaluationResult>;
4
- //# sourceMappingURL=index.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/evaluators/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,aAAa,EAAqB,gBAAgB,EAAE,MAAM,YAAY,CAAA;AAQjG,OAAO,EAAE,KAAK,WAAW,EAAE,KAAK,aAAa,EAAE,KAAK,iBAAiB,EAAE,KAAK,gBAAgB,EAAE,MAAM,YAAY,CAAA;AA6BhH,wBAAsB,oBAAoB,CACxC,YAAY,EAAE,WAAW,EAAE,EAC3B,IAAI,EAAE,aAAa,EAAE,EACrB,aAAa,GAAE,KAAK,GAAG,MAAc,GACpC,OAAO,CAAC,gBAAgB,CAAC,CAmC3B"}
@@ -1,61 +0,0 @@
1
- import { evaluateTokenBudget } from './token-budget.js';
2
- import { evaluateLatencyBudget } from './latency-budget.js';
3
- import { evaluateOutputContains } from './output-contains.js';
4
- import { evaluateOutputSchema } from './output-schema.js';
5
- import { evaluateLLMJudge } from './llm-judge.js';
6
- import { evaluateDeterminism } from './determinism.js';
7
- async function evaluateOne(expectation, runs) {
8
- switch (expectation.type) {
9
- case 'token-budget':
10
- return evaluateTokenBudget(expectation, runs);
11
- case 'latency-budget':
12
- return evaluateLatencyBudget(expectation, runs);
13
- case 'output-contains':
14
- return evaluateOutputContains(expectation, runs);
15
- case 'output-schema':
16
- return evaluateOutputSchema(expectation, runs);
17
- case 'llm-judge':
18
- return evaluateLLMJudge(expectation, runs);
19
- case 'determinism':
20
- return evaluateDeterminism(expectation, runs);
21
- default:
22
- return {
23
- expectationId: expectation.id,
24
- type: expectation.type,
25
- passed: false,
26
- detail: `Unknown expectation type: ${expectation.type}`,
27
- };
28
- }
29
- }
30
- export async function evaluateExpectations(expectations, runs, passThreshold = 'all') {
31
- const results = [];
32
- for (const expectation of expectations) {
33
- const result = await evaluateOne(expectation, runs);
34
- results.push(result);
35
- }
36
- // Determine overall pass/fail
37
- const passedExpectations = results.filter((r) => r.passed).length;
38
- const totalExpectations = results.length;
39
- let passed;
40
- if (passThreshold === 'all') {
41
- passed = results.every((r) => r.passed);
42
- }
43
- else {
44
- passed = passedExpectations >= passThreshold;
45
- }
46
- // Build summary
47
- const parts = [];
48
- parts.push(`${passedExpectations}/${totalExpectations} expectations passed`);
49
- const failedResults = results.filter((r) => !r.passed);
50
- if (failedResults.length > 0) {
51
- for (const fr of failedResults) {
52
- parts.push(` ${fr.type}: ${fr.detail}`);
53
- }
54
- }
55
- return {
56
- passed,
57
- summary: parts[0],
58
- results,
59
- };
60
- }
61
- //# sourceMappingURL=index.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/evaluators/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,mBAAmB,EAAE,MAAM,mBAAmB,CAAA;AACvD,OAAO,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAA;AAC3D,OAAO,EAAE,sBAAsB,EAAE,MAAM,sBAAsB,CAAA;AAC7D,OAAO,EAAE,oBAAoB,EAAE,MAAM,oBAAoB,CAAA;AACzD,OAAO,EAAE,gBAAgB,EAAE,MAAM,gBAAgB,CAAA;AACjD,OAAO,EAAE,mBAAmB,EAAE,MAAM,kBAAkB,CAAA;AAItD,KAAK,UAAU,WAAW,CACxB,WAAwB,EACxB,IAAqB;IAErB,QAAQ,WAAW,CAAC,IAAI,EAAE,CAAC;QACzB,KAAK,cAAc;YACjB,OAAO,mBAAmB,CAAC,WAAW,EAAE,IAAI,CAAC,CAAA;QAC/C,KAAK,gBAAgB;YACnB,OAAO,qBAAqB,CAAC,WAAW,EAAE,IAAI,CAAC,CAAA;QACjD,KAAK,iBAAiB;YACpB,OAAO,sBAAsB,CAAC,WAAW,EAAE,IAAI,CAAC,CAAA;QAClD,KAAK,eAAe;YAClB,OAAO,oBAAoB,CAAC,WAAW,EAAE,IAAI,CAAC,CAAA;QAChD,KAAK,WAAW;YACd,OAAO,gBAAgB,CAAC,WAAW,EAAE,IAAI,CAAC,CAAA;QAC5C,KAAK,aAAa;YAChB,OAAO,mBAAmB,CAAC,WAAW,EAAE,IAAI,CAAC,CAAA;QAC/C;YACE,OAAO;gBACL,aAAa,EAAE,WAAW,CAAC,EAAE;gBAC7B,IAAI,EAAE,WAAW,CAAC,IAAI;gBACtB,MAAM,EAAE,KAAK;gBACb,MAAM,EAAE,6BAA6B,WAAW,CAAC,IAAI,EAAE;aACxD,CAAA;IACL,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,YAA2B,EAC3B,IAAqB,EACrB,gBAAgC,KAAK;IAErC,MAAM,OAAO,GAAwB,EAAE,CAAA;IAEvC,KAAK,MAAM,WAAW,IAAI,YAAY,EAAE,CAAC;QACvC,MAAM,MAAM,GAAG,MAAM,WAAW,CAAC,WAAW,EAAE,IAAI,CAAC,CAAA;QACnD,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IACtB,CAAC;IAED,8BAA8B;IAC9B,MAAM,kBAAkB,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAA;IACjE,MAAM,iBAAiB,GAAG,OAAO,CAAC,MAAM,CAAA;IACxC,IAAI,MAAe,CAAA;IAEnB,IAAI,aAAa,KAAK,KAAK,EAAE,CAAC;QAC5B,MAAM,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAA;IACzC,CAAC;SAAM,CAAC;QACN,MAAM,GAAG,kBAAkB,IAAI,aAAa,CAAA;IAC9C,CAAC;IAED,gBAAgB;IAChB,MAAM,KAAK,GAAa,EAAE,CAAA;IAC1B,KAAK,CAAC,IAAI,CAAC,GAAG,kBAAkB,IAAI,iBAAiB,sBAAsB,CAAC,CAAA;IAE5E,MAAM,aAAa,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAA;IACtD,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7B,KAAK,MAAM,EAAE,IAAI,aAAa,EAAE,CAAC;YAC/B,KAAK,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC,IAAI,KAAK,EAAE,CAAC,MAAM,EAAE,CAAC,CAAA;QAC1C,CAAC;IACH,CAAC;IAED,OAAO;QACL,MAAM;QACN,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC;QACjB,OAAO;KACR,CAAA;AACH,CAAC"}
@@ -1,3 +0,0 @@
1
- import type { Expectation, SingleRunData, ExpectationResult } from './types.js';
2
- export declare function evaluateLatencyBudget(expectation: Expectation, runs: SingleRunData[]): ExpectationResult;
3
- //# sourceMappingURL=latency-budget.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"latency-budget.d.ts","sourceRoot":"","sources":["../../src/evaluators/latency-budget.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,aAAa,EAAE,iBAAiB,EAAgB,MAAM,YAAY,CAAA;AAE7F,wBAAgB,qBAAqB,CACnC,WAAW,EAAE,WAAW,EACxB,IAAI,EAAE,aAAa,EAAE,GACpB,iBAAiB,CAgDnB"}
@@ -1,45 +0,0 @@
1
- export function evaluateLatencyBudget(expectation, runs) {
2
- const perRun = [];
3
- let allPassed = true;
4
- // Per-run check
5
- if (expectation.maxDurationMs != null) {
6
- for (const run of runs) {
7
- const passed = run.durationMs <= expectation.maxDurationMs;
8
- if (!passed)
9
- allPassed = false;
10
- perRun.push({ runIndex: run.runIndex, passed, value: run.durationMs });
11
- }
12
- }
13
- // Total check
14
- const totalDuration = runs.reduce((sum, r) => sum + r.durationMs, 0);
15
- if (expectation.maxTotalDurationMs != null && totalDuration > expectation.maxTotalDurationMs) {
16
- allPassed = false;
17
- }
18
- // If neither constraint is set, pass by default
19
- if (expectation.maxDurationMs == null && expectation.maxTotalDurationMs == null) {
20
- for (const run of runs) {
21
- perRun.push({ runIndex: run.runIndex, passed: true, value: run.durationMs });
22
- }
23
- }
24
- // Build detail
25
- const avg = runs.length > 0 ? Math.round(totalDuration / runs.length) : 0;
26
- const formatMs = (ms) => ms >= 1000 ? `${(ms / 1000).toFixed(1)}s` : `${ms}ms`;
27
- const parts = [];
28
- if (expectation.maxDurationMs != null) {
29
- const passedCount = perRun.filter((r) => r.passed).length;
30
- parts.push(`${passedCount}/${runs.length} under ${formatMs(expectation.maxDurationMs)}`);
31
- }
32
- if (expectation.maxTotalDurationMs != null) {
33
- const totalPassed = totalDuration <= expectation.maxTotalDurationMs;
34
- parts.push(`total ${formatMs(totalDuration)} ${totalPassed ? 'within' : 'exceeds'} ${formatMs(expectation.maxTotalDurationMs)} budget`);
35
- }
36
- parts.push(`avg ${formatMs(avg)}`);
37
- return {
38
- expectationId: expectation.id,
39
- type: 'latency-budget',
40
- passed: allPassed,
41
- detail: parts.join(', '),
42
- perRun: perRun.length > 0 ? perRun : undefined,
43
- };
44
- }
45
- //# sourceMappingURL=latency-budget.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"latency-budget.js","sourceRoot":"","sources":["../../src/evaluators/latency-budget.ts"],"names":[],"mappings":"AAEA,MAAM,UAAU,qBAAqB,CACnC,WAAwB,EACxB,IAAqB;IAErB,MAAM,MAAM,GAAmB,EAAE,CAAA;IACjC,IAAI,SAAS,GAAG,IAAI,CAAA;IAEpB,gBAAgB;IAChB,IAAI,WAAW,CAAC,aAAa,IAAI,IAAI,EAAE,CAAC;QACtC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACvB,MAAM,MAAM,GAAG,GAAG,CAAC,UAAU,IAAI,WAAW,CAAC,aAAa,CAAA;YAC1D,IAAI,CAAC,MAAM;gBAAE,SAAS,GAAG,KAAK,CAAA;YAC9B,MAAM,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,GAAG,CAAC,QAAQ,EAAE,MAAM,EAAE,KAAK,EAAE,GAAG,CAAC,UAAU,EAAE,CAAC,CAAA;QACxE,CAAC;IACH,CAAC;IAED,cAAc;IACd,MAAM,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,CAAA;IACpE,IAAI,WAAW,CAAC,kBAAkB,IAAI,IAAI,IAAI,aAAa,GAAG,WAAW,CAAC,kBAAkB,EAAE,CAAC;QAC7F,SAAS,GAAG,KAAK,CAAA;IACnB,CAAC;IAED,gDAAgD;IAChD,IAAI,WAAW,CAAC,aAAa,IAAI,IAAI,IAAI,WAAW,CAAC,kBAAkB,IAAI,IAAI,EAAE,CAAC;QAChF,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACvB,MAAM,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,GAAG,CAAC,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,CAAC,UAAU,EAAE,CAAC,CAAA;QAC9E,CAAC;IACH,CAAC;IAED,eAAe;IACf,MAAM,GAAG,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;IACzE,MAAM,QAAQ,GAAG,CAAC,EAAU,EAAE,EAAE,CAAC,EAAE,IAAI,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,EAAE,IAAI,CAAA;IACtF,MAAM,KAAK,GAAa,EAAE,CAAA;IAE1B,IAAI,WAAW,CAAC,aAAa,IAAI,IAAI,EAAE,CAAC;QACtC,MAAM,WAAW,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAA;QACzD,KAAK,CAAC,IAAI,CAAC,GAAG,WAAW,IAAI,IAAI,CAAC,MAAM,UAAU,QAAQ,CAAC,WAAW,CAAC,aAAa,CAAC,EAAE,CAAC,CAAA;IAC1F,CAAC;IACD,IAAI,WAAW,CAAC,kBAAkB,IAAI,IAAI,EAAE,CAAC;QAC3C,MAAM,WAAW,GAAG,aAAa,IAAI,WAAW,CAAC,kBAAkB,CAAA;QACnE,KAAK,CAAC,IAAI,CAAC,SAAS,QAAQ,CAAC,aAAa,CAAC,IAAI,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,IAAI,QAAQ,CAAC,WAAW,CAAC,kBAAkB,CAAC,SAAS,CAAC,CAAA;IACzI,CAAC;IACD,KAAK,CAAC,IAAI,CAAC,OAAO,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC,CAAA;IAElC,OAAO;QACL,aAAa,EAAE,WAAW,CAAC,EAAE;QAC7B,IAAI,EAAE,gBAAgB;QACtB,MAAM,EAAE,SAAS;QACjB,MAAM,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC;QACxB,MAAM,EAAE,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,SAAS;KAC/C,CAAA;AACH,CAAC"}
@@ -1,3 +0,0 @@
1
- import type { Expectation, SingleRunData, ExpectationResult } from './types.js';
2
- export declare function evaluateLLMJudge(expectation: Expectation, runs: SingleRunData[]): Promise<ExpectationResult>;
3
- //# sourceMappingURL=llm-judge.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"llm-judge.d.ts","sourceRoot":"","sources":["../../src/evaluators/llm-judge.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,aAAa,EAAE,iBAAiB,EAAgB,MAAM,YAAY,CAAA;AAqG7F,wBAAsB,gBAAgB,CACpC,WAAW,EAAE,WAAW,EACxB,IAAI,EAAE,aAAa,EAAE,GACpB,OAAO,CAAC,iBAAiB,CAAC,CA+C5B"}