elasticdash-test 0.1.20-alpha-21 → 0.1.20-alpha-22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ci/api-client.d.ts +15 -0
- package/dist/ci/api-client.d.ts.map +1 -1
- package/dist/ci/api-client.js +10 -0
- package/dist/ci/api-client.js.map +1 -1
- package/dist/ci/benchmark.d.ts +5 -1
- package/dist/ci/benchmark.d.ts.map +1 -1
- package/dist/ci/benchmark.js +41 -3
- package/dist/ci/benchmark.js.map +1 -1
- package/dist/ci/ed-runner.d.ts.map +1 -1
- package/dist/ci/ed-runner.js +22 -4
- package/dist/ci/ed-runner.js.map +1 -1
- package/dist/index.cjs +118 -66
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -1
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/src/ci/api-client.ts +24 -0
- package/src/ci/benchmark.ts +47 -2
- package/src/ci/ed-runner.ts +25 -4
- package/src/index.ts +1 -1
package/dist/ci/api-client.d.ts
CHANGED
|
@@ -20,4 +20,19 @@ export declare function submitTestRun(serverUrl: string, apiKey: string, testGro
|
|
|
20
20
|
export declare function createBatch(serverUrl: string, apiKey: string, payload: Record<string, unknown>): Promise<{
|
|
21
21
|
id: number;
|
|
22
22
|
}>;
|
|
23
|
+
/**
|
|
24
|
+
* Resolved evaluator configuration from the backend.
|
|
25
|
+
* Provider/model/apiKey may be null if the user has not configured an evaluator.
|
|
26
|
+
*/
|
|
27
|
+
export interface EvaluatorConfig {
|
|
28
|
+
provider: string | null;
|
|
29
|
+
model: string | null;
|
|
30
|
+
apiKey: string | null;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Fetch the project's evaluator config (with user-level fallback).
|
|
34
|
+
* Used by ed-test llm_judge benchmarks when judge_provider/judge_model
|
|
35
|
+
* are not specified in the test definition.
|
|
36
|
+
*/
|
|
37
|
+
export declare function fetchEvaluatorConfig(serverUrl: string, apiKey: string): Promise<EvaluatorConfig>;
|
|
23
38
|
//# sourceMappingURL=api-client.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"api-client.d.ts","sourceRoot":"","sources":["../../src/ci/api-client.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AA0C9C;;;GAGG;AACH,wBAAsB,eAAe,CACnC,SAAS,EAAE,MAAM,EACjB,MAAM,EAAE,MAAM,EACd,OAAO,CAAC,EAAE;IAAE,YAAY,CAAC,EAAE,MAAM,CAAC;IAAC,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAAC,MAAM,CAAC,EAAE,MAAM,CAAA;CAAE,GACpE,OAAO,CAAC,YAAY,EAAE,CAAC,CAUzB;AAED;;GAEG;AACH,wBAAsB,aAAa,CACjC,SAAS,EAAE,MAAM,EACjB,MAAM,EAAE,MAAM,EACd,WAAW,EAAE,MAAM,EACnB,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAC/B,OAAO,CAAC;IAAE,EAAE,EAAE,MAAM,CAAA;CAAE,CAAC,CAOzB;AAED;;GAEG;AACH,wBAAsB,WAAW,CAC/B,SAAS,EAAE,MAAM,EACjB,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAC/B,OAAO,CAAC;IAAE,EAAE,EAAE,MAAM,CAAA;CAAE,CAAC,CAOzB"}
|
|
1
|
+
{"version":3,"file":"api-client.d.ts","sourceRoot":"","sources":["../../src/ci/api-client.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AA0C9C;;;GAGG;AACH,wBAAsB,eAAe,CACnC,SAAS,EAAE,MAAM,EACjB,MAAM,EAAE,MAAM,EACd,OAAO,CAAC,EAAE;IAAE,YAAY,CAAC,EAAE,MAAM,CAAC;IAAC,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAAC,MAAM,CAAC,EAAE,MAAM,CAAA;CAAE,GACpE,OAAO,CAAC,YAAY,EAAE,CAAC,CAUzB;AAED;;GAEG;AACH,wBAAsB,aAAa,CACjC,SAAS,EAAE,MAAM,EACjB,MAAM,EAAE,MAAM,EACd,WAAW,EAAE,MAAM,EACnB,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAC/B,OAAO,CAAC;IAAE,EAAE,EAAE,MAAM,CAAA;CAAE,CAAC,CAOzB;AAED;;GAEG;AACH,wBAAsB,WAAW,CAC/B,SAAS,EAAE,MAAM,EACjB,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAC/B,OAAO,CAAC;IAAE,EAAE,EAAE,MAAM,CAAA;CAAE,CAAC,CAOzB;AAED;;;GAGG;AACH,MAAM,WAAW,eAAe;IAC9B,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAA;IACvB,KAAK,EAAE,MAAM,GAAG,IAAI,CAAA;IACpB,MAAM,EAAE,MAAM,GAAG,IAAI,CAAA;CACtB;AAED;;;;GAIG;AACH,wBAAsB,oBAAoB,CACxC,SAAS,EAAE,MAAM,EACjB,MAAM,EAAE,MAAM,GACb,OAAO,CAAC,eAAe,CAAC,CAI1B"}
|
package/dist/ci/api-client.js
CHANGED
|
@@ -68,4 +68,14 @@ export async function createBatch(serverUrl, apiKey, payload) {
|
|
|
68
68
|
body: JSON.stringify(payload),
|
|
69
69
|
});
|
|
70
70
|
}
|
|
71
|
+
/**
|
|
72
|
+
* Fetch the project's evaluator config (with user-level fallback).
|
|
73
|
+
* Used by ed-test llm_judge benchmarks when judge_provider/judge_model
|
|
74
|
+
* are not specified in the test definition.
|
|
75
|
+
*/
|
|
76
|
+
export async function fetchEvaluatorConfig(serverUrl, apiKey) {
|
|
77
|
+
const base = normalizeBase(serverUrl);
|
|
78
|
+
const url = `${base}/api/test-runs/evaluator-config`;
|
|
79
|
+
return apiRequest(url, apiKey);
|
|
80
|
+
}
|
|
71
81
|
//# sourceMappingURL=api-client.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"api-client.js","sourceRoot":"","sources":["../../src/ci/api-client.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAA;AACxC,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAA;AAG1D,gEAAgE;AAChE,sDAAsD;AAEtD,iGAAiG;AACjG,SAAS,aAAa,CAAC,SAAiB;IACtC,OAAO,SAAS,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAA;AAC5D,CAAC;AAED,SAAS,OAAO,CAAC,MAAc;IAC7B,OAAO;QACL,cAAc,EAAE,kBAAkB;QAClC,SAAS,EAAE,MAAM,IAAI,EAAE;QACvB,kBAAkB,EAAE,UAAU,EAAE;KACjC,CAAA;AACH,CAAC;AAED,KAAK,UAAU,UAAU,CACvB,GAAW,EACX,MAAc,EACd,UAAuB,EAAE;IAEzB,MAAM,MAAM,GAAG,CAAC,OAAO,CAAC,MAAM,IAAI,KAAK,CAAC,CAAC,WAAW,EAAE,CAAA;IACtD,OAAO,CAAC,GAAG,CAAC,oBAAoB,MAAM,IAAI,GAAG,EAAE,CAAC,CAAA;IAEhD,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC,GAAG,EAAE;QACxC,GAAG,OAAO;QACV,OAAO,EAAE,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,EAAE,GAAG,CAAC,OAAO,CAAC,OAAiC,IAAI,EAAE,CAAC,EAAE;KACtF,CAAC,CAAA;IAEF,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;QACZ,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,CAAA;QAC7C,OAAO,CAAC,GAAG,CAAC,oBAAoB,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,MAAM,IAAI,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAA;QAC1F,MAAM,IAAI,KAAK,CAAC,OAAO,GAAG,CAAC,MAAM,KAAK,IAAI,IAAI,GAAG,CAAC,UAAU,EAAE,CAAC,CAAA;IACjE,CAAC;IAED,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,IAAI,EAA8B,CAAA;IACzD,kFAAkF;IAClF,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,IAAI,IAAI,IAAI,CAAM,CAAA;AAChD,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,SAAiB,EACjB,MAAc,EACd,OAAqE;IAErE,MAAM,IAAI,GAAG,aAAa,CAAC,SAAS,CAAC,CAAA;IACrC,MAAM,MAAM,GAAG,IAAI,eAAe,EAAE,CAAA;IACpC,IAAI,OAAO,EAAE,YAAY;QAAE,MAAM,CAAC,GAAG,CAAC,cAAc,EAAE,OAAO,CAAC,YAAY,CAAC,CAAA;IAC3E,IAAI,OAAO,EAAE,IAAI,EAAE,MAAM;QAAE,MAAM,CAAC,GAAG,CAAC,MAAM,EAAE,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;IACrE,IAAI,OAAO,EAAE,MAAM;QAAE,MAAM,CAAC,GAAG,CAAC,QAAQ,EAAE,OAAO,CAAC,MAAM,CAAC,CAAA;IAEzD,MAAM,EAAE,GAAG,MAAM,CAAC,QAAQ,EAAE,CAAA;IAC5B,MAAM,GAAG,GAAG,GAAG,IAAI,6BAA6B,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAAA;IACpE,OAAO,UAAU,CAAiB,GAAG,EAAE,MAAM,CAAC,CAAA;AAChD,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,SAAiB,EACjB,MAAc,EACd,WAAmB,EACnB,OAAgC;IAEhC,MAAM,IAAI,GAAG,aAAa,CAAC,SAAS,CAAC,CAAA;IACrC,MAAM,GAAG,GAAG,GAAG,IAAI,mBAAmB,WAAW,OAAO,CAAA;IACxD,OAAO,UAAU,CAAiB,GAAG,EAAE,MAAM,EAAE;QAC7C,MAAM,EAAE,MAAM;QACd,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC;KAC9B,CAAC,CAAA;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,SAAiB,EACjB,MAAc,EACd,OAAgC;IAEhC,MAAM,IAAI,GAAG,aAAa,CAAC,SAAS,CAAC,CAAA;IACrC,MAAM,GAAG,GAAG,GAAG,IAAI,yBAAyB,CAAA;IAC5C,OAAO,UAAU,CAAiB,GAAG,EAAE,MAAM,EAAE;QAC7C,MAAM,EAAE,MAAM;QACd,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC;KAC9B,CAAC,CAAA;AACJ,CAAC"}
|
|
1
|
+
{"version":3,"file":"api-client.js","sourceRoot":"","sources":["../../src/ci/api-client.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAA;AACxC,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAA;AAG1D,gEAAgE;AAChE,sDAAsD;AAEtD,iGAAiG;AACjG,SAAS,aAAa,CAAC,SAAiB;IACtC,OAAO,SAAS,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAA;AAC5D,CAAC;AAED,SAAS,OAAO,CAAC,MAAc;IAC7B,OAAO;QACL,cAAc,EAAE,kBAAkB;QAClC,SAAS,EAAE,MAAM,IAAI,EAAE;QACvB,kBAAkB,EAAE,UAAU,EAAE;KACjC,CAAA;AACH,CAAC;AAED,KAAK,UAAU,UAAU,CACvB,GAAW,EACX,MAAc,EACd,UAAuB,EAAE;IAEzB,MAAM,MAAM,GAAG,CAAC,OAAO,CAAC,MAAM,IAAI,KAAK,CAAC,CAAC,WAAW,EAAE,CAAA;IACtD,OAAO,CAAC,GAAG,CAAC,oBAAoB,MAAM,IAAI,GAAG,EAAE,CAAC,CAAA;IAEhD,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC,GAAG,EAAE;QACxC,GAAG,OAAO;QACV,OAAO,EAAE,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,EAAE,GAAG,CAAC,OAAO,CAAC,OAAiC,IAAI,EAAE,CAAC,EAAE;KACtF,CAAC,CAAA;IAEF,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;QACZ,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,CAAA;QAC7C,OAAO,CAAC,GAAG,CAAC,oBAAoB,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,MAAM,IAAI,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAA;QAC1F,MAAM,IAAI,KAAK,CAAC,OAAO,GAAG,CAAC,MAAM,KAAK,IAAI,IAAI,GAAG,CAAC,UAAU,EAAE,CAAC,CAAA;IACjE,CAAC;IAED,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,IAAI,EAA8B,CAAA;IACzD,kFAAkF;IAClF,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,IAAI,IAAI,IAAI,CAAM,CAAA;AAChD,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,SAAiB,EACjB,MAAc,EACd,OAAqE;IAErE,MAAM,IAAI,GAAG,aAAa,CAAC,SAAS,CAAC,CAAA;IACrC,MAAM,MAAM,GAAG,IAAI,eAAe,EAAE,CAAA;IACpC,IAAI,OAAO,EAAE,YAAY;QAAE,MAAM,CAAC,GAAG,CAAC,cAAc,EAAE,OAAO,CAAC,YAAY,CAAC,CAAA;IAC3E,IAAI,OAAO,EAAE,IAAI,EAAE,MAAM;QAAE,MAAM,CAAC,GAAG,CAAC,MAAM,EAAE,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;IACrE,IAAI,OAAO,EAAE,MAAM;QAAE,MAAM,CAAC,GAAG,CAAC,QAAQ,EAAE,OAAO,CAAC,MAAM,CAAC,CAAA;IAEzD,MAAM,EAAE,GAAG,MAAM,CAAC,QAAQ,EAAE,CAAA;IAC5B,MAAM,GAAG,GAAG,GAAG,IAAI,6BAA6B,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAAA;IACpE,OAAO,UAAU,CAAiB,GAAG,EAAE,MAAM,CAAC,CAAA;AAChD,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,SAAiB,EACjB,MAAc,EACd,WAAmB,EACnB,OAAgC;IAEhC,MAAM,IAAI,GAAG,aAAa,CAAC,SAAS,CAAC,CAAA;IACrC,MAAM,GAAG,GAAG,GAAG,IAAI,mBAAmB,WAAW,OAAO,CAAA;IACxD,OAAO,UAAU,CAAiB,GAAG,EAAE,MAAM,EAAE;QAC7C,MAAM,EAAE,MAAM;QACd,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC;KAC9B,CAAC,CAAA;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,SAAiB,EACjB,MAAc,EACd,OAAgC;IAEhC,MAAM,IAAI,GAAG,aAAa,CAAC,SAAS,CAAC,CAAA;IACrC,MAAM,GAAG,GAAG,GAAG,IAAI,yBAAyB,CAAA;IAC5C,OAAO,UAAU,CAAiB,GAAG,EAAE,MAAM,EAAE;QAC7C,MAAM,EAAE,MAAM;QACd,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC;KAC9B,CAAC,CAAA;AACJ,CAAC;AAYD;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,SAAiB,EACjB,MAAc;IAEd,MAAM,IAAI,GAAG,aAAa,CAAC,SAAS,CAAC,CAAA;IACrC,MAAM,GAAG,GAAG,GAAG,IAAI,iCAAiC,CAAA;IACpD,OAAO,UAAU,CAAkB,GAAG,EAAE,MAAM,CAAC,CAAA;AACjD,CAAC"}
|
package/dist/ci/benchmark.d.ts
CHANGED
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
*/
|
|
9
9
|
import type { TestMeasurement } from './measurement.js';
|
|
10
10
|
import type { TestBenchmarks } from './test-registry.js';
|
|
11
|
+
import type { EvaluatorConfig } from './api-client.js';
|
|
11
12
|
export type MetricName = 'duration_ms' | 'tokens_total' | 'output_contains' | 'output_not_contains' | 'llm_judge';
|
|
12
13
|
export interface MetricResult {
|
|
13
14
|
name: MetricName;
|
|
@@ -24,6 +25,9 @@ export interface BenchmarkResult {
|
|
|
24
25
|
/**
|
|
25
26
|
* Compare a measurement against benchmarks. Async because llm_judge requires
|
|
26
27
|
* an LLM call. The step's output is needed for output_contains/llm_judge checks.
|
|
28
|
+
*
|
|
29
|
+
* @param evaluatorConfig - Optional backend evaluator config used as fallback
|
|
30
|
+
* when the test does not specify judge_provider/judge_model.
|
|
27
31
|
*/
|
|
28
|
-
export declare function compareBenchmarks(measurement: TestMeasurement, benchmarks: TestBenchmarks, stepOutput?: unknown): Promise<BenchmarkResult>;
|
|
32
|
+
export declare function compareBenchmarks(measurement: TestMeasurement, benchmarks: TestBenchmarks, stepOutput?: unknown, evaluatorConfig?: EvaluatorConfig | null): Promise<BenchmarkResult>;
|
|
29
33
|
//# sourceMappingURL=benchmark.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"benchmark.d.ts","sourceRoot":"","sources":["../../src/ci/benchmark.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAGH,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAA;AACvD,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,oBAAoB,CAAA;
|
|
1
|
+
{"version":3,"file":"benchmark.d.ts","sourceRoot":"","sources":["../../src/ci/benchmark.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAGH,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAA;AACvD,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,oBAAoB,CAAA;AACxD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,iBAAiB,CAAA;AAEtD,MAAM,MAAM,UAAU,GAAG,aAAa,GAAG,cAAc,GAAG,iBAAiB,GAAG,qBAAqB,GAAG,WAAW,CAAA;AAEjH,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAE,UAAU,CAAA;IAChB,KAAK,EAAE,MAAM,CAAA;IACb,SAAS,EAAE,MAAM,CAAA;IACjB,MAAM,EAAE,OAAO,CAAA;IACf,MAAM,CAAC,EAAE,MAAM,CAAA;CAChB;AAED,MAAM,WAAW,eAAe;IAC9B,MAAM,EAAE,OAAO,CAAA;IACf,cAAc,CAAC,EAAE,MAAM,CAAA;IACvB,OAAO,EAAE,YAAY,EAAE,CAAA;CACxB;AAaD;;;;;;GAMG;AACH,wBAAsB,iBAAiB,CACrC,WAAW,EAAE,eAAe,EAC5B,UAAU,EAAE,cAAc,EAC1B,UAAU,CAAC,EAAE,OAAO,EACpB,eAAe,CAAC,EAAE,eAAe,GAAG,IAAI,GACvC,OAAO,CAAC,eAAe,CAAC,CA4J1B"}
|
package/dist/ci/benchmark.js
CHANGED
|
@@ -7,11 +7,23 @@
|
|
|
7
7
|
* Generated/updated on 2026-04-20.
|
|
8
8
|
*/
|
|
9
9
|
import { callProviderLLM } from '../matchers/index.js';
|
|
10
|
+
/** Maps backend provider names to SDK provider names used by callProviderLLM. */
|
|
11
|
+
const PROVIDER_NAME_MAP = {
|
|
12
|
+
anthropic: 'claude',
|
|
13
|
+
moonshot: 'kimi',
|
|
14
|
+
};
|
|
15
|
+
/** Normalize provider name from backend format to SDK format. */
|
|
16
|
+
function normalizeSdkProvider(provider) {
|
|
17
|
+
return PROVIDER_NAME_MAP[provider] ?? provider;
|
|
18
|
+
}
|
|
10
19
|
/**
|
|
11
20
|
* Compare a measurement against benchmarks. Async because llm_judge requires
|
|
12
21
|
* an LLM call. The step's output is needed for output_contains/llm_judge checks.
|
|
22
|
+
*
|
|
23
|
+
* @param evaluatorConfig - Optional backend evaluator config used as fallback
|
|
24
|
+
* when the test does not specify judge_provider/judge_model.
|
|
13
25
|
*/
|
|
14
|
-
export async function compareBenchmarks(measurement, benchmarks, stepOutput) {
|
|
26
|
+
export async function compareBenchmarks(measurement, benchmarks, stepOutput, evaluatorConfig) {
|
|
15
27
|
const metrics = [];
|
|
16
28
|
let firstFailure;
|
|
17
29
|
if (benchmarks.max_duration_ms !== undefined) {
|
|
@@ -74,10 +86,35 @@ export async function compareBenchmarks(measurement, benchmarks, stepOutput) {
|
|
|
74
86
|
const judge = benchmarks.llm_judge;
|
|
75
87
|
const outputStr = stringifyOutput(stepOutput);
|
|
76
88
|
const threshold = judge.judge_score_threshold ?? 7;
|
|
89
|
+
// Resolve provider/model: test definition takes priority, then backend
|
|
90
|
+
// evaluator config, then fall back to 'openai' default.
|
|
91
|
+
const resolvedProvider = normalizeSdkProvider(judge.judge_provider ?? evaluatorConfig?.provider ?? 'openai');
|
|
92
|
+
const resolvedModel = judge.judge_model ?? evaluatorConfig?.model ?? undefined;
|
|
93
|
+
// If the backend provided an API key and we're using its provider,
|
|
94
|
+
// set it in the environment so callProviderLLM can pick it up.
|
|
95
|
+
const envKeyMap = {
|
|
96
|
+
openai: 'OPENAI_API_KEY',
|
|
97
|
+
claude: 'ANTHROPIC_API_KEY',
|
|
98
|
+
gemini: 'GEMINI_API_KEY',
|
|
99
|
+
grok: 'GROK_API_KEY',
|
|
100
|
+
kimi: 'KIMI_API_KEY',
|
|
101
|
+
};
|
|
102
|
+
const envKey = envKeyMap[resolvedProvider];
|
|
103
|
+
let restoreEnv;
|
|
104
|
+
if (evaluatorConfig?.apiKey && envKey && !judge.judge_provider && !process.env[envKey]) {
|
|
105
|
+
const prev = process.env[envKey];
|
|
106
|
+
process.env[envKey] = evaluatorConfig.apiKey;
|
|
107
|
+
restoreEnv = () => {
|
|
108
|
+
if (prev === undefined)
|
|
109
|
+
delete process.env[envKey];
|
|
110
|
+
else
|
|
111
|
+
process.env[envKey] = prev;
|
|
112
|
+
};
|
|
113
|
+
}
|
|
77
114
|
try {
|
|
78
|
-
const provider = judge.judge_provider ?? 'openai';
|
|
79
115
|
const evalPrompt = `${judge.judge_prompt}\n\nOutput to evaluate:\n${outputStr}\n\nScore this output on a scale of 0-10. Respond with only the number.`;
|
|
80
|
-
const result = await callProviderLLM(evalPrompt, { provider, model:
|
|
116
|
+
const result = await callProviderLLM(evalPrompt, { provider: resolvedProvider, model: resolvedModel }, 'You are an expert test judge. Return only a number between 0 and 10.', 16, 0);
|
|
117
|
+
restoreEnv?.();
|
|
81
118
|
const score = parseFloat(result.content.match(/-?\d+(?:\.\d+)?/)?.[0] ?? '');
|
|
82
119
|
if (isNaN(score)) {
|
|
83
120
|
metrics.push({
|
|
@@ -106,6 +143,7 @@ export async function compareBenchmarks(measurement, benchmarks, stepOutput) {
|
|
|
106
143
|
}
|
|
107
144
|
}
|
|
108
145
|
catch (err) {
|
|
146
|
+
restoreEnv?.();
|
|
109
147
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
110
148
|
metrics.push({
|
|
111
149
|
name: 'llm_judge',
|
package/dist/ci/benchmark.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"benchmark.js","sourceRoot":"","sources":["../../src/ci/benchmark.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAA;
|
|
1
|
+
{"version":3,"file":"benchmark.js","sourceRoot":"","sources":["../../src/ci/benchmark.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAA;AAqBtD,iFAAiF;AACjF,MAAM,iBAAiB,GAA2B;IAChD,SAAS,EAAE,QAAQ;IACnB,QAAQ,EAAE,MAAM;CACjB,CAAA;AAED,iEAAiE;AACjE,SAAS,oBAAoB,CAAC,QAAgB;IAC5C,OAAO,iBAAiB,CAAC,QAAQ,CAAC,IAAI,QAAQ,CAAA;AAChD,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,WAA4B,EAC5B,UAA0B,EAC1B,UAAoB,EACpB,eAAwC;IAExC,MAAM,OAAO,GAAmB,EAAE,CAAA;IAClC,IAAI,YAAgC,CAAA;IAEpC,IAAI,UAAU,CAAC,eAAe,KAAK,SAAS,EAAE,CAAC;QAC7C,MAAM,MAAM,GAAG,WAAW,CAAC,WAAW,IAAI,UAAU,CAAC,eAAe,CAAA;QACpE,OAAO,CAAC,IAAI,CAAC;YACX,IAAI,EAAE,aAAa;YACnB,KAAK,EAAE,WAAW,CAAC,WAAW;YAC9B,SAAS,EAAE,UAAU,CAAC,eAAe;YACrC,MAAM;SACP,CAAC,CAAA;QACF,IAAI,CAAC,MAAM,IAAI,CAAC,YAAY,EAAE,CAAC;YAC7B,YAAY,GAAG,gBAAgB,WAAW,CAAC,WAAW,6BAA6B,UAAU,CAAC,eAAe,GAAG,CAAA;QAClH,CAAC;IACH,CAAC;IAED,IAAI,UAAU,CAAC,gBAAgB,KAAK,SAAS,EAAE,CAAC;QAC9C,MAAM,KAAK,GAAG,WAAW,CAAC,YAAY,IAAI,CAAC,CAAA;QAC3C,MAAM,MAAM,GAAG,KAAK,IAAI,UAAU,CAAC,gBAAgB,CAAA;QACnD,OAAO,CAAC,IAAI,CAAC;YACX,IAAI,EAAE,cAAc;YACpB,KAAK;YACL,SAAS,EAAE,UAAU,CAAC,gBAAgB;YACtC,MAAM;SACP,CAAC,CAAA;QACF,IAAI,CAAC,MAAM,IAAI,CAAC,YAAY,EAAE,CAAC;YAC7B,YAAY,GAAG,iBAAiB,KAAK,6BAA6B,UAAU,CAAC,gBAAgB,GAAG,CAAA;QAClG,CAAC;IACH,CAAC;IAED,+DAA+D;IAC/D,IAAI,UAAU,CAAC,eAAe,KAAK,SAAS,EAAE,CAAC;QAC7C,MAAM,SAAS,GAAG,eAAe,CAAC,UAAU,CAAC,CAAA;QAC7C,MAAM,MAAM,GAAG,SAAS,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,UAAU,CAAC,eAAe,CAAC,WAAW,EAAE,CAAC,CAAA;QACzF,OAAO,CAAC,IAAI,CAAC;YACX,IAAI,EAAE,iBAAiB;YACvB,KAAK,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YACrB,SAAS,EAAE,CAAC;YACZ,MAAM;YACN,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,4BAA4B,UAAU,CAAC,eAAe,GAAG;SACvF,CAAC,CAAA;QACF,IAAI,CAAC,MAAM,IAAI,CAAC,YAAY,EAAE,CAAC;YAC7B,YAAY,GAAG,4BAA4B,UAAU,CAAC,eAAe,GAAG,CAAA;QAC1E,CAAC;IACH,CAAC;IAED,+DAA+D;IAC/D,IAAI,UAAU,CAAC,mBAAmB,KAAK,SAAS,EAAE,CAAC;QACjD,MAAM,SAAS,GAAG,eAAe,CAAC,UAAU,CAAC,CAAA;QAC7C,MAAM,MAAM,GAAG,CAAC,SAAS,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,UAAU,CAAC,mBAAmB,CAAC,WAAW,EAAE,CAAC,CAAA;QAC9F,OAAO,CAAC,IAAI,CAAC;YACX,IAAI,EAAE,qBAAqB;YAC3B,KAAK,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YACrB,SAAS,EAAE,CAAC;YACZ,MAAM;YACN,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,iCAAiC,UAAU,CAAC,mBAAmB,GAAG;SAChG,CAAC,CAAA;QACF,IAAI,CAAC,MAAM,IAAI,CAAC,YAAY,EAAE,CAAC;YAC7B,YAAY,GAAG,iCAAiC,UAAU,CAAC,mBAAmB,GAAG,CAAA;QACnF,CAAC;IACH,CAAC;IAED,8DAA8D;IAC9D,IAAI,UAAU,CAAC,SAAS,EAAE,CAAC;QACzB,MAAM,KAAK,GAAG,UAAU,CAAC,SAAS,CAAA;QAClC,MAAM,SAAS,GAAG,eAAe,CAAC,UAAU,CAAC,CAAA;QAC7C,MAAM,SAAS,GAAG,KAAK,CAAC,qBAAqB,IAAI,CAAC,CAAA;QAElD,uEAAuE;QACvE,wDAAwD;QACxD,MAAM,gBAAgB,GAAG,oBAAoB,CAC3C,KAAK,CAAC,cAAc,IAAI,eAAe,EAAE,QAAQ,IAAI,QAAQ,CAC9D,CAAA;QACD,MAAM,aAAa,GAAG,KAAK,CAAC,WAAW,IAAI,eAAe,EAAE,KAAK,IAAI,SAAS,CAAA;QAE9E,mEAAmE;QACnE,+DAA+D;QAC/D,MAAM,SAAS,GAA2B;YACxC,MAAM,EAAE,gBAAgB;YACxB,MAAM,EAAE,mBAAmB;YAC3B,MAAM,EAAE,gBAAgB;YACxB,IAAI,EAAE,cAAc;YACpB,IAAI,EAAE,cAAc;SACrB,CAAA;QACD,MAAM,MAAM,GAAG,SAAS,CAAC,gBAAgB,CAAC,CAAA;QAC1C,IAAI,UAAoC,CAAA;QACxC,IAAI,eAAe,EAAE,MAAM,IAAI,MAAM,IAAI,CAAC,KAAK,CAAC,cAAc,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC;YACvF,MAAM,IAAI,GAAG,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,CAAA;YAChC,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,eAAe,CAAC,MAAM,CAAA;YAC5C,UAAU,GAAG,GAAG,EAAE;gBAChB,IAAI,IAAI,KAAK,SAAS;oBAAE,OAAO,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,CAAA;;oBAC7C,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,IAAI,CAAA;YACjC,CAAC,CAAA;QACH,CAAC;QAED,IAAI,CAAC;YACH,MAAM,UAAU,GAAG,GAAG,KAAK,CAAC,YAAY,4BAA4B,SAAS,yEAAyE,CAAA;YAEtJ,MAAM,MAAM,GAAG,MAAM,eAAe,CAClC,UAAU,EACV,EAAE,QAAQ,EAAE,gBAAoE,EAAE,KAAK,EAAE,aAAa,EAAE,EACxG,sEAAsE,EACtE,EAAE,EACF,CAAC,CACF,CAAA;YAED,UAAU,EAAE,EAAE,CAAA;YAEd,MAAM,KAAK,GAAG,UAAU,CAAC,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,iBAAiB,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAA;YAC5E,IAAI,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;gBACjB,OAAO,CAAC,IAAI,CAAC;oBACX,IAAI,EAAE,WAAW;oBACjB,KAAK,EAAE,CAAC;oBACR,SAAS;oBACT,MAAM,EAAE,KAAK;oBACb,MAAM,EAAE,6CAA6C,MAAM,CAAC,OAAO,GAAG;iBACvE,CAAC,CAAA;gBACF,IAAI,CAAC,YAAY,EAAE,CAAC;oBAClB,YAAY,GAAG,gDAAgD,CAAA;gBACjE,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,MAAM,MAAM,GAAG,KAAK,IAAI,SAAS,CAAA;gBACjC,OAAO,CAAC,IAAI,CAAC;oBACX,IAAI,EAAE,WAAW;oBACjB,KAAK,EAAE,KAAK;oBACZ,SAAS;oBACT,MAAM;oBACN,MAAM,EAAE,UAAU,KAAK,IAAI,SAAS,EAAE;iBACvC,CAAC,CAAA;gBACF,IAAI,CAAC,MAAM,IAAI,CAAC,YAAY,EAAE,CAAC;oBAC7B,YAAY,GAAG,oBAAoB,KAAK,sBAAsB,SAAS,GAAG,CAAA;gBAC5E,CAAC;YACH,CAAC;QACH,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,UAAU,EAAE,EAAE,CAAA;YACd,MAAM,MAAM,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAA;YAC/D,OAAO,CAAC,IAAI,CAAC;gBACX,IAAI,EAAE,WAAW;gBACjB,KAAK,EAAE,CAAC;gBACR,SAAS;gBACT,MAAM,EAAE,KAAK;gBACb,MAAM,EAAE,oBAAoB,MAAM,EAAE;aACrC,CAAC,CAAA;YACF,IAAI,CAAC,YAAY,EAAE,CAAC;gBAClB,YAAY,GAAG,oBAAoB,MAAM,EAAE,CAAA;YAC7C,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAA;IAC9C,OAAO;QACL,MAAM,EAAE,SAAS;QACjB,cAAc,EAAE,YAAY;QAC5B,OAAO;KACR,CAAA;AACH,CAAC;AAED,kEAAkE;AAClE,SAAS,eAAe,CAAC,MAAe;IACtC,IAAI,MAAM,KAAK,IAAI,IAAI,MAAM,KAAK,SAAS;QAAE,OAAO,EAAE,CAAA;IACtD,IAAI,OAAO,MAAM,KAAK,QAAQ;QAAE,OAAO,MAAM,CAAA;IAC7C,OAAO,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,CAAA;AAC/B,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ed-runner.d.ts","sourceRoot":"","sources":["../../src/ci/ed-runner.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"ed-runner.d.ts","sourceRoot":"","sources":["../../src/ci/ed-runner.ts"],"names":[],"mappings":"AAQA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAA;AACvD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAA;AAOrD,MAAM,WAAW,gBAAgB;IAC/B,GAAG,CAAC,EAAE,MAAM,CAAA;IACZ,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,QAAQ,CAAC,EAAE,OAAO,CAAA;IAClB,QAAQ,CAAC,EAAE,OAAO,CAAA;IAClB,QAAQ,CAAC,EAAE,SAAS,GAAG,MAAM,GAAG,OAAO,CAAA;IACvC,mFAAmF;IACnF,IAAI,CAAC,EAAE,MAAM,CAAA;CACd;AAED,MAAM,WAAW,iBAAiB;IAChC,MAAM,EAAE,MAAM,GAAG,MAAM,CAAA;IACvB,aAAa,CAAC,EAAE,MAAM,CAAA;IACtB,WAAW,CAAC,EAAE,eAAe,CAAA;IAC7B,eAAe,CAAC,EAAE,eAAe,CAAA;IACjC,MAAM,CAAC,EAAE,OAAO,CAAA;IAChB,UAAU,EAAE,MAAM,CAAA;IAClB,SAAS,EAAE,MAAM,CAAA;IACjB,UAAU,EAAE,MAAM,CAAA;CACnB;AAED,MAAM,WAAW,YAAY;IAC3B,MAAM,EAAE,MAAM,CAAA;IACd,QAAQ,EAAE,MAAM,CAAA;IAChB,MAAM,EAAE,MAAM,GAAG,MAAM,CAAA;IACvB,aAAa,CAAC,EAAE,MAAM,CAAA;IACtB,WAAW,CAAC,EAAE,eAAe,CAAA;IAC7B,eAAe,CAAC,EAAE,eAAe,CAAA;IACjC,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,MAAM,CAAC,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,CAAA;IAC1C,KAAK,CAAC,EAAE,OAAO,CAAA;IACf,MAAM,CAAC,EAAE,OAAO,CAAA;IAChB,UAAU,EAAE,MAAM,CAAA;IAClB,iDAAiD;IACjD,UAAU,CAAC,EAAE,iBAAiB,EAAE,CAAA;CACjC;AAED,MAAM,WAAW,eAAe;IAC9B,KAAK,EAAE,MAAM,CAAA;IACb,SAAS,EAAE,MAAM,CAAA;IACjB,UAAU,EAAE,MAAM,CAAA;IAClB,OAAO,EAAE,YAAY,EAAE,CAAA;IACvB,UAAU,EAAE,MAAM,CAAA;CACnB;AAID,wBAAsB,UAAU,CAAC,OAAO,CAAC,EAAE,gBAAgB,GAAG,OAAO,CAAC,eAAe,CAAC,CAyGrF"}
|
package/dist/ci/ed-runner.js
CHANGED
|
@@ -4,6 +4,7 @@ import { createReplayContext, installReplay, uninstallReplay, ReplayMissError }
|
|
|
4
4
|
import { collectMeasurement } from './measurement.js';
|
|
5
5
|
import { SDK_VERSION } from './trace-schema.js';
|
|
6
6
|
import { compareBenchmarks } from './benchmark.js';
|
|
7
|
+
import { fetchEvaluatorConfig } from './api-client.js';
|
|
7
8
|
// ─── Runner ─────────────────────────────────────────────────
|
|
8
9
|
export async function runEdTests(options) {
|
|
9
10
|
const cwd = options?.cwd ?? process.cwd();
|
|
@@ -29,12 +30,29 @@ export async function runEdTests(options) {
|
|
|
29
30
|
testsToRun = tests.filter(t => matchGlob(t.name, pattern));
|
|
30
31
|
}
|
|
31
32
|
const maxRuns = Math.max(1, options?.runs ?? 1);
|
|
33
|
+
// Fetch evaluator config from backend if any test uses llm_judge without
|
|
34
|
+
// explicit provider/model. Cached for the entire run to avoid repeated calls.
|
|
35
|
+
let evaluatorConfig = null;
|
|
36
|
+
const needsEvaluatorConfig = testsToRun.some(t => t.benchmarks.llm_judge && (!t.benchmarks.llm_judge.judge_provider || !t.benchmarks.llm_judge.judge_model));
|
|
37
|
+
if (needsEvaluatorConfig) {
|
|
38
|
+
const serverUrl = process.env.ELASTICDASH_API_URL ?? process.env.ELASTICDASH_SERVER ?? '';
|
|
39
|
+
const apiKey = process.env.ELASTICDASH_API_KEY ?? '';
|
|
40
|
+
if (serverUrl && apiKey) {
|
|
41
|
+
try {
|
|
42
|
+
evaluatorConfig = await fetchEvaluatorConfig(serverUrl, apiKey);
|
|
43
|
+
console.log(`[ed-test] Evaluator config: provider=${evaluatorConfig.provider}, model=${evaluatorConfig.model}, hasKey=${!!evaluatorConfig.apiKey}`);
|
|
44
|
+
}
|
|
45
|
+
catch (err) {
|
|
46
|
+
console.warn(`[ed-test] Could not fetch evaluator config: ${err instanceof Error ? err.message : String(err)}`);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
32
50
|
for (const test of testsToRun) {
|
|
33
51
|
const allRuns = [];
|
|
34
52
|
let bestResult = null;
|
|
35
53
|
for (let attempt = 1; attempt <= maxRuns; attempt++) {
|
|
36
54
|
const runStartedAt = new Date().toISOString();
|
|
37
|
-
const result = await runSingleTest(test);
|
|
55
|
+
const result = await runSingleTest(test, evaluatorConfig);
|
|
38
56
|
const runFinishedAt = new Date().toISOString();
|
|
39
57
|
if (attempt > 1) {
|
|
40
58
|
console.log(` [ed-test] ${test.name}: run ${attempt}/${maxRuns} — ${result.status}`);
|
|
@@ -81,7 +99,7 @@ export async function runEdTests(options) {
|
|
|
81
99
|
async function resolveCustomInput(input) {
|
|
82
100
|
return typeof input === 'function' ? await input() : input;
|
|
83
101
|
}
|
|
84
|
-
async function runSingleTest(test) {
|
|
102
|
+
async function runSingleTest(test, evaluatorConfig) {
|
|
85
103
|
const startMs = Date.now();
|
|
86
104
|
const targetStep = test.traceData.steps.find(s => s.step_id === test.target.step_id);
|
|
87
105
|
const resolvedInput = test.input !== undefined
|
|
@@ -139,7 +157,7 @@ async function runSingleTest(test) {
|
|
|
139
157
|
};
|
|
140
158
|
}
|
|
141
159
|
// Compare against benchmarks (async to support llm_judge)
|
|
142
|
-
const benchmarkResult = await compareBenchmarks(measurement, test.benchmarks, targetStep?.output);
|
|
160
|
+
const benchmarkResult = await compareBenchmarks(measurement, test.benchmarks, targetStep?.output, evaluatorConfig);
|
|
143
161
|
return {
|
|
144
162
|
...base,
|
|
145
163
|
testId: test.name,
|
|
@@ -158,7 +176,7 @@ async function runSingleTest(test) {
|
|
|
158
176
|
const traceMeasurement = extractMeasurementFromTrace(test);
|
|
159
177
|
if (traceMeasurement) {
|
|
160
178
|
console.log(` [ed-test] ${test.name}: run() failed (${err instanceof Error ? err.message : String(err)}), using trace measurement fallback`);
|
|
161
|
-
const benchmarkResult = await compareBenchmarks(traceMeasurement, test.benchmarks, targetStep?.output);
|
|
179
|
+
const benchmarkResult = await compareBenchmarks(traceMeasurement, test.benchmarks, targetStep?.output, evaluatorConfig);
|
|
162
180
|
return {
|
|
163
181
|
...base,
|
|
164
182
|
testId: test.name,
|
package/dist/ci/ed-runner.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ed-runner.js","sourceRoot":"","sources":["../../src/ci/ed-runner.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAA;AACxC,OAAO,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAA;AAC5C,OAAO,EAAE,mBAAmB,EAAE,aAAa,EAAE,eAAe,EAAE,eAAe,EAAE,MAAM,aAAa,CAAA;AAClG,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAA;AACrD,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAA;AAC/C,OAAO,EAAE,iBAAiB,EAAE,MAAM,gBAAgB,CAAA;
|
|
1
|
+
{"version":3,"file":"ed-runner.js","sourceRoot":"","sources":["../../src/ci/ed-runner.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAA;AACxC,OAAO,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAA;AAC5C,OAAO,EAAE,mBAAmB,EAAE,aAAa,EAAE,eAAe,EAAE,eAAe,EAAE,MAAM,aAAa,CAAA;AAClG,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAA;AACrD,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAA;AAC/C,OAAO,EAAE,iBAAiB,EAAE,MAAM,gBAAgB,CAAA;AAClD,OAAO,EAAE,oBAAoB,EAAE,MAAM,iBAAiB,CAAA;AAuDtD,+DAA+D;AAE/D,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,OAA0B;IACzD,MAAM,GAAG,GAAG,OAAO,EAAE,GAAG,IAAI,OAAO,CAAC,GAAG,EAAE,CAAA;IACzC,MAAM,KAAK,GAAG,UAAU,EAAE,CAAA;IAC1B,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAA;IAC1C,MAAM,OAAO,GAAmB,EAAE,CAAA;IAElC,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,MAAM,SAAS,CAAC,EAAE,GAAG,EAAE,CAAC,CAAA;IAElD,2CAA2C;IAC3C,KAAK,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;QACzB,OAAO,CAAC,IAAI,CAAC;YACX,MAAM,EAAE,GAAG,CAAC,QAAQ,IAAI,SAAS;YACjC,QAAQ,EAAE,GAAG,CAAC,QAAQ,IAAI,SAAS;YACnC,MAAM,EAAE,MAAM;YACd,aAAa,EAAE,qBAAqB,GAAG,CAAC,OAAO,EAAE;YACjD,UAAU,EAAE,CAAC;YACb,UAAU,EAAE,EAAE;SACf,CAAC,CAAA;IACJ,CAAC;IAED,mCAAmC;IACnC,IAAI,UAAU,GAAoB,KAAK,CAAA;IACvC,IAAI,OAAO,EAAE,MAAM,EAAE,CAAC;QACpB,MAAM,OAAO,GAAG,OAAO,CAAC,MAAM,CAAA;QAC9B,UAAU,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC,CAAA;IAC5D,CAAC;IAED,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,EAAE,IAAI,IAAI,CAAC,CAAC,CAAA;IAE/C,yEAAyE;IACzE,8EAA8E;IAC9E,IAAI,eAAe,GAA2B,IAAI,CAAA;IAClD,MAAM,oBAAoB,GAAG,UAAU,CAAC,IAAI,CAC1C,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,IAAI,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,cAAc,IAAI,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,WAAW,CAAC,CAC/G,CAAA;IACD,IAAI,oBAAoB,EAAE,CAAC;QACzB,MAAM,SAAS,GAAG,OAAO,CAAC,GAAG,CAAC,mBAAmB,IAAI,OAAO,CAAC,GAAG,CAAC,kBAAkB,IAAI,EAAE,CAAA;QACzF,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,mBAAmB,IAAI,EAAE,CAAA;QACpD,IAAI,SAAS,IAAI,MAAM,EAAE,CAAC;YACxB,IAAI,CAAC;gBACH,eAAe,GAAG,MAAM,oBAAoB,CAAC,SAAS,EAAE,MAAM,CAAC,CAAA;gBAC/D,OAAO,CAAC,GAAG,CAAC,wCAAwC,eAAe,CAAC,QAAQ,WAAW,eAAe,CAAC,KAAK,YAAY,CAAC,CAAC,eAAe,CAAC,MAAM,EAAE,CAAC,CAAA;YACrJ,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,OAAO,CAAC,IAAI,CAAC,+CAA+C,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAA;YACjH,CAAC;QACH,CAAC;IACH,CAAC;IAED,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;QAC9B,MAAM,OAAO,GAAwB,EAAE,CAAA;QACvC,IAAI,UAAU,GAAwB,IAAI,CAAA;QAE1C,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,OAAO,EAAE,OAAO,EAAE,EAAE,CAAC;YACpD,MAAM,YAAY,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAA;YAC7C,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,IAAI,EAAE,eAAe,CAAC,CAAA;YACzD,MAAM,aAAa,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAA;YAE9C,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;gBAChB,OAAO,CAAC,GAAG,CAAC,eAAe,IAAI,CAAC,IAAI,SAAS,OAAO,IAAI,OAAO,MAAM,MAAM,CAAC,MAAM,EAAE,CAAC,CAAA;YACvF,CAAC;YAED,+BAA+B;YAC/B,OAAO,CAAC,IAAI,CAAC;gBACX,MAAM,EAAE,MAAM,CAAC,MAAM;gBACrB,aAAa,EAAE,MAAM,CAAC,aAAa;gBACnC,WAAW,EAAE,MAAM,CAAC,WAAW;gBAC/B,eAAe,EAAE,MAAM,CAAC,eAAe;gBACvC,MAAM,EAAE,MAAM,CAAC,MAAM;gBACrB,UAAU,EAAE,MAAM,CAAC,UAAU;gBAC7B,SAAS,EAAE,YAAY;gBACvB,UAAU,EAAE,aAAa;aAC1B,CAAC,CAAA;YAEF,4EAA4E;YAC5E,IAAI,CAAC,UAAU,IAAI,MAAM,CAAC,MAAM,KAAK,MAAM,EAAE,CAAC;gBAC5C,UAAU,GAAG,MAAM,CAAA;YACrB,CAAC;QAEP,CAAC;QAEG,oCAAoC;QACpC,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC,CAAA;QACxD,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC,CAAA;QAExD,OAAO,CAAC,IAAI,CAAC;YACX,GAAG,UAAW;YACd,MAAM,EAAE,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;YACnC,aAAa,EAAE,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS,EAAE,aAAa,IAAI,UAAW,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,SAAS;YAC9F,UAAU,EAAE,OAAO;SACpB,CAAC,CAAA;QAEF,IAAI,OAAO,EAAE,QAAQ,IAAI,SAAS,EAAE,CAAC;YACnC,MAAK;QACP,CAAC;IACH,CAAC;IAED,MAAM,UAAU,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAA;IAE3C,OAAO;QACL,KAAK;QACL,SAAS;QACT,UAAU;QACV,OAAO;QACP,UAAU,EAAE,WAAW;KACxB,CAAA;AACH,CAAC;AAED,+DAA+D;AAE/D,KAAK,UAAU,kBAAkB,CAAC,KAAmD;IACnF,OAAO,OAAO,KAAK,KAAK,UAAU,CAAC,CAAC,CAAC,MAAO,KAA0C,EAAE,CAAC,CAAC,CAAC,KAAK,CAAA;AAClG,CAAC;AAED,KAAK,UAAU,aAAa,CAAC,IAAmB,EAAE,eAAwC;IACxF,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,CAAA;IAC1B,MAAM,UAAU,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;IAEpF,MAAM,aAAa,GAAG,IAAI,CAAC,KAAK,KAAK,SAAS;QAC5C,CAAC,CAAC,MAAM,kBAAkB,CAAC,IAAI,CAAC,KAAK,CAAC;QACtC,CAAC,CAAC,UAAU,EAAE,KAAK,CAAA;IAErB,MAAM,IAAI,GAA0B;QAClC,MAAM,EAAE,IAAI,CAAC,IAAI;QACjB,QAAQ,EAAE,IAAI,CAAC,IAAI;QACnB,QAAQ,EAAE,IAAI,CAAC,KAAK;QACpB,MAAM,EAAE,EAAE,IAAI,EAAE,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,OAAO,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE;QAChE,KAAK,EAAE,aAAa;QACpB,MAAM,EAAE,UAAU,EAAE,MAAM;KAC3B,CAAA;IAED,4BAA4B;IAC5B,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,OAAO,IAAI,CAAC,GAAG,KAAK,UAAU,EAAE,CAAC;QAChD,OAAO;YACL,GAAG,IAAI;YACP,MAAM,EAAE,IAAI,CAAC,IAAI;YACjB,QAAQ,EAAE,IAAI,CAAC,IAAI;YACnB,MAAM,EAAE,MAAM;YACd,aAAa,EAAE,0BAA0B;YACzC,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,OAAO;SACjC,CAAA;IACH,CAAC;IAED,MAAM,SAAS,GAAG,mBAAmB,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;IAC1E,aAAa,CAAC,SAAS,CAAC,CAAA;IAExB,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,UAAU,IAAI,KAAK,CAAA;QAE1C,MAAM,OAAO,CAAC,IAAI,CAAC;YACjB,IAAI,CAAC,GAAG,EAAE;YACV,IAAI,OAAO,CAAQ,CAAC,CAAC,EAAE,MAAM,EAAE,EAAE,CAC/B,UAAU,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,IAAI,YAAY,CAAC,SAAS,CAAC,CAAC,EAAE,SAAS,CAAC,CACjE;SACF,CAAC,CAAA;QAEF,4CAA4C;QAC5C,gEAAgE;QAChE,sEAAsE;QACtE,qEAAqE;QACrE,sEAAsE;QACtE,uEAAuE;QACvE,2CAA2C;QAC3C,IAAI,WAAW,GAAG,kBAAkB,CAAC,SAAS,CAAC,CAAA;QAC/C,IAAI,CAAC,WAAW,EAAE,CAAC;YACjB,WAAW,GAAG,2BAA2B,CAAC,IAAI,CAAC,IAAI,IAAI,CAAA;YACvD,IAAI,WAAW,EAAE,CAAC;gBAChB,OAAO,CAAC,GAAG,CAAC,eAAe,IAAI,CAAC,IAAI,yDAAyD,CAAC,CAAA;YAChG,CAAC;QACH,CAAC;QACD,IAAI,CAAC,WAAW,EAAE,CAAC;YACjB,OAAO;gBACL,GAAG,IAAI;gBACP,MAAM,EAAE,IAAI,CAAC,IAAI;gBACjB,QAAQ,EAAE,IAAI,CAAC,IAAI;gBACnB,MAAM,EAAE,MAAM;gBACd,aAAa,EAAE,gBAAgB,IAAI,CAAC,MAAM,CAAC,OAAO,qCAAqC;gBACvF,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,OAAO;aACjC,CAAA;QACH,CAAC;QAED,0DAA0D;QAC1D,MAAM,eAAe,GAAG,MAAM,iBAAiB,CAAC,WAAW,EAAE,IAAI,CAAC,UAAU,EAAE,UAAU,EAAE,MAAM,EAAE,eAAe,CAAC,CAAA;QAElH,OAAO;YACL,GAAG,IAAI;YACP,MAAM,EAAE,IAAI,CAAC,IAAI;YACjB,QAAQ,EAAE,IAAI,CAAC,IAAI;YACnB,MAAM,EAAE,eAAe,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;YAChD,aAAa,EAAE,eAAe,CAAC,cAAc;YAC7C,WAAW;YACX,eAAe;YACf,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,OAAO;SACjC,CAAA;IACH,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,4EAA4E;QAC5E,4EAA4E;QAC5E,iEAAiE;QACjE,MAAM,gBAAgB,GAAG,2BAA2B,CAAC,IAAI,CAAC,CAAA;QAC1D,IAAI,gBAAgB,EAAE,CAAC;YACrB,OAAO,CAAC,GAAG,CAAC,eAAe,IAAI,CAAC,IAAI,mBAAmB,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,qCAAqC,CAAC,CAAA;YAC7I,MAAM,eAAe,GAAG,MAAM,iBAAiB,CAAC,gBAAgB,EAAE,IAAI,CAAC,UAAU,EAAE,UAAU,EAAE,MAAM,EAAE,eAAe,CAAC,CAAA;YACvH,OAAO;gBACL,GAAG,IAAI;gBACP,MAAM,EAAE,IAAI,CAAC,IAAI;gBACjB,QAAQ,EAAE,IAAI,CAAC,IAAI;gBACnB,MAAM,EAAE,eAAe,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;gBAChD,aAAa,EAAE,eAAe,CAAC,cAAc;gBAC7C,WAAW,EAAE,gBAAgB;gBAC7B,eAAe;gBACf,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,OAAO;aACjC,CAAA;QACH,CAAC;QAED,IAAI,GAAG,YAAY,eAAe,EAAE,CAAC;YACnC,OAAO;gBACL,GAAG,IAAI;gBACP,MAAM,EAAE,IAAI,CAAC,IAAI;gBACjB,QAAQ,EAAE,IAAI,CAAC,IAAI;gBACnB,MAAM,EAAE,MAAM;gBACd,aAAa,EAAE,gBAAgB,GAAG,CAAC,QAAQ,KAAK,GAAG,CAAC,QAAQ,EAAE;gBAC9D,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,OAAO;aACjC,CAAA;QACH,CAAC;QACD,IAAI,GAAG,YAAY,YAAY,EAAE,CAAC;YAChC,OAAO;gBACL,GAAG,IAAI;gBACP,MAAM,EAAE,IAAI,CAAC,IAAI;gBACjB,QAAQ,EAAE,IAAI,CAAC,IAAI;gBACnB,MAAM,EAAE,MAAM;gBACd,aAAa,EAAE,wBAAwB,GAAG,CAAC,SAAS,IAAI;gBACxD,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,OAAO;aACjC,CAAA;QACH,CAAC;QACD,OAAO;YACL,GAAG,IAAI;YACP,MAAM,EAAE,IAAI,CAAC,IAAI;YACjB,QAAQ,EAAE,IAAI,CAAC,IAAI;YACnB,MAAM,EAAE,MAAM;YACd,aAAa,EAAE,oBAAoB,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE;YACrF,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,OAAO;SACjC,CAAA;IACH,CAAC;YAAS,CAAC;QACT,eAAe,EAAE,CAAA;IACnB,CAAC;AACH,CAAC;AAED,+DAA+D;AAE/D;;;;;;GAMG;AACH,SAAS,2BAA2B,CAAC,IAAmB;IACtD,MAAM,IAAI,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;IAC9E,IAAI,CAAC,IAAI;QAAE,OAAO,SAAS,CAAA;IAE3B,MAAM,MAAM,GAAoB;QAC9B,WAAW,EAAE,IAAI,CAAC,WAAW;KAC9B,CAAA;IAED,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;QAChB,MAAM,CAAC,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,KAAK,CAAA;QACvC,MAAM,CAAC,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM,CAAA;QACzC,MAAM,CAAC,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,KAAK,CAAA;IACzC,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED,+DAA+D;AAE/D,MAAM,YAAa,SAAQ,KAAK;IACX;IAAnB,YAAmB,SAAiB;QAClC,KAAK,CAAC,wBAAwB,SAAS,IAAI,CAAC,CAAA;QAD3B,cAAS,GAAT,SAAS,CAAQ;QAElC,IAAI,CAAC,IAAI,GAAG,cAAc,CAAA;IAC5B,CAAC;CACF;AAED,SAAS,SAAS,CAAC,IAAY,EAAE,OAAe;IAC9C,0CAA0C;IAC1C,MAAM,KAAK,GAAG,IAAI,MAAM,CACtB,GAAG,GAAG,OAAO,CAAC,OAAO,CAAC,mBAAmB,EAAE,MAAM,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,GAAG,GAAG,CAClG,CAAA;IACD,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;AACzB,CAAC"}
|
package/dist/index.cjs
CHANGED
|
@@ -4392,6 +4392,7 @@ __export(index_exports, {
|
|
|
4392
4392
|
expect: () => import_expect.expect,
|
|
4393
4393
|
extractTaskOutputs: () => extractTaskOutputs,
|
|
4394
4394
|
fetchCapturedTrace: () => fetchCapturedTrace,
|
|
4395
|
+
fetchEvaluatorConfig: () => fetchEvaluatorConfig,
|
|
4395
4396
|
fetchTestGroups: () => fetchTestGroups,
|
|
4396
4397
|
getCaptureContext: () => getCaptureContext,
|
|
4397
4398
|
getCurrentTrace: () => getCurrentTrace,
|
|
@@ -6291,7 +6292,7 @@ async function loadTests(options) {
|
|
|
6291
6292
|
}
|
|
6292
6293
|
|
|
6293
6294
|
// src/ci/ed-runner.ts
|
|
6294
|
-
var
|
|
6295
|
+
var import_node_crypto9 = require("node:crypto");
|
|
6295
6296
|
|
|
6296
6297
|
// src/ci/measurement.ts
|
|
6297
6298
|
function collectMeasurement(ctx) {
|
|
@@ -6313,7 +6314,14 @@ init_trace_schema();
|
|
|
6313
6314
|
|
|
6314
6315
|
// src/ci/benchmark.ts
|
|
6315
6316
|
init_matchers();
|
|
6316
|
-
|
|
6317
|
+
var PROVIDER_NAME_MAP = {
|
|
6318
|
+
anthropic: "claude",
|
|
6319
|
+
moonshot: "kimi"
|
|
6320
|
+
};
|
|
6321
|
+
function normalizeSdkProvider(provider) {
|
|
6322
|
+
return PROVIDER_NAME_MAP[provider] ?? provider;
|
|
6323
|
+
}
|
|
6324
|
+
async function compareBenchmarks(measurement, benchmarks, stepOutput, evaluatorConfig) {
|
|
6317
6325
|
const metrics = [];
|
|
6318
6326
|
let firstFailure;
|
|
6319
6327
|
if (benchmarks.max_duration_ms !== void 0) {
|
|
@@ -6373,8 +6381,28 @@ async function compareBenchmarks(measurement, benchmarks, stepOutput) {
|
|
|
6373
6381
|
const judge = benchmarks.llm_judge;
|
|
6374
6382
|
const outputStr = stringifyOutput(stepOutput);
|
|
6375
6383
|
const threshold = judge.judge_score_threshold ?? 7;
|
|
6384
|
+
const resolvedProvider = normalizeSdkProvider(
|
|
6385
|
+
judge.judge_provider ?? evaluatorConfig?.provider ?? "openai"
|
|
6386
|
+
);
|
|
6387
|
+
const resolvedModel = judge.judge_model ?? evaluatorConfig?.model ?? void 0;
|
|
6388
|
+
const envKeyMap = {
|
|
6389
|
+
openai: "OPENAI_API_KEY",
|
|
6390
|
+
claude: "ANTHROPIC_API_KEY",
|
|
6391
|
+
gemini: "GEMINI_API_KEY",
|
|
6392
|
+
grok: "GROK_API_KEY",
|
|
6393
|
+
kimi: "KIMI_API_KEY"
|
|
6394
|
+
};
|
|
6395
|
+
const envKey = envKeyMap[resolvedProvider];
|
|
6396
|
+
let restoreEnv;
|
|
6397
|
+
if (evaluatorConfig?.apiKey && envKey && !judge.judge_provider && !process.env[envKey]) {
|
|
6398
|
+
const prev = process.env[envKey];
|
|
6399
|
+
process.env[envKey] = evaluatorConfig.apiKey;
|
|
6400
|
+
restoreEnv = () => {
|
|
6401
|
+
if (prev === void 0) delete process.env[envKey];
|
|
6402
|
+
else process.env[envKey] = prev;
|
|
6403
|
+
};
|
|
6404
|
+
}
|
|
6376
6405
|
try {
|
|
6377
|
-
const provider = judge.judge_provider ?? "openai";
|
|
6378
6406
|
const evalPrompt = `${judge.judge_prompt}
|
|
6379
6407
|
|
|
6380
6408
|
Output to evaluate:
|
|
@@ -6383,11 +6411,12 @@ ${outputStr}
|
|
|
6383
6411
|
Score this output on a scale of 0-10. Respond with only the number.`;
|
|
6384
6412
|
const result = await callProviderLLM(
|
|
6385
6413
|
evalPrompt,
|
|
6386
|
-
{ provider, model:
|
|
6414
|
+
{ provider: resolvedProvider, model: resolvedModel },
|
|
6387
6415
|
"You are an expert test judge. Return only a number between 0 and 10.",
|
|
6388
6416
|
16,
|
|
6389
6417
|
0
|
|
6390
6418
|
);
|
|
6419
|
+
restoreEnv?.();
|
|
6391
6420
|
const score = parseFloat(result.content.match(/-?\d+(?:\.\d+)?/)?.[0] ?? "");
|
|
6392
6421
|
if (isNaN(score)) {
|
|
6393
6422
|
metrics.push({
|
|
@@ -6414,6 +6443,7 @@ Score this output on a scale of 0-10. Respond with only the number.`;
|
|
|
6414
6443
|
}
|
|
6415
6444
|
}
|
|
6416
6445
|
} catch (err) {
|
|
6446
|
+
restoreEnv?.();
|
|
6417
6447
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
6418
6448
|
metrics.push({
|
|
6419
6449
|
name: "llm_judge",
|
|
@@ -6440,10 +6470,70 @@ function stringifyOutput(output) {
|
|
|
6440
6470
|
return JSON.stringify(output);
|
|
6441
6471
|
}
|
|
6442
6472
|
|
|
6473
|
+
// src/ci/api-client.ts
|
|
6474
|
+
var import_node_crypto8 = require("node:crypto");
|
|
6475
|
+
init_http();
|
|
6476
|
+
function normalizeBase(serverUrl) {
|
|
6477
|
+
return serverUrl.replace(/\/+$/, "").replace(/\/api$/, "");
|
|
6478
|
+
}
|
|
6479
|
+
function headers(apiKey) {
|
|
6480
|
+
return {
|
|
6481
|
+
"Content-Type": "application/json",
|
|
6482
|
+
"api-key": apiKey || "",
|
|
6483
|
+
"X-Correlation-ID": (0, import_node_crypto8.randomUUID)()
|
|
6484
|
+
};
|
|
6485
|
+
}
|
|
6486
|
+
async function apiRequest(url, apiKey, options = {}) {
|
|
6487
|
+
const method = (options.method || "GET").toUpperCase();
|
|
6488
|
+
console.log(`[elasticdash ci] ${method} ${url}`);
|
|
6489
|
+
const res = await getOriginalFetch()(url, {
|
|
6490
|
+
...options,
|
|
6491
|
+
headers: { ...headers(apiKey), ...options.headers ?? {} }
|
|
6492
|
+
});
|
|
6493
|
+
if (!res.ok) {
|
|
6494
|
+
const text = await res.text().catch(() => "");
|
|
6495
|
+
console.log(`[elasticdash ci] ${method} ${url} \u2192 ${res.status} ${text.substring(0, 200)}`);
|
|
6496
|
+
throw new Error(`API ${res.status}: ${text || res.statusText}`);
|
|
6497
|
+
}
|
|
6498
|
+
const json = await res.json();
|
|
6499
|
+
return json.result ?? json.data ?? json;
|
|
6500
|
+
}
|
|
6501
|
+
async function fetchTestGroups(serverUrl, apiKey, filters) {
|
|
6502
|
+
const base = normalizeBase(serverUrl);
|
|
6503
|
+
const params = new URLSearchParams();
|
|
6504
|
+
if (filters?.workflowName) params.set("workflowName", filters.workflowName);
|
|
6505
|
+
if (filters?.tags?.length) params.set("tags", filters.tags.join(","));
|
|
6506
|
+
if (filters?.status) params.set("status", filters.status);
|
|
6507
|
+
const qs = params.toString();
|
|
6508
|
+
const url = `${base}/api/testgroups/by-project${qs ? `?${qs}` : ""}`;
|
|
6509
|
+
return apiRequest(url, apiKey);
|
|
6510
|
+
}
|
|
6511
|
+
async function submitTestRun(serverUrl, apiKey, testGroupId, payload) {
|
|
6512
|
+
const base = normalizeBase(serverUrl);
|
|
6513
|
+
const url = `${base}/api/testgroups/${testGroupId}/runs`;
|
|
6514
|
+
return apiRequest(url, apiKey, {
|
|
6515
|
+
method: "POST",
|
|
6516
|
+
body: JSON.stringify(payload)
|
|
6517
|
+
});
|
|
6518
|
+
}
|
|
6519
|
+
async function createBatch(serverUrl, apiKey, payload) {
|
|
6520
|
+
const base = normalizeBase(serverUrl);
|
|
6521
|
+
const url = `${base}/api/testgroups/batches`;
|
|
6522
|
+
return apiRequest(url, apiKey, {
|
|
6523
|
+
method: "POST",
|
|
6524
|
+
body: JSON.stringify(payload)
|
|
6525
|
+
});
|
|
6526
|
+
}
|
|
6527
|
+
async function fetchEvaluatorConfig(serverUrl, apiKey) {
|
|
6528
|
+
const base = normalizeBase(serverUrl);
|
|
6529
|
+
const url = `${base}/api/test-runs/evaluator-config`;
|
|
6530
|
+
return apiRequest(url, apiKey);
|
|
6531
|
+
}
|
|
6532
|
+
|
|
6443
6533
|
// src/ci/ed-runner.ts
|
|
6444
6534
|
async function runEdTests(options) {
|
|
6445
6535
|
const cwd = options?.cwd ?? process.cwd();
|
|
6446
|
-
const runId = (0,
|
|
6536
|
+
const runId = (0, import_node_crypto9.randomUUID)();
|
|
6447
6537
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
6448
6538
|
const results = [];
|
|
6449
6539
|
const { tests, errors } = await loadTests({ cwd });
|
|
@@ -6463,12 +6553,28 @@ async function runEdTests(options) {
|
|
|
6463
6553
|
testsToRun = tests.filter((t) => matchGlob(t.name, pattern));
|
|
6464
6554
|
}
|
|
6465
6555
|
const maxRuns = Math.max(1, options?.runs ?? 1);
|
|
6556
|
+
let evaluatorConfig = null;
|
|
6557
|
+
const needsEvaluatorConfig = testsToRun.some(
|
|
6558
|
+
(t) => t.benchmarks.llm_judge && (!t.benchmarks.llm_judge.judge_provider || !t.benchmarks.llm_judge.judge_model)
|
|
6559
|
+
);
|
|
6560
|
+
if (needsEvaluatorConfig) {
|
|
6561
|
+
const serverUrl = process.env.ELASTICDASH_API_URL ?? process.env.ELASTICDASH_SERVER ?? "";
|
|
6562
|
+
const apiKey = process.env.ELASTICDASH_API_KEY ?? "";
|
|
6563
|
+
if (serverUrl && apiKey) {
|
|
6564
|
+
try {
|
|
6565
|
+
evaluatorConfig = await fetchEvaluatorConfig(serverUrl, apiKey);
|
|
6566
|
+
console.log(`[ed-test] Evaluator config: provider=${evaluatorConfig.provider}, model=${evaluatorConfig.model}, hasKey=${!!evaluatorConfig.apiKey}`);
|
|
6567
|
+
} catch (err) {
|
|
6568
|
+
console.warn(`[ed-test] Could not fetch evaluator config: ${err instanceof Error ? err.message : String(err)}`);
|
|
6569
|
+
}
|
|
6570
|
+
}
|
|
6571
|
+
}
|
|
6466
6572
|
for (const test of testsToRun) {
|
|
6467
6573
|
const allRuns = [];
|
|
6468
6574
|
let bestResult = null;
|
|
6469
6575
|
for (let attempt = 1; attempt <= maxRuns; attempt++) {
|
|
6470
6576
|
const runStartedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
6471
|
-
const result = await runSingleTest(test);
|
|
6577
|
+
const result = await runSingleTest(test, evaluatorConfig);
|
|
6472
6578
|
const runFinishedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
6473
6579
|
if (attempt > 1) {
|
|
6474
6580
|
console.log(` [ed-test] ${test.name}: run ${attempt}/${maxRuns} \u2014 ${result.status}`);
|
|
@@ -6511,7 +6617,7 @@ async function runEdTests(options) {
|
|
|
6511
6617
|
async function resolveCustomInput(input) {
|
|
6512
6618
|
return typeof input === "function" ? await input() : input;
|
|
6513
6619
|
}
|
|
6514
|
-
async function runSingleTest(test) {
|
|
6620
|
+
async function runSingleTest(test, evaluatorConfig) {
|
|
6515
6621
|
const startMs = Date.now();
|
|
6516
6622
|
const targetStep = test.traceData.steps.find((s) => s.step_id === test.target.step_id);
|
|
6517
6623
|
const resolvedInput = test.input !== void 0 ? await resolveCustomInput(test.input) : targetStep?.input;
|
|
@@ -6560,7 +6666,7 @@ async function runSingleTest(test) {
|
|
|
6560
6666
|
durationMs: Date.now() - startMs
|
|
6561
6667
|
};
|
|
6562
6668
|
}
|
|
6563
|
-
const benchmarkResult = await compareBenchmarks(measurement, test.benchmarks, targetStep?.output);
|
|
6669
|
+
const benchmarkResult = await compareBenchmarks(measurement, test.benchmarks, targetStep?.output, evaluatorConfig);
|
|
6564
6670
|
return {
|
|
6565
6671
|
...base,
|
|
6566
6672
|
testId: test.name,
|
|
@@ -6575,7 +6681,7 @@ async function runSingleTest(test) {
|
|
|
6575
6681
|
const traceMeasurement = extractMeasurementFromTrace(test);
|
|
6576
6682
|
if (traceMeasurement) {
|
|
6577
6683
|
console.log(` [ed-test] ${test.name}: run() failed (${err instanceof Error ? err.message : String(err)}), using trace measurement fallback`);
|
|
6578
|
-
const benchmarkResult = await compareBenchmarks(traceMeasurement, test.benchmarks, targetStep?.output);
|
|
6684
|
+
const benchmarkResult = await compareBenchmarks(traceMeasurement, test.benchmarks, targetStep?.output, evaluatorConfig);
|
|
6579
6685
|
return {
|
|
6580
6686
|
...base,
|
|
6581
6687
|
testId: test.name,
|
|
@@ -6647,7 +6753,7 @@ function matchGlob(name, pattern) {
|
|
|
6647
6753
|
}
|
|
6648
6754
|
|
|
6649
6755
|
// src/ci/upload-client.ts
|
|
6650
|
-
var
|
|
6756
|
+
var import_node_crypto10 = require("node:crypto");
|
|
6651
6757
|
init_http();
|
|
6652
6758
|
init_git_info();
|
|
6653
6759
|
function buildUploadPayload(runResult) {
|
|
@@ -6775,7 +6881,7 @@ async function uploadResults(payload, options) {
|
|
|
6775
6881
|
const headers2 = {
|
|
6776
6882
|
"Content-Type": "application/json",
|
|
6777
6883
|
"api-key": options.apiKey,
|
|
6778
|
-
"X-Correlation-ID": (0,
|
|
6884
|
+
"X-Correlation-ID": (0, import_node_crypto10.randomUUID)()
|
|
6779
6885
|
};
|
|
6780
6886
|
const body = JSON.stringify(payload);
|
|
6781
6887
|
console.log(`[elasticdash] Uploading to ${url}, api-key=${options.apiKey ? options.apiKey.slice(0, 10) + "..." : "(none)"}`);
|
|
@@ -6811,61 +6917,6 @@ async function uploadResults(payload, options) {
|
|
|
6811
6917
|
// src/ci/runner.ts
|
|
6812
6918
|
var import_chalk2 = __toESM(require("chalk"), 1);
|
|
6813
6919
|
|
|
6814
|
-
// src/ci/api-client.ts
|
|
6815
|
-
var import_node_crypto10 = require("node:crypto");
|
|
6816
|
-
init_http();
|
|
6817
|
-
function normalizeBase(serverUrl) {
|
|
6818
|
-
return serverUrl.replace(/\/+$/, "").replace(/\/api$/, "");
|
|
6819
|
-
}
|
|
6820
|
-
function headers(apiKey) {
|
|
6821
|
-
return {
|
|
6822
|
-
"Content-Type": "application/json",
|
|
6823
|
-
"api-key": apiKey || "",
|
|
6824
|
-
"X-Correlation-ID": (0, import_node_crypto10.randomUUID)()
|
|
6825
|
-
};
|
|
6826
|
-
}
|
|
6827
|
-
async function apiRequest(url, apiKey, options = {}) {
|
|
6828
|
-
const method = (options.method || "GET").toUpperCase();
|
|
6829
|
-
console.log(`[elasticdash ci] ${method} ${url}`);
|
|
6830
|
-
const res = await getOriginalFetch()(url, {
|
|
6831
|
-
...options,
|
|
6832
|
-
headers: { ...headers(apiKey), ...options.headers ?? {} }
|
|
6833
|
-
});
|
|
6834
|
-
if (!res.ok) {
|
|
6835
|
-
const text = await res.text().catch(() => "");
|
|
6836
|
-
console.log(`[elasticdash ci] ${method} ${url} \u2192 ${res.status} ${text.substring(0, 200)}`);
|
|
6837
|
-
throw new Error(`API ${res.status}: ${text || res.statusText}`);
|
|
6838
|
-
}
|
|
6839
|
-
const json = await res.json();
|
|
6840
|
-
return json.result ?? json.data ?? json;
|
|
6841
|
-
}
|
|
6842
|
-
async function fetchTestGroups(serverUrl, apiKey, filters) {
|
|
6843
|
-
const base = normalizeBase(serverUrl);
|
|
6844
|
-
const params = new URLSearchParams();
|
|
6845
|
-
if (filters?.workflowName) params.set("workflowName", filters.workflowName);
|
|
6846
|
-
if (filters?.tags?.length) params.set("tags", filters.tags.join(","));
|
|
6847
|
-
if (filters?.status) params.set("status", filters.status);
|
|
6848
|
-
const qs = params.toString();
|
|
6849
|
-
const url = `${base}/api/testgroups/by-project${qs ? `?${qs}` : ""}`;
|
|
6850
|
-
return apiRequest(url, apiKey);
|
|
6851
|
-
}
|
|
6852
|
-
async function submitTestRun(serverUrl, apiKey, testGroupId, payload) {
|
|
6853
|
-
const base = normalizeBase(serverUrl);
|
|
6854
|
-
const url = `${base}/api/testgroups/${testGroupId}/runs`;
|
|
6855
|
-
return apiRequest(url, apiKey, {
|
|
6856
|
-
method: "POST",
|
|
6857
|
-
body: JSON.stringify(payload)
|
|
6858
|
-
});
|
|
6859
|
-
}
|
|
6860
|
-
async function createBatch(serverUrl, apiKey, payload) {
|
|
6861
|
-
const base = normalizeBase(serverUrl);
|
|
6862
|
-
const url = `${base}/api/testgroups/batches`;
|
|
6863
|
-
return apiRequest(url, apiKey, {
|
|
6864
|
-
method: "POST",
|
|
6865
|
-
body: JSON.stringify(payload)
|
|
6866
|
-
});
|
|
6867
|
-
}
|
|
6868
|
-
|
|
6869
6920
|
// src/ci/executor.ts
|
|
6870
6921
|
init_portal_executor();
|
|
6871
6922
|
init_tool_runner();
|
|
@@ -7593,6 +7644,7 @@ tryAutoInitHttpContext().catch(() => {
|
|
|
7593
7644
|
expect,
|
|
7594
7645
|
extractTaskOutputs,
|
|
7595
7646
|
fetchCapturedTrace,
|
|
7647
|
+
fetchEvaluatorConfig,
|
|
7596
7648
|
fetchTestGroups,
|
|
7597
7649
|
getCaptureContext,
|
|
7598
7650
|
getCurrentTrace,
|
package/dist/index.d.ts
CHANGED
|
@@ -52,7 +52,7 @@ export type { EdTestRunOptions, EdTestResult, EdTestRunResult } from './ci/ed-ru
|
|
|
52
52
|
export { uploadResults, buildUploadPayload } from './ci/upload-client.js';
|
|
53
53
|
export type { UploadPayload, UploadTestResult } from './ci/upload-client.js';
|
|
54
54
|
export { runCI } from './ci/runner.js';
|
|
55
|
-
export { fetchTestGroups, submitTestRun, createBatch } from './ci/api-client.js';
|
|
55
|
+
export { fetchTestGroups, submitTestRun, createBatch, fetchEvaluatorConfig } from './ci/api-client.js';
|
|
56
56
|
export { detectGitInfo } from './ci/git-info.js';
|
|
57
57
|
export type { CIRunConfig, CIRunSummary, CITestResult, CISingleRunResult, CIExpectationResult } from './ci/types.js';
|
|
58
58
|
export type { GitInfo } from './ci/git-info.js';
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,UAAU,EAAE,SAAS,EAAE,WAAW,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAA;AACnH,OAAO,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAA;AACtC,OAAO,EAAE,aAAa,EAAE,MAAM,eAAe,CAAA;AAC7C,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAA;AACtD,OAAO,EAAE,MAAM,EAAE,MAAM,qBAAqB,CAAA;AAC5C,YAAY,EAAE,UAAU,EAAE,UAAU,EAAE,aAAa,EAAE,MAAM,aAAa,CAAA;AAGxE,OAAO,EAAE,iBAAiB,EAAE,iBAAiB,EAAE,eAAe,EAAE,eAAe,EAAE,MAAM,4BAA4B,CAAA;AACnH,YAAY,EAAE,aAAa,EAAE,WAAW,EAAE,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,SAAS,EAAE,WAAW,EAAE,MAAM,4BAA4B,CAAA;AAGnI,OAAO,EAAE,aAAa,EAAE,iBAAiB,EAAE,iBAAiB,EAAE,MAAM,uBAAuB,CAAA;AAC3F,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAA;AACtD,YAAY,EAAE,aAAa,EAAE,iBAAiB,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAA;AACzF,YAAY,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAA;AAG3D,OAAO,EAAE,QAAQ,EAAE,MAAM,wBAAwB,CAAA;AACjD,OAAO,EAAE,MAAM,EAAE,MAAM,+BAA+B,CAAA;AAGtD,OAAO,EACL,iBAAiB,EACjB,kBAAkB,EAClB,iBAAiB,EACjB,kBAAkB,EAClB,iBAAiB,EACjB,kBAAkB,EAClB,sBAAsB,EACtB,gBAAgB,EAChB,6BAA6B,EAC7B,oBAAoB,EACpB,sBAAsB,GACvB,MAAM,kCAAkC,CAAA;AACzC,YAAY,EAAE,cAAc,EAAE,MAAM,kCAAkC,CAAA;AAGtE,OAAO,EACL,WAAW,EACX,aAAa,EACb,iBAAiB,EACjB,qBAAqB,EACrB,eAAe,EACf,oBAAoB,EACpB,sBAAsB,GACvB,MAAM,8BAA8B,CAAA;AACrC,YAAY,EACV,WAAW,EACX,YAAY,EACZ,mBAAmB,EACnB,oBAAoB,EACpB,qBAAqB,EACrB,sBAAsB,GACvB,MAAM,8BAA8B,CAAA;AAGrC,OAAO,EAAE,MAAM,EAAE,YAAY,EAAE,QAAQ,EAAE,mBAAmB,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAA;AAC3G,OAAO,EAAE,wBAAwB,EAAE,0BAA0B,EAAE,MAAM,2BAA2B,CAAA;AAGhG,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAA;AACzF,YAAY,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAA;AAGlE,OAAO,EAAE,eAAe,EAAE,aAAa,EAAE,gBAAgB,EAAE,cAAc,EAAE,MAAM,gCAAgC,CAAA;AAGjH,OAAO,EAAE,oBAAoB,EAAE,sBAAsB,EAAE,MAAM,kCAAkC,CAAA;AAG/F,OAAO,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,UAAU,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAA;AACjH,YAAY,EAAE,oBAAoB,EAAE,mBAAmB,EAAE,MAAM,oBAAoB,CAAA;AACnF,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAA;AACzD,YAAY,EAAE,uBAAuB,EAAE,aAAa,EAAE,WAAW,EAAE,MAAM,wBAAwB,CAAA;AACjG,YAAY,EAAE,oBAAoB,EAAE,MAAM,kCAAkC,CAAA;AAC5E,OAAO,EAAE,qBAAqB,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAA;AACjF,YAAY,EAAE,kBAAkB,EAAE,MAAM,sBAAsB,CAAA;AAG9D,OAAO,EAAE,gBAAgB,EAAE,qBAAqB,EAAE,MAAM,uBAAuB,CAAA;AAC/E,YAAY,EAAE,sBAAsB,EAAE,MAAM,uBAAuB,CAAA;AAGnE,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAA;AAG1E,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAA;AAC7C,OAAO,EAAE,kBAAkB,EAAE,QAAQ,EAAE,MAAM,qCAAqC,CAAA;AAGlF,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAA;AAClD,YAAY,EAAE,kBAAkB,EAAE,iBAAiB,EAAE,MAAM,sBAAsB,CAAA;AAGjF,OAAO,EAAE,mBAAmB,EAAE,qBAAqB,EAAE,kBAAkB,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAA;AACxH,YAAY,EAAE,SAAS,EAAE,SAAS,EAAE,UAAU,EAAE,eAAe,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAA;AAG1G,OAAO,EAAE,iBAAiB,EAAE,MAAM,oBAAoB,CAAA;AACtD,OAAO,EAAE,iBAAiB,EAAE,MAAM,sBAAsB,CAAA;AACxD,YAAY,EAAE,UAAU,EAAE,gBAAgB,EAAE,mBAAmB,EAAE,kBAAkB,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAA;AAG5H,OAAO,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAA;AAC1D,OAAO,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAA;AAClD,OAAO,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAA;AAC/C,YAAY,EAAE,cAAc,EAAE,UAAU,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAA;AACvF,YAAY,EAAE,aAAa,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAA;AACzE,YAAY,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAA;AAGpE,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAA;AAC9C,YAAY,EAAE,gBAAgB,EAAE,YAAY,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAA;AACxF,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,uBAAuB,CAAA;AACzE,YAAY,EAAE,aAAa,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAA;AAG5E,OAAO,EAAE,KAAK,EAAE,MAAM,gBAAgB,CAAA;AACtC,OAAO,EAAE,eAAe,EAAE,aAAa,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAA;
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,UAAU,EAAE,SAAS,EAAE,WAAW,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAA;AACnH,OAAO,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAA;AACtC,OAAO,EAAE,aAAa,EAAE,MAAM,eAAe,CAAA;AAC7C,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAA;AACtD,OAAO,EAAE,MAAM,EAAE,MAAM,qBAAqB,CAAA;AAC5C,YAAY,EAAE,UAAU,EAAE,UAAU,EAAE,aAAa,EAAE,MAAM,aAAa,CAAA;AAGxE,OAAO,EAAE,iBAAiB,EAAE,iBAAiB,EAAE,eAAe,EAAE,eAAe,EAAE,MAAM,4BAA4B,CAAA;AACnH,YAAY,EAAE,aAAa,EAAE,WAAW,EAAE,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,SAAS,EAAE,WAAW,EAAE,MAAM,4BAA4B,CAAA;AAGnI,OAAO,EAAE,aAAa,EAAE,iBAAiB,EAAE,iBAAiB,EAAE,MAAM,uBAAuB,CAAA;AAC3F,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAA;AACtD,YAAY,EAAE,aAAa,EAAE,iBAAiB,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAA;AACzF,YAAY,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAA;AAG3D,OAAO,EAAE,QAAQ,EAAE,MAAM,wBAAwB,CAAA;AACjD,OAAO,EAAE,MAAM,EAAE,MAAM,+BAA+B,CAAA;AAGtD,OAAO,EACL,iBAAiB,EACjB,kBAAkB,EAClB,iBAAiB,EACjB,kBAAkB,EAClB,iBAAiB,EACjB,kBAAkB,EAClB,sBAAsB,EACtB,gBAAgB,EAChB,6BAA6B,EAC7B,oBAAoB,EACpB,sBAAsB,GACvB,MAAM,kCAAkC,CAAA;AACzC,YAAY,EAAE,cAAc,EAAE,MAAM,kCAAkC,CAAA;AAGtE,OAAO,EACL,WAAW,EACX,aAAa,EACb,iBAAiB,EACjB,qBAAqB,EACrB,eAAe,EACf,oBAAoB,EACpB,sBAAsB,GACvB,MAAM,8BAA8B,CAAA;AACrC,YAAY,EACV,WAAW,EACX,YAAY,EACZ,mBAAmB,EACnB,oBAAoB,EACpB,qBAAqB,EACrB,sBAAsB,GACvB,MAAM,8BAA8B,CAAA;AAGrC,OAAO,EAAE,MAAM,EAAE,YAAY,EAAE,QAAQ,EAAE,mBAAmB,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAA;AAC3G,OAAO,EAAE,wBAAwB,EAAE,0BAA0B,EAAE,MAAM,2BAA2B,CAAA;AAGhG,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAA;AACzF,YAAY,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAA;AAGlE,OAAO,EAAE,eAAe,EAAE,aAAa,EAAE,gBAAgB,EAAE,cAAc,EAAE,MAAM,gCAAgC,CAAA;AAGjH,OAAO,EAAE,oBAAoB,EAAE,sBAAsB,EAAE,MAAM,kCAAkC,CAAA;AAG/F,OAAO,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,UAAU,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAA;AACjH,YAAY,EAAE,oBAAoB,EAAE,mBAAmB,EAAE,MAAM,oBAAoB,CAAA;AACnF,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAA;AACzD,YAAY,EAAE,uBAAuB,EAAE,aAAa,EAAE,WAAW,EAAE,MAAM,wBAAwB,CAAA;AACjG,YAAY,EAAE,oBAAoB,EAAE,MAAM,kCAAkC,CAAA;AAC5E,OAAO,EAAE,qBAAqB,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAA;AACjF,YAAY,EAAE,kBAAkB,EAAE,MAAM,sBAAsB,CAAA;AAG9D,OAAO,EAAE,gBAAgB,EAAE,qBAAqB,EAAE,MAAM,uBAAuB,CAAA;AAC/E,YAAY,EAAE,sBAAsB,EAAE,MAAM,uBAAuB,CAAA;AAGnE,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAA;AAG1E,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAA;AAC7C,OAAO,EAAE,kBAAkB,EAAE,QAAQ,EAAE,MAAM,qCAAqC,CAAA;AAGlF,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAA;AAClD,YAAY,EAAE,kBAAkB,EAAE,iBAAiB,EAAE,MAAM,sBAAsB,CAAA;AAGjF,OAAO,EAAE,mBAAmB,EAAE,qBAAqB,EAAE,kBAAkB,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAA;AACxH,YAAY,EAAE,SAAS,EAAE,SAAS,EAAE,UAAU,EAAE,eAAe,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAA;AAG1G,OAAO,EAAE,iBAAiB,EAAE,MAAM,oBAAoB,CAAA;AACtD,OAAO,EAAE,iBAAiB,EAAE,MAAM,sBAAsB,CAAA;AACxD,YAAY,EAAE,UAAU,EAAE,gBAAgB,EAAE,mBAAmB,EAAE,kBAAkB,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAA;AAG5H,OAAO,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAA;AAC1D,OAAO,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAA;AAClD,OAAO,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAA;AAC/C,YAAY,EAAE,cAAc,EAAE,UAAU,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAA;AACvF,YAAY,EAAE,aAAa,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAA;AACzE,YAAY,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAA;AAGpE,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAA;AAC9C,YAAY,EAAE,gBAAgB,EAAE,YAAY,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAA;AACxF,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,uBAAuB,CAAA;AACzE,YAAY,EAAE,aAAa,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAA;AAG5E,OAAO,EAAE,KAAK,EAAE,MAAM,gBAAgB,CAAA;AACtC,OAAO,EAAE,eAAe,EAAE,aAAa,EAAE,WAAW,EAAE,oBAAoB,EAAE,MAAM,oBAAoB,CAAA;AACtG,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAA;AAChD,YAAY,EAAE,WAAW,EAAE,YAAY,EAAE,YAAY,EAAE,iBAAiB,EAAE,mBAAmB,EAAE,MAAM,eAAe,CAAA;AACpH,YAAY,EAAE,OAAO,EAAE,MAAM,kBAAkB,CAAA"}
|
package/dist/index.js
CHANGED
|
@@ -53,7 +53,7 @@ export { runEdTests } from './ci/ed-runner.js';
|
|
|
53
53
|
export { uploadResults, buildUploadPayload } from './ci/upload-client.js';
|
|
54
54
|
// CI runner (programmatic API)
|
|
55
55
|
export { runCI } from './ci/runner.js';
|
|
56
|
-
export { fetchTestGroups, submitTestRun, createBatch } from './ci/api-client.js';
|
|
56
|
+
export { fetchTestGroups, submitTestRun, createBatch, fetchEvaluatorConfig } from './ci/api-client.js';
|
|
57
57
|
export { detectGitInfo } from './ci/git-info.js';
|
|
58
58
|
// ─── Eager auto-init ────────────────────────────────────────
|
|
59
59
|
// When ELASTICDASH_API_KEY is set, automatically initialise observability mode
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,0CAA0C;AAE1C,iBAAiB;AACjB,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,UAAU,EAAE,SAAS,EAAE,WAAW,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAA;AACnH,OAAO,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAA;AACtC,OAAO,EAAE,aAAa,EAAE,MAAM,eAAe,CAAA;AAC7C,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAA;AACtD,OAAO,EAAE,MAAM,EAAE,MAAM,qBAAqB,CAAA;AAG5C,gBAAgB;AAChB,OAAO,EAAE,iBAAiB,EAAE,iBAAiB,EAAE,eAAe,EAAE,eAAe,EAAE,MAAM,4BAA4B,CAAA;AAGnH,4BAA4B;AAC5B,OAAO,EAAE,aAAa,EAAE,iBAAiB,EAAE,iBAAiB,EAAE,MAAM,uBAAuB,CAAA;AAC3F,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAA;AAItD,yBAAyB;AACzB,OAAO,EAAE,QAAQ,EAAE,MAAM,wBAAwB,CAAA;AACjD,OAAO,EAAE,MAAM,EAAE,MAAM,+BAA+B,CAAA;AAEtD,oEAAoE;AACpE,OAAO,EACL,iBAAiB,EACjB,kBAAkB,EAClB,iBAAiB,EACjB,kBAAkB,EAClB,iBAAiB,EACjB,kBAAkB,EAClB,sBAAsB,EACtB,gBAAgB,EAChB,6BAA6B,EAC7B,oBAAoB,EACpB,sBAAsB,GACvB,MAAM,kCAAkC,CAAA;AAGzC,4DAA4D;AAC5D,OAAO,EACL,WAAW,EACX,aAAa,EACb,iBAAiB,EACjB,qBAAqB,EACrB,eAAe,EACf,oBAAoB,EACpB,sBAAsB,GACvB,MAAM,8BAA8B,CAAA;AAUrC,wBAAwB;AACxB,OAAO,EAAE,MAAM,EAAE,YAAY,EAAE,QAAQ,EAAE,mBAAmB,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAA;AAC3G,OAAO,EAAE,wBAAwB,EAAE,0BAA0B,EAAE,MAAM,2BAA2B,CAAA;AAEhG,4BAA4B;AAC5B,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAA;AAGzF,2BAA2B;AAC3B,OAAO,EAAE,eAAe,EAAE,aAAa,EAAE,gBAAgB,EAAE,cAAc,EAAE,MAAM,gCAAgC,CAAA;AAEjH,sCAAsC;AACtC,OAAO,EAAE,oBAAoB,EAAE,sBAAsB,EAAE,MAAM,kCAAkC,CAAA;AAE/F,gBAAgB;AAChB,OAAO,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,UAAU,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAA;AAEjH,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAA;AAGzD,OAAO,EAAE,qBAAqB,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAA;AAGjF,qDAAqD;AACrD,OAAO,EAAE,gBAAgB,EAAE,qBAAqB,EAAE,MAAM,uBAAuB,CAAA;AAG/E,YAAY;AACZ,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAA;AAE1E,UAAU;AACV,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAA;AAC7C,OAAO,EAAE,kBAAkB,EAAE,QAAQ,EAAE,MAAM,qCAAqC,CAAA;AAElF,kBAAkB;AAClB,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAA;AAGlD,yBAAyB;AACzB,OAAO,EAAE,mBAAmB,EAAE,qBAAqB,EAAE,kBAAkB,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAA;AAGxH,8BAA8B;AAC9B,OAAO,EAAE,iBAAiB,EAAE,MAAM,oBAAoB,CAAA;AACtD,OAAO,EAAE,iBAAiB,EAAE,MAAM,sBAAsB,CAAA;AAGxD,kDAAkD;AAClD,OAAO,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAA;AAC1D,OAAO,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAA;AAClD,OAAO,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAA;AAK/C,iCAAiC;AACjC,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAA;AAE9C,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,uBAAuB,CAAA;AAGzE,+BAA+B;AAC/B,OAAO,EAAE,KAAK,EAAE,MAAM,gBAAgB,CAAA;AACtC,OAAO,EAAE,eAAe,EAAE,aAAa,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAA;
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,0CAA0C;AAE1C,iBAAiB;AACjB,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,UAAU,EAAE,SAAS,EAAE,WAAW,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAA;AACnH,OAAO,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAA;AACtC,OAAO,EAAE,aAAa,EAAE,MAAM,eAAe,CAAA;AAC7C,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAA;AACtD,OAAO,EAAE,MAAM,EAAE,MAAM,qBAAqB,CAAA;AAG5C,gBAAgB;AAChB,OAAO,EAAE,iBAAiB,EAAE,iBAAiB,EAAE,eAAe,EAAE,eAAe,EAAE,MAAM,4BAA4B,CAAA;AAGnH,4BAA4B;AAC5B,OAAO,EAAE,aAAa,EAAE,iBAAiB,EAAE,iBAAiB,EAAE,MAAM,uBAAuB,CAAA;AAC3F,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAA;AAItD,yBAAyB;AACzB,OAAO,EAAE,QAAQ,EAAE,MAAM,wBAAwB,CAAA;AACjD,OAAO,EAAE,MAAM,EAAE,MAAM,+BAA+B,CAAA;AAEtD,oEAAoE;AACpE,OAAO,EACL,iBAAiB,EACjB,kBAAkB,EAClB,iBAAiB,EACjB,kBAAkB,EAClB,iBAAiB,EACjB,kBAAkB,EAClB,sBAAsB,EACtB,gBAAgB,EAChB,6BAA6B,EAC7B,oBAAoB,EACpB,sBAAsB,GACvB,MAAM,kCAAkC,CAAA;AAGzC,4DAA4D;AAC5D,OAAO,EACL,WAAW,EACX,aAAa,EACb,iBAAiB,EACjB,qBAAqB,EACrB,eAAe,EACf,oBAAoB,EACpB,sBAAsB,GACvB,MAAM,8BAA8B,CAAA;AAUrC,wBAAwB;AACxB,OAAO,EAAE,MAAM,EAAE,YAAY,EAAE,QAAQ,EAAE,mBAAmB,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAA;AAC3G,OAAO,EAAE,wBAAwB,EAAE,0BAA0B,EAAE,MAAM,2BAA2B,CAAA;AAEhG,4BAA4B;AAC5B,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAA;AAGzF,2BAA2B;AAC3B,OAAO,EAAE,eAAe,EAAE,aAAa,EAAE,gBAAgB,EAAE,cAAc,EAAE,MAAM,gCAAgC,CAAA;AAEjH,sCAAsC;AACtC,OAAO,EAAE,oBAAoB,EAAE,sBAAsB,EAAE,MAAM,kCAAkC,CAAA;AAE/F,gBAAgB;AAChB,OAAO,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,UAAU,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAA;AAEjH,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAA;AAGzD,OAAO,EAAE,qBAAqB,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAA;AAGjF,qDAAqD;AACrD,OAAO,EAAE,gBAAgB,EAAE,qBAAqB,EAAE,MAAM,uBAAuB,CAAA;AAG/E,YAAY;AACZ,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAA;AAE1E,UAAU;AACV,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAA;AAC7C,OAAO,EAAE,kBAAkB,EAAE,QAAQ,EAAE,MAAM,qCAAqC,CAAA;AAElF,kBAAkB;AAClB,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAA;AAGlD,yBAAyB;AACzB,OAAO,EAAE,mBAAmB,EAAE,qBAAqB,EAAE,kBAAkB,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAA;AAGxH,8BAA8B;AAC9B,OAAO,EAAE,iBAAiB,EAAE,MAAM,oBAAoB,CAAA;AACtD,OAAO,EAAE,iBAAiB,EAAE,MAAM,sBAAsB,CAAA;AAGxD,kDAAkD;AAClD,OAAO,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAA;AAC1D,OAAO,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAA;AAClD,OAAO,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAA;AAK/C,iCAAiC;AACjC,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAA;AAE9C,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,uBAAuB,CAAA;AAGzE,+BAA+B;AAC/B,OAAO,EAAE,KAAK,EAAE,MAAM,gBAAgB,CAAA;AACtC,OAAO,EAAE,eAAe,EAAE,aAAa,EAAE,WAAW,EAAE,oBAAoB,EAAE,MAAM,oBAAoB,CAAA;AACtG,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAA;AAIhD,+DAA+D;AAC/D,+EAA+E;AAC/E,+EAA+E;AAC/E,+EAA+E;AAC/E,iFAAiF;AACjF,OAAO,EAAE,sBAAsB,EAAE,MAAM,kCAAkC,CAAA;AACzE,sBAAsB,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAA"}
|
package/package.json
CHANGED
package/src/ci/api-client.ts
CHANGED
|
@@ -94,3 +94,27 @@ export async function createBatch(
|
|
|
94
94
|
body: JSON.stringify(payload),
|
|
95
95
|
})
|
|
96
96
|
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Resolved evaluator configuration from the backend.
|
|
100
|
+
* Provider/model/apiKey may be null if the user has not configured an evaluator.
|
|
101
|
+
*/
|
|
102
|
+
export interface EvaluatorConfig {
|
|
103
|
+
provider: string | null
|
|
104
|
+
model: string | null
|
|
105
|
+
apiKey: string | null
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Fetch the project's evaluator config (with user-level fallback).
|
|
110
|
+
* Used by ed-test llm_judge benchmarks when judge_provider/judge_model
|
|
111
|
+
* are not specified in the test definition.
|
|
112
|
+
*/
|
|
113
|
+
export async function fetchEvaluatorConfig(
|
|
114
|
+
serverUrl: string,
|
|
115
|
+
apiKey: string,
|
|
116
|
+
): Promise<EvaluatorConfig> {
|
|
117
|
+
const base = normalizeBase(serverUrl)
|
|
118
|
+
const url = `${base}/api/test-runs/evaluator-config`
|
|
119
|
+
return apiRequest<EvaluatorConfig>(url, apiKey)
|
|
120
|
+
}
|
package/src/ci/benchmark.ts
CHANGED
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
import { callProviderLLM } from '../matchers/index.js'
|
|
11
11
|
import type { TestMeasurement } from './measurement.js'
|
|
12
12
|
import type { TestBenchmarks } from './test-registry.js'
|
|
13
|
+
import type { EvaluatorConfig } from './api-client.js'
|
|
13
14
|
|
|
14
15
|
export type MetricName = 'duration_ms' | 'tokens_total' | 'output_contains' | 'output_not_contains' | 'llm_judge'
|
|
15
16
|
|
|
@@ -27,14 +28,29 @@ export interface BenchmarkResult {
|
|
|
27
28
|
metrics: MetricResult[]
|
|
28
29
|
}
|
|
29
30
|
|
|
31
|
+
/** Maps backend provider names to SDK provider names used by callProviderLLM. */
|
|
32
|
+
const PROVIDER_NAME_MAP: Record<string, string> = {
|
|
33
|
+
anthropic: 'claude',
|
|
34
|
+
moonshot: 'kimi',
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/** Normalize provider name from backend format to SDK format. */
|
|
38
|
+
function normalizeSdkProvider(provider: string): string {
|
|
39
|
+
return PROVIDER_NAME_MAP[provider] ?? provider
|
|
40
|
+
}
|
|
41
|
+
|
|
30
42
|
/**
|
|
31
43
|
* Compare a measurement against benchmarks. Async because llm_judge requires
|
|
32
44
|
* an LLM call. The step's output is needed for output_contains/llm_judge checks.
|
|
45
|
+
*
|
|
46
|
+
* @param evaluatorConfig - Optional backend evaluator config used as fallback
|
|
47
|
+
* when the test does not specify judge_provider/judge_model.
|
|
33
48
|
*/
|
|
34
49
|
export async function compareBenchmarks(
|
|
35
50
|
measurement: TestMeasurement,
|
|
36
51
|
benchmarks: TestBenchmarks,
|
|
37
52
|
stepOutput?: unknown,
|
|
53
|
+
evaluatorConfig?: EvaluatorConfig | null,
|
|
38
54
|
): Promise<BenchmarkResult> {
|
|
39
55
|
const metrics: MetricResult[] = []
|
|
40
56
|
let firstFailure: string | undefined
|
|
@@ -104,18 +120,46 @@ export async function compareBenchmarks(
|
|
|
104
120
|
const outputStr = stringifyOutput(stepOutput)
|
|
105
121
|
const threshold = judge.judge_score_threshold ?? 7
|
|
106
122
|
|
|
123
|
+
// Resolve provider/model: test definition takes priority, then backend
|
|
124
|
+
// evaluator config, then fall back to 'openai' default.
|
|
125
|
+
const resolvedProvider = normalizeSdkProvider(
|
|
126
|
+
judge.judge_provider ?? evaluatorConfig?.provider ?? 'openai'
|
|
127
|
+
)
|
|
128
|
+
const resolvedModel = judge.judge_model ?? evaluatorConfig?.model ?? undefined
|
|
129
|
+
|
|
130
|
+
// If the backend provided an API key and we're using its provider,
|
|
131
|
+
// set it in the environment so callProviderLLM can pick it up.
|
|
132
|
+
const envKeyMap: Record<string, string> = {
|
|
133
|
+
openai: 'OPENAI_API_KEY',
|
|
134
|
+
claude: 'ANTHROPIC_API_KEY',
|
|
135
|
+
gemini: 'GEMINI_API_KEY',
|
|
136
|
+
grok: 'GROK_API_KEY',
|
|
137
|
+
kimi: 'KIMI_API_KEY',
|
|
138
|
+
}
|
|
139
|
+
const envKey = envKeyMap[resolvedProvider]
|
|
140
|
+
let restoreEnv: (() => void) | undefined
|
|
141
|
+
if (evaluatorConfig?.apiKey && envKey && !judge.judge_provider && !process.env[envKey]) {
|
|
142
|
+
const prev = process.env[envKey]
|
|
143
|
+
process.env[envKey] = evaluatorConfig.apiKey
|
|
144
|
+
restoreEnv = () => {
|
|
145
|
+
if (prev === undefined) delete process.env[envKey]
|
|
146
|
+
else process.env[envKey] = prev
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
107
150
|
try {
|
|
108
|
-
const provider = judge.judge_provider ?? 'openai'
|
|
109
151
|
const evalPrompt = `${judge.judge_prompt}\n\nOutput to evaluate:\n${outputStr}\n\nScore this output on a scale of 0-10. Respond with only the number.`
|
|
110
152
|
|
|
111
153
|
const result = await callProviderLLM(
|
|
112
154
|
evalPrompt,
|
|
113
|
-
{ provider, model:
|
|
155
|
+
{ provider: resolvedProvider as 'openai' | 'claude' | 'gemini' | 'grok' | 'kimi', model: resolvedModel },
|
|
114
156
|
'You are an expert test judge. Return only a number between 0 and 10.',
|
|
115
157
|
16,
|
|
116
158
|
0,
|
|
117
159
|
)
|
|
118
160
|
|
|
161
|
+
restoreEnv?.()
|
|
162
|
+
|
|
119
163
|
const score = parseFloat(result.content.match(/-?\d+(?:\.\d+)?/)?.[0] ?? '')
|
|
120
164
|
if (isNaN(score)) {
|
|
121
165
|
metrics.push({
|
|
@@ -142,6 +186,7 @@ export async function compareBenchmarks(
|
|
|
142
186
|
}
|
|
143
187
|
}
|
|
144
188
|
} catch (err) {
|
|
189
|
+
restoreEnv?.()
|
|
145
190
|
const errMsg = err instanceof Error ? err.message : String(err)
|
|
146
191
|
metrics.push({
|
|
147
192
|
name: 'llm_judge',
|
package/src/ci/ed-runner.ts
CHANGED
|
@@ -4,6 +4,8 @@ import { createReplayContext, installReplay, uninstallReplay, ReplayMissError }
|
|
|
4
4
|
import { collectMeasurement } from './measurement.js'
|
|
5
5
|
import { SDK_VERSION } from './trace-schema.js'
|
|
6
6
|
import { compareBenchmarks } from './benchmark.js'
|
|
7
|
+
import { fetchEvaluatorConfig } from './api-client.js'
|
|
8
|
+
import type { EvaluatorConfig } from './api-client.js'
|
|
7
9
|
import type { TestMeasurement } from './measurement.js'
|
|
8
10
|
import type { BenchmarkResult } from './benchmark.js'
|
|
9
11
|
import type { ValidatedTest } from './test-loader.js'
|
|
@@ -88,13 +90,32 @@ export async function runEdTests(options?: EdTestRunOptions): Promise<EdTestRunR
|
|
|
88
90
|
|
|
89
91
|
const maxRuns = Math.max(1, options?.runs ?? 1)
|
|
90
92
|
|
|
93
|
+
// Fetch evaluator config from backend if any test uses llm_judge without
|
|
94
|
+
// explicit provider/model. Cached for the entire run to avoid repeated calls.
|
|
95
|
+
let evaluatorConfig: EvaluatorConfig | null = null
|
|
96
|
+
const needsEvaluatorConfig = testsToRun.some(
|
|
97
|
+
t => t.benchmarks.llm_judge && (!t.benchmarks.llm_judge.judge_provider || !t.benchmarks.llm_judge.judge_model)
|
|
98
|
+
)
|
|
99
|
+
if (needsEvaluatorConfig) {
|
|
100
|
+
const serverUrl = process.env.ELASTICDASH_API_URL ?? process.env.ELASTICDASH_SERVER ?? ''
|
|
101
|
+
const apiKey = process.env.ELASTICDASH_API_KEY ?? ''
|
|
102
|
+
if (serverUrl && apiKey) {
|
|
103
|
+
try {
|
|
104
|
+
evaluatorConfig = await fetchEvaluatorConfig(serverUrl, apiKey)
|
|
105
|
+
console.log(`[ed-test] Evaluator config: provider=${evaluatorConfig.provider}, model=${evaluatorConfig.model}, hasKey=${!!evaluatorConfig.apiKey}`)
|
|
106
|
+
} catch (err) {
|
|
107
|
+
console.warn(`[ed-test] Could not fetch evaluator config: ${err instanceof Error ? err.message : String(err)}`)
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
91
112
|
for (const test of testsToRun) {
|
|
92
113
|
const allRuns: EdSingleRunResult[] = []
|
|
93
114
|
let bestResult: EdTestResult | null = null
|
|
94
115
|
|
|
95
116
|
for (let attempt = 1; attempt <= maxRuns; attempt++) {
|
|
96
117
|
const runStartedAt = new Date().toISOString()
|
|
97
|
-
const result = await runSingleTest(test)
|
|
118
|
+
const result = await runSingleTest(test, evaluatorConfig)
|
|
98
119
|
const runFinishedAt = new Date().toISOString()
|
|
99
120
|
|
|
100
121
|
if (attempt > 1) {
|
|
@@ -153,7 +174,7 @@ async function resolveCustomInput(input: unknown | (() => Promise<unknown> | unk
|
|
|
153
174
|
return typeof input === 'function' ? await (input as () => Promise<unknown> | unknown)() : input
|
|
154
175
|
}
|
|
155
176
|
|
|
156
|
-
async function runSingleTest(test: ValidatedTest): Promise<EdTestResult> {
|
|
177
|
+
async function runSingleTest(test: ValidatedTest, evaluatorConfig?: EvaluatorConfig | null): Promise<EdTestResult> {
|
|
157
178
|
const startMs = Date.now()
|
|
158
179
|
const targetStep = test.traceData.steps.find(s => s.step_id === test.target.step_id)
|
|
159
180
|
|
|
@@ -221,7 +242,7 @@ async function runSingleTest(test: ValidatedTest): Promise<EdTestResult> {
|
|
|
221
242
|
}
|
|
222
243
|
|
|
223
244
|
// Compare against benchmarks (async to support llm_judge)
|
|
224
|
-
const benchmarkResult = await compareBenchmarks(measurement, test.benchmarks, targetStep?.output)
|
|
245
|
+
const benchmarkResult = await compareBenchmarks(measurement, test.benchmarks, targetStep?.output, evaluatorConfig)
|
|
225
246
|
|
|
226
247
|
return {
|
|
227
248
|
...base,
|
|
@@ -240,7 +261,7 @@ async function runSingleTest(test: ValidatedTest): Promise<EdTestResult> {
|
|
|
240
261
|
const traceMeasurement = extractMeasurementFromTrace(test)
|
|
241
262
|
if (traceMeasurement) {
|
|
242
263
|
console.log(` [ed-test] ${test.name}: run() failed (${err instanceof Error ? err.message : String(err)}), using trace measurement fallback`)
|
|
243
|
-
const benchmarkResult = await compareBenchmarks(traceMeasurement, test.benchmarks, targetStep?.output)
|
|
264
|
+
const benchmarkResult = await compareBenchmarks(traceMeasurement, test.benchmarks, targetStep?.output, evaluatorConfig)
|
|
244
265
|
return {
|
|
245
266
|
...base,
|
|
246
267
|
testId: test.name,
|
package/src/index.ts
CHANGED
|
@@ -120,7 +120,7 @@ export type { UploadPayload, UploadTestResult } from './ci/upload-client.js'
|
|
|
120
120
|
|
|
121
121
|
// CI runner (programmatic API)
|
|
122
122
|
export { runCI } from './ci/runner.js'
|
|
123
|
-
export { fetchTestGroups, submitTestRun, createBatch } from './ci/api-client.js'
|
|
123
|
+
export { fetchTestGroups, submitTestRun, createBatch, fetchEvaluatorConfig } from './ci/api-client.js'
|
|
124
124
|
export { detectGitInfo } from './ci/git-info.js'
|
|
125
125
|
export type { CIRunConfig, CIRunSummary, CITestResult, CISingleRunResult, CIExpectationResult } from './ci/types.js'
|
|
126
126
|
export type { GitInfo } from './ci/git-info.js'
|