vieval 0.0.5 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +219 -109
- package/dist/bin/vieval.mjs +1 -1
- package/dist/cli/index.mjs +1 -1
- package/dist/{cli-DayPXzHX.mjs → cli-ImxGpoYQ.mjs} +1447 -195
- package/dist/cli-ImxGpoYQ.mjs.map +1 -0
- package/dist/config.d.mts +2 -2
- package/dist/config.mjs +1 -1
- package/dist/core/assertions/index.d.mts +1 -1
- package/dist/core/inference-executors/index.d.mts +1 -1
- package/dist/core/inference-executors/index.mjs +1 -1
- package/dist/core/processors/results/index.d.mts +1 -1
- package/dist/core/runner/index.d.mts +3 -2
- package/dist/core/runner/index.mjs +3 -2
- package/dist/core/runner/index.mjs.map +1 -1
- package/dist/core/scheduler/index.d.mts +2 -0
- package/dist/core/scheduler/index.mjs +188 -0
- package/dist/core/scheduler/index.mjs.map +1 -0
- package/dist/{env-BFSjny07.mjs → env--94B0UtW.mjs} +1 -1
- package/dist/{env-BFSjny07.mjs.map → env--94B0UtW.mjs.map} +1 -1
- package/dist/{env-BTq3dV7C.d.mts → env-BeHv_5mo.d.mts} +1 -1
- package/dist/{expect-extensions-QLXESWjn.mjs → expect-extensions-DCSqlneN.mjs} +1 -1
- package/dist/{expect-extensions-QLXESWjn.mjs.map → expect-extensions-DCSqlneN.mjs.map} +1 -1
- package/dist/expect.mjs +1 -1
- package/dist/{index-OEdqjQSe.d.mts → index-5R1_k2nv.d.mts} +195 -3
- package/dist/index-fakXoZEe.d.mts +147 -0
- package/dist/index.d.mts +120 -13
- package/dist/index.mjs +286 -54
- package/dist/index.mjs.map +1 -1
- package/dist/{models-D_MsBtYw.mjs → models-DIGdOUpJ.mjs} +1 -1
- package/dist/models-DIGdOUpJ.mjs.map +1 -0
- package/dist/plugins/chat-models/index.d.mts +27 -1
- package/dist/plugins/chat-models/index.mjs +29 -1
- package/dist/plugins/chat-models/index.mjs.map +1 -1
- package/dist/queue-DsZQkZO_.mjs +21 -0
- package/dist/queue-DsZQkZO_.mjs.map +1 -0
- package/dist/{registry-CwcMMjnZ.mjs → registry-BHGMxjpA.mjs} +164 -6
- package/dist/registry-BHGMxjpA.mjs.map +1 -0
- package/dist/testing/expect-extensions.mjs +1 -1
- package/package.json +8 -1
- package/dist/cli-DayPXzHX.mjs.map +0 -1
- package/dist/models-D_MsBtYw.mjs.map +0 -1
- package/dist/registry-CwcMMjnZ.mjs.map +0 -1
package/dist/index.d.mts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { a as requiredEnvFrom } from "./env-
|
|
1
|
+
import { D as TaskRunContext, I as TaskExecutionContext, O as TaskRunOutput, S as TaskConcurrencyConfig, Y as RunResult, Z as RunScoreKind, et as InferenceExecutor, f as MatrixDefinition, it as ScheduledTask, k as TelemetryAttributeValue, o as CliReportingConfig, p as MatrixLayer, t as ConfigHookPlugin, w as TaskExecutionPolicy, z as ModelDefinition } from "./index-5R1_k2nv.mjs";
|
|
2
|
+
import { a as requiredEnvFrom } from "./env-BeHv_5mo.mjs";
|
|
3
3
|
import { expect } from "./expect.mjs";
|
|
4
4
|
import * as _$c12 from "c12";
|
|
5
5
|
|
|
@@ -66,6 +66,41 @@ type VievalVitestCompatReporterReference = VievalVitestCompatReporterValue | rea
|
|
|
66
66
|
* CLI plugin shape bound to the full CLI config object.
|
|
67
67
|
*/
|
|
68
68
|
type CliConfigPlugin = ConfigHookPlugin<CliConfig>;
|
|
69
|
+
/**
|
|
70
|
+
* Concurrency limits that can be declared in CLI-facing config.
|
|
71
|
+
*
|
|
72
|
+
* Use when:
|
|
73
|
+
* - the CLI needs independent caps for workspace, project, task, attempt, or case scheduling scopes
|
|
74
|
+
* - config authors want to define concurrency without wiring runtime execution yet
|
|
75
|
+
*
|
|
76
|
+
* Expects:
|
|
77
|
+
* - each provided value to be a positive integer chosen by the caller
|
|
78
|
+
*
|
|
79
|
+
* Returns:
|
|
80
|
+
* - one partial concurrency descriptor keyed by scheduling scope
|
|
81
|
+
*/
|
|
82
|
+
interface CliConcurrencyConfig {
|
|
83
|
+
/**
|
|
84
|
+
* Workspace-level concurrency cap.
|
|
85
|
+
*/
|
|
86
|
+
workspace?: number;
|
|
87
|
+
/**
|
|
88
|
+
* Project-level concurrency cap.
|
|
89
|
+
*/
|
|
90
|
+
project?: number;
|
|
91
|
+
/**
|
|
92
|
+
* Task-level concurrency cap.
|
|
93
|
+
*/
|
|
94
|
+
task?: number;
|
|
95
|
+
/**
|
|
96
|
+
* Attempt-level concurrency cap.
|
|
97
|
+
*/
|
|
98
|
+
attempt?: number;
|
|
99
|
+
/**
|
|
100
|
+
* Case-level concurrency cap.
|
|
101
|
+
*/
|
|
102
|
+
case?: number;
|
|
103
|
+
}
|
|
69
104
|
/**
|
|
70
105
|
* Defines one project block for `vieval run`.
|
|
71
106
|
*/
|
|
@@ -115,6 +150,12 @@ interface CliProjectConfig {
|
|
|
115
150
|
* Optional eval-time matrix dimensions.
|
|
116
151
|
*/
|
|
117
152
|
evalMatrix?: MatrixDefinition | MatrixLayer;
|
|
153
|
+
/**
|
|
154
|
+
* Optional project-scoped concurrency overrides.
|
|
155
|
+
*
|
|
156
|
+
* @default inherited from top-level or CLI execution settings
|
|
157
|
+
*/
|
|
158
|
+
concurrency?: Omit<CliConcurrencyConfig, 'workspace'>;
|
|
118
159
|
/**
|
|
119
160
|
* Optional task executor.
|
|
120
161
|
*
|
|
@@ -218,9 +259,13 @@ interface CliComparisonConfig {
|
|
|
218
259
|
* Expects:
|
|
219
260
|
* - `model` resolves configured models for the current task
|
|
220
261
|
* - `reporterHooks` follows `TaskRunContext['reporterHooks']`
|
|
262
|
+
* - `telemetry` follows `TaskRunContext['telemetry']`
|
|
263
|
+
* - `runtimeConcurrency` follows `TaskRunContext['runtimeConcurrency']`
|
|
221
264
|
*/
|
|
222
265
|
interface CliProjectExecutorContext extends TaskExecutionContext {
|
|
223
266
|
reporterHooks?: TaskRunContext['reporterHooks'];
|
|
267
|
+
telemetry?: TaskRunContext['telemetry'];
|
|
268
|
+
runtimeConcurrency?: TaskRunContext['runtimeConcurrency'];
|
|
224
269
|
}
|
|
225
270
|
/**
|
|
226
271
|
* Top-level CLI config loaded from `vieval.config.*`.
|
|
@@ -232,6 +277,19 @@ interface CliConfigBase {
|
|
|
232
277
|
* @default []
|
|
233
278
|
*/
|
|
234
279
|
models?: ModelDefinition[];
|
|
280
|
+
/**
|
|
281
|
+
* Global concurrency defaults inherited by projects and tasks.
|
|
282
|
+
*
|
|
283
|
+
* Use when:
|
|
284
|
+
* - config authors want one shared concurrency policy across workspace, project, task, attempt, and case scopes
|
|
285
|
+
* - project-local overrides should start from a top-level baseline
|
|
286
|
+
*
|
|
287
|
+
* Expects:
|
|
288
|
+
* - each provided value to be a positive integer chosen by the caller
|
|
289
|
+
*
|
|
290
|
+
* @default undefined
|
|
291
|
+
*/
|
|
292
|
+
concurrency?: CliConcurrencyConfig;
|
|
235
293
|
/**
|
|
236
294
|
* Global config plugins.
|
|
237
295
|
*
|
|
@@ -254,6 +312,12 @@ interface CliConfigBase {
|
|
|
254
312
|
* @default {}
|
|
255
313
|
*/
|
|
256
314
|
env?: NodeJS.ProcessEnv;
|
|
315
|
+
/**
|
|
316
|
+
* Optional reporting integrations shared by CLI run orchestration.
|
|
317
|
+
*
|
|
318
|
+
* @default undefined
|
|
319
|
+
*/
|
|
320
|
+
reporting?: CliReportingConfig;
|
|
257
321
|
}
|
|
258
322
|
/**
|
|
259
323
|
* Project mode config for `vieval run`.
|
|
@@ -344,12 +408,44 @@ interface CaseRunContext<TInput> extends TaskRunContext {
|
|
|
344
408
|
* - `name` to be a stable metric identifier
|
|
345
409
|
* - `value` to be JSON-serializable
|
|
346
410
|
*/
|
|
347
|
-
metric: (name: string, value:
|
|
411
|
+
metric: (name: string, value: TelemetryAttributeValue) => void;
|
|
412
|
+
/**
|
|
413
|
+
* Cooperative abort signal for the current case execution.
|
|
414
|
+
*/
|
|
415
|
+
signal: AbortSignal;
|
|
348
416
|
}
|
|
349
417
|
/**
|
|
350
418
|
* Callback for one task case.
|
|
351
419
|
*/
|
|
352
|
-
type CaseRunner<TInput> = (context: CaseRunContext<TInput>) => Promise<
|
|
420
|
+
type CaseRunner<TInput> = (context: CaseRunContext<TInput>) => Promise<unknown> | unknown;
|
|
421
|
+
/**
|
|
422
|
+
* Per-group options for `casesFromInputs`.
|
|
423
|
+
*
|
|
424
|
+
* Use when:
|
|
425
|
+
* - one generated case group should run with a lower case concurrency than the task default
|
|
426
|
+
* - a task should keep a broader task-level cap while one expensive case family stays bounded
|
|
427
|
+
*
|
|
428
|
+
* Expects:
|
|
429
|
+
* - `concurrency` to be a positive integer when provided
|
|
430
|
+
*
|
|
431
|
+
* Returns:
|
|
432
|
+
* - one partial case-group execution descriptor
|
|
433
|
+
*/
|
|
434
|
+
interface CasesFromInputsOptions extends TaskExecutionPolicy {
|
|
435
|
+
/**
|
|
436
|
+
* Case-level concurrency cap for cases registered by one `casesFromInputs(...)` call.
|
|
437
|
+
*/
|
|
438
|
+
concurrency?: number;
|
|
439
|
+
}
|
|
440
|
+
/**
|
|
441
|
+
* Per-case registration options for `caseOf`.
|
|
442
|
+
*/
|
|
443
|
+
interface CaseRegistrationOptions<TInput> extends TaskExecutionPolicy {
|
|
444
|
+
/**
|
|
445
|
+
* Optional case input payload.
|
|
446
|
+
*/
|
|
447
|
+
input: TInput;
|
|
448
|
+
}
|
|
353
449
|
/**
|
|
354
450
|
* Builder callbacks passed into `describeTask`.
|
|
355
451
|
*/
|
|
@@ -359,35 +455,44 @@ interface DescribeTaskBuilder {
|
|
|
359
455
|
*/
|
|
360
456
|
caseOf: {
|
|
361
457
|
(name: string, run: CaseRunner<undefined>): void;
|
|
362
|
-
<TInput>(name: string, run: CaseRunner<TInput>, options:
|
|
363
|
-
input: TInput;
|
|
364
|
-
}): void;
|
|
458
|
+
<TInput>(name: string, run: CaseRunner<TInput>, options: CaseRegistrationOptions<TInput>): void;
|
|
365
459
|
};
|
|
366
460
|
/**
|
|
367
461
|
* Registers multiple cases from input list.
|
|
368
462
|
*/
|
|
369
|
-
casesFromInputs: <TInput>(namePrefix: string, inputs: readonly TInput[], run: CaseRunner<TInput
|
|
463
|
+
casesFromInputs: <TInput>(namePrefix: string, inputs: readonly TInput[], run: CaseRunner<TInput>, options?: CasesFromInputsOptions) => void;
|
|
370
464
|
}
|
|
371
465
|
/**
|
|
372
466
|
* Options for `describeTask`.
|
|
373
467
|
*/
|
|
374
|
-
interface DescribeTaskOptions {
|
|
468
|
+
interface DescribeTaskOptions extends TaskExecutionPolicy {
|
|
375
469
|
/**
|
|
376
470
|
* Optional description override.
|
|
377
471
|
*/
|
|
378
472
|
description?: string;
|
|
473
|
+
/**
|
|
474
|
+
* Optional task-local concurrency overrides.
|
|
475
|
+
*
|
|
476
|
+
* Use when:
|
|
477
|
+
* - one task should cap attempt fan-out independently from the surrounding project
|
|
478
|
+
* - one task should cap case fan-out without changing global scheduling defaults
|
|
479
|
+
*
|
|
480
|
+
* Expects:
|
|
481
|
+
* - each provided value to be a positive integer
|
|
482
|
+
*
|
|
483
|
+
* @default inherited from project or CLI concurrency settings
|
|
484
|
+
*/
|
|
485
|
+
concurrency?: TaskConcurrencyConfig;
|
|
379
486
|
}
|
|
380
487
|
/**
|
|
381
488
|
* Registers one case in the currently active task scope.
|
|
382
489
|
*/
|
|
383
490
|
declare function caseOf(name: string, run: CaseRunner<undefined>): void;
|
|
384
|
-
declare function caseOf<TInput>(name: string, run: CaseRunner<TInput>, options:
|
|
385
|
-
input: TInput;
|
|
386
|
-
}): void;
|
|
491
|
+
declare function caseOf<TInput>(name: string, run: CaseRunner<TInput>, options: CaseRegistrationOptions<TInput>): void;
|
|
387
492
|
/**
|
|
388
493
|
* Registers multiple cases in the currently active task scope.
|
|
389
494
|
*/
|
|
390
|
-
declare function casesFromInputs<TInput>(namePrefix: string, inputs: readonly TInput[], run: CaseRunner<TInput
|
|
495
|
+
declare function casesFromInputs<TInput>(namePrefix: string, inputs: readonly TInput[], run: CaseRunner<TInput>, options?: CasesFromInputsOptions): void;
|
|
391
496
|
/**
|
|
392
497
|
* Defines one eval task with task/case semantics similar to Vitest.
|
|
393
498
|
*
|
|
@@ -399,6 +504,8 @@ declare function describeTask(name: string, build: ((builder: DescribeTaskBuilde
|
|
|
399
504
|
readonly description: string;
|
|
400
505
|
readonly name: string;
|
|
401
506
|
readonly task: {
|
|
507
|
+
readonly concurrency: TaskConcurrencyConfig | undefined;
|
|
508
|
+
readonly executionPolicy: TaskExecutionPolicy | undefined;
|
|
402
509
|
readonly id: string;
|
|
403
510
|
readonly run: (context: TaskRunContext) => Promise<TaskRunOutput>;
|
|
404
511
|
};
|
package/dist/index.mjs
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
1
|
+
import { i as registerEvalDefinition, l as loadEnv, o as createNoopTelemetryRuntime, s as defineConfig } from "./registry-BHGMxjpA.mjs";
|
|
2
|
+
import { t as createSchedulerQueue } from "./queue-DsZQkZO_.mjs";
|
|
3
|
+
import { n as requiredEnvFrom } from "./env--94B0UtW.mjs";
|
|
3
4
|
import { defineEval, defineTask } from "./config.mjs";
|
|
4
5
|
import { expect } from "./expect.mjs";
|
|
5
|
-
import { errorMessageFrom } from "@moeru/std";
|
|
6
|
+
import { errorMessageFrom, sleep } from "@moeru/std";
|
|
6
7
|
//#region src/dsl/task.ts
|
|
7
8
|
function cloneCaseMatrix(matrix) {
|
|
8
9
|
return {
|
|
@@ -14,9 +15,36 @@ function cloneCaseMatrix(matrix) {
|
|
|
14
15
|
function createTaskCaseReporterId(index, name) {
|
|
15
16
|
return `${index}:${encodeURIComponent(name)}`;
|
|
16
17
|
}
|
|
18
|
+
function isTelemetryAttributeScalar(value) {
|
|
19
|
+
return typeof value === "boolean" || typeof value === "number" || typeof value === "string";
|
|
20
|
+
}
|
|
21
|
+
function isTelemetryAttributeArray(value) {
|
|
22
|
+
return value.every(isTelemetryAttributeScalar);
|
|
23
|
+
}
|
|
24
|
+
function canAttachMetricAsAttribute(value) {
|
|
25
|
+
if (isTelemetryAttributeScalar(value)) return true;
|
|
26
|
+
return Array.isArray(value) && isTelemetryAttributeArray(value);
|
|
27
|
+
}
|
|
17
28
|
function assertValidScore(score) {
|
|
18
29
|
if (!Number.isFinite(score) || score < 0 || score > 1) throw new Error(`Case score must be a finite number in range 0..1, got "${score}".`);
|
|
19
30
|
}
|
|
31
|
+
function assertNonNegativeInteger(value, label) {
|
|
32
|
+
if (!Number.isFinite(value) || !Number.isInteger(value) || value < 0) throw new Error(`Invalid ${label}: ${String(value)}`);
|
|
33
|
+
}
|
|
34
|
+
function assertNonNegativeNumber(value, label) {
|
|
35
|
+
if (!Number.isFinite(value) || value < 0) throw new Error(`Invalid ${label}: ${String(value)}`);
|
|
36
|
+
}
|
|
37
|
+
function assertPositiveInteger(value, label) {
|
|
38
|
+
if (!Number.isFinite(value) || !Number.isInteger(value) || value <= 0) throw new Error(`Invalid ${label}: ${String(value)}`);
|
|
39
|
+
}
|
|
40
|
+
function autoRetryDelayMs(retryIndex) {
|
|
41
|
+
return 500 * 2 ** (retryIndex - 1);
|
|
42
|
+
}
|
|
43
|
+
function resolveAutoRetryDelay(policy, retryIndex) {
|
|
44
|
+
const delay = policy.autoRetryDelay;
|
|
45
|
+
if (delay == null) return autoRetryDelayMs(retryIndex);
|
|
46
|
+
return typeof delay === "number" ? delay : delay(retryIndex);
|
|
47
|
+
}
|
|
20
48
|
function emitCaseStart(hooks, payload) {
|
|
21
49
|
try {
|
|
22
50
|
hooks?.onCaseStart?.(payload);
|
|
@@ -27,9 +55,174 @@ function emitCaseEnd(hooks, payload) {
|
|
|
27
55
|
hooks?.onCaseEnd?.(payload);
|
|
28
56
|
} catch {}
|
|
29
57
|
}
|
|
58
|
+
function emitReporterEvent(hooks, payload) {
|
|
59
|
+
try {
|
|
60
|
+
hooks?.onEvent?.(payload);
|
|
61
|
+
} catch {}
|
|
62
|
+
}
|
|
63
|
+
function createCaseTimeoutError(timeout) {
|
|
64
|
+
const error = /* @__PURE__ */ new Error(`Case timed out after ${timeout}ms.`);
|
|
65
|
+
error.name = "TimeoutError";
|
|
66
|
+
return error;
|
|
67
|
+
}
|
|
68
|
+
function normalizeExecutionPolicy(policy, label) {
|
|
69
|
+
if (policy == null) return;
|
|
70
|
+
if (policy.autoAttempt != null) assertNonNegativeInteger(policy.autoAttempt, `${label} autoAttempt`);
|
|
71
|
+
if (policy.autoRetry != null) assertNonNegativeInteger(policy.autoRetry, `${label} autoRetry`);
|
|
72
|
+
if (typeof policy.autoRetryDelay === "number") assertNonNegativeNumber(policy.autoRetryDelay, `${label} autoRetryDelay`);
|
|
73
|
+
if (policy.timeout != null) assertPositiveInteger(policy.timeout, `${label} timeout`);
|
|
74
|
+
const normalized = {
|
|
75
|
+
autoAttempt: policy.autoAttempt,
|
|
76
|
+
autoRetry: policy.autoRetry,
|
|
77
|
+
autoRetryDelay: policy.autoRetryDelay,
|
|
78
|
+
timeout: policy.timeout
|
|
79
|
+
};
|
|
80
|
+
return Object.values(normalized).some((value) => value != null) ? normalized : void 0;
|
|
81
|
+
}
|
|
82
|
+
function resolveCaseExecutionPolicy(taskCase, taskExecutionPolicy) {
|
|
83
|
+
return {
|
|
84
|
+
autoAttempt: taskCase.executionPolicy?.autoAttempt ?? taskExecutionPolicy?.autoAttempt ?? 0,
|
|
85
|
+
autoRetry: taskCase.executionPolicy?.autoRetry ?? taskExecutionPolicy?.autoRetry ?? 0,
|
|
86
|
+
autoRetryDelay: taskCase.executionPolicy?.autoRetryDelay ?? taskExecutionPolicy?.autoRetryDelay,
|
|
87
|
+
timeout: taskCase.executionPolicy?.timeout ?? taskExecutionPolicy?.timeout
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
async function runCaseOnce(context, taskCase, index, timeout) {
|
|
91
|
+
const customScoresByKind = /* @__PURE__ */ new Map();
|
|
92
|
+
const abortController = new AbortController();
|
|
93
|
+
const telemetry = context.telemetry ?? createNoopTelemetryRuntime();
|
|
94
|
+
const caseId = createTaskCaseReporterId(index, taskCase.name);
|
|
95
|
+
let timeoutHandle;
|
|
96
|
+
let timedOut = false;
|
|
97
|
+
let settled = false;
|
|
98
|
+
try {
|
|
99
|
+
return await telemetry.withSpan("vieval.case", {
|
|
100
|
+
"vieval.case.id": caseId,
|
|
101
|
+
"vieval.case.name": taskCase.name,
|
|
102
|
+
"vieval.task.id": context.task.id,
|
|
103
|
+
"vieval.task.name": context.task.entry.name
|
|
104
|
+
}, async () => {
|
|
105
|
+
const runPromise = Promise.resolve(taskCase.run({
|
|
106
|
+
...context,
|
|
107
|
+
matrix: {
|
|
108
|
+
...cloneCaseMatrix(context.task.matrix),
|
|
109
|
+
inputs: taskCase.input
|
|
110
|
+
},
|
|
111
|
+
metric(name, value) {
|
|
112
|
+
if (abortController.signal.aborted || settled) return;
|
|
113
|
+
emitReporterEvent(context.reporterHooks, {
|
|
114
|
+
caseId,
|
|
115
|
+
data: {
|
|
116
|
+
name,
|
|
117
|
+
value
|
|
118
|
+
},
|
|
119
|
+
event: "task.case.metric"
|
|
120
|
+
});
|
|
121
|
+
telemetry.addEvent("vieval.case.metric", {
|
|
122
|
+
name,
|
|
123
|
+
value
|
|
124
|
+
});
|
|
125
|
+
if (canAttachMetricAsAttribute(value)) telemetry.setAttributes({ [name]: value });
|
|
126
|
+
},
|
|
127
|
+
score(score, kind = "exact") {
|
|
128
|
+
if (abortController.signal.aborted || settled) return;
|
|
129
|
+
assertValidScore(score);
|
|
130
|
+
customScoresByKind.set(kind, score);
|
|
131
|
+
telemetry.addEvent("vieval.case.score", {
|
|
132
|
+
"vieval.score.kind": kind,
|
|
133
|
+
"vieval.score.value": score
|
|
134
|
+
});
|
|
135
|
+
emitReporterEvent(context.reporterHooks, {
|
|
136
|
+
caseId,
|
|
137
|
+
data: {
|
|
138
|
+
kind,
|
|
139
|
+
score
|
|
140
|
+
},
|
|
141
|
+
event: "task.case.score"
|
|
142
|
+
});
|
|
143
|
+
},
|
|
144
|
+
signal: abortController.signal
|
|
145
|
+
}));
|
|
146
|
+
if (timeout != null) {
|
|
147
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
148
|
+
timeoutHandle = setTimeout(() => {
|
|
149
|
+
timedOut = true;
|
|
150
|
+
abortController.abort(createCaseTimeoutError(timeout));
|
|
151
|
+
reject(createCaseTimeoutError(timeout));
|
|
152
|
+
}, timeout);
|
|
153
|
+
});
|
|
154
|
+
const output = await Promise.race([runPromise, timeoutPromise]);
|
|
155
|
+
settled = true;
|
|
156
|
+
return {
|
|
157
|
+
output,
|
|
158
|
+
scoresByKind: customScoresByKind,
|
|
159
|
+
state: "passed"
|
|
160
|
+
};
|
|
161
|
+
}
|
|
162
|
+
const output = await runPromise;
|
|
163
|
+
settled = true;
|
|
164
|
+
return {
|
|
165
|
+
output,
|
|
166
|
+
scoresByKind: customScoresByKind,
|
|
167
|
+
state: "passed"
|
|
168
|
+
};
|
|
169
|
+
});
|
|
170
|
+
} catch (error) {
|
|
171
|
+
settled = true;
|
|
172
|
+
return {
|
|
173
|
+
errorMessage: errorMessageFrom(error) ?? (timedOut && timeout != null ? `Case timed out after ${timeout}ms.` : "Unknown case failure."),
|
|
174
|
+
scoresByKind: customScoresByKind,
|
|
175
|
+
state: timedOut ? "timeout" : "failed"
|
|
176
|
+
};
|
|
177
|
+
} finally {
|
|
178
|
+
if (timeoutHandle != null) clearTimeout(timeoutHandle);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
async function executeRegisteredCase(context, taskCase, index, totalCases, taskExecutionPolicy) {
|
|
182
|
+
const resolvedPolicy = resolveCaseExecutionPolicy(taskCase, taskExecutionPolicy);
|
|
183
|
+
let lastOutcome;
|
|
184
|
+
for (let retryIndex = 0; retryIndex <= resolvedPolicy.autoRetry; retryIndex += 1) {
|
|
185
|
+
if (retryIndex > 0) {
|
|
186
|
+
const retryDelayMs = resolveAutoRetryDelay(resolvedPolicy, retryIndex);
|
|
187
|
+
assertNonNegativeNumber(retryDelayMs, "autoRetryDelay result");
|
|
188
|
+
if (retryDelayMs > 0) await sleep(retryDelayMs);
|
|
189
|
+
}
|
|
190
|
+
emitCaseStart(context.reporterHooks, {
|
|
191
|
+
...resolvedPolicy.autoRetry > 0 ? {
|
|
192
|
+
autoRetry: resolvedPolicy.autoRetry,
|
|
193
|
+
retryIndex
|
|
194
|
+
} : {},
|
|
195
|
+
index,
|
|
196
|
+
...taskCase.input === void 0 ? {} : { input: taskCase.input },
|
|
197
|
+
name: taskCase.name,
|
|
198
|
+
total: totalCases
|
|
199
|
+
});
|
|
200
|
+
lastOutcome = await runCaseOnce(context, taskCase, index, resolvedPolicy.timeout);
|
|
201
|
+
if (lastOutcome.state === "passed") return lastOutcome;
|
|
202
|
+
}
|
|
203
|
+
return lastOutcome ?? {
|
|
204
|
+
errorMessage: "Unknown case failure.",
|
|
205
|
+
scoresByKind: /* @__PURE__ */ new Map(),
|
|
206
|
+
state: "failed"
|
|
207
|
+
};
|
|
208
|
+
}
|
|
209
|
+
function collectCaseOutcomeScores(outcome, scoreBucketsByKind) {
|
|
210
|
+
if (outcome.state !== "passed") {
|
|
211
|
+
scoreBucketsByKind.exact.push(0);
|
|
212
|
+
return;
|
|
213
|
+
}
|
|
214
|
+
if (outcome.scoresByKind.size === 0) {
|
|
215
|
+
scoreBucketsByKind.exact.push(1);
|
|
216
|
+
return;
|
|
217
|
+
}
|
|
218
|
+
scoreBucketsByKind.exact.push(outcome.scoresByKind.get("exact") ?? 1);
|
|
219
|
+
const judgeScore = outcome.scoresByKind.get("judge");
|
|
220
|
+
if (judgeScore != null) scoreBucketsByKind.judge.push(judgeScore);
|
|
221
|
+
}
|
|
30
222
|
function createCaseBuilder(registeredCases) {
|
|
31
223
|
function registerCase(name, run, options) {
|
|
32
224
|
registeredCases.push({
|
|
225
|
+
executionPolicy: normalizeExecutionPolicy(options, "task case"),
|
|
33
226
|
input: options?.input,
|
|
34
227
|
name,
|
|
35
228
|
run
|
|
@@ -37,11 +230,15 @@ function createCaseBuilder(registeredCases) {
|
|
|
37
230
|
}
|
|
38
231
|
return {
|
|
39
232
|
caseOf: registerCase,
|
|
40
|
-
casesFromInputs(namePrefix, inputs, run) {
|
|
233
|
+
casesFromInputs(namePrefix, inputs, run, options) {
|
|
234
|
+
const queueKey = options?.concurrency == null ? void 0 : {};
|
|
41
235
|
inputs.forEach((input, index) => {
|
|
42
236
|
registeredCases.push({
|
|
237
|
+
concurrency: options?.concurrency,
|
|
238
|
+
executionPolicy: normalizeExecutionPolicy(options, "casesFromInputs"),
|
|
43
239
|
input,
|
|
44
240
|
name: `${namePrefix} #${index + 1}`,
|
|
241
|
+
queueKey,
|
|
45
242
|
run
|
|
46
243
|
});
|
|
47
244
|
});
|
|
@@ -64,6 +261,7 @@ function getActiveCases() {
|
|
|
64
261
|
}
|
|
65
262
|
function caseOf(name, run, options) {
|
|
66
263
|
getActiveCases().push({
|
|
264
|
+
executionPolicy: normalizeExecutionPolicy(options, "task case"),
|
|
67
265
|
input: options?.input,
|
|
68
266
|
name,
|
|
69
267
|
run
|
|
@@ -72,16 +270,40 @@ function caseOf(name, run, options) {
|
|
|
72
270
|
/**
|
|
73
271
|
* Registers multiple cases in the currently active task scope.
|
|
74
272
|
*/
|
|
75
|
-
function casesFromInputs(namePrefix, inputs, run) {
|
|
273
|
+
function casesFromInputs(namePrefix, inputs, run, options) {
|
|
274
|
+
const queueKey = options?.concurrency == null ? void 0 : {};
|
|
76
275
|
inputs.forEach((input, index) => {
|
|
77
276
|
getActiveCases().push({
|
|
277
|
+
concurrency: options?.concurrency,
|
|
278
|
+
executionPolicy: normalizeExecutionPolicy(options, "casesFromInputs"),
|
|
78
279
|
input,
|
|
79
280
|
name: `${namePrefix} #${index + 1}`,
|
|
281
|
+
queueKey,
|
|
80
282
|
run
|
|
81
283
|
});
|
|
82
284
|
});
|
|
83
285
|
}
|
|
84
286
|
/**
|
|
287
|
+
* Resolves the effective case concurrency for one registered task case.
|
|
288
|
+
*
|
|
289
|
+
* Before:
|
|
290
|
+
* - registered case override `2`, task default `4`
|
|
291
|
+
* - registered case override `undefined`, task default `3`
|
|
292
|
+
*
|
|
293
|
+
* After:
|
|
294
|
+
* - `2`
|
|
295
|
+
* - `3`
|
|
296
|
+
*/
|
|
297
|
+
function resolveCaseConcurrency(taskCase, taskConcurrency, runtimeConcurrency) {
|
|
298
|
+
const concurrency = runtimeConcurrency?.case ?? taskCase.concurrency ?? taskConcurrency?.case;
|
|
299
|
+
if (concurrency == null) return;
|
|
300
|
+
if (!Number.isFinite(concurrency) || !Number.isInteger(concurrency) || concurrency <= 0) throw new Error(`Invalid task case concurrency: ${String(concurrency)}`);
|
|
301
|
+
return concurrency;
|
|
302
|
+
}
|
|
303
|
+
function resolveCaseQueueKey(taskCase, defaultQueueKey) {
|
|
304
|
+
return taskCase.queueKey ?? defaultQueueKey;
|
|
305
|
+
}
|
|
306
|
+
/**
|
|
85
307
|
* Defines one eval task with task/case semantics similar to Vitest.
|
|
86
308
|
*
|
|
87
309
|
* Use when:
|
|
@@ -98,10 +320,14 @@ function describeTask(name, build, options = {}) {
|
|
|
98
320
|
}
|
|
99
321
|
build();
|
|
100
322
|
});
|
|
323
|
+
const description = options.description ?? name;
|
|
324
|
+
const taskExecutionPolicy = normalizeExecutionPolicy(options, "describeTask");
|
|
101
325
|
const definition = defineEval({
|
|
102
|
-
description
|
|
326
|
+
description,
|
|
103
327
|
name,
|
|
104
328
|
task: defineTask({
|
|
329
|
+
concurrency: options.concurrency,
|
|
330
|
+
executionPolicy: taskExecutionPolicy,
|
|
105
331
|
id: name,
|
|
106
332
|
async run(context) {
|
|
107
333
|
if (registeredCases.length === 0) return { scores: [{
|
|
@@ -113,62 +339,68 @@ function describeTask(name, build, options = {}) {
|
|
|
113
339
|
exact: [],
|
|
114
340
|
judge: []
|
|
115
341
|
};
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
let errorMessage;
|
|
124
|
-
const caseId = createTaskCaseReporterId(index, taskCase.name);
|
|
125
|
-
const customScoresByKind = /* @__PURE__ */ new Map();
|
|
126
|
-
try {
|
|
127
|
-
await taskCase.run({
|
|
128
|
-
...context,
|
|
129
|
-
matrix: {
|
|
130
|
-
...cloneCaseMatrix(context.task.matrix),
|
|
131
|
-
inputs: taskCase.input
|
|
132
|
-
},
|
|
133
|
-
metric(name, value) {
|
|
134
|
-
context.reporterHooks?.onEvent?.({
|
|
135
|
-
caseId,
|
|
136
|
-
data: {
|
|
137
|
-
name,
|
|
138
|
-
value
|
|
139
|
-
},
|
|
140
|
-
event: "task.case.metric"
|
|
141
|
-
});
|
|
142
|
-
},
|
|
143
|
-
score(score, kind = "exact") {
|
|
144
|
-
assertValidScore(score);
|
|
145
|
-
customScoresByKind.set(kind, score);
|
|
146
|
-
}
|
|
147
|
-
});
|
|
148
|
-
} catch (error) {
|
|
149
|
-
state = "failed";
|
|
150
|
-
errorMessage = errorMessageFrom(error) ?? "Unknown case failure.";
|
|
151
|
-
} finally {
|
|
342
|
+
const defaultCaseQueueKey = {};
|
|
343
|
+
const caseQueues = /* @__PURE__ */ new Map();
|
|
344
|
+
const hasAutoAttempt = registeredCases.some((taskCase) => resolveCaseExecutionPolicy(taskCase, taskExecutionPolicy).autoAttempt > 0);
|
|
345
|
+
const runtimeTaskConcurrency = context.task.entry.task?.concurrency ?? options.concurrency;
|
|
346
|
+
if (!hasAutoAttempt) await Promise.all(registeredCases.map(async (taskCase, index) => {
|
|
347
|
+
const executeCase = async () => {
|
|
348
|
+
const outcome = await executeRegisteredCase(context, taskCase, index, totalCases, taskExecutionPolicy);
|
|
152
349
|
emitCaseEnd(context.reporterHooks, {
|
|
153
|
-
...errorMessage == null ? {} : { errorMessage },
|
|
350
|
+
...outcome.errorMessage == null ? {} : { errorMessage: outcome.errorMessage },
|
|
154
351
|
index,
|
|
155
|
-
|
|
352
|
+
...outcome.output === void 0 ? {} : { output: outcome.output },
|
|
353
|
+
state: outcome.state,
|
|
156
354
|
name: taskCase.name,
|
|
157
355
|
total: totalCases
|
|
158
356
|
});
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
if (customScoresByKind.size === 0) {
|
|
165
|
-
scoreBucketsByKind.exact.push(1);
|
|
357
|
+
collectCaseOutcomeScores(outcome, scoreBucketsByKind);
|
|
358
|
+
};
|
|
359
|
+
const concurrency = resolveCaseConcurrency(taskCase, runtimeTaskConcurrency, context.runtimeConcurrency);
|
|
360
|
+
if (concurrency == null) {
|
|
361
|
+
await executeCase();
|
|
166
362
|
return;
|
|
167
363
|
}
|
|
168
|
-
|
|
169
|
-
const
|
|
170
|
-
|
|
364
|
+
const queueKey = resolveCaseQueueKey(taskCase, defaultCaseQueueKey);
|
|
365
|
+
const queue = caseQueues.get(queueKey) ?? createSchedulerQueue(concurrency);
|
|
366
|
+
caseQueues.set(queueKey, queue);
|
|
367
|
+
await queue.run(executeCase);
|
|
171
368
|
}));
|
|
369
|
+
else {
|
|
370
|
+
let finalOutcomes = [];
|
|
371
|
+
let attemptIndex = 0;
|
|
372
|
+
for (;;) {
|
|
373
|
+
finalOutcomes = await Promise.all(registeredCases.map(async (taskCase, index) => {
|
|
374
|
+
const executeCase = async () => await executeRegisteredCase(context, taskCase, index, totalCases, taskExecutionPolicy);
|
|
375
|
+
const concurrency = resolveCaseConcurrency(taskCase, runtimeTaskConcurrency, context.runtimeConcurrency);
|
|
376
|
+
if (concurrency == null) return await executeCase();
|
|
377
|
+
const queueKey = resolveCaseQueueKey(taskCase, defaultCaseQueueKey);
|
|
378
|
+
const queue = caseQueues.get(queueKey) ?? createSchedulerQueue(concurrency);
|
|
379
|
+
caseQueues.set(queueKey, queue);
|
|
380
|
+
return await queue.run(executeCase);
|
|
381
|
+
}));
|
|
382
|
+
if (!finalOutcomes.some((outcome, index) => {
|
|
383
|
+
if (outcome.state === "passed") return false;
|
|
384
|
+
const taskCase = registeredCases[index];
|
|
385
|
+
if (taskCase == null) return false;
|
|
386
|
+
return attemptIndex < resolveCaseExecutionPolicy(taskCase, taskExecutionPolicy).autoAttempt;
|
|
387
|
+
})) break;
|
|
388
|
+
attemptIndex += 1;
|
|
389
|
+
}
|
|
390
|
+
finalOutcomes.forEach((outcome, index) => {
|
|
391
|
+
const taskCase = registeredCases[index];
|
|
392
|
+
if (taskCase == null) return;
|
|
393
|
+
emitCaseEnd(context.reporterHooks, {
|
|
394
|
+
...outcome.errorMessage == null ? {} : { errorMessage: outcome.errorMessage },
|
|
395
|
+
index,
|
|
396
|
+
...outcome.output === void 0 ? {} : { output: outcome.output },
|
|
397
|
+
state: outcome.state,
|
|
398
|
+
name: taskCase.name,
|
|
399
|
+
total: totalCases
|
|
400
|
+
});
|
|
401
|
+
collectCaseOutcomeScores(outcome, scoreBucketsByKind);
|
|
402
|
+
});
|
|
403
|
+
}
|
|
172
404
|
return { scores: Object.keys(scoreBucketsByKind).filter((kind) => scoreBucketsByKind[kind].length > 0).map((kind) => {
|
|
173
405
|
const values = scoreBucketsByKind[kind];
|
|
174
406
|
return {
|