vieval 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/README.md +219 -109
  2. package/dist/bin/vieval.mjs +1 -1
  3. package/dist/cli/index.mjs +1 -1
  4. package/dist/{cli-DayPXzHX.mjs → cli-ImxGpoYQ.mjs} +1447 -195
  5. package/dist/cli-ImxGpoYQ.mjs.map +1 -0
  6. package/dist/config.d.mts +2 -2
  7. package/dist/config.mjs +1 -1
  8. package/dist/core/assertions/index.d.mts +1 -1
  9. package/dist/core/inference-executors/index.d.mts +1 -1
  10. package/dist/core/inference-executors/index.mjs +1 -1
  11. package/dist/core/processors/results/index.d.mts +1 -1
  12. package/dist/core/runner/index.d.mts +3 -2
  13. package/dist/core/runner/index.mjs +3 -2
  14. package/dist/core/runner/index.mjs.map +1 -1
  15. package/dist/core/scheduler/index.d.mts +2 -0
  16. package/dist/core/scheduler/index.mjs +188 -0
  17. package/dist/core/scheduler/index.mjs.map +1 -0
  18. package/dist/{env-BFSjny07.mjs → env--94B0UtW.mjs} +1 -1
  19. package/dist/{env-BFSjny07.mjs.map → env--94B0UtW.mjs.map} +1 -1
  20. package/dist/{env-BTq3dV7C.d.mts → env-BeHv_5mo.d.mts} +1 -1
  21. package/dist/{expect-extensions-QLXESWjn.mjs → expect-extensions-DCSqlneN.mjs} +1 -1
  22. package/dist/{expect-extensions-QLXESWjn.mjs.map → expect-extensions-DCSqlneN.mjs.map} +1 -1
  23. package/dist/expect.mjs +1 -1
  24. package/dist/{index-OEdqjQSe.d.mts → index-5R1_k2nv.d.mts} +195 -3
  25. package/dist/index-fakXoZEe.d.mts +147 -0
  26. package/dist/index.d.mts +120 -13
  27. package/dist/index.mjs +286 -54
  28. package/dist/index.mjs.map +1 -1
  29. package/dist/{models-D_MsBtYw.mjs → models-DIGdOUpJ.mjs} +1 -1
  30. package/dist/models-DIGdOUpJ.mjs.map +1 -0
  31. package/dist/plugins/chat-models/index.d.mts +27 -1
  32. package/dist/plugins/chat-models/index.mjs +29 -1
  33. package/dist/plugins/chat-models/index.mjs.map +1 -1
  34. package/dist/queue-DsZQkZO_.mjs +21 -0
  35. package/dist/queue-DsZQkZO_.mjs.map +1 -0
  36. package/dist/{registry-CwcMMjnZ.mjs → registry-BHGMxjpA.mjs} +164 -6
  37. package/dist/registry-BHGMxjpA.mjs.map +1 -0
  38. package/dist/testing/expect-extensions.mjs +1 -1
  39. package/package.json +8 -1
  40. package/dist/cli-DayPXzHX.mjs.map +0 -1
  41. package/dist/models-D_MsBtYw.mjs.map +0 -1
  42. package/dist/registry-CwcMMjnZ.mjs.map +0 -1
package/dist/index.d.mts CHANGED
@@ -1,5 +1,5 @@
1
- import { H as RunResult, M as ModelDefinition, S as TaskRunOutput, W as RunScoreKind, Z as ScheduledTask, k as TaskExecutionContext, l as MatrixDefinition, q as InferenceExecutor, t as ConfigHookPlugin, u as MatrixLayer, x as TaskRunContext } from "./index-OEdqjQSe.mjs";
2
- import { a as requiredEnvFrom } from "./env-BTq3dV7C.mjs";
1
+ import { D as TaskRunContext, I as TaskExecutionContext, O as TaskRunOutput, S as TaskConcurrencyConfig, Y as RunResult, Z as RunScoreKind, et as InferenceExecutor, f as MatrixDefinition, it as ScheduledTask, k as TelemetryAttributeValue, o as CliReportingConfig, p as MatrixLayer, t as ConfigHookPlugin, w as TaskExecutionPolicy, z as ModelDefinition } from "./index-5R1_k2nv.mjs";
2
+ import { a as requiredEnvFrom } from "./env-BeHv_5mo.mjs";
3
3
  import { expect } from "./expect.mjs";
4
4
  import * as _$c12 from "c12";
5
5
 
@@ -66,6 +66,41 @@ type VievalVitestCompatReporterReference = VievalVitestCompatReporterValue | rea
66
66
  * CLI plugin shape bound to the full CLI config object.
67
67
  */
68
68
  type CliConfigPlugin = ConfigHookPlugin<CliConfig>;
69
+ /**
70
+ * Concurrency limits that can be declared in CLI-facing config.
71
+ *
72
+ * Use when:
73
+ * - the CLI needs independent caps for workspace, project, task, attempt, or case scheduling scopes
74
+ * - config authors want to define concurrency without wiring runtime execution yet
75
+ *
76
+ * Expects:
77
+ * - each provided value to be a positive integer chosen by the caller
78
+ *
79
+ * Returns:
80
+ * - one partial concurrency descriptor keyed by scheduling scope
81
+ */
82
+ interface CliConcurrencyConfig {
83
+ /**
84
+ * Workspace-level concurrency cap.
85
+ */
86
+ workspace?: number;
87
+ /**
88
+ * Project-level concurrency cap.
89
+ */
90
+ project?: number;
91
+ /**
92
+ * Task-level concurrency cap.
93
+ */
94
+ task?: number;
95
+ /**
96
+ * Attempt-level concurrency cap.
97
+ */
98
+ attempt?: number;
99
+ /**
100
+ * Case-level concurrency cap.
101
+ */
102
+ case?: number;
103
+ }
69
104
  /**
70
105
  * Defines one project block for `vieval run`.
71
106
  */
@@ -115,6 +150,12 @@ interface CliProjectConfig {
115
150
  * Optional eval-time matrix dimensions.
116
151
  */
117
152
  evalMatrix?: MatrixDefinition | MatrixLayer;
153
+ /**
154
+ * Optional project-scoped concurrency overrides.
155
+ *
156
+ * @default inherited from top-level or CLI execution settings
157
+ */
158
+ concurrency?: Omit<CliConcurrencyConfig, 'workspace'>;
118
159
  /**
119
160
  * Optional task executor.
120
161
  *
@@ -218,9 +259,13 @@ interface CliComparisonConfig {
218
259
  * Expects:
219
260
  * - `model` resolves configured models for the current task
220
261
  * - `reporterHooks` follows `TaskRunContext['reporterHooks']`
262
+ * - `telemetry` follows `TaskRunContext['telemetry']`
263
+ * - `runtimeConcurrency` follows `TaskRunContext['runtimeConcurrency']`
221
264
  */
222
265
  interface CliProjectExecutorContext extends TaskExecutionContext {
223
266
  reporterHooks?: TaskRunContext['reporterHooks'];
267
+ telemetry?: TaskRunContext['telemetry'];
268
+ runtimeConcurrency?: TaskRunContext['runtimeConcurrency'];
224
269
  }
225
270
  /**
226
271
  * Top-level CLI config loaded from `vieval.config.*`.
@@ -232,6 +277,19 @@ interface CliConfigBase {
232
277
  * @default []
233
278
  */
234
279
  models?: ModelDefinition[];
280
+ /**
281
+ * Global concurrency defaults inherited by projects and tasks.
282
+ *
283
+ * Use when:
284
+ * - config authors want one shared concurrency policy across workspace, project, task, attempt, and case scopes
285
+ * - project-local overrides should start from a top-level baseline
286
+ *
287
+ * Expects:
288
+ * - each provided value to be a positive integer chosen by the caller
289
+ *
290
+ * @default undefined
291
+ */
292
+ concurrency?: CliConcurrencyConfig;
235
293
  /**
236
294
  * Global config plugins.
237
295
  *
@@ -254,6 +312,12 @@ interface CliConfigBase {
254
312
  * @default {}
255
313
  */
256
314
  env?: NodeJS.ProcessEnv;
315
+ /**
316
+ * Optional reporting integrations shared by CLI run orchestration.
317
+ *
318
+ * @default undefined
319
+ */
320
+ reporting?: CliReportingConfig;
257
321
  }
258
322
  /**
259
323
  * Project mode config for `vieval run`.
@@ -344,12 +408,44 @@ interface CaseRunContext<TInput> extends TaskRunContext {
344
408
  * - `name` to be a stable metric identifier
345
409
  * - `value` to be JSON-serializable
346
410
  */
347
- metric: (name: string, value: boolean | number | string | null) => void;
411
+ metric: (name: string, value: TelemetryAttributeValue) => void;
412
+ /**
413
+ * Cooperative abort signal for the current case execution.
414
+ */
415
+ signal: AbortSignal;
348
416
  }
349
417
  /**
350
418
  * Callback for one task case.
351
419
  */
352
- type CaseRunner<TInput> = (context: CaseRunContext<TInput>) => Promise<void> | void;
420
+ type CaseRunner<TInput> = (context: CaseRunContext<TInput>) => Promise<unknown> | unknown;
421
+ /**
422
+ * Per-group options for `casesFromInputs`.
423
+ *
424
+ * Use when:
425
+ * - one generated case group should run with a lower case concurrency than the task default
426
+ * - a task should keep a broader task-level cap while one expensive case family stays bounded
427
+ *
428
+ * Expects:
429
+ * - `concurrency` to be a positive integer when provided
430
+ *
431
+ * Returns:
432
+ * - one partial case-group execution descriptor
433
+ */
434
+ interface CasesFromInputsOptions extends TaskExecutionPolicy {
435
+ /**
436
+ * Case-level concurrency cap for cases registered by one `casesFromInputs(...)` call.
437
+ */
438
+ concurrency?: number;
439
+ }
440
+ /**
441
+ * Per-case registration options for `caseOf`.
442
+ */
443
+ interface CaseRegistrationOptions<TInput> extends TaskExecutionPolicy {
444
+ /**
445
+ * Optional case input payload.
446
+ */
447
+ input: TInput;
448
+ }
353
449
  /**
354
450
  * Builder callbacks passed into `describeTask`.
355
451
  */
@@ -359,35 +455,44 @@ interface DescribeTaskBuilder {
359
455
  */
360
456
  caseOf: {
361
457
  (name: string, run: CaseRunner<undefined>): void;
362
- <TInput>(name: string, run: CaseRunner<TInput>, options: {
363
- input: TInput;
364
- }): void;
458
+ <TInput>(name: string, run: CaseRunner<TInput>, options: CaseRegistrationOptions<TInput>): void;
365
459
  };
366
460
  /**
367
461
  * Registers multiple cases from input list.
368
462
  */
369
- casesFromInputs: <TInput>(namePrefix: string, inputs: readonly TInput[], run: CaseRunner<TInput>) => void;
463
+ casesFromInputs: <TInput>(namePrefix: string, inputs: readonly TInput[], run: CaseRunner<TInput>, options?: CasesFromInputsOptions) => void;
370
464
  }
371
465
  /**
372
466
  * Options for `describeTask`.
373
467
  */
374
- interface DescribeTaskOptions {
468
+ interface DescribeTaskOptions extends TaskExecutionPolicy {
375
469
  /**
376
470
  * Optional description override.
377
471
  */
378
472
  description?: string;
473
+ /**
474
+ * Optional task-local concurrency overrides.
475
+ *
476
+ * Use when:
477
+ * - one task should cap attempt fan-out independently from the surrounding project
478
+ * - one task should cap case fan-out without changing global scheduling defaults
479
+ *
480
+ * Expects:
481
+ * - each provided value to be a positive integer
482
+ *
483
+ * @default inherited from project or CLI concurrency settings
484
+ */
485
+ concurrency?: TaskConcurrencyConfig;
379
486
  }
380
487
  /**
381
488
  * Registers one case in the currently active task scope.
382
489
  */
383
490
  declare function caseOf(name: string, run: CaseRunner<undefined>): void;
384
- declare function caseOf<TInput>(name: string, run: CaseRunner<TInput>, options: {
385
- input: TInput;
386
- }): void;
491
+ declare function caseOf<TInput>(name: string, run: CaseRunner<TInput>, options: CaseRegistrationOptions<TInput>): void;
387
492
  /**
388
493
  * Registers multiple cases in the currently active task scope.
389
494
  */
390
- declare function casesFromInputs<TInput>(namePrefix: string, inputs: readonly TInput[], run: CaseRunner<TInput>): void;
495
+ declare function casesFromInputs<TInput>(namePrefix: string, inputs: readonly TInput[], run: CaseRunner<TInput>, options?: CasesFromInputsOptions): void;
391
496
  /**
392
497
  * Defines one eval task with task/case semantics similar to Vitest.
393
498
  *
@@ -399,6 +504,8 @@ declare function describeTask(name: string, build: ((builder: DescribeTaskBuilde
399
504
  readonly description: string;
400
505
  readonly name: string;
401
506
  readonly task: {
507
+ readonly concurrency: TaskConcurrencyConfig | undefined;
508
+ readonly executionPolicy: TaskExecutionPolicy | undefined;
402
509
  readonly id: string;
403
510
  readonly run: (context: TaskRunContext) => Promise<TaskRunOutput>;
404
511
  };
package/dist/index.mjs CHANGED
@@ -1,8 +1,9 @@
1
- import { a as defineConfig, i as registerEvalDefinition, s as loadEnv } from "./registry-CwcMMjnZ.mjs";
2
- import { n as requiredEnvFrom } from "./env-BFSjny07.mjs";
1
+ import { i as registerEvalDefinition, l as loadEnv, o as createNoopTelemetryRuntime, s as defineConfig } from "./registry-BHGMxjpA.mjs";
2
+ import { t as createSchedulerQueue } from "./queue-DsZQkZO_.mjs";
3
+ import { n as requiredEnvFrom } from "./env--94B0UtW.mjs";
3
4
  import { defineEval, defineTask } from "./config.mjs";
4
5
  import { expect } from "./expect.mjs";
5
- import { errorMessageFrom } from "@moeru/std";
6
+ import { errorMessageFrom, sleep } from "@moeru/std";
6
7
  //#region src/dsl/task.ts
7
8
  function cloneCaseMatrix(matrix) {
8
9
  return {
@@ -14,9 +15,36 @@ function cloneCaseMatrix(matrix) {
14
15
  function createTaskCaseReporterId(index, name) {
15
16
  return `${index}:${encodeURIComponent(name)}`;
16
17
  }
18
+ function isTelemetryAttributeScalar(value) {
19
+ return typeof value === "boolean" || typeof value === "number" || typeof value === "string";
20
+ }
21
+ function isTelemetryAttributeArray(value) {
22
+ return value.every(isTelemetryAttributeScalar);
23
+ }
24
+ function canAttachMetricAsAttribute(value) {
25
+ if (isTelemetryAttributeScalar(value)) return true;
26
+ return Array.isArray(value) && isTelemetryAttributeArray(value);
27
+ }
17
28
  function assertValidScore(score) {
18
29
  if (!Number.isFinite(score) || score < 0 || score > 1) throw new Error(`Case score must be a finite number in range 0..1, got "${score}".`);
19
30
  }
31
+ function assertNonNegativeInteger(value, label) {
32
+ if (!Number.isFinite(value) || !Number.isInteger(value) || value < 0) throw new Error(`Invalid ${label}: ${String(value)}`);
33
+ }
34
+ function assertNonNegativeNumber(value, label) {
35
+ if (!Number.isFinite(value) || value < 0) throw new Error(`Invalid ${label}: ${String(value)}`);
36
+ }
37
+ function assertPositiveInteger(value, label) {
38
+ if (!Number.isFinite(value) || !Number.isInteger(value) || value <= 0) throw new Error(`Invalid ${label}: ${String(value)}`);
39
+ }
40
+ function autoRetryDelayMs(retryIndex) {
41
+ return 500 * 2 ** (retryIndex - 1);
42
+ }
43
+ function resolveAutoRetryDelay(policy, retryIndex) {
44
+ const delay = policy.autoRetryDelay;
45
+ if (delay == null) return autoRetryDelayMs(retryIndex);
46
+ return typeof delay === "number" ? delay : delay(retryIndex);
47
+ }
20
48
  function emitCaseStart(hooks, payload) {
21
49
  try {
22
50
  hooks?.onCaseStart?.(payload);
@@ -27,9 +55,174 @@ function emitCaseEnd(hooks, payload) {
27
55
  hooks?.onCaseEnd?.(payload);
28
56
  } catch {}
29
57
  }
58
+ function emitReporterEvent(hooks, payload) {
59
+ try {
60
+ hooks?.onEvent?.(payload);
61
+ } catch {}
62
+ }
63
+ function createCaseTimeoutError(timeout) {
64
+ const error = /* @__PURE__ */ new Error(`Case timed out after ${timeout}ms.`);
65
+ error.name = "TimeoutError";
66
+ return error;
67
+ }
68
+ function normalizeExecutionPolicy(policy, label) {
69
+ if (policy == null) return;
70
+ if (policy.autoAttempt != null) assertNonNegativeInteger(policy.autoAttempt, `${label} autoAttempt`);
71
+ if (policy.autoRetry != null) assertNonNegativeInteger(policy.autoRetry, `${label} autoRetry`);
72
+ if (typeof policy.autoRetryDelay === "number") assertNonNegativeNumber(policy.autoRetryDelay, `${label} autoRetryDelay`);
73
+ if (policy.timeout != null) assertPositiveInteger(policy.timeout, `${label} timeout`);
74
+ const normalized = {
75
+ autoAttempt: policy.autoAttempt,
76
+ autoRetry: policy.autoRetry,
77
+ autoRetryDelay: policy.autoRetryDelay,
78
+ timeout: policy.timeout
79
+ };
80
+ return Object.values(normalized).some((value) => value != null) ? normalized : void 0;
81
+ }
82
+ function resolveCaseExecutionPolicy(taskCase, taskExecutionPolicy) {
83
+ return {
84
+ autoAttempt: taskCase.executionPolicy?.autoAttempt ?? taskExecutionPolicy?.autoAttempt ?? 0,
85
+ autoRetry: taskCase.executionPolicy?.autoRetry ?? taskExecutionPolicy?.autoRetry ?? 0,
86
+ autoRetryDelay: taskCase.executionPolicy?.autoRetryDelay ?? taskExecutionPolicy?.autoRetryDelay,
87
+ timeout: taskCase.executionPolicy?.timeout ?? taskExecutionPolicy?.timeout
88
+ };
89
+ }
90
+ async function runCaseOnce(context, taskCase, index, timeout) {
91
+ const customScoresByKind = /* @__PURE__ */ new Map();
92
+ const abortController = new AbortController();
93
+ const telemetry = context.telemetry ?? createNoopTelemetryRuntime();
94
+ const caseId = createTaskCaseReporterId(index, taskCase.name);
95
+ let timeoutHandle;
96
+ let timedOut = false;
97
+ let settled = false;
98
+ try {
99
+ return await telemetry.withSpan("vieval.case", {
100
+ "vieval.case.id": caseId,
101
+ "vieval.case.name": taskCase.name,
102
+ "vieval.task.id": context.task.id,
103
+ "vieval.task.name": context.task.entry.name
104
+ }, async () => {
105
+ const runPromise = Promise.resolve(taskCase.run({
106
+ ...context,
107
+ matrix: {
108
+ ...cloneCaseMatrix(context.task.matrix),
109
+ inputs: taskCase.input
110
+ },
111
+ metric(name, value) {
112
+ if (abortController.signal.aborted || settled) return;
113
+ emitReporterEvent(context.reporterHooks, {
114
+ caseId,
115
+ data: {
116
+ name,
117
+ value
118
+ },
119
+ event: "task.case.metric"
120
+ });
121
+ telemetry.addEvent("vieval.case.metric", {
122
+ name,
123
+ value
124
+ });
125
+ if (canAttachMetricAsAttribute(value)) telemetry.setAttributes({ [name]: value });
126
+ },
127
+ score(score, kind = "exact") {
128
+ if (abortController.signal.aborted || settled) return;
129
+ assertValidScore(score);
130
+ customScoresByKind.set(kind, score);
131
+ telemetry.addEvent("vieval.case.score", {
132
+ "vieval.score.kind": kind,
133
+ "vieval.score.value": score
134
+ });
135
+ emitReporterEvent(context.reporterHooks, {
136
+ caseId,
137
+ data: {
138
+ kind,
139
+ score
140
+ },
141
+ event: "task.case.score"
142
+ });
143
+ },
144
+ signal: abortController.signal
145
+ }));
146
+ if (timeout != null) {
147
+ const timeoutPromise = new Promise((_, reject) => {
148
+ timeoutHandle = setTimeout(() => {
149
+ timedOut = true;
150
+ abortController.abort(createCaseTimeoutError(timeout));
151
+ reject(createCaseTimeoutError(timeout));
152
+ }, timeout);
153
+ });
154
+ const output = await Promise.race([runPromise, timeoutPromise]);
155
+ settled = true;
156
+ return {
157
+ output,
158
+ scoresByKind: customScoresByKind,
159
+ state: "passed"
160
+ };
161
+ }
162
+ const output = await runPromise;
163
+ settled = true;
164
+ return {
165
+ output,
166
+ scoresByKind: customScoresByKind,
167
+ state: "passed"
168
+ };
169
+ });
170
+ } catch (error) {
171
+ settled = true;
172
+ return {
173
+ errorMessage: errorMessageFrom(error) ?? (timedOut && timeout != null ? `Case timed out after ${timeout}ms.` : "Unknown case failure."),
174
+ scoresByKind: customScoresByKind,
175
+ state: timedOut ? "timeout" : "failed"
176
+ };
177
+ } finally {
178
+ if (timeoutHandle != null) clearTimeout(timeoutHandle);
179
+ }
180
+ }
181
+ async function executeRegisteredCase(context, taskCase, index, totalCases, taskExecutionPolicy) {
182
+ const resolvedPolicy = resolveCaseExecutionPolicy(taskCase, taskExecutionPolicy);
183
+ let lastOutcome;
184
+ for (let retryIndex = 0; retryIndex <= resolvedPolicy.autoRetry; retryIndex += 1) {
185
+ if (retryIndex > 0) {
186
+ const retryDelayMs = resolveAutoRetryDelay(resolvedPolicy, retryIndex);
187
+ assertNonNegativeNumber(retryDelayMs, "autoRetryDelay result");
188
+ if (retryDelayMs > 0) await sleep(retryDelayMs);
189
+ }
190
+ emitCaseStart(context.reporterHooks, {
191
+ ...resolvedPolicy.autoRetry > 0 ? {
192
+ autoRetry: resolvedPolicy.autoRetry,
193
+ retryIndex
194
+ } : {},
195
+ index,
196
+ ...taskCase.input === void 0 ? {} : { input: taskCase.input },
197
+ name: taskCase.name,
198
+ total: totalCases
199
+ });
200
+ lastOutcome = await runCaseOnce(context, taskCase, index, resolvedPolicy.timeout);
201
+ if (lastOutcome.state === "passed") return lastOutcome;
202
+ }
203
+ return lastOutcome ?? {
204
+ errorMessage: "Unknown case failure.",
205
+ scoresByKind: /* @__PURE__ */ new Map(),
206
+ state: "failed"
207
+ };
208
+ }
209
+ function collectCaseOutcomeScores(outcome, scoreBucketsByKind) {
210
+ if (outcome.state !== "passed") {
211
+ scoreBucketsByKind.exact.push(0);
212
+ return;
213
+ }
214
+ if (outcome.scoresByKind.size === 0) {
215
+ scoreBucketsByKind.exact.push(1);
216
+ return;
217
+ }
218
+ scoreBucketsByKind.exact.push(outcome.scoresByKind.get("exact") ?? 1);
219
+ const judgeScore = outcome.scoresByKind.get("judge");
220
+ if (judgeScore != null) scoreBucketsByKind.judge.push(judgeScore);
221
+ }
30
222
  function createCaseBuilder(registeredCases) {
31
223
  function registerCase(name, run, options) {
32
224
  registeredCases.push({
225
+ executionPolicy: normalizeExecutionPolicy(options, "task case"),
33
226
  input: options?.input,
34
227
  name,
35
228
  run
@@ -37,11 +230,15 @@ function createCaseBuilder(registeredCases) {
37
230
  }
38
231
  return {
39
232
  caseOf: registerCase,
40
- casesFromInputs(namePrefix, inputs, run) {
233
+ casesFromInputs(namePrefix, inputs, run, options) {
234
+ const queueKey = options?.concurrency == null ? void 0 : {};
41
235
  inputs.forEach((input, index) => {
42
236
  registeredCases.push({
237
+ concurrency: options?.concurrency,
238
+ executionPolicy: normalizeExecutionPolicy(options, "casesFromInputs"),
43
239
  input,
44
240
  name: `${namePrefix} #${index + 1}`,
241
+ queueKey,
45
242
  run
46
243
  });
47
244
  });
@@ -64,6 +261,7 @@ function getActiveCases() {
64
261
  }
65
262
  function caseOf(name, run, options) {
66
263
  getActiveCases().push({
264
+ executionPolicy: normalizeExecutionPolicy(options, "task case"),
67
265
  input: options?.input,
68
266
  name,
69
267
  run
@@ -72,16 +270,40 @@ function caseOf(name, run, options) {
72
270
  /**
73
271
  * Registers multiple cases in the currently active task scope.
74
272
  */
75
- function casesFromInputs(namePrefix, inputs, run) {
273
+ function casesFromInputs(namePrefix, inputs, run, options) {
274
+ const queueKey = options?.concurrency == null ? void 0 : {};
76
275
  inputs.forEach((input, index) => {
77
276
  getActiveCases().push({
277
+ concurrency: options?.concurrency,
278
+ executionPolicy: normalizeExecutionPolicy(options, "casesFromInputs"),
78
279
  input,
79
280
  name: `${namePrefix} #${index + 1}`,
281
+ queueKey,
80
282
  run
81
283
  });
82
284
  });
83
285
  }
84
286
  /**
287
+ * Resolves the effective case concurrency for one registered task case.
288
+ *
289
+ * Before:
290
+ * - registered case override `2`, task default `4`
291
+ * - registered case override `undefined`, task default `3`
292
+ *
293
+ * After:
294
+ * - `2`
295
+ * - `3`
296
+ */
297
+ function resolveCaseConcurrency(taskCase, taskConcurrency, runtimeConcurrency) {
298
+ const concurrency = runtimeConcurrency?.case ?? taskCase.concurrency ?? taskConcurrency?.case;
299
+ if (concurrency == null) return;
300
+ if (!Number.isFinite(concurrency) || !Number.isInteger(concurrency) || concurrency <= 0) throw new Error(`Invalid task case concurrency: ${String(concurrency)}`);
301
+ return concurrency;
302
+ }
303
+ function resolveCaseQueueKey(taskCase, defaultQueueKey) {
304
+ return taskCase.queueKey ?? defaultQueueKey;
305
+ }
306
+ /**
85
307
  * Defines one eval task with task/case semantics similar to Vitest.
86
308
  *
87
309
  * Use when:
@@ -98,10 +320,14 @@ function describeTask(name, build, options = {}) {
98
320
  }
99
321
  build();
100
322
  });
323
+ const description = options.description ?? name;
324
+ const taskExecutionPolicy = normalizeExecutionPolicy(options, "describeTask");
101
325
  const definition = defineEval({
102
- description: options.description ?? name,
326
+ description,
103
327
  name,
104
328
  task: defineTask({
329
+ concurrency: options.concurrency,
330
+ executionPolicy: taskExecutionPolicy,
105
331
  id: name,
106
332
  async run(context) {
107
333
  if (registeredCases.length === 0) return { scores: [{
@@ -113,62 +339,68 @@ function describeTask(name, build, options = {}) {
113
339
  exact: [],
114
340
  judge: []
115
341
  };
116
- await Promise.all(registeredCases.map(async (taskCase, index) => {
117
- emitCaseStart(context.reporterHooks, {
118
- index,
119
- name: taskCase.name,
120
- total: totalCases
121
- });
122
- let state = "passed";
123
- let errorMessage;
124
- const caseId = createTaskCaseReporterId(index, taskCase.name);
125
- const customScoresByKind = /* @__PURE__ */ new Map();
126
- try {
127
- await taskCase.run({
128
- ...context,
129
- matrix: {
130
- ...cloneCaseMatrix(context.task.matrix),
131
- inputs: taskCase.input
132
- },
133
- metric(name, value) {
134
- context.reporterHooks?.onEvent?.({
135
- caseId,
136
- data: {
137
- name,
138
- value
139
- },
140
- event: "task.case.metric"
141
- });
142
- },
143
- score(score, kind = "exact") {
144
- assertValidScore(score);
145
- customScoresByKind.set(kind, score);
146
- }
147
- });
148
- } catch (error) {
149
- state = "failed";
150
- errorMessage = errorMessageFrom(error) ?? "Unknown case failure.";
151
- } finally {
342
+ const defaultCaseQueueKey = {};
343
+ const caseQueues = /* @__PURE__ */ new Map();
344
+ const hasAutoAttempt = registeredCases.some((taskCase) => resolveCaseExecutionPolicy(taskCase, taskExecutionPolicy).autoAttempt > 0);
345
+ const runtimeTaskConcurrency = context.task.entry.task?.concurrency ?? options.concurrency;
346
+ if (!hasAutoAttempt) await Promise.all(registeredCases.map(async (taskCase, index) => {
347
+ const executeCase = async () => {
348
+ const outcome = await executeRegisteredCase(context, taskCase, index, totalCases, taskExecutionPolicy);
152
349
  emitCaseEnd(context.reporterHooks, {
153
- ...errorMessage == null ? {} : { errorMessage },
350
+ ...outcome.errorMessage == null ? {} : { errorMessage: outcome.errorMessage },
154
351
  index,
155
- state,
352
+ ...outcome.output === void 0 ? {} : { output: outcome.output },
353
+ state: outcome.state,
156
354
  name: taskCase.name,
157
355
  total: totalCases
158
356
  });
159
- }
160
- if (state === "failed") {
161
- scoreBucketsByKind.exact.push(0);
162
- return;
163
- }
164
- if (customScoresByKind.size === 0) {
165
- scoreBucketsByKind.exact.push(1);
357
+ collectCaseOutcomeScores(outcome, scoreBucketsByKind);
358
+ };
359
+ const concurrency = resolveCaseConcurrency(taskCase, runtimeTaskConcurrency, context.runtimeConcurrency);
360
+ if (concurrency == null) {
361
+ await executeCase();
166
362
  return;
167
363
  }
168
- scoreBucketsByKind.exact.push(customScoresByKind.get("exact") ?? 1);
169
- const judgeScore = customScoresByKind.get("judge");
170
- if (judgeScore != null) scoreBucketsByKind.judge.push(judgeScore);
364
+ const queueKey = resolveCaseQueueKey(taskCase, defaultCaseQueueKey);
365
+ const queue = caseQueues.get(queueKey) ?? createSchedulerQueue(concurrency);
366
+ caseQueues.set(queueKey, queue);
367
+ await queue.run(executeCase);
171
368
  }));
369
+ else {
370
+ let finalOutcomes = [];
371
+ let attemptIndex = 0;
372
+ for (;;) {
373
+ finalOutcomes = await Promise.all(registeredCases.map(async (taskCase, index) => {
374
+ const executeCase = async () => await executeRegisteredCase(context, taskCase, index, totalCases, taskExecutionPolicy);
375
+ const concurrency = resolveCaseConcurrency(taskCase, runtimeTaskConcurrency, context.runtimeConcurrency);
376
+ if (concurrency == null) return await executeCase();
377
+ const queueKey = resolveCaseQueueKey(taskCase, defaultCaseQueueKey);
378
+ const queue = caseQueues.get(queueKey) ?? createSchedulerQueue(concurrency);
379
+ caseQueues.set(queueKey, queue);
380
+ return await queue.run(executeCase);
381
+ }));
382
+ if (!finalOutcomes.some((outcome, index) => {
383
+ if (outcome.state === "passed") return false;
384
+ const taskCase = registeredCases[index];
385
+ if (taskCase == null) return false;
386
+ return attemptIndex < resolveCaseExecutionPolicy(taskCase, taskExecutionPolicy).autoAttempt;
387
+ })) break;
388
+ attemptIndex += 1;
389
+ }
390
+ finalOutcomes.forEach((outcome, index) => {
391
+ const taskCase = registeredCases[index];
392
+ if (taskCase == null) return;
393
+ emitCaseEnd(context.reporterHooks, {
394
+ ...outcome.errorMessage == null ? {} : { errorMessage: outcome.errorMessage },
395
+ index,
396
+ ...outcome.output === void 0 ? {} : { output: outcome.output },
397
+ state: outcome.state,
398
+ name: taskCase.name,
399
+ total: totalCases
400
+ });
401
+ collectCaseOutcomeScores(outcome, scoreBucketsByKind);
402
+ });
403
+ }
172
404
  return { scores: Object.keys(scoreBucketsByKind).filter((kind) => scoreBucketsByKind[kind].length > 0).map((kind) => {
173
405
  const values = scoreBucketsByKind[kind];
174
406
  return {