vieval 0.0.6 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -495,15 +495,6 @@ interface ModelDefinition {
495
495
  declare function resolveModelByName(models: readonly ModelDefinition[], name: string): ModelDefinition | undefined;
496
496
  //#endregion
497
497
  //#region src/core/runner/task-context.d.ts
498
- /**
499
- * Options for selecting a model from the execution context.
500
- */
501
- interface TaskModelSelectionOptions {
502
- /**
503
- * Model id or alias name.
504
- */
505
- name: string;
506
- }
507
498
  /**
508
499
  * Task-scoped execution context exposed to runner executors.
509
500
  */
@@ -513,13 +504,9 @@ interface TaskExecutionContext {
513
504
  */
514
505
  cache: TaskCacheRuntime;
515
506
  /**
516
- * Resolves model configuration for the current task.
517
- *
518
- * Use when:
519
- * - no arguments are provided to use the model selected by run matrix/inferenceExecutor
520
- * - `name` is provided to resolve a specific model id or alias
507
+ * Configured model registrations available to model plugins.
521
508
  */
522
- model: (selection?: string | TaskModelSelectionOptions) => ModelDefinition;
509
+ models: readonly ModelDefinition[];
523
510
  }
524
511
  /**
525
512
  * Inputs used to build task execution context.
@@ -530,14 +517,13 @@ interface CreateTaskExecutionContextOptions {
530
517
  task: ScheduledTask;
531
518
  }
532
519
  /**
533
- * Creates task-scoped model resolver context for runner execution.
520
+ * Creates task-scoped context data for runner execution.
534
521
  *
535
522
  * Call stack:
536
523
  *
537
524
  * {@link runScheduledTasks}
538
525
  * -> {@link createTaskExecutionContext}
539
- * -> {@link resolveModelByName}
540
- * -> `task.model()` / `task.model({ name })`
526
+ * -> `TaskExecutionContext`
541
527
  */
542
528
  declare function createTaskExecutionContext(options: CreateTaskExecutionContextOptions): TaskExecutionContext;
543
529
  //#endregion
@@ -581,7 +567,7 @@ interface RunScheduledTasksOptions {
581
567
  * Creates per-task execution context.
582
568
  *
583
569
  * Use when:
584
- * - executor code needs per-task model resolution or other task-scoped data
570
+ * - executor code needs per-task models, cache, or other task-scoped data
585
571
  */
586
572
  createExecutionContext?: (task: ScheduledTask) => TaskExecutionContext;
587
573
  /**
@@ -646,7 +632,39 @@ declare class RunnerExecutionError extends Error {
646
632
  */
647
633
  declare function runScheduledTasks(tasks: readonly ScheduledTask[], executor: ScheduledTaskExecutor, options?: RunScheduledTasksOptions): Promise<AggregatedRunResults>;
648
634
  //#endregion
635
+ //#region src/core/telemetry/types.d.ts
636
+ /** JSON-compatible scalar values accepted as telemetry attributes. */
637
+ type TelemetryAttributeValue = boolean | number | string | null | readonly TelemetryAttributeValue[];
638
+ /** Attribute map shared by local report projection and OpenTelemetry span calls. */
639
+ type TelemetryAttributes = Record<string, TelemetryAttributeValue | undefined>;
640
+ /**
641
+ * Internal Vieval telemetry runtime.
642
+ *
643
+ * Use when:
644
+ * - runner code needs one execution path for disabled and enabled telemetry
645
+ * - case code should run inside an active OpenTelemetry span when configured
646
+ *
647
+ * Expects:
648
+ * - attributes are JSON-compatible and stable enough for report filtering
649
+ * - callbacks are awaited by the caller
650
+ *
651
+ * Returns:
652
+ * - callback result, preserving thrown errors after telemetry records them
653
+ */
654
+ interface TelemetryRuntime {
655
+ withSpan: <T>(name: string, attributes: TelemetryAttributes, callback: () => Promise<T>) => Promise<T>;
656
+ addEvent: (name: string, attributes?: TelemetryAttributes) => void;
657
+ setAttributes: (attributes: TelemetryAttributes) => void;
658
+ recordException: (error: unknown) => void;
659
+ }
660
+ //#endregion
649
661
  //#region src/config/types.d.ts
662
+ /**
663
+ * Value that can be returned directly or through a promise.
664
+ *
665
+ * @param T - Resolved value type.
666
+ */
667
+ type Awaitable<T> = Promise<T> | T;
650
668
  /**
651
669
  * Primitive value allowed in one matrix cell.
652
670
  *
@@ -862,6 +880,12 @@ interface TaskRunOutput {
862
880
  */
863
881
  scores: readonly RunScore[];
864
882
  }
883
+ /**
884
+ * Delay policy for retries within one task case attempt.
885
+ *
886
+ * @param retryIndex Retry number where `1` is the first retry after the initial failure.
887
+ */
888
+ type TaskAutoRetryDelay = number | ((retryIndex: number) => number);
865
889
  /**
866
890
  * Execution policy applied to task and case callbacks.
867
891
  *
@@ -883,6 +907,15 @@ interface TaskExecutionPolicy {
883
907
  * @default 0
884
908
  */
885
909
  autoRetry?: number;
910
+ /**
911
+ * Delay in milliseconds before a case auto retry starts.
912
+ *
913
+ * A number applies the same delay to every retry. A function receives the
914
+ * retry index where `1` is the first retry after the initial failure.
915
+ *
916
+ * @default retryIndex => 500 * 2 ** (retryIndex - 1)
917
+ */
918
+ autoRetryDelay?: TaskAutoRetryDelay;
886
919
  /**
887
920
  * Additional full task attempts allowed after the current attempt settles.
888
921
  *
@@ -917,6 +950,30 @@ interface TaskConcurrencyConfig {
917
950
  */
918
951
  case?: number;
919
952
  }
953
+ /**
954
+ * Reporting configuration for local artifacts and optional OpenTelemetry integration.
955
+ */
956
+ interface CliReportingConfig {
957
+ /**
958
+ * Optional OpenTelemetry API integration.
959
+ */
960
+ openTelemetry?: CliOpenTelemetryReportingConfig;
961
+ }
962
+ /**
963
+ * OpenTelemetry reporting configuration managed by user config setup.
964
+ */
965
+ interface CliOpenTelemetryReportingConfig {
966
+ /**
967
+ * Enables Vieval active span wrapping through `@opentelemetry/api`.
968
+ *
969
+ * @default false
970
+ */
971
+ enabled?: boolean;
972
+ /**
973
+ * Called after all telemetry events and local report artifacts have been emitted.
974
+ */
975
+ onRunEnd?: () => Awaitable<void>;
976
+ }
920
977
  /**
921
978
  * Runtime context passed into eval task `run`.
922
979
  */
@@ -964,24 +1021,13 @@ interface TaskRunContext {
964
1021
  */
965
1022
  task: ScheduledTask;
966
1023
  /**
967
- * Matrix-scoped model resolver.
968
- *
969
- * Runtime impact:
970
- * - `context.model()` uses `context.task.matrix.run.model` first when present
971
- * - then falls back to inferenceExecutor-id match
972
- * - then falls back to first configured model
1024
+ * Configured model registrations available to model plugins.
973
1025
  *
974
- * @example
975
- * ```ts
976
- * // matrix.run.model = 'gpt-4.1-mini'
977
- * const defaultModel = context.model()
978
- * // resolves the configured model whose id/model/alias matches 'gpt-4.1-mini'
979
- *
980
- * const judgeModel = context.model({ name: 'judge-large' })
981
- * // explicit lookup bypasses matrix default
982
- * ```
1026
+ * Use when:
1027
+ * - a plugin owns model selection semantics and needs access to registered models
1028
+ * - eval code resolves matrix-selected model axes through plugin helpers
983
1029
  */
984
- model: TaskExecutionContext['model'];
1030
+ models: TaskExecutionContext['models'];
985
1031
  /**
986
1032
  * Optional reporter lifecycle hooks for task-local case events.
987
1033
  *
@@ -992,6 +1038,17 @@ interface TaskRunContext {
992
1038
  * - hooks are best-effort observers and should not affect task scoring
993
1039
  */
994
1040
  reporterHooks?: TaskReporterHooks;
1041
+ /**
1042
+ * Optional telemetry runtime shared by runner, DSL, and reporter integrations.
1043
+ *
1044
+ * Use when:
1045
+ * - task execution should emit events to the currently active telemetry runtime
1046
+ * - enabled and disabled telemetry should keep the same execution path
1047
+ *
1048
+ * Expects:
1049
+ * - callers inject a no-op runtime when telemetry is disabled
1050
+ */
1051
+ telemetry?: TelemetryRuntime;
995
1052
  /**
996
1053
  * Optional runtime scheduling overrides supplied by CLI or host execution.
997
1054
  *
@@ -1036,6 +1093,10 @@ interface TaskCaseReporterPayload {
1036
1093
  * Maximum retry count configured for this case.
1037
1094
  */
1038
1095
  autoRetry?: number;
1096
+ /**
1097
+ * Optional case input payload registered by the task DSL.
1098
+ */
1099
+ input?: unknown;
1039
1100
  /**
1040
1101
  * Declared case label.
1041
1102
  */
@@ -1066,6 +1127,10 @@ interface TaskCaseReporterPayload {
1066
1127
  * - `state` describes the final case result
1067
1128
  */
1068
1129
  interface TaskCaseReporterEndPayload extends TaskCaseReporterPayload {
1130
+ /**
1131
+ * Optional case output returned by the task case callback.
1132
+ */
1133
+ output?: unknown;
1069
1134
  /**
1070
1135
  * Final case state.
1071
1136
  */
@@ -1288,5 +1353,5 @@ interface ConfigHookPlugin<TConfig> {
1288
1353
  configVievalResolved?: (config: TConfig) => void | Promise<void>;
1289
1354
  }
1290
1355
  //#endregion
1291
- export { ScheduledTask as $, CreateTaskExecutionContextOptions as A, createRunnerRuntimeContext as B, TaskRunContext as C, RunnerTaskState as D, RunnerExecutionError as E, resolveModelByName as F, RunScore as G, AggregatedRunResults as H, asProjectRelativePath as I, CreateRunnerScheduleOptions as J, RunScoreKind as K, collectEvalEntries as L, TaskModelSelectionOptions as M, createTaskExecutionContext as N, ScheduledTaskExecutor as O, ModelDefinition as P, RunnerMatrixSelection as Q, CreateVievalRunnerRuntimeContextOptions as R, TaskReporterHooks as S, RunScheduledTasksOptions as T, AggregatedRunSummary as U, AggregatedProviderSummary as V, RunResult as W, RunnerMatrixDefinition as X, InferenceExecutor as Y, RunnerMatrixInput as Z, TaskCaseState as _, EvalDefinition as a, normalizeCacheFilePathSegments as at, TaskExecutionPolicy as b, MatrixAxisValues as c, CacheNamespace as ct, MatrixPrimitive as d, ScheduledTaskMatrix as et, MatrixRow as f, TaskCaseReporterPayload as g, TaskCaseReporterEndPayload as h, CollectedEvalEntry as i, createFilesystemTaskCacheRuntime as it, TaskExecutionContext as j, runScheduledTasks as k, MatrixDefinition as l, TaskCacheRuntime as lt, ScopedMatrices as m, defineEval as n, createRunnerSchedule as nt, EvalModule as o, CacheFileHandle as ot, MatrixValue as p, aggregateRunResults as q, defineTask as r, CreateFilesystemTaskCacheRuntimeOptions as rt, EvalModuleMap as s, CacheFileOptions as st, ConfigHookPlugin as t, ScheduledTaskMatrixMeta as tt, MatrixLayer as u, TaskConcurrencyConfig as v, TaskRunOutput as w, TaskReporterEventPayload as x, TaskDefinition as y, RunnerRuntimeContext as z };
1292
- //# sourceMappingURL=index-DBZKkpBe.d.mts.map
1356
+ export { InferenceExecutor as $, RunScheduledTasksOptions as A, asProjectRelativePath as B, TaskDefinition as C, TaskRunContext as D, TaskReporterHooks as E, CreateTaskExecutionContextOptions as F, AggregatedProviderSummary as G, CreateVievalRunnerRuntimeContextOptions as H, TaskExecutionContext as I, RunResult as J, AggregatedRunResults as K, createTaskExecutionContext as L, RunnerTaskState as M, ScheduledTaskExecutor as N, TaskRunOutput as O, runScheduledTasks as P, CreateRunnerScheduleOptions as Q, ModelDefinition as R, TaskConcurrencyConfig as S, TaskReporterEventPayload as T, RunnerRuntimeContext as U, collectEvalEntries as V, createRunnerRuntimeContext as W, RunScoreKind as X, RunScore as Y, aggregateRunResults as Z, ScopedMatrices as _, CliOpenTelemetryReportingConfig as a, ScheduledTaskMatrixMeta as at, TaskCaseReporterPayload as b, EvalDefinition as c, createFilesystemTaskCacheRuntime as ct, MatrixAxisValues as d, CacheFileOptions as dt, RunnerMatrixDefinition as et, MatrixDefinition as f, CacheNamespace as ft, MatrixValue as g, MatrixRow as h, Awaitable as i, ScheduledTaskMatrix as it, RunnerExecutionError as j, TelemetryAttributeValue as k, EvalModule as l, normalizeCacheFilePathSegments as lt, MatrixPrimitive as m, defineEval as n, RunnerMatrixSelection as nt, CliReportingConfig as o, createRunnerSchedule as ot, MatrixLayer as p, TaskCacheRuntime as pt, AggregatedRunSummary as q, defineTask as r, ScheduledTask as rt, CollectedEvalEntry as s, CreateFilesystemTaskCacheRuntimeOptions as st, ConfigHookPlugin as t, RunnerMatrixInput as tt, EvalModuleMap as u, CacheFileHandle as ut, TaskAutoRetryDelay as v, TaskExecutionPolicy as w, TaskCaseState as x, TaskCaseReporterEndPayload as y, resolveModelByName as z };
1357
+ //# sourceMappingURL=index-BkjyCInx.d.mts.map
package/dist/index.d.mts CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as ScheduledTask, C as TaskRunContext, K as RunScoreKind, P as ModelDefinition, W as RunResult, Y as InferenceExecutor, b as TaskExecutionPolicy, j as TaskExecutionContext, l as MatrixDefinition, t as ConfigHookPlugin, u as MatrixLayer, v as TaskConcurrencyConfig, w as TaskRunOutput } from "./index-DBZKkpBe.mjs";
1
+ import { $ as InferenceExecutor, D as TaskRunContext, I as TaskExecutionContext, J as RunResult, O as TaskRunOutput, R as ModelDefinition, S as TaskConcurrencyConfig, X as RunScoreKind, f as MatrixDefinition, k as TelemetryAttributeValue, o as CliReportingConfig, p as MatrixLayer, rt as ScheduledTask, t as ConfigHookPlugin, w as TaskExecutionPolicy } from "./index-BkjyCInx.mjs";
2
2
  import { a as requiredEnvFrom } from "./env-BeHv_5mo.mjs";
3
3
  import { expect } from "./expect.mjs";
4
4
  import * as _$c12 from "c12";
@@ -137,7 +137,7 @@ interface CliProjectConfig {
137
137
  * Model definitions available to project runtime execution.
138
138
  *
139
139
  * Inference executors control schedule fan-out, while models provide
140
- * runtime lookup metadata for `context.model(...)` during task execution.
140
+ * runtime lookup metadata for model plugin helpers during task execution.
141
141
  *
142
142
  * @default inherited from top-level config models
143
143
  */
@@ -253,16 +253,18 @@ interface CliComparisonConfig {
253
253
  * Execution context exposed to project-level `executor` implementations.
254
254
  *
255
255
  * Use when:
256
- * - a project executor needs the task-scoped model resolver plus case reporter hooks
256
+ * - a project executor needs task-scoped models plus case reporter hooks
257
257
  * - custom scheduling logic wants the same hook shape as `TaskRunContext`
258
258
  *
259
259
  * Expects:
260
- * - `model` resolves configured models for the current task
260
+ * - `models` exposes configured model registrations for plugin helpers
261
261
  * - `reporterHooks` follows `TaskRunContext['reporterHooks']`
262
+ * - `telemetry` follows `TaskRunContext['telemetry']`
262
263
  * - `runtimeConcurrency` follows `TaskRunContext['runtimeConcurrency']`
263
264
  */
264
265
  interface CliProjectExecutorContext extends TaskExecutionContext {
265
266
  reporterHooks?: TaskRunContext['reporterHooks'];
267
+ telemetry?: TaskRunContext['telemetry'];
266
268
  runtimeConcurrency?: TaskRunContext['runtimeConcurrency'];
267
269
  }
268
270
  /**
@@ -310,6 +312,12 @@ interface CliConfigBase {
310
312
  * @default {}
311
313
  */
312
314
  env?: NodeJS.ProcessEnv;
315
+ /**
316
+ * Optional reporting integrations shared by CLI run orchestration.
317
+ *
318
+ * @default undefined
319
+ */
320
+ reporting?: CliReportingConfig;
313
321
  }
314
322
  /**
315
323
  * Project mode config for `vieval run`.
@@ -400,7 +408,7 @@ interface CaseRunContext<TInput> extends TaskRunContext {
400
408
  * - `name` to be a stable metric identifier
401
409
  * - `value` to be JSON-serializable
402
410
  */
403
- metric: (name: string, value: boolean | number | string | null) => void;
411
+ metric: (name: string, value: TelemetryAttributeValue) => void;
404
412
  /**
405
413
  * Cooperative abort signal for the current case execution.
406
414
  */
@@ -409,7 +417,7 @@ interface CaseRunContext<TInput> extends TaskRunContext {
409
417
  /**
410
418
  * Callback for one task case.
411
419
  */
412
- type CaseRunner<TInput> = (context: CaseRunContext<TInput>) => Promise<void> | void;
420
+ type CaseRunner<TInput> = (context: CaseRunContext<TInput>) => Promise<unknown> | unknown;
413
421
  /**
414
422
  * Per-group options for `casesFromInputs`.
415
423
  *
package/dist/index.mjs CHANGED
@@ -1,9 +1,9 @@
1
- import { a as defineConfig, i as registerEvalDefinition, s as loadEnv } from "./registry-CcKZqDJY.mjs";
1
+ import { i as registerEvalDefinition, l as loadEnv, o as createNoopTelemetryRuntime, s as defineConfig } from "./registry-BHGMxjpA.mjs";
2
2
  import { t as createSchedulerQueue } from "./queue-DsZQkZO_.mjs";
3
- import { n as requiredEnvFrom } from "./env--94B0UtW.mjs";
3
+ import { n as requiredEnvFrom } from "./env-BFSjny07.mjs";
4
4
  import { defineEval, defineTask } from "./config.mjs";
5
5
  import { expect } from "./expect.mjs";
6
- import { errorMessageFrom } from "@moeru/std";
6
+ import { errorMessageFrom, sleep } from "@moeru/std";
7
7
  //#region src/dsl/task.ts
8
8
  function cloneCaseMatrix(matrix) {
9
9
  return {
@@ -15,15 +15,36 @@ function cloneCaseMatrix(matrix) {
15
15
  function createTaskCaseReporterId(index, name) {
16
16
  return `${index}:${encodeURIComponent(name)}`;
17
17
  }
18
+ function isTelemetryAttributeScalar(value) {
19
+ return typeof value === "boolean" || typeof value === "number" || typeof value === "string";
20
+ }
21
+ function isTelemetryAttributeArray(value) {
22
+ return value.every(isTelemetryAttributeScalar);
23
+ }
24
+ function canAttachMetricAsAttribute(value) {
25
+ if (isTelemetryAttributeScalar(value)) return true;
26
+ return Array.isArray(value) && isTelemetryAttributeArray(value);
27
+ }
18
28
  function assertValidScore(score) {
19
29
  if (!Number.isFinite(score) || score < 0 || score > 1) throw new Error(`Case score must be a finite number in range 0..1, got "${score}".`);
20
30
  }
21
31
  function assertNonNegativeInteger(value, label) {
22
32
  if (!Number.isFinite(value) || !Number.isInteger(value) || value < 0) throw new Error(`Invalid ${label}: ${String(value)}`);
23
33
  }
34
+ function assertNonNegativeNumber(value, label) {
35
+ if (!Number.isFinite(value) || value < 0) throw new Error(`Invalid ${label}: ${String(value)}`);
36
+ }
24
37
  function assertPositiveInteger(value, label) {
25
38
  if (!Number.isFinite(value) || !Number.isInteger(value) || value <= 0) throw new Error(`Invalid ${label}: ${String(value)}`);
26
39
  }
40
+ function autoRetryDelayMs(retryIndex) {
41
+ return 500 * 2 ** (retryIndex - 1);
42
+ }
43
+ function resolveAutoRetryDelay(policy, retryIndex) {
44
+ const delay = policy.autoRetryDelay;
45
+ if (delay == null) return autoRetryDelayMs(retryIndex);
46
+ return typeof delay === "number" ? delay : delay(retryIndex);
47
+ }
27
48
  function emitCaseStart(hooks, payload) {
28
49
  try {
29
50
  hooks?.onCaseStart?.(payload);
@@ -34,6 +55,11 @@ function emitCaseEnd(hooks, payload) {
34
55
  hooks?.onCaseEnd?.(payload);
35
56
  } catch {}
36
57
  }
58
+ function emitReporterEvent(hooks, payload) {
59
+ try {
60
+ hooks?.onEvent?.(payload);
61
+ } catch {}
62
+ }
37
63
  function createCaseTimeoutError(timeout) {
38
64
  const error = /* @__PURE__ */ new Error(`Case timed out after ${timeout}ms.`);
39
65
  error.name = "TimeoutError";
@@ -43,10 +69,12 @@ function normalizeExecutionPolicy(policy, label) {
43
69
  if (policy == null) return;
44
70
  if (policy.autoAttempt != null) assertNonNegativeInteger(policy.autoAttempt, `${label} autoAttempt`);
45
71
  if (policy.autoRetry != null) assertNonNegativeInteger(policy.autoRetry, `${label} autoRetry`);
72
+ if (typeof policy.autoRetryDelay === "number") assertNonNegativeNumber(policy.autoRetryDelay, `${label} autoRetryDelay`);
46
73
  if (policy.timeout != null) assertPositiveInteger(policy.timeout, `${label} timeout`);
47
74
  const normalized = {
48
75
  autoAttempt: policy.autoAttempt,
49
76
  autoRetry: policy.autoRetry,
77
+ autoRetryDelay: policy.autoRetryDelay,
50
78
  timeout: policy.timeout
51
79
  };
52
80
  return Object.values(normalized).some((value) => value != null) ? normalized : void 0;
@@ -55,55 +83,90 @@ function resolveCaseExecutionPolicy(taskCase, taskExecutionPolicy) {
55
83
  return {
56
84
  autoAttempt: taskCase.executionPolicy?.autoAttempt ?? taskExecutionPolicy?.autoAttempt ?? 0,
57
85
  autoRetry: taskCase.executionPolicy?.autoRetry ?? taskExecutionPolicy?.autoRetry ?? 0,
86
+ autoRetryDelay: taskCase.executionPolicy?.autoRetryDelay ?? taskExecutionPolicy?.autoRetryDelay,
58
87
  timeout: taskCase.executionPolicy?.timeout ?? taskExecutionPolicy?.timeout
59
88
  };
60
89
  }
61
90
  async function runCaseOnce(context, taskCase, index, timeout) {
62
91
  const customScoresByKind = /* @__PURE__ */ new Map();
63
92
  const abortController = new AbortController();
93
+ const telemetry = context.telemetry ?? createNoopTelemetryRuntime();
94
+ const caseId = createTaskCaseReporterId(index, taskCase.name);
64
95
  let timeoutHandle;
65
96
  let timedOut = false;
66
97
  let settled = false;
67
98
  try {
68
- const runPromise = Promise.resolve(taskCase.run({
69
- ...context,
70
- matrix: {
71
- ...cloneCaseMatrix(context.task.matrix),
72
- inputs: taskCase.input
73
- },
74
- metric(name, value) {
75
- if (abortController.signal.aborted || settled) return;
76
- context.reporterHooks?.onEvent?.({
77
- caseId: createTaskCaseReporterId(index, taskCase.name),
78
- data: {
99
+ return await telemetry.withSpan("vieval.case", {
100
+ "vieval.case.id": caseId,
101
+ "vieval.case.name": taskCase.name,
102
+ "vieval.task.id": context.task.id,
103
+ "vieval.task.name": context.task.entry.name
104
+ }, async () => {
105
+ const runPromise = Promise.resolve(taskCase.run({
106
+ ...context,
107
+ matrix: {
108
+ ...cloneCaseMatrix(context.task.matrix),
109
+ inputs: taskCase.input
110
+ },
111
+ metric(name, value) {
112
+ if (abortController.signal.aborted || settled) return;
113
+ emitReporterEvent(context.reporterHooks, {
114
+ caseId,
115
+ data: {
116
+ name,
117
+ value
118
+ },
119
+ event: "task.case.metric"
120
+ });
121
+ telemetry.addEvent("vieval.case.metric", {
79
122
  name,
80
123
  value
81
- },
82
- event: "task.case.metric"
124
+ });
125
+ if (canAttachMetricAsAttribute(value)) telemetry.setAttributes({ [name]: value });
126
+ },
127
+ score(score, kind = "exact") {
128
+ if (abortController.signal.aborted || settled) return;
129
+ assertValidScore(score);
130
+ customScoresByKind.set(kind, score);
131
+ telemetry.addEvent("vieval.case.score", {
132
+ "vieval.score.kind": kind,
133
+ "vieval.score.value": score
134
+ });
135
+ emitReporterEvent(context.reporterHooks, {
136
+ caseId,
137
+ data: {
138
+ kind,
139
+ score
140
+ },
141
+ event: "task.case.score"
142
+ });
143
+ },
144
+ signal: abortController.signal
145
+ }));
146
+ if (timeout != null) {
147
+ const timeoutPromise = new Promise((_, reject) => {
148
+ timeoutHandle = setTimeout(() => {
149
+ timedOut = true;
150
+ abortController.abort(createCaseTimeoutError(timeout));
151
+ reject(createCaseTimeoutError(timeout));
152
+ }, timeout);
83
153
  });
84
- },
85
- score(score, kind = "exact") {
86
- if (abortController.signal.aborted || settled) return;
87
- assertValidScore(score);
88
- customScoresByKind.set(kind, score);
89
- },
90
- signal: abortController.signal
91
- }));
92
- if (timeout != null) {
93
- const timeoutPromise = new Promise((_, reject) => {
94
- timeoutHandle = setTimeout(() => {
95
- timedOut = true;
96
- abortController.abort(createCaseTimeoutError(timeout));
97
- reject(createCaseTimeoutError(timeout));
98
- }, timeout);
99
- });
100
- await Promise.race([runPromise, timeoutPromise]);
101
- } else await runPromise;
102
- settled = true;
103
- return {
104
- scoresByKind: customScoresByKind,
105
- state: "passed"
106
- };
154
+ const output = await Promise.race([runPromise, timeoutPromise]);
155
+ settled = true;
156
+ return {
157
+ output,
158
+ scoresByKind: customScoresByKind,
159
+ state: "passed"
160
+ };
161
+ }
162
+ const output = await runPromise;
163
+ settled = true;
164
+ return {
165
+ output,
166
+ scoresByKind: customScoresByKind,
167
+ state: "passed"
168
+ };
169
+ });
107
170
  } catch (error) {
108
171
  settled = true;
109
172
  return {
@@ -119,12 +182,18 @@ async function executeRegisteredCase(context, taskCase, index, totalCases, taskE
119
182
  const resolvedPolicy = resolveCaseExecutionPolicy(taskCase, taskExecutionPolicy);
120
183
  let lastOutcome;
121
184
  for (let retryIndex = 0; retryIndex <= resolvedPolicy.autoRetry; retryIndex += 1) {
185
+ if (retryIndex > 0) {
186
+ const retryDelayMs = resolveAutoRetryDelay(resolvedPolicy, retryIndex);
187
+ assertNonNegativeNumber(retryDelayMs, "autoRetryDelay result");
188
+ if (retryDelayMs > 0) await sleep(retryDelayMs);
189
+ }
122
190
  emitCaseStart(context.reporterHooks, {
123
191
  ...resolvedPolicy.autoRetry > 0 ? {
124
192
  autoRetry: resolvedPolicy.autoRetry,
125
193
  retryIndex
126
194
  } : {},
127
195
  index,
196
+ ...taskCase.input === void 0 ? {} : { input: taskCase.input },
128
197
  name: taskCase.name,
129
198
  total: totalCases
130
199
  });
@@ -280,6 +349,7 @@ function describeTask(name, build, options = {}) {
280
349
  emitCaseEnd(context.reporterHooks, {
281
350
  ...outcome.errorMessage == null ? {} : { errorMessage: outcome.errorMessage },
282
351
  index,
352
+ ...outcome.output === void 0 ? {} : { output: outcome.output },
283
353
  state: outcome.state,
284
354
  name: taskCase.name,
285
355
  total: totalCases
@@ -323,6 +393,7 @@ function describeTask(name, build, options = {}) {
323
393
  emitCaseEnd(context.reporterHooks, {
324
394
  ...outcome.errorMessage == null ? {} : { errorMessage: outcome.errorMessage },
325
395
  index,
396
+ ...outcome.output === void 0 ? {} : { output: outcome.output },
326
397
  state: outcome.state,
327
398
  name: taskCase.name,
328
399
  total: totalCases