vieval 0.0.6 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +219 -109
- package/dist/bin/vieval.mjs +1 -1
- package/dist/cli/index.mjs +1 -1
- package/dist/{cli-sanbKtQq.mjs → cli-Dao25VxV.mjs} +1186 -162
- package/dist/cli-Dao25VxV.mjs.map +1 -0
- package/dist/config.d.mts +2 -2
- package/dist/config.mjs +1 -1
- package/dist/core/assertions/index.d.mts +1 -1
- package/dist/core/inference-executors/index.mjs +1 -1
- package/dist/core/processors/results/index.d.mts +1 -1
- package/dist/core/runner/index.d.mts +2 -2
- package/dist/core/runner/index.mjs +6 -40
- package/dist/core/runner/index.mjs.map +1 -1
- package/dist/{env--94B0UtW.mjs → env-BFSjny07.mjs} +1 -1
- package/dist/{env--94B0UtW.mjs.map → env-BFSjny07.mjs.map} +1 -1
- package/dist/{index-DBZKkpBe.d.mts → index-BkjyCInx.d.mts} +102 -37
- package/dist/index.d.mts +14 -6
- package/dist/index.mjs +110 -39
- package/dist/index.mjs.map +1 -1
- package/dist/{models-DIGdOUpJ.mjs → models-pBSRUZhY.mjs} +1 -1
- package/dist/{models-DIGdOUpJ.mjs.map → models-pBSRUZhY.mjs.map} +1 -1
- package/dist/plugins/chat-models/index.d.mts +69 -6
- package/dist/plugins/chat-models/index.mjs +62 -6
- package/dist/plugins/chat-models/index.mjs.map +1 -1
- package/dist/{registry-CcKZqDJY.mjs → registry-BHGMxjpA.mjs} +140 -4
- package/dist/registry-BHGMxjpA.mjs.map +1 -0
- package/package.json +2 -1
- package/dist/cli-sanbKtQq.mjs.map +0 -1
- package/dist/registry-CcKZqDJY.mjs.map +0 -1
|
@@ -495,15 +495,6 @@ interface ModelDefinition {
|
|
|
495
495
|
declare function resolveModelByName(models: readonly ModelDefinition[], name: string): ModelDefinition | undefined;
|
|
496
496
|
//#endregion
|
|
497
497
|
//#region src/core/runner/task-context.d.ts
|
|
498
|
-
/**
|
|
499
|
-
* Options for selecting a model from the execution context.
|
|
500
|
-
*/
|
|
501
|
-
interface TaskModelSelectionOptions {
|
|
502
|
-
/**
|
|
503
|
-
* Model id or alias name.
|
|
504
|
-
*/
|
|
505
|
-
name: string;
|
|
506
|
-
}
|
|
507
498
|
/**
|
|
508
499
|
* Task-scoped execution context exposed to runner executors.
|
|
509
500
|
*/
|
|
@@ -513,13 +504,9 @@ interface TaskExecutionContext {
|
|
|
513
504
|
*/
|
|
514
505
|
cache: TaskCacheRuntime;
|
|
515
506
|
/**
|
|
516
|
-
*
|
|
517
|
-
*
|
|
518
|
-
* Use when:
|
|
519
|
-
* - no arguments are provided to use the model selected by run matrix/inferenceExecutor
|
|
520
|
-
* - `name` is provided to resolve a specific model id or alias
|
|
507
|
+
* Configured model registrations available to model plugins.
|
|
521
508
|
*/
|
|
522
|
-
|
|
509
|
+
models: readonly ModelDefinition[];
|
|
523
510
|
}
|
|
524
511
|
/**
|
|
525
512
|
* Inputs used to build task execution context.
|
|
@@ -530,14 +517,13 @@ interface CreateTaskExecutionContextOptions {
|
|
|
530
517
|
task: ScheduledTask;
|
|
531
518
|
}
|
|
532
519
|
/**
|
|
533
|
-
* Creates task-scoped
|
|
520
|
+
* Creates task-scoped context data for runner execution.
|
|
534
521
|
*
|
|
535
522
|
* Call stack:
|
|
536
523
|
*
|
|
537
524
|
* {@link runScheduledTasks}
|
|
538
525
|
* -> {@link createTaskExecutionContext}
|
|
539
|
-
* ->
|
|
540
|
-
* -> `task.model()` / `task.model({ name })`
|
|
526
|
+
* -> `TaskExecutionContext`
|
|
541
527
|
*/
|
|
542
528
|
declare function createTaskExecutionContext(options: CreateTaskExecutionContextOptions): TaskExecutionContext;
|
|
543
529
|
//#endregion
|
|
@@ -581,7 +567,7 @@ interface RunScheduledTasksOptions {
|
|
|
581
567
|
* Creates per-task execution context.
|
|
582
568
|
*
|
|
583
569
|
* Use when:
|
|
584
|
-
* - executor code needs per-task
|
|
570
|
+
* - executor code needs per-task models, cache, or other task-scoped data
|
|
585
571
|
*/
|
|
586
572
|
createExecutionContext?: (task: ScheduledTask) => TaskExecutionContext;
|
|
587
573
|
/**
|
|
@@ -646,7 +632,39 @@ declare class RunnerExecutionError extends Error {
|
|
|
646
632
|
*/
|
|
647
633
|
declare function runScheduledTasks(tasks: readonly ScheduledTask[], executor: ScheduledTaskExecutor, options?: RunScheduledTasksOptions): Promise<AggregatedRunResults>;
|
|
648
634
|
//#endregion
|
|
635
|
+
//#region src/core/telemetry/types.d.ts
|
|
636
|
+
/** JSON-compatible scalar values accepted as telemetry attributes. */
|
|
637
|
+
type TelemetryAttributeValue = boolean | number | string | null | readonly TelemetryAttributeValue[];
|
|
638
|
+
/** Attribute map shared by local report projection and OpenTelemetry span calls. */
|
|
639
|
+
type TelemetryAttributes = Record<string, TelemetryAttributeValue | undefined>;
|
|
640
|
+
/**
|
|
641
|
+
* Internal Vieval telemetry runtime.
|
|
642
|
+
*
|
|
643
|
+
* Use when:
|
|
644
|
+
* - runner code needs one execution path for disabled and enabled telemetry
|
|
645
|
+
* - case code should run inside an active OpenTelemetry span when configured
|
|
646
|
+
*
|
|
647
|
+
* Expects:
|
|
648
|
+
* - attributes are JSON-compatible and stable enough for report filtering
|
|
649
|
+
* - callbacks are awaited by the caller
|
|
650
|
+
*
|
|
651
|
+
* Returns:
|
|
652
|
+
* - callback result, preserving thrown errors after telemetry records them
|
|
653
|
+
*/
|
|
654
|
+
interface TelemetryRuntime {
|
|
655
|
+
withSpan: <T>(name: string, attributes: TelemetryAttributes, callback: () => Promise<T>) => Promise<T>;
|
|
656
|
+
addEvent: (name: string, attributes?: TelemetryAttributes) => void;
|
|
657
|
+
setAttributes: (attributes: TelemetryAttributes) => void;
|
|
658
|
+
recordException: (error: unknown) => void;
|
|
659
|
+
}
|
|
660
|
+
//#endregion
|
|
649
661
|
//#region src/config/types.d.ts
|
|
662
|
+
/**
|
|
663
|
+
* Value that can be returned directly or through a promise.
|
|
664
|
+
*
|
|
665
|
+
* @param T - Resolved value type.
|
|
666
|
+
*/
|
|
667
|
+
type Awaitable<T> = Promise<T> | T;
|
|
650
668
|
/**
|
|
651
669
|
* Primitive value allowed in one matrix cell.
|
|
652
670
|
*
|
|
@@ -862,6 +880,12 @@ interface TaskRunOutput {
|
|
|
862
880
|
*/
|
|
863
881
|
scores: readonly RunScore[];
|
|
864
882
|
}
|
|
883
|
+
/**
|
|
884
|
+
* Delay policy for retries within one task case attempt.
|
|
885
|
+
*
|
|
886
|
+
* @param retryIndex Retry number where `1` is the first retry after the initial failure.
|
|
887
|
+
*/
|
|
888
|
+
type TaskAutoRetryDelay = number | ((retryIndex: number) => number);
|
|
865
889
|
/**
|
|
866
890
|
* Execution policy applied to task and case callbacks.
|
|
867
891
|
*
|
|
@@ -883,6 +907,15 @@ interface TaskExecutionPolicy {
|
|
|
883
907
|
* @default 0
|
|
884
908
|
*/
|
|
885
909
|
autoRetry?: number;
|
|
910
|
+
/**
|
|
911
|
+
* Delay in milliseconds before a case auto retry starts.
|
|
912
|
+
*
|
|
913
|
+
* A number applies the same delay to every retry. A function receives the
|
|
914
|
+
* retry index where `1` is the first retry after the initial failure.
|
|
915
|
+
*
|
|
916
|
+
* @default retryIndex => 500 * 2 ** (retryIndex - 1)
|
|
917
|
+
*/
|
|
918
|
+
autoRetryDelay?: TaskAutoRetryDelay;
|
|
886
919
|
/**
|
|
887
920
|
* Additional full task attempts allowed after the current attempt settles.
|
|
888
921
|
*
|
|
@@ -917,6 +950,30 @@ interface TaskConcurrencyConfig {
|
|
|
917
950
|
*/
|
|
918
951
|
case?: number;
|
|
919
952
|
}
|
|
953
|
+
/**
|
|
954
|
+
* Reporting configuration for local artifacts and optional OpenTelemetry integration.
|
|
955
|
+
*/
|
|
956
|
+
interface CliReportingConfig {
|
|
957
|
+
/**
|
|
958
|
+
* Optional OpenTelemetry API integration.
|
|
959
|
+
*/
|
|
960
|
+
openTelemetry?: CliOpenTelemetryReportingConfig;
|
|
961
|
+
}
|
|
962
|
+
/**
|
|
963
|
+
* OpenTelemetry reporting configuration managed by user config setup.
|
|
964
|
+
*/
|
|
965
|
+
interface CliOpenTelemetryReportingConfig {
|
|
966
|
+
/**
|
|
967
|
+
* Enables Vieval active span wrapping through `@opentelemetry/api`.
|
|
968
|
+
*
|
|
969
|
+
* @default false
|
|
970
|
+
*/
|
|
971
|
+
enabled?: boolean;
|
|
972
|
+
/**
|
|
973
|
+
* Called after all telemetry events and local report artifacts have been emitted.
|
|
974
|
+
*/
|
|
975
|
+
onRunEnd?: () => Awaitable<void>;
|
|
976
|
+
}
|
|
920
977
|
/**
|
|
921
978
|
* Runtime context passed into eval task `run`.
|
|
922
979
|
*/
|
|
@@ -964,24 +1021,13 @@ interface TaskRunContext {
|
|
|
964
1021
|
*/
|
|
965
1022
|
task: ScheduledTask;
|
|
966
1023
|
/**
|
|
967
|
-
*
|
|
968
|
-
*
|
|
969
|
-
* Runtime impact:
|
|
970
|
-
* - `context.model()` uses `context.task.matrix.run.model` first when present
|
|
971
|
-
* - then falls back to inferenceExecutor-id match
|
|
972
|
-
* - then falls back to first configured model
|
|
1024
|
+
* Configured model registrations available to model plugins.
|
|
973
1025
|
*
|
|
974
|
-
*
|
|
975
|
-
*
|
|
976
|
-
*
|
|
977
|
-
* const defaultModel = context.model()
|
|
978
|
-
* // resolves the configured model whose id/model/alias matches 'gpt-4.1-mini'
|
|
979
|
-
*
|
|
980
|
-
* const judgeModel = context.model({ name: 'judge-large' })
|
|
981
|
-
* // explicit lookup bypasses matrix default
|
|
982
|
-
* ```
|
|
1026
|
+
* Use when:
|
|
1027
|
+
* - a plugin owns model selection semantics and needs access to registered models
|
|
1028
|
+
* - eval code resolves matrix-selected model axes through plugin helpers
|
|
983
1029
|
*/
|
|
984
|
-
|
|
1030
|
+
models: TaskExecutionContext['models'];
|
|
985
1031
|
/**
|
|
986
1032
|
* Optional reporter lifecycle hooks for task-local case events.
|
|
987
1033
|
*
|
|
@@ -992,6 +1038,17 @@ interface TaskRunContext {
|
|
|
992
1038
|
* - hooks are best-effort observers and should not affect task scoring
|
|
993
1039
|
*/
|
|
994
1040
|
reporterHooks?: TaskReporterHooks;
|
|
1041
|
+
/**
|
|
1042
|
+
* Optional telemetry runtime shared by runner, DSL, and reporter integrations.
|
|
1043
|
+
*
|
|
1044
|
+
* Use when:
|
|
1045
|
+
* - task execution should emit events to the currently active telemetry runtime
|
|
1046
|
+
* - enabled and disabled telemetry should keep the same execution path
|
|
1047
|
+
*
|
|
1048
|
+
* Expects:
|
|
1049
|
+
* - callers inject a no-op runtime when telemetry is disabled
|
|
1050
|
+
*/
|
|
1051
|
+
telemetry?: TelemetryRuntime;
|
|
995
1052
|
/**
|
|
996
1053
|
* Optional runtime scheduling overrides supplied by CLI or host execution.
|
|
997
1054
|
*
|
|
@@ -1036,6 +1093,10 @@ interface TaskCaseReporterPayload {
|
|
|
1036
1093
|
* Maximum retry count configured for this case.
|
|
1037
1094
|
*/
|
|
1038
1095
|
autoRetry?: number;
|
|
1096
|
+
/**
|
|
1097
|
+
* Optional case input payload registered by the task DSL.
|
|
1098
|
+
*/
|
|
1099
|
+
input?: unknown;
|
|
1039
1100
|
/**
|
|
1040
1101
|
* Declared case label.
|
|
1041
1102
|
*/
|
|
@@ -1066,6 +1127,10 @@ interface TaskCaseReporterPayload {
|
|
|
1066
1127
|
* - `state` describes the final case result
|
|
1067
1128
|
*/
|
|
1068
1129
|
interface TaskCaseReporterEndPayload extends TaskCaseReporterPayload {
|
|
1130
|
+
/**
|
|
1131
|
+
* Optional case output returned by the task case callback.
|
|
1132
|
+
*/
|
|
1133
|
+
output?: unknown;
|
|
1069
1134
|
/**
|
|
1070
1135
|
* Final case state.
|
|
1071
1136
|
*/
|
|
@@ -1288,5 +1353,5 @@ interface ConfigHookPlugin<TConfig> {
|
|
|
1288
1353
|
configVievalResolved?: (config: TConfig) => void | Promise<void>;
|
|
1289
1354
|
}
|
|
1290
1355
|
//#endregion
|
|
1291
|
-
export {
|
|
1292
|
-
//# sourceMappingURL=index-
|
|
1356
|
+
export { InferenceExecutor as $, RunScheduledTasksOptions as A, asProjectRelativePath as B, TaskDefinition as C, TaskRunContext as D, TaskReporterHooks as E, CreateTaskExecutionContextOptions as F, AggregatedProviderSummary as G, CreateVievalRunnerRuntimeContextOptions as H, TaskExecutionContext as I, RunResult as J, AggregatedRunResults as K, createTaskExecutionContext as L, RunnerTaskState as M, ScheduledTaskExecutor as N, TaskRunOutput as O, runScheduledTasks as P, CreateRunnerScheduleOptions as Q, ModelDefinition as R, TaskConcurrencyConfig as S, TaskReporterEventPayload as T, RunnerRuntimeContext as U, collectEvalEntries as V, createRunnerRuntimeContext as W, RunScoreKind as X, RunScore as Y, aggregateRunResults as Z, ScopedMatrices as _, CliOpenTelemetryReportingConfig as a, ScheduledTaskMatrixMeta as at, TaskCaseReporterPayload as b, EvalDefinition as c, createFilesystemTaskCacheRuntime as ct, MatrixAxisValues as d, CacheFileOptions as dt, RunnerMatrixDefinition as et, MatrixDefinition as f, CacheNamespace as ft, MatrixValue as g, MatrixRow as h, Awaitable as i, ScheduledTaskMatrix as it, RunnerExecutionError as j, TelemetryAttributeValue as k, EvalModule as l, normalizeCacheFilePathSegments as lt, MatrixPrimitive as m, defineEval as n, RunnerMatrixSelection as nt, CliReportingConfig as o, createRunnerSchedule as ot, MatrixLayer as p, TaskCacheRuntime as pt, AggregatedRunSummary as q, defineTask as r, ScheduledTask as rt, CollectedEvalEntry as s, CreateFilesystemTaskCacheRuntimeOptions as st, ConfigHookPlugin as t, RunnerMatrixInput as tt, EvalModuleMap as u, CacheFileHandle as ut, TaskAutoRetryDelay as v, TaskExecutionPolicy as w, TaskCaseState as x, TaskCaseReporterEndPayload as y, resolveModelByName as z };
|
|
1357
|
+
//# sourceMappingURL=index-BkjyCInx.d.mts.map
|
package/dist/index.d.mts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as
|
|
1
|
+
import { $ as InferenceExecutor, D as TaskRunContext, I as TaskExecutionContext, J as RunResult, O as TaskRunOutput, R as ModelDefinition, S as TaskConcurrencyConfig, X as RunScoreKind, f as MatrixDefinition, k as TelemetryAttributeValue, o as CliReportingConfig, p as MatrixLayer, rt as ScheduledTask, t as ConfigHookPlugin, w as TaskExecutionPolicy } from "./index-BkjyCInx.mjs";
|
|
2
2
|
import { a as requiredEnvFrom } from "./env-BeHv_5mo.mjs";
|
|
3
3
|
import { expect } from "./expect.mjs";
|
|
4
4
|
import * as _$c12 from "c12";
|
|
@@ -137,7 +137,7 @@ interface CliProjectConfig {
|
|
|
137
137
|
* Model definitions available to project runtime execution.
|
|
138
138
|
*
|
|
139
139
|
* Inference executors control schedule fan-out, while models provide
|
|
140
|
-
* runtime lookup metadata for
|
|
140
|
+
* runtime lookup metadata for model plugin helpers during task execution.
|
|
141
141
|
*
|
|
142
142
|
* @default inherited from top-level config models
|
|
143
143
|
*/
|
|
@@ -253,16 +253,18 @@ interface CliComparisonConfig {
|
|
|
253
253
|
* Execution context exposed to project-level `executor` implementations.
|
|
254
254
|
*
|
|
255
255
|
* Use when:
|
|
256
|
-
* - a project executor needs
|
|
256
|
+
* - a project executor needs task-scoped models plus case reporter hooks
|
|
257
257
|
* - custom scheduling logic wants the same hook shape as `TaskRunContext`
|
|
258
258
|
*
|
|
259
259
|
* Expects:
|
|
260
|
-
* - `
|
|
260
|
+
* - `models` exposes configured model registrations for plugin helpers
|
|
261
261
|
* - `reporterHooks` follows `TaskRunContext['reporterHooks']`
|
|
262
|
+
* - `telemetry` follows `TaskRunContext['telemetry']`
|
|
262
263
|
* - `runtimeConcurrency` follows `TaskRunContext['runtimeConcurrency']`
|
|
263
264
|
*/
|
|
264
265
|
interface CliProjectExecutorContext extends TaskExecutionContext {
|
|
265
266
|
reporterHooks?: TaskRunContext['reporterHooks'];
|
|
267
|
+
telemetry?: TaskRunContext['telemetry'];
|
|
266
268
|
runtimeConcurrency?: TaskRunContext['runtimeConcurrency'];
|
|
267
269
|
}
|
|
268
270
|
/**
|
|
@@ -310,6 +312,12 @@ interface CliConfigBase {
|
|
|
310
312
|
* @default {}
|
|
311
313
|
*/
|
|
312
314
|
env?: NodeJS.ProcessEnv;
|
|
315
|
+
/**
|
|
316
|
+
* Optional reporting integrations shared by CLI run orchestration.
|
|
317
|
+
*
|
|
318
|
+
* @default undefined
|
|
319
|
+
*/
|
|
320
|
+
reporting?: CliReportingConfig;
|
|
313
321
|
}
|
|
314
322
|
/**
|
|
315
323
|
* Project mode config for `vieval run`.
|
|
@@ -400,7 +408,7 @@ interface CaseRunContext<TInput> extends TaskRunContext {
|
|
|
400
408
|
* - `name` to be a stable metric identifier
|
|
401
409
|
* - `value` to be JSON-serializable
|
|
402
410
|
*/
|
|
403
|
-
metric: (name: string, value:
|
|
411
|
+
metric: (name: string, value: TelemetryAttributeValue) => void;
|
|
404
412
|
/**
|
|
405
413
|
* Cooperative abort signal for the current case execution.
|
|
406
414
|
*/
|
|
@@ -409,7 +417,7 @@ interface CaseRunContext<TInput> extends TaskRunContext {
|
|
|
409
417
|
/**
|
|
410
418
|
* Callback for one task case.
|
|
411
419
|
*/
|
|
412
|
-
type CaseRunner<TInput> = (context: CaseRunContext<TInput>) => Promise<
|
|
420
|
+
type CaseRunner<TInput> = (context: CaseRunContext<TInput>) => Promise<unknown> | unknown;
|
|
413
421
|
/**
|
|
414
422
|
* Per-group options for `casesFromInputs`.
|
|
415
423
|
*
|
package/dist/index.mjs
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { i as registerEvalDefinition, l as loadEnv, o as createNoopTelemetryRuntime, s as defineConfig } from "./registry-BHGMxjpA.mjs";
|
|
2
2
|
import { t as createSchedulerQueue } from "./queue-DsZQkZO_.mjs";
|
|
3
|
-
import { n as requiredEnvFrom } from "./env
|
|
3
|
+
import { n as requiredEnvFrom } from "./env-BFSjny07.mjs";
|
|
4
4
|
import { defineEval, defineTask } from "./config.mjs";
|
|
5
5
|
import { expect } from "./expect.mjs";
|
|
6
|
-
import { errorMessageFrom } from "@moeru/std";
|
|
6
|
+
import { errorMessageFrom, sleep } from "@moeru/std";
|
|
7
7
|
//#region src/dsl/task.ts
|
|
8
8
|
function cloneCaseMatrix(matrix) {
|
|
9
9
|
return {
|
|
@@ -15,15 +15,36 @@ function cloneCaseMatrix(matrix) {
|
|
|
15
15
|
function createTaskCaseReporterId(index, name) {
|
|
16
16
|
return `${index}:${encodeURIComponent(name)}`;
|
|
17
17
|
}
|
|
18
|
+
function isTelemetryAttributeScalar(value) {
|
|
19
|
+
return typeof value === "boolean" || typeof value === "number" || typeof value === "string";
|
|
20
|
+
}
|
|
21
|
+
function isTelemetryAttributeArray(value) {
|
|
22
|
+
return value.every(isTelemetryAttributeScalar);
|
|
23
|
+
}
|
|
24
|
+
function canAttachMetricAsAttribute(value) {
|
|
25
|
+
if (isTelemetryAttributeScalar(value)) return true;
|
|
26
|
+
return Array.isArray(value) && isTelemetryAttributeArray(value);
|
|
27
|
+
}
|
|
18
28
|
function assertValidScore(score) {
|
|
19
29
|
if (!Number.isFinite(score) || score < 0 || score > 1) throw new Error(`Case score must be a finite number in range 0..1, got "${score}".`);
|
|
20
30
|
}
|
|
21
31
|
function assertNonNegativeInteger(value, label) {
|
|
22
32
|
if (!Number.isFinite(value) || !Number.isInteger(value) || value < 0) throw new Error(`Invalid ${label}: ${String(value)}`);
|
|
23
33
|
}
|
|
34
|
+
function assertNonNegativeNumber(value, label) {
|
|
35
|
+
if (!Number.isFinite(value) || value < 0) throw new Error(`Invalid ${label}: ${String(value)}`);
|
|
36
|
+
}
|
|
24
37
|
function assertPositiveInteger(value, label) {
|
|
25
38
|
if (!Number.isFinite(value) || !Number.isInteger(value) || value <= 0) throw new Error(`Invalid ${label}: ${String(value)}`);
|
|
26
39
|
}
|
|
40
|
+
function autoRetryDelayMs(retryIndex) {
|
|
41
|
+
return 500 * 2 ** (retryIndex - 1);
|
|
42
|
+
}
|
|
43
|
+
function resolveAutoRetryDelay(policy, retryIndex) {
|
|
44
|
+
const delay = policy.autoRetryDelay;
|
|
45
|
+
if (delay == null) return autoRetryDelayMs(retryIndex);
|
|
46
|
+
return typeof delay === "number" ? delay : delay(retryIndex);
|
|
47
|
+
}
|
|
27
48
|
function emitCaseStart(hooks, payload) {
|
|
28
49
|
try {
|
|
29
50
|
hooks?.onCaseStart?.(payload);
|
|
@@ -34,6 +55,11 @@ function emitCaseEnd(hooks, payload) {
|
|
|
34
55
|
hooks?.onCaseEnd?.(payload);
|
|
35
56
|
} catch {}
|
|
36
57
|
}
|
|
58
|
+
function emitReporterEvent(hooks, payload) {
|
|
59
|
+
try {
|
|
60
|
+
hooks?.onEvent?.(payload);
|
|
61
|
+
} catch {}
|
|
62
|
+
}
|
|
37
63
|
function createCaseTimeoutError(timeout) {
|
|
38
64
|
const error = /* @__PURE__ */ new Error(`Case timed out after ${timeout}ms.`);
|
|
39
65
|
error.name = "TimeoutError";
|
|
@@ -43,10 +69,12 @@ function normalizeExecutionPolicy(policy, label) {
|
|
|
43
69
|
if (policy == null) return;
|
|
44
70
|
if (policy.autoAttempt != null) assertNonNegativeInteger(policy.autoAttempt, `${label} autoAttempt`);
|
|
45
71
|
if (policy.autoRetry != null) assertNonNegativeInteger(policy.autoRetry, `${label} autoRetry`);
|
|
72
|
+
if (typeof policy.autoRetryDelay === "number") assertNonNegativeNumber(policy.autoRetryDelay, `${label} autoRetryDelay`);
|
|
46
73
|
if (policy.timeout != null) assertPositiveInteger(policy.timeout, `${label} timeout`);
|
|
47
74
|
const normalized = {
|
|
48
75
|
autoAttempt: policy.autoAttempt,
|
|
49
76
|
autoRetry: policy.autoRetry,
|
|
77
|
+
autoRetryDelay: policy.autoRetryDelay,
|
|
50
78
|
timeout: policy.timeout
|
|
51
79
|
};
|
|
52
80
|
return Object.values(normalized).some((value) => value != null) ? normalized : void 0;
|
|
@@ -55,55 +83,90 @@ function resolveCaseExecutionPolicy(taskCase, taskExecutionPolicy) {
|
|
|
55
83
|
return {
|
|
56
84
|
autoAttempt: taskCase.executionPolicy?.autoAttempt ?? taskExecutionPolicy?.autoAttempt ?? 0,
|
|
57
85
|
autoRetry: taskCase.executionPolicy?.autoRetry ?? taskExecutionPolicy?.autoRetry ?? 0,
|
|
86
|
+
autoRetryDelay: taskCase.executionPolicy?.autoRetryDelay ?? taskExecutionPolicy?.autoRetryDelay,
|
|
58
87
|
timeout: taskCase.executionPolicy?.timeout ?? taskExecutionPolicy?.timeout
|
|
59
88
|
};
|
|
60
89
|
}
|
|
61
90
|
async function runCaseOnce(context, taskCase, index, timeout) {
|
|
62
91
|
const customScoresByKind = /* @__PURE__ */ new Map();
|
|
63
92
|
const abortController = new AbortController();
|
|
93
|
+
const telemetry = context.telemetry ?? createNoopTelemetryRuntime();
|
|
94
|
+
const caseId = createTaskCaseReporterId(index, taskCase.name);
|
|
64
95
|
let timeoutHandle;
|
|
65
96
|
let timedOut = false;
|
|
66
97
|
let settled = false;
|
|
67
98
|
try {
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
99
|
+
return await telemetry.withSpan("vieval.case", {
|
|
100
|
+
"vieval.case.id": caseId,
|
|
101
|
+
"vieval.case.name": taskCase.name,
|
|
102
|
+
"vieval.task.id": context.task.id,
|
|
103
|
+
"vieval.task.name": context.task.entry.name
|
|
104
|
+
}, async () => {
|
|
105
|
+
const runPromise = Promise.resolve(taskCase.run({
|
|
106
|
+
...context,
|
|
107
|
+
matrix: {
|
|
108
|
+
...cloneCaseMatrix(context.task.matrix),
|
|
109
|
+
inputs: taskCase.input
|
|
110
|
+
},
|
|
111
|
+
metric(name, value) {
|
|
112
|
+
if (abortController.signal.aborted || settled) return;
|
|
113
|
+
emitReporterEvent(context.reporterHooks, {
|
|
114
|
+
caseId,
|
|
115
|
+
data: {
|
|
116
|
+
name,
|
|
117
|
+
value
|
|
118
|
+
},
|
|
119
|
+
event: "task.case.metric"
|
|
120
|
+
});
|
|
121
|
+
telemetry.addEvent("vieval.case.metric", {
|
|
79
122
|
name,
|
|
80
123
|
value
|
|
81
|
-
}
|
|
82
|
-
|
|
124
|
+
});
|
|
125
|
+
if (canAttachMetricAsAttribute(value)) telemetry.setAttributes({ [name]: value });
|
|
126
|
+
},
|
|
127
|
+
score(score, kind = "exact") {
|
|
128
|
+
if (abortController.signal.aborted || settled) return;
|
|
129
|
+
assertValidScore(score);
|
|
130
|
+
customScoresByKind.set(kind, score);
|
|
131
|
+
telemetry.addEvent("vieval.case.score", {
|
|
132
|
+
"vieval.score.kind": kind,
|
|
133
|
+
"vieval.score.value": score
|
|
134
|
+
});
|
|
135
|
+
emitReporterEvent(context.reporterHooks, {
|
|
136
|
+
caseId,
|
|
137
|
+
data: {
|
|
138
|
+
kind,
|
|
139
|
+
score
|
|
140
|
+
},
|
|
141
|
+
event: "task.case.score"
|
|
142
|
+
});
|
|
143
|
+
},
|
|
144
|
+
signal: abortController.signal
|
|
145
|
+
}));
|
|
146
|
+
if (timeout != null) {
|
|
147
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
148
|
+
timeoutHandle = setTimeout(() => {
|
|
149
|
+
timedOut = true;
|
|
150
|
+
abortController.abort(createCaseTimeoutError(timeout));
|
|
151
|
+
reject(createCaseTimeoutError(timeout));
|
|
152
|
+
}, timeout);
|
|
83
153
|
});
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
await Promise.race([runPromise, timeoutPromise]);
|
|
101
|
-
} else await runPromise;
|
|
102
|
-
settled = true;
|
|
103
|
-
return {
|
|
104
|
-
scoresByKind: customScoresByKind,
|
|
105
|
-
state: "passed"
|
|
106
|
-
};
|
|
154
|
+
const output = await Promise.race([runPromise, timeoutPromise]);
|
|
155
|
+
settled = true;
|
|
156
|
+
return {
|
|
157
|
+
output,
|
|
158
|
+
scoresByKind: customScoresByKind,
|
|
159
|
+
state: "passed"
|
|
160
|
+
};
|
|
161
|
+
}
|
|
162
|
+
const output = await runPromise;
|
|
163
|
+
settled = true;
|
|
164
|
+
return {
|
|
165
|
+
output,
|
|
166
|
+
scoresByKind: customScoresByKind,
|
|
167
|
+
state: "passed"
|
|
168
|
+
};
|
|
169
|
+
});
|
|
107
170
|
} catch (error) {
|
|
108
171
|
settled = true;
|
|
109
172
|
return {
|
|
@@ -119,12 +182,18 @@ async function executeRegisteredCase(context, taskCase, index, totalCases, taskE
|
|
|
119
182
|
const resolvedPolicy = resolveCaseExecutionPolicy(taskCase, taskExecutionPolicy);
|
|
120
183
|
let lastOutcome;
|
|
121
184
|
for (let retryIndex = 0; retryIndex <= resolvedPolicy.autoRetry; retryIndex += 1) {
|
|
185
|
+
if (retryIndex > 0) {
|
|
186
|
+
const retryDelayMs = resolveAutoRetryDelay(resolvedPolicy, retryIndex);
|
|
187
|
+
assertNonNegativeNumber(retryDelayMs, "autoRetryDelay result");
|
|
188
|
+
if (retryDelayMs > 0) await sleep(retryDelayMs);
|
|
189
|
+
}
|
|
122
190
|
emitCaseStart(context.reporterHooks, {
|
|
123
191
|
...resolvedPolicy.autoRetry > 0 ? {
|
|
124
192
|
autoRetry: resolvedPolicy.autoRetry,
|
|
125
193
|
retryIndex
|
|
126
194
|
} : {},
|
|
127
195
|
index,
|
|
196
|
+
...taskCase.input === void 0 ? {} : { input: taskCase.input },
|
|
128
197
|
name: taskCase.name,
|
|
129
198
|
total: totalCases
|
|
130
199
|
});
|
|
@@ -280,6 +349,7 @@ function describeTask(name, build, options = {}) {
|
|
|
280
349
|
emitCaseEnd(context.reporterHooks, {
|
|
281
350
|
...outcome.errorMessage == null ? {} : { errorMessage: outcome.errorMessage },
|
|
282
351
|
index,
|
|
352
|
+
...outcome.output === void 0 ? {} : { output: outcome.output },
|
|
283
353
|
state: outcome.state,
|
|
284
354
|
name: taskCase.name,
|
|
285
355
|
total: totalCases
|
|
@@ -323,6 +393,7 @@ function describeTask(name, build, options = {}) {
|
|
|
323
393
|
emitCaseEnd(context.reporterHooks, {
|
|
324
394
|
...outcome.errorMessage == null ? {} : { errorMessage: outcome.errorMessage },
|
|
325
395
|
index,
|
|
396
|
+
...outcome.output === void 0 ? {} : { output: outcome.output },
|
|
326
397
|
state: outcome.state,
|
|
327
398
|
name: taskCase.name,
|
|
328
399
|
total: totalCases
|