@agentv/core 2.7.1-next.5 → 2.8.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-6W5E3VR6.js → chunk-P2465XAH.js} +24 -49
- package/dist/chunk-P2465XAH.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +28 -58
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +21 -44
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +295 -220
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +44 -42
- package/dist/index.d.ts +44 -42
- package/dist/index.js +273 -173
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-6W5E3VR6.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -187,8 +187,6 @@ interface TargetDefinition {
|
|
|
187
187
|
readonly subagentRoot?: string | unknown | undefined;
|
|
188
188
|
readonly workspace_template?: string | unknown | undefined;
|
|
189
189
|
readonly workspaceTemplate?: string | unknown | undefined;
|
|
190
|
-
readonly command_template?: string | unknown | undefined;
|
|
191
|
-
readonly commandTemplate?: string | unknown | undefined;
|
|
192
190
|
readonly files_format?: string | unknown | undefined;
|
|
193
191
|
readonly filesFormat?: string | unknown | undefined;
|
|
194
192
|
readonly attachments_format?: string | unknown | undefined;
|
|
@@ -466,16 +464,18 @@ type TargetAccessConfig = {
|
|
|
466
464
|
readonly max_calls?: number;
|
|
467
465
|
};
|
|
468
466
|
/**
|
|
469
|
-
* Configuration for workspace lifecycle
|
|
470
|
-
*
|
|
467
|
+
* Configuration for workspace lifecycle commands (before_all, after_all, before_each, after_each).
|
|
468
|
+
* Commands are executed with workspace context passed via stdin.
|
|
471
469
|
*/
|
|
472
470
|
type WorkspaceScriptConfig = {
|
|
473
471
|
/** Command array to execute (e.g., ["bun", "run", "setup.ts"]) */
|
|
474
|
-
readonly
|
|
472
|
+
readonly command: readonly string[];
|
|
473
|
+
/** @deprecated Use `command` instead */
|
|
474
|
+
readonly script?: readonly string[];
|
|
475
475
|
/** Optional timeout in milliseconds (default: 60000 for setup, 30000 for teardown) */
|
|
476
476
|
readonly timeout_ms?: number;
|
|
477
477
|
readonly timeoutMs?: number;
|
|
478
|
-
/** Optional working directory for
|
|
478
|
+
/** Optional working directory for command execution */
|
|
479
479
|
readonly cwd?: string;
|
|
480
480
|
};
|
|
481
481
|
/**
|
|
@@ -493,19 +493,21 @@ type WorkspaceConfig = {
|
|
|
493
493
|
/** Template directory or .code-workspace file. Directories are copied to temp workspace.
|
|
494
494
|
* .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */
|
|
495
495
|
readonly template?: string;
|
|
496
|
-
/**
|
|
496
|
+
/** Command to run once before first test (after workspace creation, before git baseline) */
|
|
497
497
|
readonly before_all?: WorkspaceScriptConfig;
|
|
498
|
-
/**
|
|
498
|
+
/** Command to run once after last test (before workspace cleanup) */
|
|
499
499
|
readonly after_all?: WorkspaceScriptConfig;
|
|
500
|
-
/**
|
|
500
|
+
/** Command to run before each test */
|
|
501
501
|
readonly before_each?: WorkspaceScriptConfig;
|
|
502
|
-
/**
|
|
502
|
+
/** Command to run after each test (e.g., git reset for workspace reuse) */
|
|
503
503
|
readonly after_each?: WorkspaceScriptConfig;
|
|
504
504
|
};
|
|
505
505
|
type CodeEvaluatorConfig = {
|
|
506
506
|
readonly name: string;
|
|
507
507
|
readonly type: 'code';
|
|
508
|
-
readonly
|
|
508
|
+
readonly command: readonly string[];
|
|
509
|
+
/** @deprecated Use `command` instead */
|
|
510
|
+
readonly script?: readonly string[];
|
|
509
511
|
readonly resolvedScriptPath?: string;
|
|
510
512
|
readonly cwd?: string;
|
|
511
513
|
readonly resolvedCwd?: string;
|
|
@@ -513,9 +515,9 @@ type CodeEvaluatorConfig = {
|
|
|
513
515
|
readonly required?: boolean | number;
|
|
514
516
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
515
517
|
readonly negate?: boolean;
|
|
516
|
-
/** Pass-through configuration for the code_judge
|
|
518
|
+
/** Pass-through configuration for the code_judge (any unrecognized YAML properties) */
|
|
517
519
|
readonly config?: JsonObject;
|
|
518
|
-
/** When present, enables target access
|
|
520
|
+
/** When present, enables target access via local proxy */
|
|
519
521
|
readonly target?: TargetAccessConfig;
|
|
520
522
|
};
|
|
521
523
|
/**
|
|
@@ -524,7 +526,9 @@ type CodeEvaluatorConfig = {
|
|
|
524
526
|
*/
|
|
525
527
|
type PromptScriptConfig = {
|
|
526
528
|
/** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
|
|
527
|
-
readonly
|
|
529
|
+
readonly command: readonly string[];
|
|
530
|
+
/** @deprecated Use `command` instead */
|
|
531
|
+
readonly script?: readonly string[];
|
|
528
532
|
/** Pass-through configuration for the prompt template */
|
|
529
533
|
readonly config?: Record<string, unknown>;
|
|
530
534
|
};
|
|
@@ -949,6 +953,8 @@ interface EvaluationResult {
|
|
|
949
953
|
readonly aggregation?: TrialAggregation;
|
|
950
954
|
/** Whether the trial loop was terminated early due to cost limit */
|
|
951
955
|
readonly costLimited?: boolean;
|
|
956
|
+
/** Whether the evaluation was skipped due to suite-level budget exhaustion */
|
|
957
|
+
readonly budgetExceeded?: boolean;
|
|
952
958
|
}
|
|
953
959
|
type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
|
|
954
960
|
interface EvaluatorResult {
|
|
@@ -1110,6 +1116,8 @@ type EvalSuiteResult = {
|
|
|
1110
1116
|
readonly cacheConfig?: CacheConfig;
|
|
1111
1117
|
/** Suite-level metadata (name, description, version, etc.) */
|
|
1112
1118
|
readonly metadata?: EvalMetadata;
|
|
1119
|
+
/** Suite-level total cost budget in USD */
|
|
1120
|
+
readonly totalBudgetUsd?: number;
|
|
1113
1121
|
};
|
|
1114
1122
|
/**
|
|
1115
1123
|
* Load tests and suite metadata from a single parse.
|
|
@@ -1178,7 +1186,7 @@ declare function resolveFileReference(rawValue: string, searchRoots: readonly st
|
|
|
1178
1186
|
* @example
|
|
1179
1187
|
* ```typescript
|
|
1180
1188
|
* const config: CliNormalizedConfig = {
|
|
1181
|
-
*
|
|
1189
|
+
* command: 'agent run {PROMPT}',
|
|
1182
1190
|
* timeoutMs: 120000,
|
|
1183
1191
|
* verbose: true,
|
|
1184
1192
|
* };
|
|
@@ -1186,72 +1194,62 @@ declare function resolveFileReference(rawValue: string, searchRoots: readonly st
|
|
|
1186
1194
|
* ```
|
|
1187
1195
|
*/
|
|
1188
1196
|
declare const CliTargetConfigSchema: z.ZodObject<{
|
|
1189
|
-
|
|
1197
|
+
command: z.ZodString;
|
|
1190
1198
|
filesFormat: z.ZodOptional<z.ZodString>;
|
|
1191
1199
|
cwd: z.ZodOptional<z.ZodString>;
|
|
1192
1200
|
workspaceTemplate: z.ZodOptional<z.ZodString>;
|
|
1193
1201
|
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1194
|
-
healthcheck: z.ZodOptional<z.
|
|
1195
|
-
type: z.ZodLiteral<"http">;
|
|
1202
|
+
healthcheck: z.ZodOptional<z.ZodUnion<[z.ZodObject<{
|
|
1196
1203
|
url: z.ZodString;
|
|
1197
1204
|
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1198
1205
|
}, "strict", z.ZodTypeAny, {
|
|
1199
|
-
type: "http";
|
|
1200
1206
|
url: string;
|
|
1201
1207
|
timeoutMs?: number | undefined;
|
|
1202
1208
|
}, {
|
|
1203
|
-
type: "http";
|
|
1204
1209
|
url: string;
|
|
1205
1210
|
timeoutMs?: number | undefined;
|
|
1206
1211
|
}>, z.ZodObject<{
|
|
1207
|
-
|
|
1208
|
-
commandTemplate: z.ZodString;
|
|
1212
|
+
command: z.ZodString;
|
|
1209
1213
|
cwd: z.ZodOptional<z.ZodString>;
|
|
1210
1214
|
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1211
1215
|
}, "strict", z.ZodTypeAny, {
|
|
1212
|
-
|
|
1213
|
-
commandTemplate: string;
|
|
1216
|
+
command: string;
|
|
1214
1217
|
cwd?: string | undefined;
|
|
1215
1218
|
timeoutMs?: number | undefined;
|
|
1216
1219
|
}, {
|
|
1217
|
-
|
|
1218
|
-
commandTemplate: string;
|
|
1220
|
+
command: string;
|
|
1219
1221
|
cwd?: string | undefined;
|
|
1220
1222
|
timeoutMs?: number | undefined;
|
|
1221
1223
|
}>]>>;
|
|
1222
1224
|
verbose: z.ZodOptional<z.ZodBoolean>;
|
|
1223
1225
|
keepTempFiles: z.ZodOptional<z.ZodBoolean>;
|
|
1224
1226
|
}, "strict", z.ZodTypeAny, {
|
|
1225
|
-
|
|
1227
|
+
command: string;
|
|
1226
1228
|
cwd?: string | undefined;
|
|
1227
1229
|
verbose?: boolean | undefined;
|
|
1228
1230
|
filesFormat?: string | undefined;
|
|
1229
1231
|
workspaceTemplate?: string | undefined;
|
|
1230
1232
|
healthcheck?: {
|
|
1231
|
-
type: "http";
|
|
1232
1233
|
url: string;
|
|
1233
1234
|
timeoutMs?: number | undefined;
|
|
1234
1235
|
} | {
|
|
1235
|
-
|
|
1236
|
-
commandTemplate: string;
|
|
1236
|
+
command: string;
|
|
1237
1237
|
cwd?: string | undefined;
|
|
1238
1238
|
timeoutMs?: number | undefined;
|
|
1239
1239
|
} | undefined;
|
|
1240
1240
|
keepTempFiles?: boolean | undefined;
|
|
1241
1241
|
timeoutMs?: number | undefined;
|
|
1242
1242
|
}, {
|
|
1243
|
-
|
|
1243
|
+
command: string;
|
|
1244
1244
|
cwd?: string | undefined;
|
|
1245
1245
|
verbose?: boolean | undefined;
|
|
1246
1246
|
filesFormat?: string | undefined;
|
|
1247
1247
|
workspaceTemplate?: string | undefined;
|
|
1248
1248
|
healthcheck?: {
|
|
1249
|
-
type: "http";
|
|
1250
1249
|
url: string;
|
|
1251
1250
|
timeoutMs?: number | undefined;
|
|
1252
1251
|
} | {
|
|
1253
|
-
|
|
1254
|
-
commandTemplate: string;
|
|
1252
|
+
command: string;
|
|
1255
1253
|
cwd?: string | undefined;
|
|
1256
1254
|
timeoutMs?: number | undefined;
|
|
1257
1255
|
} | undefined;
|
|
@@ -1698,17 +1696,19 @@ declare function deepEqual(a: unknown, b: unknown): boolean;
|
|
|
1698
1696
|
declare function negateScore(score: EvaluationScore): EvaluationScore;
|
|
1699
1697
|
|
|
1700
1698
|
interface CodeEvaluatorOptions {
|
|
1701
|
-
readonly
|
|
1699
|
+
readonly command: readonly string[];
|
|
1700
|
+
/** @deprecated Use `command` instead */
|
|
1701
|
+
readonly script?: readonly string[];
|
|
1702
1702
|
readonly cwd?: string;
|
|
1703
1703
|
readonly agentTimeoutMs?: number;
|
|
1704
1704
|
/** Pass-through configuration from YAML (any unrecognized properties) */
|
|
1705
1705
|
readonly config?: Record<string, unknown>;
|
|
1706
|
-
/** Target access config - when present, enables target invocation
|
|
1706
|
+
/** Target access config - when present, enables target invocation */
|
|
1707
1707
|
readonly target?: TargetAccessConfig;
|
|
1708
1708
|
}
|
|
1709
1709
|
declare class CodeEvaluator implements Evaluator {
|
|
1710
1710
|
readonly kind = "code";
|
|
1711
|
-
private readonly
|
|
1711
|
+
private readonly command;
|
|
1712
1712
|
private readonly cwd?;
|
|
1713
1713
|
private readonly agentTimeoutMs?;
|
|
1714
1714
|
private readonly config?;
|
|
@@ -2198,6 +2198,8 @@ interface RunEvaluationOptions {
|
|
|
2198
2198
|
readonly trials?: TrialsConfig;
|
|
2199
2199
|
/** Real-time observability callbacks passed to the provider */
|
|
2200
2200
|
readonly streamCallbacks?: ProviderStreamCallbacks;
|
|
2201
|
+
/** Suite-level total cost budget in USD (stops dispatching when exceeded) */
|
|
2202
|
+
readonly totalBudgetUsd?: number;
|
|
2201
2203
|
}
|
|
2202
2204
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
2203
2205
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
@@ -2634,13 +2636,13 @@ interface ScriptExecutionContext {
|
|
|
2634
2636
|
}
|
|
2635
2637
|
type ScriptFailureMode = 'fatal' | 'warn';
|
|
2636
2638
|
/**
|
|
2637
|
-
* Executes a workspace lifecycle
|
|
2639
|
+
* Executes a workspace lifecycle command (before_all, after_all, before_each, after_each).
|
|
2638
2640
|
*
|
|
2639
|
-
* @param config - Workspace
|
|
2640
|
-
* @param context - Context passed to
|
|
2641
|
+
* @param config - Workspace command configuration (command, timeout_ms, cwd)
|
|
2642
|
+
* @param context - Context passed to command via stdin (JSON)
|
|
2641
2643
|
* @param failureMode - 'fatal' throws on non-zero exit; 'warn' logs warning
|
|
2642
|
-
* @returns Captured stdout from the
|
|
2643
|
-
* @throws Error if
|
|
2644
|
+
* @returns Captured stdout from the command
|
|
2645
|
+
* @throws Error if command exits with non-zero code (fatal mode) or times out
|
|
2644
2646
|
*/
|
|
2645
2647
|
declare function executeWorkspaceScript(config: WorkspaceScriptConfig, context: ScriptExecutionContext, failureMode?: ScriptFailureMode): Promise<string>;
|
|
2646
2648
|
|
package/dist/index.d.ts
CHANGED
|
@@ -187,8 +187,6 @@ interface TargetDefinition {
|
|
|
187
187
|
readonly subagentRoot?: string | unknown | undefined;
|
|
188
188
|
readonly workspace_template?: string | unknown | undefined;
|
|
189
189
|
readonly workspaceTemplate?: string | unknown | undefined;
|
|
190
|
-
readonly command_template?: string | unknown | undefined;
|
|
191
|
-
readonly commandTemplate?: string | unknown | undefined;
|
|
192
190
|
readonly files_format?: string | unknown | undefined;
|
|
193
191
|
readonly filesFormat?: string | unknown | undefined;
|
|
194
192
|
readonly attachments_format?: string | unknown | undefined;
|
|
@@ -466,16 +464,18 @@ type TargetAccessConfig = {
|
|
|
466
464
|
readonly max_calls?: number;
|
|
467
465
|
};
|
|
468
466
|
/**
|
|
469
|
-
* Configuration for workspace lifecycle
|
|
470
|
-
*
|
|
467
|
+
* Configuration for workspace lifecycle commands (before_all, after_all, before_each, after_each).
|
|
468
|
+
* Commands are executed with workspace context passed via stdin.
|
|
471
469
|
*/
|
|
472
470
|
type WorkspaceScriptConfig = {
|
|
473
471
|
/** Command array to execute (e.g., ["bun", "run", "setup.ts"]) */
|
|
474
|
-
readonly
|
|
472
|
+
readonly command: readonly string[];
|
|
473
|
+
/** @deprecated Use `command` instead */
|
|
474
|
+
readonly script?: readonly string[];
|
|
475
475
|
/** Optional timeout in milliseconds (default: 60000 for setup, 30000 for teardown) */
|
|
476
476
|
readonly timeout_ms?: number;
|
|
477
477
|
readonly timeoutMs?: number;
|
|
478
|
-
/** Optional working directory for
|
|
478
|
+
/** Optional working directory for command execution */
|
|
479
479
|
readonly cwd?: string;
|
|
480
480
|
};
|
|
481
481
|
/**
|
|
@@ -493,19 +493,21 @@ type WorkspaceConfig = {
|
|
|
493
493
|
/** Template directory or .code-workspace file. Directories are copied to temp workspace.
|
|
494
494
|
* .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */
|
|
495
495
|
readonly template?: string;
|
|
496
|
-
/**
|
|
496
|
+
/** Command to run once before first test (after workspace creation, before git baseline) */
|
|
497
497
|
readonly before_all?: WorkspaceScriptConfig;
|
|
498
|
-
/**
|
|
498
|
+
/** Command to run once after last test (before workspace cleanup) */
|
|
499
499
|
readonly after_all?: WorkspaceScriptConfig;
|
|
500
|
-
/**
|
|
500
|
+
/** Command to run before each test */
|
|
501
501
|
readonly before_each?: WorkspaceScriptConfig;
|
|
502
|
-
/**
|
|
502
|
+
/** Command to run after each test (e.g., git reset for workspace reuse) */
|
|
503
503
|
readonly after_each?: WorkspaceScriptConfig;
|
|
504
504
|
};
|
|
505
505
|
type CodeEvaluatorConfig = {
|
|
506
506
|
readonly name: string;
|
|
507
507
|
readonly type: 'code';
|
|
508
|
-
readonly
|
|
508
|
+
readonly command: readonly string[];
|
|
509
|
+
/** @deprecated Use `command` instead */
|
|
510
|
+
readonly script?: readonly string[];
|
|
509
511
|
readonly resolvedScriptPath?: string;
|
|
510
512
|
readonly cwd?: string;
|
|
511
513
|
readonly resolvedCwd?: string;
|
|
@@ -513,9 +515,9 @@ type CodeEvaluatorConfig = {
|
|
|
513
515
|
readonly required?: boolean | number;
|
|
514
516
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
515
517
|
readonly negate?: boolean;
|
|
516
|
-
/** Pass-through configuration for the code_judge
|
|
518
|
+
/** Pass-through configuration for the code_judge (any unrecognized YAML properties) */
|
|
517
519
|
readonly config?: JsonObject;
|
|
518
|
-
/** When present, enables target access
|
|
520
|
+
/** When present, enables target access via local proxy */
|
|
519
521
|
readonly target?: TargetAccessConfig;
|
|
520
522
|
};
|
|
521
523
|
/**
|
|
@@ -524,7 +526,9 @@ type CodeEvaluatorConfig = {
|
|
|
524
526
|
*/
|
|
525
527
|
type PromptScriptConfig = {
|
|
526
528
|
/** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
|
|
527
|
-
readonly
|
|
529
|
+
readonly command: readonly string[];
|
|
530
|
+
/** @deprecated Use `command` instead */
|
|
531
|
+
readonly script?: readonly string[];
|
|
528
532
|
/** Pass-through configuration for the prompt template */
|
|
529
533
|
readonly config?: Record<string, unknown>;
|
|
530
534
|
};
|
|
@@ -949,6 +953,8 @@ interface EvaluationResult {
|
|
|
949
953
|
readonly aggregation?: TrialAggregation;
|
|
950
954
|
/** Whether the trial loop was terminated early due to cost limit */
|
|
951
955
|
readonly costLimited?: boolean;
|
|
956
|
+
/** Whether the evaluation was skipped due to suite-level budget exhaustion */
|
|
957
|
+
readonly budgetExceeded?: boolean;
|
|
952
958
|
}
|
|
953
959
|
type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
|
|
954
960
|
interface EvaluatorResult {
|
|
@@ -1110,6 +1116,8 @@ type EvalSuiteResult = {
|
|
|
1110
1116
|
readonly cacheConfig?: CacheConfig;
|
|
1111
1117
|
/** Suite-level metadata (name, description, version, etc.) */
|
|
1112
1118
|
readonly metadata?: EvalMetadata;
|
|
1119
|
+
/** Suite-level total cost budget in USD */
|
|
1120
|
+
readonly totalBudgetUsd?: number;
|
|
1113
1121
|
};
|
|
1114
1122
|
/**
|
|
1115
1123
|
* Load tests and suite metadata from a single parse.
|
|
@@ -1178,7 +1186,7 @@ declare function resolveFileReference(rawValue: string, searchRoots: readonly st
|
|
|
1178
1186
|
* @example
|
|
1179
1187
|
* ```typescript
|
|
1180
1188
|
* const config: CliNormalizedConfig = {
|
|
1181
|
-
*
|
|
1189
|
+
* command: 'agent run {PROMPT}',
|
|
1182
1190
|
* timeoutMs: 120000,
|
|
1183
1191
|
* verbose: true,
|
|
1184
1192
|
* };
|
|
@@ -1186,72 +1194,62 @@ declare function resolveFileReference(rawValue: string, searchRoots: readonly st
|
|
|
1186
1194
|
* ```
|
|
1187
1195
|
*/
|
|
1188
1196
|
declare const CliTargetConfigSchema: z.ZodObject<{
|
|
1189
|
-
|
|
1197
|
+
command: z.ZodString;
|
|
1190
1198
|
filesFormat: z.ZodOptional<z.ZodString>;
|
|
1191
1199
|
cwd: z.ZodOptional<z.ZodString>;
|
|
1192
1200
|
workspaceTemplate: z.ZodOptional<z.ZodString>;
|
|
1193
1201
|
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1194
|
-
healthcheck: z.ZodOptional<z.
|
|
1195
|
-
type: z.ZodLiteral<"http">;
|
|
1202
|
+
healthcheck: z.ZodOptional<z.ZodUnion<[z.ZodObject<{
|
|
1196
1203
|
url: z.ZodString;
|
|
1197
1204
|
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1198
1205
|
}, "strict", z.ZodTypeAny, {
|
|
1199
|
-
type: "http";
|
|
1200
1206
|
url: string;
|
|
1201
1207
|
timeoutMs?: number | undefined;
|
|
1202
1208
|
}, {
|
|
1203
|
-
type: "http";
|
|
1204
1209
|
url: string;
|
|
1205
1210
|
timeoutMs?: number | undefined;
|
|
1206
1211
|
}>, z.ZodObject<{
|
|
1207
|
-
|
|
1208
|
-
commandTemplate: z.ZodString;
|
|
1212
|
+
command: z.ZodString;
|
|
1209
1213
|
cwd: z.ZodOptional<z.ZodString>;
|
|
1210
1214
|
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1211
1215
|
}, "strict", z.ZodTypeAny, {
|
|
1212
|
-
|
|
1213
|
-
commandTemplate: string;
|
|
1216
|
+
command: string;
|
|
1214
1217
|
cwd?: string | undefined;
|
|
1215
1218
|
timeoutMs?: number | undefined;
|
|
1216
1219
|
}, {
|
|
1217
|
-
|
|
1218
|
-
commandTemplate: string;
|
|
1220
|
+
command: string;
|
|
1219
1221
|
cwd?: string | undefined;
|
|
1220
1222
|
timeoutMs?: number | undefined;
|
|
1221
1223
|
}>]>>;
|
|
1222
1224
|
verbose: z.ZodOptional<z.ZodBoolean>;
|
|
1223
1225
|
keepTempFiles: z.ZodOptional<z.ZodBoolean>;
|
|
1224
1226
|
}, "strict", z.ZodTypeAny, {
|
|
1225
|
-
|
|
1227
|
+
command: string;
|
|
1226
1228
|
cwd?: string | undefined;
|
|
1227
1229
|
verbose?: boolean | undefined;
|
|
1228
1230
|
filesFormat?: string | undefined;
|
|
1229
1231
|
workspaceTemplate?: string | undefined;
|
|
1230
1232
|
healthcheck?: {
|
|
1231
|
-
type: "http";
|
|
1232
1233
|
url: string;
|
|
1233
1234
|
timeoutMs?: number | undefined;
|
|
1234
1235
|
} | {
|
|
1235
|
-
|
|
1236
|
-
commandTemplate: string;
|
|
1236
|
+
command: string;
|
|
1237
1237
|
cwd?: string | undefined;
|
|
1238
1238
|
timeoutMs?: number | undefined;
|
|
1239
1239
|
} | undefined;
|
|
1240
1240
|
keepTempFiles?: boolean | undefined;
|
|
1241
1241
|
timeoutMs?: number | undefined;
|
|
1242
1242
|
}, {
|
|
1243
|
-
|
|
1243
|
+
command: string;
|
|
1244
1244
|
cwd?: string | undefined;
|
|
1245
1245
|
verbose?: boolean | undefined;
|
|
1246
1246
|
filesFormat?: string | undefined;
|
|
1247
1247
|
workspaceTemplate?: string | undefined;
|
|
1248
1248
|
healthcheck?: {
|
|
1249
|
-
type: "http";
|
|
1250
1249
|
url: string;
|
|
1251
1250
|
timeoutMs?: number | undefined;
|
|
1252
1251
|
} | {
|
|
1253
|
-
|
|
1254
|
-
commandTemplate: string;
|
|
1252
|
+
command: string;
|
|
1255
1253
|
cwd?: string | undefined;
|
|
1256
1254
|
timeoutMs?: number | undefined;
|
|
1257
1255
|
} | undefined;
|
|
@@ -1698,17 +1696,19 @@ declare function deepEqual(a: unknown, b: unknown): boolean;
|
|
|
1698
1696
|
declare function negateScore(score: EvaluationScore): EvaluationScore;
|
|
1699
1697
|
|
|
1700
1698
|
interface CodeEvaluatorOptions {
|
|
1701
|
-
readonly
|
|
1699
|
+
readonly command: readonly string[];
|
|
1700
|
+
/** @deprecated Use `command` instead */
|
|
1701
|
+
readonly script?: readonly string[];
|
|
1702
1702
|
readonly cwd?: string;
|
|
1703
1703
|
readonly agentTimeoutMs?: number;
|
|
1704
1704
|
/** Pass-through configuration from YAML (any unrecognized properties) */
|
|
1705
1705
|
readonly config?: Record<string, unknown>;
|
|
1706
|
-
/** Target access config - when present, enables target invocation
|
|
1706
|
+
/** Target access config - when present, enables target invocation */
|
|
1707
1707
|
readonly target?: TargetAccessConfig;
|
|
1708
1708
|
}
|
|
1709
1709
|
declare class CodeEvaluator implements Evaluator {
|
|
1710
1710
|
readonly kind = "code";
|
|
1711
|
-
private readonly
|
|
1711
|
+
private readonly command;
|
|
1712
1712
|
private readonly cwd?;
|
|
1713
1713
|
private readonly agentTimeoutMs?;
|
|
1714
1714
|
private readonly config?;
|
|
@@ -2198,6 +2198,8 @@ interface RunEvaluationOptions {
|
|
|
2198
2198
|
readonly trials?: TrialsConfig;
|
|
2199
2199
|
/** Real-time observability callbacks passed to the provider */
|
|
2200
2200
|
readonly streamCallbacks?: ProviderStreamCallbacks;
|
|
2201
|
+
/** Suite-level total cost budget in USD (stops dispatching when exceeded) */
|
|
2202
|
+
readonly totalBudgetUsd?: number;
|
|
2201
2203
|
}
|
|
2202
2204
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
2203
2205
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
@@ -2634,13 +2636,13 @@ interface ScriptExecutionContext {
|
|
|
2634
2636
|
}
|
|
2635
2637
|
type ScriptFailureMode = 'fatal' | 'warn';
|
|
2636
2638
|
/**
|
|
2637
|
-
* Executes a workspace lifecycle
|
|
2639
|
+
* Executes a workspace lifecycle command (before_all, after_all, before_each, after_each).
|
|
2638
2640
|
*
|
|
2639
|
-
* @param config - Workspace
|
|
2640
|
-
* @param context - Context passed to
|
|
2641
|
+
* @param config - Workspace command configuration (command, timeout_ms, cwd)
|
|
2642
|
+
* @param context - Context passed to command via stdin (JSON)
|
|
2641
2643
|
* @param failureMode - 'fatal' throws on non-zero exit; 'warn' logs warning
|
|
2642
|
-
* @returns Captured stdout from the
|
|
2643
|
-
* @throws Error if
|
|
2644
|
+
* @returns Captured stdout from the command
|
|
2645
|
+
* @throws Error if command exits with non-zero code (fatal mode) or times out
|
|
2644
2646
|
*/
|
|
2645
2647
|
declare function executeWorkspaceScript(config: WorkspaceScriptConfig, context: ScriptExecutionContext, failureMode?: ScriptFailureMode): Promise<string>;
|
|
2646
2648
|
|