@agentv/core 2.16.0 → 2.17.1-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-E6AJPAXM.js → chunk-PSYFRPNT.js} +1 -1
- package/dist/chunk-PSYFRPNT.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +5 -5
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +6 -6
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +122 -62
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +10 -10
- package/dist/index.d.ts +10 -10
- package/dist/index.js +123 -63
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-E6AJPAXM.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -526,22 +526,16 @@ type WorkspaceHookConfig = {
|
|
|
526
526
|
readonly cwd?: string;
|
|
527
527
|
/** Optional reset policy for this hook */
|
|
528
528
|
readonly reset?: 'none' | 'fast' | 'strict';
|
|
529
|
-
/** Optional cleanup policy for this hook */
|
|
530
|
-
readonly clean?: 'always' | 'on_success' | 'on_failure' | 'never';
|
|
531
529
|
};
|
|
532
530
|
type WorkspaceHooksConfig = {
|
|
533
531
|
/** Runs once before first test in the workspace lifecycle */
|
|
534
|
-
readonly
|
|
532
|
+
readonly before_all?: WorkspaceHookConfig;
|
|
535
533
|
/** Runs before each test case */
|
|
536
|
-
readonly
|
|
534
|
+
readonly before_each?: WorkspaceHookConfig;
|
|
537
535
|
/** Runs after each test case */
|
|
538
|
-
readonly
|
|
536
|
+
readonly after_each?: WorkspaceHookConfig;
|
|
539
537
|
/** Runs once after final test in the workspace lifecycle */
|
|
540
|
-
readonly
|
|
541
|
-
/** Runs when reusing a pooled workspace slot */
|
|
542
|
-
readonly on_reuse?: WorkspaceHookConfig;
|
|
543
|
-
/** Runs/controls behavior when workspace lifecycle finishes */
|
|
544
|
-
readonly on_finish?: WorkspaceHookConfig;
|
|
538
|
+
readonly after_all?: WorkspaceHookConfig;
|
|
545
539
|
};
|
|
546
540
|
type WorkspaceConfig = {
|
|
547
541
|
/** Template directory or .code-workspace file. Directories are copied to temp workspace.
|
|
@@ -1172,6 +1166,12 @@ interface EvaluatorResult {
|
|
|
1172
1166
|
readonly details?: JsonObject;
|
|
1173
1167
|
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
1174
1168
|
readonly tokenUsage?: TokenUsage;
|
|
1169
|
+
/** Wall-clock duration of this judge execution in milliseconds. */
|
|
1170
|
+
readonly durationMs?: number;
|
|
1171
|
+
/** ISO 8601 UTC timestamp when this judge started executing. */
|
|
1172
|
+
readonly startedAt?: string;
|
|
1173
|
+
/** ISO 8601 UTC timestamp when this judge finished executing. */
|
|
1174
|
+
readonly endedAt?: string;
|
|
1175
1175
|
}
|
|
1176
1176
|
/**
|
|
1177
1177
|
* Convenience accessor matching the Python hit_count property.
|
package/dist/index.d.ts
CHANGED
|
@@ -526,22 +526,16 @@ type WorkspaceHookConfig = {
|
|
|
526
526
|
readonly cwd?: string;
|
|
527
527
|
/** Optional reset policy for this hook */
|
|
528
528
|
readonly reset?: 'none' | 'fast' | 'strict';
|
|
529
|
-
/** Optional cleanup policy for this hook */
|
|
530
|
-
readonly clean?: 'always' | 'on_success' | 'on_failure' | 'never';
|
|
531
529
|
};
|
|
532
530
|
type WorkspaceHooksConfig = {
|
|
533
531
|
/** Runs once before first test in the workspace lifecycle */
|
|
534
|
-
readonly
|
|
532
|
+
readonly before_all?: WorkspaceHookConfig;
|
|
535
533
|
/** Runs before each test case */
|
|
536
|
-
readonly
|
|
534
|
+
readonly before_each?: WorkspaceHookConfig;
|
|
537
535
|
/** Runs after each test case */
|
|
538
|
-
readonly
|
|
536
|
+
readonly after_each?: WorkspaceHookConfig;
|
|
539
537
|
/** Runs once after final test in the workspace lifecycle */
|
|
540
|
-
readonly
|
|
541
|
-
/** Runs when reusing a pooled workspace slot */
|
|
542
|
-
readonly on_reuse?: WorkspaceHookConfig;
|
|
543
|
-
/** Runs/controls behavior when workspace lifecycle finishes */
|
|
544
|
-
readonly on_finish?: WorkspaceHookConfig;
|
|
538
|
+
readonly after_all?: WorkspaceHookConfig;
|
|
545
539
|
};
|
|
546
540
|
type WorkspaceConfig = {
|
|
547
541
|
/** Template directory or .code-workspace file. Directories are copied to temp workspace.
|
|
@@ -1172,6 +1166,12 @@ interface EvaluatorResult {
|
|
|
1172
1166
|
readonly details?: JsonObject;
|
|
1173
1167
|
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
1174
1168
|
readonly tokenUsage?: TokenUsage;
|
|
1169
|
+
/** Wall-clock duration of this judge execution in milliseconds. */
|
|
1170
|
+
readonly durationMs?: number;
|
|
1171
|
+
/** ISO 8601 UTC timestamp when this judge started executing. */
|
|
1172
|
+
readonly startedAt?: string;
|
|
1173
|
+
/** ISO 8601 UTC timestamp when this judge finished executing. */
|
|
1174
|
+
readonly endedAt?: string;
|
|
1175
1175
|
}
|
|
1176
1176
|
/**
|
|
1177
1177
|
* Convenience accessor matching the Python hit_count property.
|
package/dist/index.js
CHANGED
|
@@ -17,7 +17,7 @@ import {
|
|
|
17
17
|
readTextFile,
|
|
18
18
|
resolveFileReference,
|
|
19
19
|
resolveTargetDefinition
|
|
20
|
-
} from "./chunk-
|
|
20
|
+
} from "./chunk-PSYFRPNT.js";
|
|
21
21
|
import {
|
|
22
22
|
OtlpJsonFileExporter
|
|
23
23
|
} from "./chunk-HFSYZHGF.js";
|
|
@@ -151,6 +151,25 @@ import path8 from "node:path";
|
|
|
151
151
|
import micromatch3 from "micromatch";
|
|
152
152
|
import { parse as parse2 } from "yaml";
|
|
153
153
|
|
|
154
|
+
// src/evaluation/interpolation.ts
|
|
155
|
+
var ENV_VAR_PATTERN = /\$\{\{\s*([A-Za-z_][A-Za-z0-9_]*)\s*\}\}/g;
|
|
156
|
+
function interpolateEnv(value, env) {
|
|
157
|
+
if (typeof value === "string") {
|
|
158
|
+
return value.replace(ENV_VAR_PATTERN, (_, varName) => env[varName] ?? "");
|
|
159
|
+
}
|
|
160
|
+
if (Array.isArray(value)) {
|
|
161
|
+
return value.map((item) => interpolateEnv(item, env));
|
|
162
|
+
}
|
|
163
|
+
if (value !== null && typeof value === "object") {
|
|
164
|
+
const result = {};
|
|
165
|
+
for (const [key, val] of Object.entries(value)) {
|
|
166
|
+
result[key] = interpolateEnv(val, env);
|
|
167
|
+
}
|
|
168
|
+
return result;
|
|
169
|
+
}
|
|
170
|
+
return value;
|
|
171
|
+
}
|
|
172
|
+
|
|
154
173
|
// src/evaluation/loaders/case-file-loader.ts
|
|
155
174
|
import { readFile } from "node:fs/promises";
|
|
156
175
|
import path from "node:path";
|
|
@@ -169,7 +188,8 @@ function isGlobPattern(filePath) {
|
|
|
169
188
|
return filePath.includes("*") || filePath.includes("?") || filePath.includes("{");
|
|
170
189
|
}
|
|
171
190
|
function parseYamlCases(content, filePath) {
|
|
172
|
-
const
|
|
191
|
+
const raw = parseYaml(content);
|
|
192
|
+
const parsed = interpolateEnv(raw, process.env);
|
|
173
193
|
if (!Array.isArray(parsed)) {
|
|
174
194
|
throw new Error(
|
|
175
195
|
`External test file must contain a YAML array, got ${typeof parsed}: ${filePath}`
|
|
@@ -191,7 +211,8 @@ function parseJsonlCases(content, filePath) {
|
|
|
191
211
|
const line = lines[i].trim();
|
|
192
212
|
if (line === "") continue;
|
|
193
213
|
try {
|
|
194
|
-
const
|
|
214
|
+
const raw = JSON.parse(line);
|
|
215
|
+
const parsed = interpolateEnv(raw, process.env);
|
|
195
216
|
if (!isJsonObject(parsed)) {
|
|
196
217
|
throw new Error("Expected JSON object");
|
|
197
218
|
}
|
|
@@ -2340,7 +2361,7 @@ async function loadSidecarMetadata(jsonlPath, verbose) {
|
|
|
2340
2361
|
}
|
|
2341
2362
|
try {
|
|
2342
2363
|
const content = await readFile5(sidecarPath, "utf8");
|
|
2343
|
-
const parsed = parseYaml2(content);
|
|
2364
|
+
const parsed = interpolateEnv(parseYaml2(content), process.env);
|
|
2344
2365
|
if (!isJsonObject(parsed)) {
|
|
2345
2366
|
logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
|
|
2346
2367
|
return {};
|
|
@@ -2363,7 +2384,8 @@ function parseJsonlContent(content, filePath) {
|
|
|
2363
2384
|
const line = lines[i].trim();
|
|
2364
2385
|
if (line === "") continue;
|
|
2365
2386
|
try {
|
|
2366
|
-
const
|
|
2387
|
+
const raw = JSON.parse(line);
|
|
2388
|
+
const parsed = interpolateEnv(raw, process.env);
|
|
2367
2389
|
if (!isJsonObject(parsed)) {
|
|
2368
2390
|
throw new Error("Expected JSON object");
|
|
2369
2391
|
}
|
|
@@ -2420,9 +2442,10 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2420
2442
|
}
|
|
2421
2443
|
const inputMessages = resolveInputMessages(evalcase);
|
|
2422
2444
|
const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
|
|
2423
|
-
|
|
2445
|
+
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
|
|
2446
|
+
if (!id || !hasEvaluationSpec || !inputMessages || inputMessages.length === 0) {
|
|
2424
2447
|
logError(
|
|
2425
|
-
`Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id,
|
|
2448
|
+
`Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
|
|
2426
2449
|
);
|
|
2427
2450
|
continue;
|
|
2428
2451
|
}
|
|
@@ -2500,7 +2523,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2500
2523
|
guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
|
|
2501
2524
|
guideline_patterns: guidelinePatterns,
|
|
2502
2525
|
file_paths: allFilePaths,
|
|
2503
|
-
criteria: outcome,
|
|
2526
|
+
criteria: outcome ?? "",
|
|
2504
2527
|
evaluator: evalCaseEvaluatorKind,
|
|
2505
2528
|
evaluators
|
|
2506
2529
|
};
|
|
@@ -2813,7 +2836,7 @@ async function readTestSuiteMetadata(testFilePath) {
|
|
|
2813
2836
|
try {
|
|
2814
2837
|
const absolutePath = path8.resolve(testFilePath);
|
|
2815
2838
|
const content = await readFile7(absolutePath, "utf8");
|
|
2816
|
-
const parsed = parse2(content);
|
|
2839
|
+
const parsed = interpolateEnv(parse2(content), process.env);
|
|
2817
2840
|
if (!isJsonObject(parsed)) {
|
|
2818
2841
|
return {};
|
|
2819
2842
|
}
|
|
@@ -2863,11 +2886,11 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2863
2886
|
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
2864
2887
|
const guidelinePatterns = config?.guideline_patterns;
|
|
2865
2888
|
const rawFile = await readFile7(absoluteTestPath, "utf8");
|
|
2866
|
-
const
|
|
2867
|
-
if (!isJsonObject(
|
|
2889
|
+
const interpolated = interpolateEnv(parse2(rawFile), process.env);
|
|
2890
|
+
if (!isJsonObject(interpolated)) {
|
|
2868
2891
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
2869
2892
|
}
|
|
2870
|
-
const suite =
|
|
2893
|
+
const suite = interpolated;
|
|
2871
2894
|
const datasetNameFromSuite = asString6(suite.dataset)?.trim();
|
|
2872
2895
|
const fallbackDataset = path8.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
2873
2896
|
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
@@ -2911,9 +2934,10 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2911
2934
|
}
|
|
2912
2935
|
const testInputMessages = resolveInputMessages(evalcase);
|
|
2913
2936
|
const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
|
|
2914
|
-
|
|
2937
|
+
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
|
|
2938
|
+
if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
|
|
2915
2939
|
logError2(
|
|
2916
|
-
`Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id,
|
|
2940
|
+
`Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
|
|
2917
2941
|
);
|
|
2918
2942
|
continue;
|
|
2919
2943
|
}
|
|
@@ -3009,7 +3033,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
3009
3033
|
guideline_paths: guidelinePaths.map((guidelinePath) => path8.resolve(guidelinePath)),
|
|
3010
3034
|
guideline_patterns: guidelinePatterns,
|
|
3011
3035
|
file_paths: allFilePaths,
|
|
3012
|
-
criteria: outcome,
|
|
3036
|
+
criteria: outcome ?? "",
|
|
3013
3037
|
evaluator: evalCaseEvaluatorKind,
|
|
3014
3038
|
evaluators,
|
|
3015
3039
|
workspace: mergedWorkspace,
|
|
@@ -3119,30 +3143,24 @@ function parseWorkspaceHookConfig(raw, evalFileDir) {
|
|
|
3119
3143
|
const script = parseWorkspaceScriptConfig(raw, evalFileDir);
|
|
3120
3144
|
const obj = raw;
|
|
3121
3145
|
const reset = obj.reset === "none" || obj.reset === "fast" || obj.reset === "strict" ? obj.reset : void 0;
|
|
3122
|
-
|
|
3123
|
-
if (!script && !reset && !clean) return void 0;
|
|
3146
|
+
if (!script && !reset) return void 0;
|
|
3124
3147
|
return {
|
|
3125
3148
|
...script ?? {},
|
|
3126
|
-
...reset !== void 0 && { reset }
|
|
3127
|
-
...clean !== void 0 && { clean }
|
|
3149
|
+
...reset !== void 0 && { reset }
|
|
3128
3150
|
};
|
|
3129
3151
|
}
|
|
3130
3152
|
function parseWorkspaceHooksConfig(raw, evalFileDir) {
|
|
3131
3153
|
if (!isJsonObject(raw)) return void 0;
|
|
3132
3154
|
const obj = raw;
|
|
3133
|
-
const
|
|
3134
|
-
const
|
|
3135
|
-
const
|
|
3136
|
-
const
|
|
3137
|
-
const onReuse = parseWorkspaceHookConfig(obj.on_reuse, evalFileDir);
|
|
3138
|
-
const onFinish = parseWorkspaceHookConfig(obj.on_finish, evalFileDir);
|
|
3155
|
+
const beforeAll = parseWorkspaceHookConfig(obj.before_all, evalFileDir);
|
|
3156
|
+
const beforeEach = parseWorkspaceHookConfig(obj.before_each, evalFileDir);
|
|
3157
|
+
const afterEach = parseWorkspaceHookConfig(obj.after_each, evalFileDir);
|
|
3158
|
+
const afterAll = parseWorkspaceHookConfig(obj.after_all, evalFileDir);
|
|
3139
3159
|
const hooks = {
|
|
3140
|
-
...
|
|
3141
|
-
...
|
|
3142
|
-
...
|
|
3143
|
-
...
|
|
3144
|
-
...onReuse !== void 0 && { on_reuse: onReuse },
|
|
3145
|
-
...onFinish !== void 0 && { on_finish: onFinish }
|
|
3160
|
+
...beforeAll !== void 0 && { before_all: beforeAll },
|
|
3161
|
+
...beforeEach !== void 0 && { before_each: beforeEach },
|
|
3162
|
+
...afterEach !== void 0 && { after_each: afterEach },
|
|
3163
|
+
...afterAll !== void 0 && { after_all: afterAll }
|
|
3146
3164
|
};
|
|
3147
3165
|
return Object.keys(hooks).length > 0 ? hooks : void 0;
|
|
3148
3166
|
}
|
|
@@ -3155,7 +3173,7 @@ async function resolveWorkspaceConfig(raw, evalFileDir) {
|
|
|
3155
3173
|
} catch {
|
|
3156
3174
|
throw new Error(`Workspace file not found: ${raw} (resolved to ${workspaceFilePath})`);
|
|
3157
3175
|
}
|
|
3158
|
-
const parsed = parse2(content);
|
|
3176
|
+
const parsed = interpolateEnv(parse2(content), process.env);
|
|
3159
3177
|
if (!isJsonObject(parsed)) {
|
|
3160
3178
|
throw new Error(
|
|
3161
3179
|
`Invalid workspace file format: ${workspaceFilePath} (expected a YAML object)`
|
|
@@ -3203,18 +3221,10 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
|
|
|
3203
3221
|
};
|
|
3204
3222
|
};
|
|
3205
3223
|
const mergedHooks = {
|
|
3206
|
-
|
|
3207
|
-
|
|
3208
|
-
|
|
3209
|
-
)
|
|
3210
|
-
before_each_test: mergeHook(
|
|
3211
|
-
suiteLevel.hooks?.before_each_test,
|
|
3212
|
-
caseLevel.hooks?.before_each_test
|
|
3213
|
-
),
|
|
3214
|
-
after_each_test: mergeHook(suiteLevel.hooks?.after_each_test, caseLevel.hooks?.after_each_test),
|
|
3215
|
-
after_all_tests: mergeHook(suiteLevel.hooks?.after_all_tests, caseLevel.hooks?.after_all_tests),
|
|
3216
|
-
on_reuse: mergeHook(suiteLevel.hooks?.on_reuse, caseLevel.hooks?.on_reuse),
|
|
3217
|
-
on_finish: mergeHook(suiteLevel.hooks?.on_finish, caseLevel.hooks?.on_finish)
|
|
3224
|
+
before_all: mergeHook(suiteLevel.hooks?.before_all, caseLevel.hooks?.before_all),
|
|
3225
|
+
before_each: mergeHook(suiteLevel.hooks?.before_each, caseLevel.hooks?.before_each),
|
|
3226
|
+
after_each: mergeHook(suiteLevel.hooks?.after_each, caseLevel.hooks?.after_each),
|
|
3227
|
+
after_all: mergeHook(suiteLevel.hooks?.after_all, caseLevel.hooks?.after_all)
|
|
3218
3228
|
};
|
|
3219
3229
|
const hasHooks = Object.values(mergedHooks).some((hook) => hook !== void 0);
|
|
3220
3230
|
return {
|
|
@@ -5344,6 +5354,7 @@ var CopilotCliProvider = class {
|
|
|
5344
5354
|
const agentProcess = spawn(executable, args, {
|
|
5345
5355
|
stdio: ["pipe", "pipe", "inherit"]
|
|
5346
5356
|
});
|
|
5357
|
+
await waitForProcessSpawn(agentProcess, executable, this.targetName);
|
|
5347
5358
|
const toolCallsInProgress = /* @__PURE__ */ new Map();
|
|
5348
5359
|
const completedToolCalls = [];
|
|
5349
5360
|
let finalContent = "";
|
|
@@ -5623,6 +5634,47 @@ var CopilotCliProvider = class {
|
|
|
5623
5634
|
}
|
|
5624
5635
|
}
|
|
5625
5636
|
};
|
|
5637
|
+
async function waitForProcessSpawn(proc, executable, targetName) {
|
|
5638
|
+
if (proc.pid) {
|
|
5639
|
+
return;
|
|
5640
|
+
}
|
|
5641
|
+
await new Promise((resolve, reject) => {
|
|
5642
|
+
const onSpawn = () => {
|
|
5643
|
+
cleanup();
|
|
5644
|
+
resolve();
|
|
5645
|
+
};
|
|
5646
|
+
const onError = (error) => {
|
|
5647
|
+
cleanup();
|
|
5648
|
+
reject(new Error(formatCopilotSpawnError(error, executable, targetName)));
|
|
5649
|
+
};
|
|
5650
|
+
const cleanup = () => {
|
|
5651
|
+
proc.off("spawn", onSpawn);
|
|
5652
|
+
proc.off("error", onError);
|
|
5653
|
+
};
|
|
5654
|
+
proc.once("spawn", onSpawn);
|
|
5655
|
+
proc.once("error", onError);
|
|
5656
|
+
});
|
|
5657
|
+
}
|
|
5658
|
+
function formatCopilotSpawnError(error, executable, targetName) {
|
|
5659
|
+
const code = error.code;
|
|
5660
|
+
const base = `Failed to start Copilot CLI executable '${executable}' for target '${targetName}'. ${error.message}`;
|
|
5661
|
+
if (process.platform !== "win32") {
|
|
5662
|
+
return base;
|
|
5663
|
+
}
|
|
5664
|
+
if (code !== "ENOENT" && code !== "EINVAL") {
|
|
5665
|
+
return base;
|
|
5666
|
+
}
|
|
5667
|
+
return `${base}
|
|
5668
|
+
|
|
5669
|
+
On Windows, shell commands like 'copilot -h' can work via .ps1/.bat shims, but AgentV launches a subprocess that needs a directly spawnable executable path.
|
|
5670
|
+
|
|
5671
|
+
Fix options:
|
|
5672
|
+
1) Install native Copilot binary package:
|
|
5673
|
+
npm install -g @github/copilot-win32-x64
|
|
5674
|
+
2) Set explicit executable for Copilot targets:
|
|
5675
|
+
- In .env: COPILOT_EXE=C:\\Users\\<you>\\AppData\\Roaming\\npm\\node_modules\\@github\\copilot-win32-x64\\copilot.exe
|
|
5676
|
+
- In .agentv/targets.yaml: executable: \${{ COPILOT_EXE }}`;
|
|
5677
|
+
}
|
|
5626
5678
|
function summarizeAcpEvent(eventType, data) {
|
|
5627
5679
|
if (!data || typeof data !== "object") {
|
|
5628
5680
|
return eventType;
|
|
@@ -13557,9 +13609,8 @@ async function runEvaluation(options) {
|
|
|
13557
13609
|
const hasSharedWorkspace = !!(useStaticWorkspace || workspaceTemplate || suiteWorkspace?.hooks || suiteWorkspace?.repos?.length && !isPerTestIsolation);
|
|
13558
13610
|
const poolEnabled = configuredMode === "pooled" ? true : configuredMode === "ephemeral" || useStaticWorkspace ? false : suiteWorkspace?.pool ?? poolWorkspaces ?? true;
|
|
13559
13611
|
const usePool = poolEnabled !== false && !!suiteWorkspace?.repos?.length && !isPerTestIsolation && !useStaticWorkspace;
|
|
13560
|
-
const
|
|
13561
|
-
const
|
|
13562
|
-
const resolvedRetainOnFailure = (finishCleanPolicy === "always" || finishCleanPolicy === "on_failure" ? "cleanup" : finishCleanPolicy === "on_success" || finishCleanPolicy === "never" ? "keep" : void 0) ?? retainOnFailure ?? (cleanupWorkspaces ? "cleanup" : "keep");
|
|
13612
|
+
const resolvedRetainOnSuccess = retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup");
|
|
13613
|
+
const resolvedRetainOnFailure = retainOnFailure ?? (cleanupWorkspaces ? "cleanup" : "keep");
|
|
13563
13614
|
const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
|
|
13564
13615
|
const workers = hasSharedWorkspace && !usePool ? 1 : requestedWorkers;
|
|
13565
13616
|
setupLog(
|
|
@@ -13594,7 +13645,7 @@ async function runEvaluation(options) {
|
|
|
13594
13645
|
repos: suiteWorkspace.repos,
|
|
13595
13646
|
maxSlots: poolMaxSlots,
|
|
13596
13647
|
repoManager: poolRepoManager,
|
|
13597
|
-
poolReset: (workspaceClean === "full" ? "strict" : workspaceClean === "standard" ? "fast" : null) ??
|
|
13648
|
+
poolReset: (workspaceClean === "full" ? "strict" : workspaceClean === "standard" ? "fast" : null) ?? "fast"
|
|
13598
13649
|
});
|
|
13599
13650
|
poolSlots.push(slot);
|
|
13600
13651
|
setupLog(`pool slot ${i} acquired at: ${slot.path} (existing=${slot.isExisting})`);
|
|
@@ -13645,7 +13696,7 @@ async function runEvaluation(options) {
|
|
|
13645
13696
|
throw new Error(`Failed to materialize repos: ${message}`);
|
|
13646
13697
|
}
|
|
13647
13698
|
}
|
|
13648
|
-
const suiteBeforeAllHook = suiteWorkspace?.hooks?.
|
|
13699
|
+
const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
|
|
13649
13700
|
if (sharedWorkspacePath && hasHookCommand(suiteBeforeAllHook)) {
|
|
13650
13701
|
const beforeAllHook = suiteBeforeAllHook;
|
|
13651
13702
|
const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
|
|
@@ -13660,7 +13711,7 @@ async function runEvaluation(options) {
|
|
|
13660
13711
|
};
|
|
13661
13712
|
try {
|
|
13662
13713
|
beforeAllOutput = await executeWorkspaceScript(
|
|
13663
|
-
toScriptConfig(beforeAllHook, "
|
|
13714
|
+
toScriptConfig(beforeAllHook, "before_all", "suite workspace"),
|
|
13664
13715
|
scriptContext
|
|
13665
13716
|
);
|
|
13666
13717
|
setupLog("shared before_all completed");
|
|
@@ -13685,7 +13736,7 @@ async function runEvaluation(options) {
|
|
|
13685
13736
|
};
|
|
13686
13737
|
try {
|
|
13687
13738
|
const output = await executeWorkspaceScript(
|
|
13688
|
-
toScriptConfig(beforeAllHook, "
|
|
13739
|
+
toScriptConfig(beforeAllHook, "before_all", "suite workspace"),
|
|
13689
13740
|
scriptContext
|
|
13690
13741
|
);
|
|
13691
13742
|
if (!beforeAllOutput) beforeAllOutput = output;
|
|
@@ -13914,7 +13965,7 @@ async function runEvaluation(options) {
|
|
|
13914
13965
|
}
|
|
13915
13966
|
}
|
|
13916
13967
|
const afterAllWorkspaces = poolSlots.length > 1 ? poolSlots.map((s) => s.path) : sharedWorkspacePath ? [sharedWorkspacePath] : [];
|
|
13917
|
-
const suiteAfterAllHook = suiteWorkspace?.hooks?.
|
|
13968
|
+
const suiteAfterAllHook = suiteWorkspace?.hooks?.after_all;
|
|
13918
13969
|
if (afterAllWorkspaces.length > 0 && hasHookCommand(suiteAfterAllHook)) {
|
|
13919
13970
|
const afterAllHook = suiteAfterAllHook;
|
|
13920
13971
|
for (const wsPath of afterAllWorkspaces) {
|
|
@@ -13926,7 +13977,7 @@ async function runEvaluation(options) {
|
|
|
13926
13977
|
};
|
|
13927
13978
|
try {
|
|
13928
13979
|
const afterAllOutput = await executeWorkspaceScript(
|
|
13929
|
-
toScriptConfig(afterAllHook, "
|
|
13980
|
+
toScriptConfig(afterAllHook, "after_all", "suite workspace"),
|
|
13930
13981
|
scriptContext,
|
|
13931
13982
|
"warn"
|
|
13932
13983
|
);
|
|
@@ -14223,7 +14274,7 @@ async function runEvalCase(options) {
|
|
|
14223
14274
|
);
|
|
14224
14275
|
}
|
|
14225
14276
|
}
|
|
14226
|
-
const caseBeforeAllHook = evalCase.workspace?.hooks?.
|
|
14277
|
+
const caseBeforeAllHook = evalCase.workspace?.hooks?.before_all;
|
|
14227
14278
|
if (workspacePath && hasHookCommand(caseBeforeAllHook)) {
|
|
14228
14279
|
const beforeAllHook = caseBeforeAllHook;
|
|
14229
14280
|
const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
|
|
@@ -14242,7 +14293,7 @@ async function runEvalCase(options) {
|
|
|
14242
14293
|
};
|
|
14243
14294
|
try {
|
|
14244
14295
|
beforeAllOutput = await executeWorkspaceScript(
|
|
14245
|
-
toScriptConfig(beforeAllHook, "
|
|
14296
|
+
toScriptConfig(beforeAllHook, "before_all", `test '${evalCase.id}'`),
|
|
14246
14297
|
scriptContext
|
|
14247
14298
|
);
|
|
14248
14299
|
if (setupDebug) {
|
|
@@ -14267,7 +14318,7 @@ async function runEvalCase(options) {
|
|
|
14267
14318
|
}
|
|
14268
14319
|
}
|
|
14269
14320
|
}
|
|
14270
|
-
const caseBeforeEachHook = evalCase.workspace?.hooks?.
|
|
14321
|
+
const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each;
|
|
14271
14322
|
if (workspacePath && hasHookCommand(caseBeforeEachHook)) {
|
|
14272
14323
|
const beforeEachHook = caseBeforeEachHook;
|
|
14273
14324
|
const scriptContext = {
|
|
@@ -14280,7 +14331,7 @@ async function runEvalCase(options) {
|
|
|
14280
14331
|
};
|
|
14281
14332
|
try {
|
|
14282
14333
|
beforeEachOutput = await executeWorkspaceScript(
|
|
14283
|
-
toScriptConfig(beforeEachHook, "
|
|
14334
|
+
toScriptConfig(beforeEachHook, "before_each", `test '${evalCase.id}'`),
|
|
14284
14335
|
scriptContext
|
|
14285
14336
|
);
|
|
14286
14337
|
} catch (error) {
|
|
@@ -14397,17 +14448,17 @@ async function runEvalCase(options) {
|
|
|
14397
14448
|
}
|
|
14398
14449
|
}
|
|
14399
14450
|
const providerError = extractProviderError(providerResponse);
|
|
14400
|
-
if (repoManager && workspacePath && evalCase.workspace?.hooks?.
|
|
14451
|
+
if (repoManager && workspacePath && evalCase.workspace?.hooks?.after_each?.reset && evalCase.workspace.hooks.after_each.reset !== "none" && evalCase.workspace.repos) {
|
|
14401
14452
|
try {
|
|
14402
14453
|
await repoManager.reset(
|
|
14403
14454
|
evalCase.workspace.repos,
|
|
14404
14455
|
workspacePath,
|
|
14405
|
-
evalCase.workspace.hooks.
|
|
14456
|
+
evalCase.workspace.hooks.after_each.reset
|
|
14406
14457
|
);
|
|
14407
14458
|
} catch {
|
|
14408
14459
|
}
|
|
14409
14460
|
}
|
|
14410
|
-
const caseAfterEachHook = evalCase.workspace?.hooks?.
|
|
14461
|
+
const caseAfterEachHook = evalCase.workspace?.hooks?.after_each;
|
|
14411
14462
|
if (workspacePath && hasHookCommand(caseAfterEachHook)) {
|
|
14412
14463
|
const afterEachHook = caseAfterEachHook;
|
|
14413
14464
|
const scriptContext = {
|
|
@@ -14420,7 +14471,7 @@ async function runEvalCase(options) {
|
|
|
14420
14471
|
};
|
|
14421
14472
|
try {
|
|
14422
14473
|
afterEachOutput = await executeWorkspaceScript(
|
|
14423
|
-
toScriptConfig(afterEachHook, "
|
|
14474
|
+
toScriptConfig(afterEachHook, "after_each", `test '${evalCase.id}'`),
|
|
14424
14475
|
scriptContext,
|
|
14425
14476
|
"warn"
|
|
14426
14477
|
);
|
|
@@ -14820,9 +14871,11 @@ async function runEvaluatorList(options) {
|
|
|
14820
14871
|
registry: typeRegistry
|
|
14821
14872
|
};
|
|
14822
14873
|
for (const evaluatorConfig of evaluators ?? []) {
|
|
14874
|
+
const startedAt = /* @__PURE__ */ new Date();
|
|
14823
14875
|
try {
|
|
14824
14876
|
const evaluatorInstance = await typeRegistry.create(evaluatorConfig, dispatchContext);
|
|
14825
14877
|
const score2 = await evaluatorInstance.evaluate(evalContext);
|
|
14878
|
+
const endedAt = /* @__PURE__ */ new Date();
|
|
14826
14879
|
const weight = evaluatorConfig.weight ?? 1;
|
|
14827
14880
|
scored.push({
|
|
14828
14881
|
score: score2,
|
|
@@ -14843,9 +14896,13 @@ async function runEvaluatorList(options) {
|
|
|
14843
14896
|
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
14844
14897
|
details: score2.details,
|
|
14845
14898
|
scores: mapChildResults(score2.scores),
|
|
14846
|
-
tokenUsage: score2.tokenUsage
|
|
14899
|
+
tokenUsage: score2.tokenUsage,
|
|
14900
|
+
durationMs: endedAt.getTime() - startedAt.getTime(),
|
|
14901
|
+
startedAt: startedAt.toISOString(),
|
|
14902
|
+
endedAt: endedAt.toISOString()
|
|
14847
14903
|
});
|
|
14848
14904
|
} catch (error) {
|
|
14905
|
+
const endedAt = /* @__PURE__ */ new Date();
|
|
14849
14906
|
const message = error instanceof Error ? error.message : String(error);
|
|
14850
14907
|
const fallbackScore = {
|
|
14851
14908
|
score: 0,
|
|
@@ -14871,7 +14928,10 @@ async function runEvaluatorList(options) {
|
|
|
14871
14928
|
verdict: "fail",
|
|
14872
14929
|
hits: [],
|
|
14873
14930
|
misses: [`Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`],
|
|
14874
|
-
reasoning: message
|
|
14931
|
+
reasoning: message,
|
|
14932
|
+
durationMs: endedAt.getTime() - startedAt.getTime(),
|
|
14933
|
+
startedAt: startedAt.toISOString(),
|
|
14934
|
+
endedAt: endedAt.toISOString()
|
|
14875
14935
|
});
|
|
14876
14936
|
}
|
|
14877
14937
|
if (evaluatorConfig.negate === true && scored.length > 0) {
|