@agentv/core 3.9.0 → 3.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-PC5TLJF6.js → chunk-K7JCJIXA.js} +1 -1
- package/dist/chunk-K7JCJIXA.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +100 -40
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +17 -5
- package/dist/index.d.ts +17 -5
- package/dist/index.js +100 -41
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-PC5TLJF6.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1575,6 +1575,7 @@ __export(index_exports, {
|
|
|
1575
1575
|
extractTargetsFromSuite: () => extractTargetsFromSuite,
|
|
1576
1576
|
extractTargetsFromTestCase: () => extractTargetsFromTestCase,
|
|
1577
1577
|
extractTrialsConfig: () => extractTrialsConfig,
|
|
1578
|
+
extractWorkersFromSuite: () => extractWorkersFromSuite,
|
|
1578
1579
|
fileExists: () => fileExists2,
|
|
1579
1580
|
findGitRoot: () => findGitRoot,
|
|
1580
1581
|
freeformEvaluationSchema: () => freeformEvaluationSchema,
|
|
@@ -2256,6 +2257,17 @@ function extractTargetsFromSuite(suite) {
|
|
|
2256
2257
|
}
|
|
2257
2258
|
return void 0;
|
|
2258
2259
|
}
|
|
2260
|
+
function extractWorkersFromSuite(suite) {
|
|
2261
|
+
const execution = suite.execution;
|
|
2262
|
+
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
2263
|
+
return void 0;
|
|
2264
|
+
}
|
|
2265
|
+
const workers = execution.workers;
|
|
2266
|
+
if (typeof workers === "number" && Number.isInteger(workers) && workers >= 1 && workers <= 50) {
|
|
2267
|
+
return workers;
|
|
2268
|
+
}
|
|
2269
|
+
return void 0;
|
|
2270
|
+
}
|
|
2259
2271
|
function extractTargetsFromTestCase(testCase) {
|
|
2260
2272
|
const execution = testCase.execution;
|
|
2261
2273
|
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
@@ -4556,6 +4568,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
4556
4568
|
tests,
|
|
4557
4569
|
trials: extractTrialsConfig(parsed),
|
|
4558
4570
|
targets: extractTargetsFromSuite(parsed),
|
|
4571
|
+
workers: extractWorkersFromSuite(parsed),
|
|
4559
4572
|
cacheConfig: extractCacheConfig(parsed),
|
|
4560
4573
|
totalBudgetUsd: extractTotalBudgetUsd(parsed),
|
|
4561
4574
|
...metadata !== void 0 && { metadata },
|
|
@@ -13344,8 +13357,7 @@ ${context2.fileChanges}`;
|
|
|
13344
13357
|
}
|
|
13345
13358
|
const evaluatorRawRequest = {
|
|
13346
13359
|
userPrompt,
|
|
13347
|
-
systemPrompt
|
|
13348
|
-
target: graderProvider.targetName
|
|
13360
|
+
systemPrompt
|
|
13349
13361
|
};
|
|
13350
13362
|
try {
|
|
13351
13363
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -13363,6 +13375,7 @@ ${context2.fileChanges}`;
|
|
|
13363
13375
|
assertions,
|
|
13364
13376
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
13365
13377
|
evaluatorRawRequest,
|
|
13378
|
+
graderTarget: graderProvider.targetName,
|
|
13366
13379
|
tokenUsage
|
|
13367
13380
|
};
|
|
13368
13381
|
} catch (e) {
|
|
@@ -13374,7 +13387,8 @@ ${context2.fileChanges}`;
|
|
|
13374
13387
|
verdict: "skip",
|
|
13375
13388
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
13376
13389
|
expectedAspectCount: 1,
|
|
13377
|
-
evaluatorRawRequest
|
|
13390
|
+
evaluatorRawRequest,
|
|
13391
|
+
graderTarget: graderProvider.targetName
|
|
13378
13392
|
};
|
|
13379
13393
|
}
|
|
13380
13394
|
}
|
|
@@ -13392,8 +13406,7 @@ ${context2.fileChanges}`;
|
|
|
13392
13406
|
const systemPrompt = buildRubricOutputSchema();
|
|
13393
13407
|
const evaluatorRawRequest = {
|
|
13394
13408
|
userPrompt: prompt,
|
|
13395
|
-
systemPrompt
|
|
13396
|
-
target: graderProvider.targetName
|
|
13409
|
+
systemPrompt
|
|
13397
13410
|
};
|
|
13398
13411
|
try {
|
|
13399
13412
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -13410,6 +13423,7 @@ ${context2.fileChanges}`;
|
|
|
13410
13423
|
assertions,
|
|
13411
13424
|
expectedAspectCount: rubrics.length,
|
|
13412
13425
|
evaluatorRawRequest,
|
|
13426
|
+
graderTarget: graderProvider.targetName,
|
|
13413
13427
|
tokenUsage
|
|
13414
13428
|
};
|
|
13415
13429
|
} catch (e) {
|
|
@@ -13421,7 +13435,8 @@ ${context2.fileChanges}`;
|
|
|
13421
13435
|
verdict: "skip",
|
|
13422
13436
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
13423
13437
|
expectedAspectCount: rubrics.length,
|
|
13424
|
-
evaluatorRawRequest
|
|
13438
|
+
evaluatorRawRequest,
|
|
13439
|
+
graderTarget: graderProvider.targetName
|
|
13425
13440
|
};
|
|
13426
13441
|
}
|
|
13427
13442
|
}
|
|
@@ -13434,8 +13449,7 @@ ${context2.fileChanges}`;
|
|
|
13434
13449
|
const systemPrompt = buildScoreRangeOutputSchema();
|
|
13435
13450
|
const evaluatorRawRequest = {
|
|
13436
13451
|
userPrompt: prompt,
|
|
13437
|
-
systemPrompt
|
|
13438
|
-
target: graderProvider.targetName
|
|
13452
|
+
systemPrompt
|
|
13439
13453
|
};
|
|
13440
13454
|
try {
|
|
13441
13455
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -13452,6 +13466,7 @@ ${context2.fileChanges}`;
|
|
|
13452
13466
|
assertions,
|
|
13453
13467
|
expectedAspectCount: rubrics.length,
|
|
13454
13468
|
evaluatorRawRequest,
|
|
13469
|
+
graderTarget: graderProvider.targetName,
|
|
13455
13470
|
details,
|
|
13456
13471
|
tokenUsage
|
|
13457
13472
|
};
|
|
@@ -13464,7 +13479,8 @@ ${context2.fileChanges}`;
|
|
|
13464
13479
|
verdict: "skip",
|
|
13465
13480
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
13466
13481
|
expectedAspectCount: rubrics.length,
|
|
13467
|
-
evaluatorRawRequest
|
|
13482
|
+
evaluatorRawRequest,
|
|
13483
|
+
graderTarget: graderProvider.targetName
|
|
13468
13484
|
};
|
|
13469
13485
|
}
|
|
13470
13486
|
}
|
|
@@ -13496,7 +13512,6 @@ ${context2.fileChanges}`;
|
|
|
13496
13512
|
mode: "built-in",
|
|
13497
13513
|
systemPrompt,
|
|
13498
13514
|
userPrompt,
|
|
13499
|
-
target: graderProvider.targetName,
|
|
13500
13515
|
maxSteps: this.maxSteps
|
|
13501
13516
|
};
|
|
13502
13517
|
try {
|
|
@@ -13514,7 +13529,13 @@ ${context2.fileChanges}`;
|
|
|
13514
13529
|
steps: steps.length,
|
|
13515
13530
|
tool_calls: toolCallCount
|
|
13516
13531
|
};
|
|
13517
|
-
return this.parseAgentResult(
|
|
13532
|
+
return this.parseAgentResult(
|
|
13533
|
+
text,
|
|
13534
|
+
rubrics,
|
|
13535
|
+
evaluatorRawRequest,
|
|
13536
|
+
details,
|
|
13537
|
+
graderProvider.targetName
|
|
13538
|
+
);
|
|
13518
13539
|
} catch (error) {
|
|
13519
13540
|
const message = error instanceof Error ? error.message : String(error);
|
|
13520
13541
|
return {
|
|
@@ -13523,6 +13544,7 @@ ${context2.fileChanges}`;
|
|
|
13523
13544
|
assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
|
|
13524
13545
|
expectedAspectCount: 1,
|
|
13525
13546
|
evaluatorRawRequest,
|
|
13547
|
+
graderTarget: graderProvider.targetName,
|
|
13526
13548
|
details: { mode: "built-in", error: message }
|
|
13527
13549
|
};
|
|
13528
13550
|
}
|
|
@@ -13575,6 +13597,7 @@ ${context2.fileChanges}`;
|
|
|
13575
13597
|
],
|
|
13576
13598
|
expectedAspectCount: 1,
|
|
13577
13599
|
evaluatorRawRequest,
|
|
13600
|
+
graderTarget: provider.targetName,
|
|
13578
13601
|
details: { mode: modeLabel, grader_target: provider.targetName }
|
|
13579
13602
|
};
|
|
13580
13603
|
}
|
|
@@ -13584,7 +13607,13 @@ ${context2.fileChanges}`;
|
|
|
13584
13607
|
mode: modeLabel,
|
|
13585
13608
|
grader_target: provider.targetName
|
|
13586
13609
|
};
|
|
13587
|
-
return this.parseAgentResult(
|
|
13610
|
+
return this.parseAgentResult(
|
|
13611
|
+
assistantContent,
|
|
13612
|
+
rubrics,
|
|
13613
|
+
evaluatorRawRequest,
|
|
13614
|
+
details,
|
|
13615
|
+
provider.targetName
|
|
13616
|
+
);
|
|
13588
13617
|
} catch (error) {
|
|
13589
13618
|
const message = error instanceof Error ? error.message : String(error);
|
|
13590
13619
|
return {
|
|
@@ -13595,6 +13624,7 @@ ${context2.fileChanges}`;
|
|
|
13595
13624
|
],
|
|
13596
13625
|
expectedAspectCount: 1,
|
|
13597
13626
|
evaluatorRawRequest,
|
|
13627
|
+
graderTarget: provider.targetName,
|
|
13598
13628
|
details: {
|
|
13599
13629
|
mode: modeLabel,
|
|
13600
13630
|
grader_target: provider.targetName,
|
|
@@ -13739,7 +13769,7 @@ ${outputSchema}`;
|
|
|
13739
13769
|
* Parse the agent's response text into an EvaluationScore.
|
|
13740
13770
|
* Supports both freeform and rubric modes.
|
|
13741
13771
|
*/
|
|
13742
|
-
parseAgentResult(text, rubrics, evaluatorRawRequest, details) {
|
|
13772
|
+
parseAgentResult(text, rubrics, evaluatorRawRequest, details, graderTarget) {
|
|
13743
13773
|
try {
|
|
13744
13774
|
const parsed = parseJsonFromText(text);
|
|
13745
13775
|
if (rubrics && rubrics.length > 0) {
|
|
@@ -13751,6 +13781,7 @@ ${outputSchema}`;
|
|
|
13751
13781
|
assertions: assertions2,
|
|
13752
13782
|
expectedAspectCount: rubrics.length,
|
|
13753
13783
|
evaluatorRawRequest,
|
|
13784
|
+
graderTarget,
|
|
13754
13785
|
details
|
|
13755
13786
|
};
|
|
13756
13787
|
}
|
|
@@ -13763,6 +13794,7 @@ ${outputSchema}`;
|
|
|
13763
13794
|
assertions,
|
|
13764
13795
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
13765
13796
|
evaluatorRawRequest,
|
|
13797
|
+
graderTarget,
|
|
13766
13798
|
details
|
|
13767
13799
|
};
|
|
13768
13800
|
} catch {
|
|
@@ -13777,6 +13809,7 @@ ${outputSchema}`;
|
|
|
13777
13809
|
],
|
|
13778
13810
|
expectedAspectCount: 1,
|
|
13779
13811
|
evaluatorRawRequest,
|
|
13812
|
+
graderTarget,
|
|
13780
13813
|
details
|
|
13781
13814
|
};
|
|
13782
13815
|
}
|
|
@@ -17790,14 +17823,22 @@ async function runEvaluation(options) {
|
|
|
17790
17823
|
const usePool = poolEnabled !== false && !!suiteWorkspace?.repos?.length && !isPerTestIsolation && !useStaticWorkspace;
|
|
17791
17824
|
const resolvedRetainOnSuccess = retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup");
|
|
17792
17825
|
const resolvedRetainOnFailure = retainOnFailure ?? (cleanupWorkspaces ? "cleanup" : "keep");
|
|
17793
|
-
const
|
|
17794
|
-
const workers = hasSharedWorkspace && !usePool ? 1 : requestedWorkers;
|
|
17826
|
+
const workers = options.maxConcurrency ?? target.workers ?? 1;
|
|
17795
17827
|
setupLog(
|
|
17796
|
-
`sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool}
|
|
17828
|
+
`sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool} workers=${workers}`
|
|
17797
17829
|
);
|
|
17798
|
-
if (hasSharedWorkspace && !usePool &&
|
|
17830
|
+
if (hasSharedWorkspace && !usePool && workers > 1) {
|
|
17799
17831
|
console.warn(
|
|
17800
|
-
|
|
17832
|
+
[
|
|
17833
|
+
`Warning: This eval uses a shared workspace with ${workers} workers.`,
|
|
17834
|
+
"If the agent under test makes file edits, concurrent runs may corrupt each other.",
|
|
17835
|
+
"To limit concurrency, add this to your eval YAML:",
|
|
17836
|
+
"",
|
|
17837
|
+
" execution:",
|
|
17838
|
+
" workers: 1",
|
|
17839
|
+
"",
|
|
17840
|
+
"Or pass --workers 1 on the command line."
|
|
17841
|
+
].join("\n")
|
|
17801
17842
|
);
|
|
17802
17843
|
}
|
|
17803
17844
|
const limit = pLimit(workers);
|
|
@@ -18080,7 +18121,8 @@ async function runEvaluation(options) {
|
|
|
18080
18121
|
streamCallbacks,
|
|
18081
18122
|
typeRegistry,
|
|
18082
18123
|
repoManager,
|
|
18083
|
-
evalDir
|
|
18124
|
+
evalDir,
|
|
18125
|
+
verbose
|
|
18084
18126
|
};
|
|
18085
18127
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
18086
18128
|
if (totalBudgetUsd !== void 0) {
|
|
@@ -18160,7 +18202,8 @@ async function runEvaluation(options) {
|
|
|
18160
18202
|
promptInputs,
|
|
18161
18203
|
primaryProvider,
|
|
18162
18204
|
"agent",
|
|
18163
|
-
"provider_error"
|
|
18205
|
+
"provider_error",
|
|
18206
|
+
verbose
|
|
18164
18207
|
);
|
|
18165
18208
|
results.push(errorResult);
|
|
18166
18209
|
if (onResult) {
|
|
@@ -18233,6 +18276,7 @@ async function runBatchEvaluation(options) {
|
|
|
18233
18276
|
nowFn,
|
|
18234
18277
|
onProgress,
|
|
18235
18278
|
onResult,
|
|
18279
|
+
verbose,
|
|
18236
18280
|
resolveGraderProvider,
|
|
18237
18281
|
agentTimeoutMs,
|
|
18238
18282
|
targetResolver,
|
|
@@ -18320,7 +18364,8 @@ async function runBatchEvaluation(options) {
|
|
|
18320
18364
|
startTime,
|
|
18321
18365
|
endTime,
|
|
18322
18366
|
targetResolver,
|
|
18323
|
-
availableTargets
|
|
18367
|
+
availableTargets,
|
|
18368
|
+
verbose
|
|
18324
18369
|
});
|
|
18325
18370
|
if (providerError) {
|
|
18326
18371
|
result = {
|
|
@@ -18341,7 +18386,8 @@ async function runBatchEvaluation(options) {
|
|
|
18341
18386
|
promptInputs,
|
|
18342
18387
|
provider,
|
|
18343
18388
|
"evaluator",
|
|
18344
|
-
"evaluator_error"
|
|
18389
|
+
"evaluator_error",
|
|
18390
|
+
verbose
|
|
18345
18391
|
);
|
|
18346
18392
|
results.push(errorResult);
|
|
18347
18393
|
if (onResult) {
|
|
@@ -18404,7 +18450,8 @@ async function runEvalCase(options) {
|
|
|
18404
18450
|
suiteWorkspaceFile,
|
|
18405
18451
|
typeRegistry: providedTypeRegistry,
|
|
18406
18452
|
repoManager,
|
|
18407
|
-
evalDir
|
|
18453
|
+
evalDir,
|
|
18454
|
+
verbose
|
|
18408
18455
|
} = options;
|
|
18409
18456
|
const setupDebug = process.env.AGENTV_SETUP_DEBUG === "1";
|
|
18410
18457
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
@@ -18441,7 +18488,8 @@ async function runEvalCase(options) {
|
|
|
18441
18488
|
promptInputs,
|
|
18442
18489
|
provider,
|
|
18443
18490
|
"setup",
|
|
18444
|
-
"template_error"
|
|
18491
|
+
"template_error",
|
|
18492
|
+
verbose
|
|
18445
18493
|
);
|
|
18446
18494
|
}
|
|
18447
18495
|
if (caseWorkspaceFile && workspacePath) {
|
|
@@ -18470,7 +18518,8 @@ async function runEvalCase(options) {
|
|
|
18470
18518
|
promptInputs,
|
|
18471
18519
|
provider,
|
|
18472
18520
|
"repo_setup",
|
|
18473
|
-
"local_path_not_found"
|
|
18521
|
+
"local_path_not_found",
|
|
18522
|
+
verbose
|
|
18474
18523
|
);
|
|
18475
18524
|
}
|
|
18476
18525
|
}
|
|
@@ -18496,7 +18545,8 @@ async function runEvalCase(options) {
|
|
|
18496
18545
|
promptInputs,
|
|
18497
18546
|
provider,
|
|
18498
18547
|
"repo_setup",
|
|
18499
|
-
"clone_error"
|
|
18548
|
+
"clone_error",
|
|
18549
|
+
verbose
|
|
18500
18550
|
);
|
|
18501
18551
|
}
|
|
18502
18552
|
}
|
|
@@ -18522,7 +18572,8 @@ async function runEvalCase(options) {
|
|
|
18522
18572
|
promptInputs,
|
|
18523
18573
|
provider,
|
|
18524
18574
|
"setup",
|
|
18525
|
-
"file_copy_error"
|
|
18575
|
+
"file_copy_error",
|
|
18576
|
+
verbose
|
|
18526
18577
|
);
|
|
18527
18578
|
}
|
|
18528
18579
|
}
|
|
@@ -18567,7 +18618,8 @@ async function runEvalCase(options) {
|
|
|
18567
18618
|
promptInputs,
|
|
18568
18619
|
provider,
|
|
18569
18620
|
"setup",
|
|
18570
|
-
"script_error"
|
|
18621
|
+
"script_error",
|
|
18622
|
+
verbose
|
|
18571
18623
|
);
|
|
18572
18624
|
}
|
|
18573
18625
|
}
|
|
@@ -18598,7 +18650,8 @@ async function runEvalCase(options) {
|
|
|
18598
18650
|
promptInputs,
|
|
18599
18651
|
provider,
|
|
18600
18652
|
"setup",
|
|
18601
|
-
"script_error"
|
|
18653
|
+
"script_error",
|
|
18654
|
+
verbose
|
|
18602
18655
|
);
|
|
18603
18656
|
}
|
|
18604
18657
|
}
|
|
@@ -18642,7 +18695,8 @@ async function runEvalCase(options) {
|
|
|
18642
18695
|
promptInputs,
|
|
18643
18696
|
provider,
|
|
18644
18697
|
"agent",
|
|
18645
|
-
"provider_error"
|
|
18698
|
+
"provider_error",
|
|
18699
|
+
verbose
|
|
18646
18700
|
);
|
|
18647
18701
|
if (workspacePath) {
|
|
18648
18702
|
if (forceCleanup) {
|
|
@@ -18663,7 +18717,8 @@ async function runEvalCase(options) {
|
|
|
18663
18717
|
promptInputs,
|
|
18664
18718
|
provider,
|
|
18665
18719
|
"agent",
|
|
18666
|
-
"provider_error"
|
|
18720
|
+
"provider_error",
|
|
18721
|
+
verbose
|
|
18667
18722
|
);
|
|
18668
18723
|
if (workspacePath) {
|
|
18669
18724
|
if (forceCleanup) {
|
|
@@ -18758,7 +18813,8 @@ async function runEvalCase(options) {
|
|
|
18758
18813
|
targetResolver,
|
|
18759
18814
|
availableTargets,
|
|
18760
18815
|
fileChanges,
|
|
18761
|
-
workspacePath
|
|
18816
|
+
workspacePath,
|
|
18817
|
+
verbose
|
|
18762
18818
|
});
|
|
18763
18819
|
const totalDurationMs = Date.now() - caseStartMs;
|
|
18764
18820
|
const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
|
|
@@ -18813,7 +18869,8 @@ async function runEvalCase(options) {
|
|
|
18813
18869
|
promptInputs,
|
|
18814
18870
|
provider,
|
|
18815
18871
|
"evaluator",
|
|
18816
|
-
"evaluator_error"
|
|
18872
|
+
"evaluator_error",
|
|
18873
|
+
verbose
|
|
18817
18874
|
);
|
|
18818
18875
|
if (workspacePath && !isSharedWorkspace) {
|
|
18819
18876
|
if (forceCleanup || (retainOnFailure ?? "keep") === "cleanup") {
|
|
@@ -18955,7 +19012,7 @@ async function evaluateCandidate(options) {
|
|
|
18955
19012
|
let lmRequest;
|
|
18956
19013
|
if (isAgentProvider(provider)) {
|
|
18957
19014
|
agentRequest = {
|
|
18958
|
-
|
|
19015
|
+
...options.verbose ? { input: promptInputs.question } : {}
|
|
18959
19016
|
};
|
|
18960
19017
|
} else {
|
|
18961
19018
|
if (promptInputs.chatPrompt) {
|
|
@@ -18969,8 +19026,9 @@ async function evaluateCandidate(options) {
|
|
|
18969
19026
|
}
|
|
18970
19027
|
}
|
|
18971
19028
|
const evaluatorRequest = scores ? void 0 : score.evaluatorRawRequest;
|
|
18972
|
-
const
|
|
18973
|
-
|
|
19029
|
+
const effectiveAgentRequest = agentRequest && Object.keys(agentRequest).length > 0 ? agentRequest : void 0;
|
|
19030
|
+
const requests = effectiveAgentRequest || lmRequest || evaluatorRequest ? {
|
|
19031
|
+
...effectiveAgentRequest ? { agent: effectiveAgentRequest } : {},
|
|
18974
19032
|
...lmRequest ? { lm: lmRequest } : {},
|
|
18975
19033
|
...evaluatorRequest ? { evaluator: evaluatorRequest } : {}
|
|
18976
19034
|
} : void 0;
|
|
@@ -18990,9 +19048,9 @@ async function evaluateCandidate(options) {
|
|
|
18990
19048
|
endTime,
|
|
18991
19049
|
requests,
|
|
18992
19050
|
input,
|
|
19051
|
+
output: output ?? [{ role: "assistant", content: candidate }],
|
|
18993
19052
|
scores,
|
|
18994
19053
|
trace: trace2,
|
|
18995
|
-
output: output ?? [{ role: "assistant", content: candidate }],
|
|
18996
19054
|
fileChanges,
|
|
18997
19055
|
executionStatus: classifyQualityStatus(score.score)
|
|
18998
19056
|
};
|
|
@@ -19158,6 +19216,7 @@ async function runEvaluatorList(options) {
|
|
|
19158
19216
|
verdict: score2.verdict,
|
|
19159
19217
|
assertions: score2.assertions,
|
|
19160
19218
|
input: score2.evaluatorRawRequest,
|
|
19219
|
+
target: score2.graderTarget,
|
|
19161
19220
|
details: score2.details,
|
|
19162
19221
|
scores: mapChildResults(score2.scores),
|
|
19163
19222
|
tokenUsage: score2.tokenUsage,
|
|
@@ -19297,13 +19356,13 @@ async function invokeProvider(provider, options) {
|
|
|
19297
19356
|
}
|
|
19298
19357
|
}
|
|
19299
19358
|
}
|
|
19300
|
-
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
|
|
19359
|
+
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode, verbose) {
|
|
19301
19360
|
const message = error instanceof Error ? error.message : String(error);
|
|
19302
19361
|
let agentRequest;
|
|
19303
19362
|
let lmRequest;
|
|
19304
19363
|
if (isAgentProvider(provider)) {
|
|
19305
19364
|
agentRequest = {
|
|
19306
|
-
|
|
19365
|
+
...verbose ? { input: promptInputs.question } : {},
|
|
19307
19366
|
error: message
|
|
19308
19367
|
};
|
|
19309
19368
|
} else {
|
|
@@ -19331,10 +19390,10 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
19331
19390
|
conversationId: evalCase.conversation_id,
|
|
19332
19391
|
score: 0,
|
|
19333
19392
|
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
19334
|
-
output: [{ role: "assistant", content: `Error occurred: ${message}` }],
|
|
19335
19393
|
target: targetName,
|
|
19336
19394
|
requests,
|
|
19337
19395
|
input,
|
|
19396
|
+
output: [{ role: "assistant", content: `Error occurred: ${message}` }],
|
|
19338
19397
|
error: message,
|
|
19339
19398
|
executionStatus: "execution_error",
|
|
19340
19399
|
failureStage,
|
|
@@ -20408,6 +20467,7 @@ function createAgentKernel() {
|
|
|
20408
20467
|
extractTargetsFromSuite,
|
|
20409
20468
|
extractTargetsFromTestCase,
|
|
20410
20469
|
extractTrialsConfig,
|
|
20470
|
+
extractWorkersFromSuite,
|
|
20411
20471
|
fileExists,
|
|
20412
20472
|
findGitRoot,
|
|
20413
20473
|
freeformEvaluationSchema,
|