@agentv/core 3.9.1 → 3.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-PC5TLJF6.js → chunk-K7JCJIXA.js} +1 -1
- package/dist/chunk-K7JCJIXA.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +73 -35
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +6 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +74 -36
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-PC5TLJF6.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -13357,8 +13357,7 @@ ${context2.fileChanges}`;
|
|
|
13357
13357
|
}
|
|
13358
13358
|
const evaluatorRawRequest = {
|
|
13359
13359
|
userPrompt,
|
|
13360
|
-
systemPrompt
|
|
13361
|
-
target: graderProvider.targetName
|
|
13360
|
+
systemPrompt
|
|
13362
13361
|
};
|
|
13363
13362
|
try {
|
|
13364
13363
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -13376,6 +13375,7 @@ ${context2.fileChanges}`;
|
|
|
13376
13375
|
assertions,
|
|
13377
13376
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
13378
13377
|
evaluatorRawRequest,
|
|
13378
|
+
graderTarget: graderProvider.targetName,
|
|
13379
13379
|
tokenUsage
|
|
13380
13380
|
};
|
|
13381
13381
|
} catch (e) {
|
|
@@ -13387,7 +13387,8 @@ ${context2.fileChanges}`;
|
|
|
13387
13387
|
verdict: "skip",
|
|
13388
13388
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
13389
13389
|
expectedAspectCount: 1,
|
|
13390
|
-
evaluatorRawRequest
|
|
13390
|
+
evaluatorRawRequest,
|
|
13391
|
+
graderTarget: graderProvider.targetName
|
|
13391
13392
|
};
|
|
13392
13393
|
}
|
|
13393
13394
|
}
|
|
@@ -13405,8 +13406,7 @@ ${context2.fileChanges}`;
|
|
|
13405
13406
|
const systemPrompt = buildRubricOutputSchema();
|
|
13406
13407
|
const evaluatorRawRequest = {
|
|
13407
13408
|
userPrompt: prompt,
|
|
13408
|
-
systemPrompt
|
|
13409
|
-
target: graderProvider.targetName
|
|
13409
|
+
systemPrompt
|
|
13410
13410
|
};
|
|
13411
13411
|
try {
|
|
13412
13412
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -13423,6 +13423,7 @@ ${context2.fileChanges}`;
|
|
|
13423
13423
|
assertions,
|
|
13424
13424
|
expectedAspectCount: rubrics.length,
|
|
13425
13425
|
evaluatorRawRequest,
|
|
13426
|
+
graderTarget: graderProvider.targetName,
|
|
13426
13427
|
tokenUsage
|
|
13427
13428
|
};
|
|
13428
13429
|
} catch (e) {
|
|
@@ -13434,7 +13435,8 @@ ${context2.fileChanges}`;
|
|
|
13434
13435
|
verdict: "skip",
|
|
13435
13436
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
13436
13437
|
expectedAspectCount: rubrics.length,
|
|
13437
|
-
evaluatorRawRequest
|
|
13438
|
+
evaluatorRawRequest,
|
|
13439
|
+
graderTarget: graderProvider.targetName
|
|
13438
13440
|
};
|
|
13439
13441
|
}
|
|
13440
13442
|
}
|
|
@@ -13447,8 +13449,7 @@ ${context2.fileChanges}`;
|
|
|
13447
13449
|
const systemPrompt = buildScoreRangeOutputSchema();
|
|
13448
13450
|
const evaluatorRawRequest = {
|
|
13449
13451
|
userPrompt: prompt,
|
|
13450
|
-
systemPrompt
|
|
13451
|
-
target: graderProvider.targetName
|
|
13452
|
+
systemPrompt
|
|
13452
13453
|
};
|
|
13453
13454
|
try {
|
|
13454
13455
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -13465,6 +13466,7 @@ ${context2.fileChanges}`;
|
|
|
13465
13466
|
assertions,
|
|
13466
13467
|
expectedAspectCount: rubrics.length,
|
|
13467
13468
|
evaluatorRawRequest,
|
|
13469
|
+
graderTarget: graderProvider.targetName,
|
|
13468
13470
|
details,
|
|
13469
13471
|
tokenUsage
|
|
13470
13472
|
};
|
|
@@ -13477,7 +13479,8 @@ ${context2.fileChanges}`;
|
|
|
13477
13479
|
verdict: "skip",
|
|
13478
13480
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
13479
13481
|
expectedAspectCount: rubrics.length,
|
|
13480
|
-
evaluatorRawRequest
|
|
13482
|
+
evaluatorRawRequest,
|
|
13483
|
+
graderTarget: graderProvider.targetName
|
|
13481
13484
|
};
|
|
13482
13485
|
}
|
|
13483
13486
|
}
|
|
@@ -13509,7 +13512,6 @@ ${context2.fileChanges}`;
|
|
|
13509
13512
|
mode: "built-in",
|
|
13510
13513
|
systemPrompt,
|
|
13511
13514
|
userPrompt,
|
|
13512
|
-
target: graderProvider.targetName,
|
|
13513
13515
|
maxSteps: this.maxSteps
|
|
13514
13516
|
};
|
|
13515
13517
|
try {
|
|
@@ -13527,7 +13529,13 @@ ${context2.fileChanges}`;
|
|
|
13527
13529
|
steps: steps.length,
|
|
13528
13530
|
tool_calls: toolCallCount
|
|
13529
13531
|
};
|
|
13530
|
-
return this.parseAgentResult(
|
|
13532
|
+
return this.parseAgentResult(
|
|
13533
|
+
text,
|
|
13534
|
+
rubrics,
|
|
13535
|
+
evaluatorRawRequest,
|
|
13536
|
+
details,
|
|
13537
|
+
graderProvider.targetName
|
|
13538
|
+
);
|
|
13531
13539
|
} catch (error) {
|
|
13532
13540
|
const message = error instanceof Error ? error.message : String(error);
|
|
13533
13541
|
return {
|
|
@@ -13536,6 +13544,7 @@ ${context2.fileChanges}`;
|
|
|
13536
13544
|
assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
|
|
13537
13545
|
expectedAspectCount: 1,
|
|
13538
13546
|
evaluatorRawRequest,
|
|
13547
|
+
graderTarget: graderProvider.targetName,
|
|
13539
13548
|
details: { mode: "built-in", error: message }
|
|
13540
13549
|
};
|
|
13541
13550
|
}
|
|
@@ -13588,6 +13597,7 @@ ${context2.fileChanges}`;
|
|
|
13588
13597
|
],
|
|
13589
13598
|
expectedAspectCount: 1,
|
|
13590
13599
|
evaluatorRawRequest,
|
|
13600
|
+
graderTarget: provider.targetName,
|
|
13591
13601
|
details: { mode: modeLabel, grader_target: provider.targetName }
|
|
13592
13602
|
};
|
|
13593
13603
|
}
|
|
@@ -13597,7 +13607,13 @@ ${context2.fileChanges}`;
|
|
|
13597
13607
|
mode: modeLabel,
|
|
13598
13608
|
grader_target: provider.targetName
|
|
13599
13609
|
};
|
|
13600
|
-
return this.parseAgentResult(
|
|
13610
|
+
return this.parseAgentResult(
|
|
13611
|
+
assistantContent,
|
|
13612
|
+
rubrics,
|
|
13613
|
+
evaluatorRawRequest,
|
|
13614
|
+
details,
|
|
13615
|
+
provider.targetName
|
|
13616
|
+
);
|
|
13601
13617
|
} catch (error) {
|
|
13602
13618
|
const message = error instanceof Error ? error.message : String(error);
|
|
13603
13619
|
return {
|
|
@@ -13608,6 +13624,7 @@ ${context2.fileChanges}`;
|
|
|
13608
13624
|
],
|
|
13609
13625
|
expectedAspectCount: 1,
|
|
13610
13626
|
evaluatorRawRequest,
|
|
13627
|
+
graderTarget: provider.targetName,
|
|
13611
13628
|
details: {
|
|
13612
13629
|
mode: modeLabel,
|
|
13613
13630
|
grader_target: provider.targetName,
|
|
@@ -13752,7 +13769,7 @@ ${outputSchema}`;
|
|
|
13752
13769
|
* Parse the agent's response text into an EvaluationScore.
|
|
13753
13770
|
* Supports both freeform and rubric modes.
|
|
13754
13771
|
*/
|
|
13755
|
-
parseAgentResult(text, rubrics, evaluatorRawRequest, details) {
|
|
13772
|
+
parseAgentResult(text, rubrics, evaluatorRawRequest, details, graderTarget) {
|
|
13756
13773
|
try {
|
|
13757
13774
|
const parsed = parseJsonFromText(text);
|
|
13758
13775
|
if (rubrics && rubrics.length > 0) {
|
|
@@ -13764,6 +13781,7 @@ ${outputSchema}`;
|
|
|
13764
13781
|
assertions: assertions2,
|
|
13765
13782
|
expectedAspectCount: rubrics.length,
|
|
13766
13783
|
evaluatorRawRequest,
|
|
13784
|
+
graderTarget,
|
|
13767
13785
|
details
|
|
13768
13786
|
};
|
|
13769
13787
|
}
|
|
@@ -13776,6 +13794,7 @@ ${outputSchema}`;
|
|
|
13776
13794
|
assertions,
|
|
13777
13795
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
13778
13796
|
evaluatorRawRequest,
|
|
13797
|
+
graderTarget,
|
|
13779
13798
|
details
|
|
13780
13799
|
};
|
|
13781
13800
|
} catch {
|
|
@@ -13790,6 +13809,7 @@ ${outputSchema}`;
|
|
|
13790
13809
|
],
|
|
13791
13810
|
expectedAspectCount: 1,
|
|
13792
13811
|
evaluatorRawRequest,
|
|
13812
|
+
graderTarget,
|
|
13793
13813
|
details
|
|
13794
13814
|
};
|
|
13795
13815
|
}
|
|
@@ -18101,7 +18121,8 @@ async function runEvaluation(options) {
|
|
|
18101
18121
|
streamCallbacks,
|
|
18102
18122
|
typeRegistry,
|
|
18103
18123
|
repoManager,
|
|
18104
|
-
evalDir
|
|
18124
|
+
evalDir,
|
|
18125
|
+
verbose
|
|
18105
18126
|
};
|
|
18106
18127
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
18107
18128
|
if (totalBudgetUsd !== void 0) {
|
|
@@ -18181,7 +18202,8 @@ async function runEvaluation(options) {
|
|
|
18181
18202
|
promptInputs,
|
|
18182
18203
|
primaryProvider,
|
|
18183
18204
|
"agent",
|
|
18184
|
-
"provider_error"
|
|
18205
|
+
"provider_error",
|
|
18206
|
+
verbose
|
|
18185
18207
|
);
|
|
18186
18208
|
results.push(errorResult);
|
|
18187
18209
|
if (onResult) {
|
|
@@ -18254,6 +18276,7 @@ async function runBatchEvaluation(options) {
|
|
|
18254
18276
|
nowFn,
|
|
18255
18277
|
onProgress,
|
|
18256
18278
|
onResult,
|
|
18279
|
+
verbose,
|
|
18257
18280
|
resolveGraderProvider,
|
|
18258
18281
|
agentTimeoutMs,
|
|
18259
18282
|
targetResolver,
|
|
@@ -18341,7 +18364,8 @@ async function runBatchEvaluation(options) {
|
|
|
18341
18364
|
startTime,
|
|
18342
18365
|
endTime,
|
|
18343
18366
|
targetResolver,
|
|
18344
|
-
availableTargets
|
|
18367
|
+
availableTargets,
|
|
18368
|
+
verbose
|
|
18345
18369
|
});
|
|
18346
18370
|
if (providerError) {
|
|
18347
18371
|
result = {
|
|
@@ -18362,7 +18386,8 @@ async function runBatchEvaluation(options) {
|
|
|
18362
18386
|
promptInputs,
|
|
18363
18387
|
provider,
|
|
18364
18388
|
"evaluator",
|
|
18365
|
-
"evaluator_error"
|
|
18389
|
+
"evaluator_error",
|
|
18390
|
+
verbose
|
|
18366
18391
|
);
|
|
18367
18392
|
results.push(errorResult);
|
|
18368
18393
|
if (onResult) {
|
|
@@ -18425,7 +18450,8 @@ async function runEvalCase(options) {
|
|
|
18425
18450
|
suiteWorkspaceFile,
|
|
18426
18451
|
typeRegistry: providedTypeRegistry,
|
|
18427
18452
|
repoManager,
|
|
18428
|
-
evalDir
|
|
18453
|
+
evalDir,
|
|
18454
|
+
verbose
|
|
18429
18455
|
} = options;
|
|
18430
18456
|
const setupDebug = process.env.AGENTV_SETUP_DEBUG === "1";
|
|
18431
18457
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
@@ -18462,7 +18488,8 @@ async function runEvalCase(options) {
|
|
|
18462
18488
|
promptInputs,
|
|
18463
18489
|
provider,
|
|
18464
18490
|
"setup",
|
|
18465
|
-
"template_error"
|
|
18491
|
+
"template_error",
|
|
18492
|
+
verbose
|
|
18466
18493
|
);
|
|
18467
18494
|
}
|
|
18468
18495
|
if (caseWorkspaceFile && workspacePath) {
|
|
@@ -18491,7 +18518,8 @@ async function runEvalCase(options) {
|
|
|
18491
18518
|
promptInputs,
|
|
18492
18519
|
provider,
|
|
18493
18520
|
"repo_setup",
|
|
18494
|
-
"local_path_not_found"
|
|
18521
|
+
"local_path_not_found",
|
|
18522
|
+
verbose
|
|
18495
18523
|
);
|
|
18496
18524
|
}
|
|
18497
18525
|
}
|
|
@@ -18517,7 +18545,8 @@ async function runEvalCase(options) {
|
|
|
18517
18545
|
promptInputs,
|
|
18518
18546
|
provider,
|
|
18519
18547
|
"repo_setup",
|
|
18520
|
-
"clone_error"
|
|
18548
|
+
"clone_error",
|
|
18549
|
+
verbose
|
|
18521
18550
|
);
|
|
18522
18551
|
}
|
|
18523
18552
|
}
|
|
@@ -18543,7 +18572,8 @@ async function runEvalCase(options) {
|
|
|
18543
18572
|
promptInputs,
|
|
18544
18573
|
provider,
|
|
18545
18574
|
"setup",
|
|
18546
|
-
"file_copy_error"
|
|
18575
|
+
"file_copy_error",
|
|
18576
|
+
verbose
|
|
18547
18577
|
);
|
|
18548
18578
|
}
|
|
18549
18579
|
}
|
|
@@ -18588,7 +18618,8 @@ async function runEvalCase(options) {
|
|
|
18588
18618
|
promptInputs,
|
|
18589
18619
|
provider,
|
|
18590
18620
|
"setup",
|
|
18591
|
-
"script_error"
|
|
18621
|
+
"script_error",
|
|
18622
|
+
verbose
|
|
18592
18623
|
);
|
|
18593
18624
|
}
|
|
18594
18625
|
}
|
|
@@ -18619,7 +18650,8 @@ async function runEvalCase(options) {
|
|
|
18619
18650
|
promptInputs,
|
|
18620
18651
|
provider,
|
|
18621
18652
|
"setup",
|
|
18622
|
-
"script_error"
|
|
18653
|
+
"script_error",
|
|
18654
|
+
verbose
|
|
18623
18655
|
);
|
|
18624
18656
|
}
|
|
18625
18657
|
}
|
|
@@ -18663,7 +18695,8 @@ async function runEvalCase(options) {
|
|
|
18663
18695
|
promptInputs,
|
|
18664
18696
|
provider,
|
|
18665
18697
|
"agent",
|
|
18666
|
-
"provider_error"
|
|
18698
|
+
"provider_error",
|
|
18699
|
+
verbose
|
|
18667
18700
|
);
|
|
18668
18701
|
if (workspacePath) {
|
|
18669
18702
|
if (forceCleanup) {
|
|
@@ -18684,7 +18717,8 @@ async function runEvalCase(options) {
|
|
|
18684
18717
|
promptInputs,
|
|
18685
18718
|
provider,
|
|
18686
18719
|
"agent",
|
|
18687
|
-
"provider_error"
|
|
18720
|
+
"provider_error",
|
|
18721
|
+
verbose
|
|
18688
18722
|
);
|
|
18689
18723
|
if (workspacePath) {
|
|
18690
18724
|
if (forceCleanup) {
|
|
@@ -18779,7 +18813,8 @@ async function runEvalCase(options) {
|
|
|
18779
18813
|
targetResolver,
|
|
18780
18814
|
availableTargets,
|
|
18781
18815
|
fileChanges,
|
|
18782
|
-
workspacePath
|
|
18816
|
+
workspacePath,
|
|
18817
|
+
verbose
|
|
18783
18818
|
});
|
|
18784
18819
|
const totalDurationMs = Date.now() - caseStartMs;
|
|
18785
18820
|
const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
|
|
@@ -18834,7 +18869,8 @@ async function runEvalCase(options) {
|
|
|
18834
18869
|
promptInputs,
|
|
18835
18870
|
provider,
|
|
18836
18871
|
"evaluator",
|
|
18837
|
-
"evaluator_error"
|
|
18872
|
+
"evaluator_error",
|
|
18873
|
+
verbose
|
|
18838
18874
|
);
|
|
18839
18875
|
if (workspacePath && !isSharedWorkspace) {
|
|
18840
18876
|
if (forceCleanup || (retainOnFailure ?? "keep") === "cleanup") {
|
|
@@ -18976,7 +19012,7 @@ async function evaluateCandidate(options) {
|
|
|
18976
19012
|
let lmRequest;
|
|
18977
19013
|
if (isAgentProvider(provider)) {
|
|
18978
19014
|
agentRequest = {
|
|
18979
|
-
|
|
19015
|
+
...options.verbose ? { input: promptInputs.question } : {}
|
|
18980
19016
|
};
|
|
18981
19017
|
} else {
|
|
18982
19018
|
if (promptInputs.chatPrompt) {
|
|
@@ -18990,8 +19026,9 @@ async function evaluateCandidate(options) {
|
|
|
18990
19026
|
}
|
|
18991
19027
|
}
|
|
18992
19028
|
const evaluatorRequest = scores ? void 0 : score.evaluatorRawRequest;
|
|
18993
|
-
const
|
|
18994
|
-
|
|
19029
|
+
const effectiveAgentRequest = agentRequest && Object.keys(agentRequest).length > 0 ? agentRequest : void 0;
|
|
19030
|
+
const requests = effectiveAgentRequest || lmRequest || evaluatorRequest ? {
|
|
19031
|
+
...effectiveAgentRequest ? { agent: effectiveAgentRequest } : {},
|
|
18995
19032
|
...lmRequest ? { lm: lmRequest } : {},
|
|
18996
19033
|
...evaluatorRequest ? { evaluator: evaluatorRequest } : {}
|
|
18997
19034
|
} : void 0;
|
|
@@ -19011,9 +19048,9 @@ async function evaluateCandidate(options) {
|
|
|
19011
19048
|
endTime,
|
|
19012
19049
|
requests,
|
|
19013
19050
|
input,
|
|
19051
|
+
output: output ?? [{ role: "assistant", content: candidate }],
|
|
19014
19052
|
scores,
|
|
19015
19053
|
trace: trace2,
|
|
19016
|
-
output: output ?? [{ role: "assistant", content: candidate }],
|
|
19017
19054
|
fileChanges,
|
|
19018
19055
|
executionStatus: classifyQualityStatus(score.score)
|
|
19019
19056
|
};
|
|
@@ -19179,6 +19216,7 @@ async function runEvaluatorList(options) {
|
|
|
19179
19216
|
verdict: score2.verdict,
|
|
19180
19217
|
assertions: score2.assertions,
|
|
19181
19218
|
input: score2.evaluatorRawRequest,
|
|
19219
|
+
target: score2.graderTarget,
|
|
19182
19220
|
details: score2.details,
|
|
19183
19221
|
scores: mapChildResults(score2.scores),
|
|
19184
19222
|
tokenUsage: score2.tokenUsage,
|
|
@@ -19318,13 +19356,13 @@ async function invokeProvider(provider, options) {
|
|
|
19318
19356
|
}
|
|
19319
19357
|
}
|
|
19320
19358
|
}
|
|
19321
|
-
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
|
|
19359
|
+
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode, verbose) {
|
|
19322
19360
|
const message = error instanceof Error ? error.message : String(error);
|
|
19323
19361
|
let agentRequest;
|
|
19324
19362
|
let lmRequest;
|
|
19325
19363
|
if (isAgentProvider(provider)) {
|
|
19326
19364
|
agentRequest = {
|
|
19327
|
-
|
|
19365
|
+
...verbose ? { input: promptInputs.question } : {},
|
|
19328
19366
|
error: message
|
|
19329
19367
|
};
|
|
19330
19368
|
} else {
|
|
@@ -19352,10 +19390,10 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
19352
19390
|
conversationId: evalCase.conversation_id,
|
|
19353
19391
|
score: 0,
|
|
19354
19392
|
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
19355
|
-
output: [{ role: "assistant", content: `Error occurred: ${message}` }],
|
|
19356
19393
|
target: targetName,
|
|
19357
19394
|
requests,
|
|
19358
19395
|
input,
|
|
19396
|
+
output: [{ role: "assistant", content: `Error occurred: ${message}` }],
|
|
19359
19397
|
error: message,
|
|
19360
19398
|
executionStatus: "execution_error",
|
|
19361
19399
|
failureStage,
|