@agentv/core 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-IOCVST3R.js → chunk-QHEZJRTU.js} +28 -11
- package/dist/chunk-QHEZJRTU.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +68 -64
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +64 -67
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +137 -85
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +11 -11
- package/dist/index.d.ts +11 -11
- package/dist/index.js +108 -68
- package/dist/index.js.map +1 -1
- package/package.json +14 -12
- package/LICENSE +0 -21
- package/dist/chunk-IOCVST3R.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -147,17 +147,6 @@ interface EvaluatorResult {
|
|
|
147
147
|
*/
|
|
148
148
|
declare function getHitCount(result: Pick<EvaluationResult, "hits">): number;
|
|
149
149
|
|
|
150
|
-
/**
|
|
151
|
-
* Formatting mode for segment content.
|
|
152
|
-
* - 'agent': File references only (for providers with filesystem access)
|
|
153
|
-
* - 'lm': Embedded file content with XML tags (for language model providers)
|
|
154
|
-
*/
|
|
155
|
-
type FormattingMode = 'agent' | 'lm';
|
|
156
|
-
/**
|
|
157
|
-
* Extract fenced code blocks from AgentV user segments.
|
|
158
|
-
*/
|
|
159
|
-
declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
|
|
160
|
-
|
|
161
150
|
type ChatMessageRole = "system" | "user" | "assistant" | "tool" | "function";
|
|
162
151
|
interface ChatMessage {
|
|
163
152
|
readonly role: ChatMessageRole;
|
|
@@ -273,6 +262,17 @@ interface TargetDefinition {
|
|
|
273
262
|
readonly retryStatusCodes?: unknown | undefined;
|
|
274
263
|
}
|
|
275
264
|
|
|
265
|
+
/**
|
|
266
|
+
* Formatting mode for segment content.
|
|
267
|
+
* - 'agent': File references only (for providers with filesystem access)
|
|
268
|
+
* - 'lm': Embedded file content with XML tags (for language model providers)
|
|
269
|
+
*/
|
|
270
|
+
type FormattingMode = "agent" | "lm";
|
|
271
|
+
/**
|
|
272
|
+
* Extract fenced code blocks from AgentV user segments.
|
|
273
|
+
*/
|
|
274
|
+
declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
|
|
275
|
+
|
|
276
276
|
/**
|
|
277
277
|
* Build prompt inputs by consolidating user request context and guideline content.
|
|
278
278
|
*/
|
package/dist/index.d.ts
CHANGED
|
@@ -147,17 +147,6 @@ interface EvaluatorResult {
|
|
|
147
147
|
*/
|
|
148
148
|
declare function getHitCount(result: Pick<EvaluationResult, "hits">): number;
|
|
149
149
|
|
|
150
|
-
/**
|
|
151
|
-
* Formatting mode for segment content.
|
|
152
|
-
* - 'agent': File references only (for providers with filesystem access)
|
|
153
|
-
* - 'lm': Embedded file content with XML tags (for language model providers)
|
|
154
|
-
*/
|
|
155
|
-
type FormattingMode = 'agent' | 'lm';
|
|
156
|
-
/**
|
|
157
|
-
* Extract fenced code blocks from AgentV user segments.
|
|
158
|
-
*/
|
|
159
|
-
declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
|
|
160
|
-
|
|
161
150
|
type ChatMessageRole = "system" | "user" | "assistant" | "tool" | "function";
|
|
162
151
|
interface ChatMessage {
|
|
163
152
|
readonly role: ChatMessageRole;
|
|
@@ -273,6 +262,17 @@ interface TargetDefinition {
|
|
|
273
262
|
readonly retryStatusCodes?: unknown | undefined;
|
|
274
263
|
}
|
|
275
264
|
|
|
265
|
+
/**
|
|
266
|
+
* Formatting mode for segment content.
|
|
267
|
+
* - 'agent': File references only (for providers with filesystem access)
|
|
268
|
+
* - 'lm': Embedded file content with XML tags (for language model providers)
|
|
269
|
+
*/
|
|
270
|
+
type FormattingMode = "agent" | "lm";
|
|
271
|
+
/**
|
|
272
|
+
* Extract fenced code blocks from AgentV user segments.
|
|
273
|
+
*/
|
|
274
|
+
declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
|
|
275
|
+
|
|
276
276
|
/**
|
|
277
277
|
* Build prompt inputs by consolidating user request context and guideline content.
|
|
278
278
|
*/
|
package/dist/index.js
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import {
|
|
2
|
-
TARGETS_SCHEMA_V2,
|
|
3
2
|
buildDirectoryChain,
|
|
4
3
|
buildSearchRoots,
|
|
5
4
|
fileExists,
|
|
@@ -9,7 +8,7 @@ import {
|
|
|
9
8
|
readTextFile,
|
|
10
9
|
resolveFileReference,
|
|
11
10
|
resolveTargetDefinition
|
|
12
|
-
} from "./chunk-
|
|
11
|
+
} from "./chunk-QHEZJRTU.js";
|
|
13
12
|
|
|
14
13
|
// src/evaluation/types.ts
|
|
15
14
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -71,11 +70,11 @@ function extractCodeBlocks(segments) {
|
|
|
71
70
|
const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
72
71
|
const codeBlocks = [];
|
|
73
72
|
for (const segment of segments) {
|
|
74
|
-
const typeValue = segment
|
|
73
|
+
const typeValue = segment.type;
|
|
75
74
|
if (typeof typeValue !== "string" || typeValue !== "text") {
|
|
76
75
|
continue;
|
|
77
76
|
}
|
|
78
|
-
const textValue = segment
|
|
77
|
+
const textValue = segment.value;
|
|
79
78
|
if (typeof textValue !== "string") {
|
|
80
79
|
continue;
|
|
81
80
|
}
|
|
@@ -146,9 +145,9 @@ function asString(value) {
|
|
|
146
145
|
}
|
|
147
146
|
|
|
148
147
|
// src/evaluation/loaders/config-loader.ts
|
|
149
|
-
import micromatch from "micromatch";
|
|
150
148
|
import { readFile } from "node:fs/promises";
|
|
151
149
|
import path2 from "node:path";
|
|
150
|
+
import micromatch from "micromatch";
|
|
152
151
|
import { parse } from "yaml";
|
|
153
152
|
|
|
154
153
|
// src/evaluation/loaders/file-resolver.ts
|
|
@@ -290,8 +289,9 @@ Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
|
|
|
290
289
|
guideline_patterns: guidelinePatterns
|
|
291
290
|
};
|
|
292
291
|
} catch (error) {
|
|
293
|
-
logWarning(
|
|
294
|
-
|
|
292
|
+
logWarning(
|
|
293
|
+
`Could not read .agentv/config.yaml at ${configPath}: ${error.message}`
|
|
294
|
+
);
|
|
295
295
|
}
|
|
296
296
|
}
|
|
297
297
|
return null;
|
|
@@ -334,9 +334,7 @@ var TEMPLATE_VARIABLES = {
|
|
|
334
334
|
REFERENCE_ANSWER: "reference_answer",
|
|
335
335
|
INPUT_MESSAGES: "input_messages"
|
|
336
336
|
};
|
|
337
|
-
var VALID_TEMPLATE_VARIABLES = new Set(
|
|
338
|
-
Object.values(TEMPLATE_VARIABLES)
|
|
339
|
-
);
|
|
337
|
+
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
340
338
|
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
341
339
|
TEMPLATE_VARIABLES.CANDIDATE_ANSWER,
|
|
342
340
|
TEMPLATE_VARIABLES.EXPECTED_MESSAGES
|
|
@@ -353,13 +351,14 @@ function validateTemplateVariables(content, source) {
|
|
|
353
351
|
const variablePattern = /\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g;
|
|
354
352
|
const foundVariables = /* @__PURE__ */ new Set();
|
|
355
353
|
const invalidVariables = [];
|
|
356
|
-
let match;
|
|
357
|
-
while (
|
|
354
|
+
let match = variablePattern.exec(content);
|
|
355
|
+
while (match !== null) {
|
|
358
356
|
const varName = match[1];
|
|
359
357
|
foundVariables.add(varName);
|
|
360
358
|
if (!VALID_TEMPLATE_VARIABLES.has(varName)) {
|
|
361
359
|
invalidVariables.push(varName);
|
|
362
360
|
}
|
|
361
|
+
match = variablePattern.exec(content);
|
|
363
362
|
}
|
|
364
363
|
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.CANDIDATE_ANSWER);
|
|
365
364
|
const hasExpectedMessages = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_MESSAGES);
|
|
@@ -796,7 +795,14 @@ function needsRoleMarkers(messages, processedSegmentsByMessage) {
|
|
|
796
795
|
return messagesWithContent > 1;
|
|
797
796
|
}
|
|
798
797
|
function buildChatPromptFromSegments(options) {
|
|
799
|
-
const {
|
|
798
|
+
const {
|
|
799
|
+
messages,
|
|
800
|
+
segmentsByMessage,
|
|
801
|
+
guidelinePatterns,
|
|
802
|
+
guidelineContent,
|
|
803
|
+
systemPrompt,
|
|
804
|
+
mode = "lm"
|
|
805
|
+
} = options;
|
|
800
806
|
if (messages.length === 0) {
|
|
801
807
|
return void 0;
|
|
802
808
|
}
|
|
@@ -878,7 +884,6 @@ function logWarning4(message) {
|
|
|
878
884
|
var ANSI_YELLOW6 = "\x1B[33m";
|
|
879
885
|
var ANSI_RED = "\x1B[31m";
|
|
880
886
|
var ANSI_RESET6 = "\x1B[0m";
|
|
881
|
-
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
882
887
|
async function readTestSuiteMetadata(testFilePath) {
|
|
883
888
|
try {
|
|
884
889
|
const absolutePath = path6.resolve(testFilePath);
|
|
@@ -909,12 +914,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
909
914
|
const datasetNameFromSuite = asString5(suite.dataset)?.trim();
|
|
910
915
|
const fallbackDataset = path6.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
911
916
|
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
912
|
-
const schema = suite.$schema;
|
|
913
|
-
if (schema !== SCHEMA_EVAL_V2) {
|
|
914
|
-
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
915
|
-
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
916
|
-
throw new Error(message);
|
|
917
|
-
}
|
|
918
917
|
const rawTestcases = suite.evalcases;
|
|
919
918
|
if (!Array.isArray(rawTestcases)) {
|
|
920
919
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
@@ -938,11 +937,15 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
938
937
|
const inputMessagesValue = evalcase.input_messages;
|
|
939
938
|
const expectedMessagesValue = evalcase.expected_messages;
|
|
940
939
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
941
|
-
logError(
|
|
940
|
+
logError(
|
|
941
|
+
`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
|
|
942
|
+
);
|
|
942
943
|
continue;
|
|
943
944
|
}
|
|
944
945
|
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
945
|
-
const inputMessages = inputMessagesValue.filter(
|
|
946
|
+
const inputMessages = inputMessagesValue.filter(
|
|
947
|
+
(msg) => isTestMessage(msg)
|
|
948
|
+
);
|
|
946
949
|
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
947
950
|
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
948
951
|
logError(`No valid expected message found for eval case: ${id}`);
|
|
@@ -1615,7 +1618,7 @@ function formatTimeoutSuffix(timeoutMs) {
|
|
|
1615
1618
|
import { exec as execCallback, spawn } from "node:child_process";
|
|
1616
1619
|
import { randomUUID } from "node:crypto";
|
|
1617
1620
|
import { constants as constants2, createWriteStream } from "node:fs";
|
|
1618
|
-
import { access as access2,
|
|
1621
|
+
import { access as access2, mkdir, mkdtemp, rm, writeFile } from "node:fs/promises";
|
|
1619
1622
|
import { tmpdir } from "node:os";
|
|
1620
1623
|
import path9 from "node:path";
|
|
1621
1624
|
import { promisify as promisify2 } from "node:util";
|
|
@@ -1683,9 +1686,7 @@ function buildPromptDocument(request, inputFiles, options) {
|
|
|
1683
1686
|
options?.guidelineOverrides
|
|
1684
1687
|
);
|
|
1685
1688
|
const inputFilesList = collectInputFiles(inputFiles);
|
|
1686
|
-
const nonGuidelineInputFiles = inputFilesList.filter(
|
|
1687
|
-
(file) => !guidelineFiles.includes(file)
|
|
1688
|
-
);
|
|
1689
|
+
const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
|
|
1689
1690
|
const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
|
|
1690
1691
|
if (prereadBlock.length > 0) {
|
|
1691
1692
|
parts.push("\n", prereadBlock);
|
|
@@ -1857,7 +1858,15 @@ var CodexProvider = class {
|
|
|
1857
1858
|
return path9.resolve(this.config.cwd);
|
|
1858
1859
|
}
|
|
1859
1860
|
buildCodexArgs() {
|
|
1860
|
-
const args = [
|
|
1861
|
+
const args = [
|
|
1862
|
+
"--ask-for-approval",
|
|
1863
|
+
"never",
|
|
1864
|
+
"exec",
|
|
1865
|
+
"--json",
|
|
1866
|
+
"--color",
|
|
1867
|
+
"never",
|
|
1868
|
+
"--skip-git-repo-check"
|
|
1869
|
+
];
|
|
1861
1870
|
if (this.config.args && this.config.args.length > 0) {
|
|
1862
1871
|
args.push(...this.config.args);
|
|
1863
1872
|
}
|
|
@@ -2481,7 +2490,12 @@ var MockProvider = class {
|
|
|
2481
2490
|
|
|
2482
2491
|
// src/evaluation/providers/vscode.ts
|
|
2483
2492
|
import path10 from "node:path";
|
|
2484
|
-
import {
|
|
2493
|
+
import {
|
|
2494
|
+
dispatchAgentSession,
|
|
2495
|
+
dispatchBatchAgent,
|
|
2496
|
+
getSubagentRoot,
|
|
2497
|
+
provisionSubagents
|
|
2498
|
+
} from "subagent";
|
|
2485
2499
|
var VSCodeProvider = class {
|
|
2486
2500
|
id;
|
|
2487
2501
|
kind;
|
|
@@ -2598,9 +2612,7 @@ function buildPromptDocument2(request, attachments, guidelinePatterns) {
|
|
|
2598
2612
|
}
|
|
2599
2613
|
const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
|
|
2600
2614
|
const attachmentFiles = collectAttachmentFiles(attachments);
|
|
2601
|
-
const nonGuidelineAttachments = attachmentFiles.filter(
|
|
2602
|
-
(file) => !guidelineFiles.includes(file)
|
|
2603
|
-
);
|
|
2615
|
+
const nonGuidelineAttachments = attachmentFiles.filter((file) => !guidelineFiles.includes(file));
|
|
2604
2616
|
const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineAttachments);
|
|
2605
2617
|
if (prereadBlock.length > 0) {
|
|
2606
2618
|
parts.push("\n", prereadBlock);
|
|
@@ -2709,8 +2721,10 @@ async function ensureVSCodeSubagents(options) {
|
|
|
2709
2721
|
if (result.skippedExisting.length > 0) {
|
|
2710
2722
|
console.log(`Reusing ${result.skippedExisting.length} existing unlocked subagent(s)`);
|
|
2711
2723
|
}
|
|
2712
|
-
console.log(
|
|
2713
|
-
|
|
2724
|
+
console.log(
|
|
2725
|
+
`
|
|
2726
|
+
total unlocked subagents available: ${result.created.length + result.skippedExisting.length}`
|
|
2727
|
+
);
|
|
2714
2728
|
}
|
|
2715
2729
|
return {
|
|
2716
2730
|
provisioned: true,
|
|
@@ -2736,27 +2750,6 @@ import { parse as parse3 } from "yaml";
|
|
|
2736
2750
|
function isRecord(value) {
|
|
2737
2751
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
2738
2752
|
}
|
|
2739
|
-
function checkSchema(parsed, absolutePath) {
|
|
2740
|
-
const schema = parsed.$schema;
|
|
2741
|
-
if (schema === void 0) {
|
|
2742
|
-
throw new Error(
|
|
2743
|
-
`Missing $schema field in targets.yaml at ${absolutePath}.
|
|
2744
|
-
Please add '$schema: ${TARGETS_SCHEMA_V2}' at the top of the file.`
|
|
2745
|
-
);
|
|
2746
|
-
}
|
|
2747
|
-
if (typeof schema !== "string") {
|
|
2748
|
-
throw new Error(
|
|
2749
|
-
`Invalid $schema field in targets.yaml at ${absolutePath}.
|
|
2750
|
-
Expected a string value '${TARGETS_SCHEMA_V2}'.`
|
|
2751
|
-
);
|
|
2752
|
-
}
|
|
2753
|
-
if (schema !== TARGETS_SCHEMA_V2) {
|
|
2754
|
-
throw new Error(
|
|
2755
|
-
`Invalid $schema '${schema}' in targets.yaml at ${absolutePath}.
|
|
2756
|
-
Expected '${TARGETS_SCHEMA_V2}'.`
|
|
2757
|
-
);
|
|
2758
|
-
}
|
|
2759
|
-
}
|
|
2760
2753
|
function extractTargetsArray(parsed, absolutePath) {
|
|
2761
2754
|
const targets = parsed.targets;
|
|
2762
2755
|
if (!Array.isArray(targets)) {
|
|
@@ -2771,7 +2764,9 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
2771
2764
|
const name = value.name;
|
|
2772
2765
|
const provider = value.provider;
|
|
2773
2766
|
if (typeof name !== "string" || name.trim().length === 0) {
|
|
2774
|
-
throw new Error(
|
|
2767
|
+
throw new Error(
|
|
2768
|
+
`targets.yaml entry at index ${index} in ${filePath} is missing a valid 'name'`
|
|
2769
|
+
);
|
|
2775
2770
|
}
|
|
2776
2771
|
if (typeof provider !== "string" || provider.trim().length === 0) {
|
|
2777
2772
|
throw new Error(`targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider'`);
|
|
@@ -2794,11 +2789,12 @@ async function readTargetDefinitions(filePath) {
|
|
|
2794
2789
|
const raw = await readFile6(absolutePath, "utf8");
|
|
2795
2790
|
const parsed = parse3(raw);
|
|
2796
2791
|
if (!isRecord(parsed)) {
|
|
2797
|
-
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with
|
|
2792
|
+
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
|
|
2798
2793
|
}
|
|
2799
|
-
checkSchema(parsed, absolutePath);
|
|
2800
2794
|
const targets = extractTargetsArray(parsed, absolutePath);
|
|
2801
|
-
const definitions = targets.map(
|
|
2795
|
+
const definitions = targets.map(
|
|
2796
|
+
(entry, index) => assertTargetDefinition(entry, index, absolutePath)
|
|
2797
|
+
);
|
|
2802
2798
|
return definitions;
|
|
2803
2799
|
}
|
|
2804
2800
|
function listTargetNames(definitions) {
|
|
@@ -2875,7 +2871,11 @@ var LlmJudgeEvaluator = class {
|
|
|
2875
2871
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
2876
2872
|
const variables = {
|
|
2877
2873
|
[TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
2878
|
-
[TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
|
|
2874
|
+
[TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
|
|
2875
|
+
context.evalCase.expected_segments,
|
|
2876
|
+
null,
|
|
2877
|
+
2
|
|
2878
|
+
),
|
|
2879
2879
|
[TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
|
|
2880
2880
|
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
2881
2881
|
[TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
|
|
@@ -3121,7 +3121,7 @@ import { createHash, randomUUID as randomUUID2 } from "node:crypto";
|
|
|
3121
3121
|
import { mkdir as mkdir2, writeFile as writeFile2 } from "node:fs/promises";
|
|
3122
3122
|
import path12 from "node:path";
|
|
3123
3123
|
|
|
3124
|
-
// ../../node_modules/.
|
|
3124
|
+
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
3125
3125
|
var Node = class {
|
|
3126
3126
|
value;
|
|
3127
3127
|
next;
|
|
@@ -3154,6 +3154,9 @@ var Queue = class {
|
|
|
3154
3154
|
}
|
|
3155
3155
|
this.#head = this.#head.next;
|
|
3156
3156
|
this.#size--;
|
|
3157
|
+
if (!this.#head) {
|
|
3158
|
+
this.#tail = void 0;
|
|
3159
|
+
}
|
|
3157
3160
|
return current.value;
|
|
3158
3161
|
}
|
|
3159
3162
|
peek() {
|
|
@@ -3184,7 +3187,7 @@ var Queue = class {
|
|
|
3184
3187
|
}
|
|
3185
3188
|
};
|
|
3186
3189
|
|
|
3187
|
-
// ../../node_modules/.
|
|
3190
|
+
// ../../node_modules/.bun/p-limit@6.2.0/node_modules/p-limit/index.js
|
|
3188
3191
|
function pLimit(concurrency) {
|
|
3189
3192
|
validateConcurrency(concurrency);
|
|
3190
3193
|
const queue = new Queue();
|
|
@@ -3360,7 +3363,9 @@ async function runEvaluation(options) {
|
|
|
3360
3363
|
} catch (error) {
|
|
3361
3364
|
if (verbose) {
|
|
3362
3365
|
const message = error instanceof Error ? error.message : String(error);
|
|
3363
|
-
console.warn(
|
|
3366
|
+
console.warn(
|
|
3367
|
+
`Provider batch execution failed, falling back to per-case dispatch: ${message}`
|
|
3368
|
+
);
|
|
3364
3369
|
}
|
|
3365
3370
|
}
|
|
3366
3371
|
}
|
|
@@ -3524,7 +3529,14 @@ async function runBatchEvaluation(options) {
|
|
|
3524
3529
|
agentTimeoutMs
|
|
3525
3530
|
});
|
|
3526
3531
|
} catch (error) {
|
|
3527
|
-
const errorResult = buildErrorResult(
|
|
3532
|
+
const errorResult = buildErrorResult(
|
|
3533
|
+
evalCase,
|
|
3534
|
+
target.name,
|
|
3535
|
+
nowFn(),
|
|
3536
|
+
error,
|
|
3537
|
+
promptInputs,
|
|
3538
|
+
provider
|
|
3539
|
+
);
|
|
3528
3540
|
results.push(errorResult);
|
|
3529
3541
|
if (onResult) {
|
|
3530
3542
|
await onResult(errorResult);
|
|
@@ -3702,7 +3714,18 @@ async function evaluateCandidate(options) {
|
|
|
3702
3714
|
};
|
|
3703
3715
|
}
|
|
3704
3716
|
async function runEvaluatorsForCase(options) {
|
|
3705
|
-
const {
|
|
3717
|
+
const {
|
|
3718
|
+
evalCase,
|
|
3719
|
+
candidate,
|
|
3720
|
+
target,
|
|
3721
|
+
provider,
|
|
3722
|
+
evaluators,
|
|
3723
|
+
attempt,
|
|
3724
|
+
promptInputs,
|
|
3725
|
+
now,
|
|
3726
|
+
judgeProvider,
|
|
3727
|
+
agentTimeoutMs
|
|
3728
|
+
} = options;
|
|
3706
3729
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
3707
3730
|
return runEvaluatorList({
|
|
3708
3731
|
evalCase,
|
|
@@ -3803,7 +3826,6 @@ async function runEvaluatorList(options) {
|
|
|
3803
3826
|
reasoning: score2.reasoning,
|
|
3804
3827
|
evaluator_provider_request: score2.evaluatorRawRequest
|
|
3805
3828
|
});
|
|
3806
|
-
continue;
|
|
3807
3829
|
}
|
|
3808
3830
|
} catch (error) {
|
|
3809
3831
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -3814,7 +3836,11 @@ async function runEvaluatorList(options) {
|
|
|
3814
3836
|
expectedAspectCount: 1,
|
|
3815
3837
|
reasoning: message
|
|
3816
3838
|
};
|
|
3817
|
-
scored.push({
|
|
3839
|
+
scored.push({
|
|
3840
|
+
score: fallbackScore,
|
|
3841
|
+
name: evaluator.name ?? "unknown",
|
|
3842
|
+
type: evaluator.type ?? "unknown"
|
|
3843
|
+
});
|
|
3818
3844
|
evaluatorResults.push({
|
|
3819
3845
|
name: evaluator.name ?? "unknown",
|
|
3820
3846
|
type: evaluator.type ?? "unknown",
|
|
@@ -3828,7 +3854,10 @@ async function runEvaluatorList(options) {
|
|
|
3828
3854
|
const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
|
|
3829
3855
|
const hits = scored.flatMap((entry) => entry.score.hits);
|
|
3830
3856
|
const misses = scored.flatMap((entry) => entry.score.misses);
|
|
3831
|
-
const expectedAspectCount = scored.reduce(
|
|
3857
|
+
const expectedAspectCount = scored.reduce(
|
|
3858
|
+
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
3859
|
+
0
|
|
3860
|
+
);
|
|
3832
3861
|
const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
|
|
3833
3862
|
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
|
|
3834
3863
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
@@ -3843,7 +3872,18 @@ async function runEvaluatorList(options) {
|
|
|
3843
3872
|
return { score, evaluatorResults };
|
|
3844
3873
|
}
|
|
3845
3874
|
async function runLlmJudgeEvaluator(options) {
|
|
3846
|
-
const {
|
|
3875
|
+
const {
|
|
3876
|
+
config,
|
|
3877
|
+
evalCase,
|
|
3878
|
+
candidate,
|
|
3879
|
+
target,
|
|
3880
|
+
provider,
|
|
3881
|
+
evaluatorRegistry,
|
|
3882
|
+
attempt,
|
|
3883
|
+
promptInputs,
|
|
3884
|
+
now,
|
|
3885
|
+
judgeProvider
|
|
3886
|
+
} = options;
|
|
3847
3887
|
const customPrompt = await resolveCustomPrompt(config);
|
|
3848
3888
|
return evaluatorRegistry.llm_judge.evaluate({
|
|
3849
3889
|
evalCase,
|