@agentv/core 0.5.3 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-NL7K4CAK.js → chunk-OW3SHBIJ.js} +7 -2
- package/dist/chunk-OW3SHBIJ.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +172 -5
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +28 -2
- package/dist/index.d.ts +28 -2
- package/dist/index.js +167 -6
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-NL7K4CAK.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { AxChatRequest } from '@ax-llm/ax';
|
|
1
|
+
import { AxChatRequest, AxAI } from '@ax-llm/ax';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* JSON primitive values appearing in AgentV payloads.
|
|
@@ -99,6 +99,7 @@ type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
|
|
|
99
99
|
*/
|
|
100
100
|
interface EvalCase {
|
|
101
101
|
readonly id: string;
|
|
102
|
+
readonly dataset: string;
|
|
102
103
|
readonly conversation_id?: string;
|
|
103
104
|
readonly task: string;
|
|
104
105
|
readonly user_segments: readonly JsonObject[];
|
|
@@ -117,6 +118,7 @@ interface EvalCase {
|
|
|
117
118
|
*/
|
|
118
119
|
interface EvaluationResult {
|
|
119
120
|
readonly eval_id: string;
|
|
121
|
+
readonly dataset: string;
|
|
120
122
|
readonly conversation_id?: string;
|
|
121
123
|
readonly score: number;
|
|
122
124
|
readonly hits: readonly string[];
|
|
@@ -171,6 +173,11 @@ declare function buildPromptInputs(testCase: EvalCase): Promise<{
|
|
|
171
173
|
}>;
|
|
172
174
|
|
|
173
175
|
declare function fileExists(filePath: string): Promise<boolean>;
|
|
176
|
+
/**
|
|
177
|
+
* Read a text file and normalize line endings to LF (\n).
|
|
178
|
+
* This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
|
|
179
|
+
*/
|
|
180
|
+
declare function readTextFile(filePath: string): Promise<string>;
|
|
174
181
|
/**
|
|
175
182
|
* Find git repository root by walking up the directory tree.
|
|
176
183
|
*/
|
|
@@ -229,6 +236,11 @@ interface Provider {
|
|
|
229
236
|
* the orchestrator may send multiple requests in a single provider session.
|
|
230
237
|
*/
|
|
231
238
|
invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
|
|
239
|
+
/**
|
|
240
|
+
* Optional access to the underlying AxAI instance.
|
|
241
|
+
* This enables using advanced Ax features like structured output signatures.
|
|
242
|
+
*/
|
|
243
|
+
getAxAI?(): AxAI;
|
|
232
244
|
}
|
|
233
245
|
type EnvLookup = Readonly<Record<string, string | undefined>>;
|
|
234
246
|
interface TargetDefinition {
|
|
@@ -372,6 +384,16 @@ interface EnsureSubagentsResult {
|
|
|
372
384
|
*/
|
|
373
385
|
declare function ensureVSCodeSubagents(options: EnsureSubagentsOptions): Promise<EnsureSubagentsResult>;
|
|
374
386
|
|
|
387
|
+
type CodexLogEntry = {
|
|
388
|
+
readonly filePath: string;
|
|
389
|
+
readonly evalCaseId?: string;
|
|
390
|
+
readonly targetName: string;
|
|
391
|
+
readonly attempt?: number;
|
|
392
|
+
};
|
|
393
|
+
type CodexLogListener = (entry: CodexLogEntry) => void;
|
|
394
|
+
declare function consumeCodexLogEntries(): CodexLogEntry[];
|
|
395
|
+
declare function subscribeToCodexLogEntries(listener: CodexLogListener): () => void;
|
|
396
|
+
|
|
375
397
|
declare function createProvider(target: ResolvedTarget): Provider;
|
|
376
398
|
declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
|
|
377
399
|
|
|
@@ -420,6 +442,10 @@ declare class LlmJudgeEvaluator implements Evaluator {
|
|
|
420
442
|
private readonly customPrompt?;
|
|
421
443
|
constructor(options: LlmJudgeEvaluatorOptions);
|
|
422
444
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
445
|
+
private evaluateWithAx;
|
|
446
|
+
private evaluateWithPrompt;
|
|
447
|
+
private buildJudgeForwardOptions;
|
|
448
|
+
private buildJudgeModelConfig;
|
|
423
449
|
}
|
|
424
450
|
interface CodeEvaluatorOptions {
|
|
425
451
|
readonly script: string;
|
|
@@ -492,4 +518,4 @@ type AgentKernel = {
|
|
|
492
518
|
};
|
|
493
519
|
declare function createAgentKernel(): AgentKernel;
|
|
494
520
|
|
|
495
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, readTargetDefinitions, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation };
|
|
521
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, readTargetDefinitions, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { AxChatRequest } from '@ax-llm/ax';
|
|
1
|
+
import { AxChatRequest, AxAI } from '@ax-llm/ax';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* JSON primitive values appearing in AgentV payloads.
|
|
@@ -99,6 +99,7 @@ type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
|
|
|
99
99
|
*/
|
|
100
100
|
interface EvalCase {
|
|
101
101
|
readonly id: string;
|
|
102
|
+
readonly dataset: string;
|
|
102
103
|
readonly conversation_id?: string;
|
|
103
104
|
readonly task: string;
|
|
104
105
|
readonly user_segments: readonly JsonObject[];
|
|
@@ -117,6 +118,7 @@ interface EvalCase {
|
|
|
117
118
|
*/
|
|
118
119
|
interface EvaluationResult {
|
|
119
120
|
readonly eval_id: string;
|
|
121
|
+
readonly dataset: string;
|
|
120
122
|
readonly conversation_id?: string;
|
|
121
123
|
readonly score: number;
|
|
122
124
|
readonly hits: readonly string[];
|
|
@@ -171,6 +173,11 @@ declare function buildPromptInputs(testCase: EvalCase): Promise<{
|
|
|
171
173
|
}>;
|
|
172
174
|
|
|
173
175
|
declare function fileExists(filePath: string): Promise<boolean>;
|
|
176
|
+
/**
|
|
177
|
+
* Read a text file and normalize line endings to LF (\n).
|
|
178
|
+
* This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
|
|
179
|
+
*/
|
|
180
|
+
declare function readTextFile(filePath: string): Promise<string>;
|
|
174
181
|
/**
|
|
175
182
|
* Find git repository root by walking up the directory tree.
|
|
176
183
|
*/
|
|
@@ -229,6 +236,11 @@ interface Provider {
|
|
|
229
236
|
* the orchestrator may send multiple requests in a single provider session.
|
|
230
237
|
*/
|
|
231
238
|
invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
|
|
239
|
+
/**
|
|
240
|
+
* Optional access to the underlying AxAI instance.
|
|
241
|
+
* This enables using advanced Ax features like structured output signatures.
|
|
242
|
+
*/
|
|
243
|
+
getAxAI?(): AxAI;
|
|
232
244
|
}
|
|
233
245
|
type EnvLookup = Readonly<Record<string, string | undefined>>;
|
|
234
246
|
interface TargetDefinition {
|
|
@@ -372,6 +384,16 @@ interface EnsureSubagentsResult {
|
|
|
372
384
|
*/
|
|
373
385
|
declare function ensureVSCodeSubagents(options: EnsureSubagentsOptions): Promise<EnsureSubagentsResult>;
|
|
374
386
|
|
|
387
|
+
type CodexLogEntry = {
|
|
388
|
+
readonly filePath: string;
|
|
389
|
+
readonly evalCaseId?: string;
|
|
390
|
+
readonly targetName: string;
|
|
391
|
+
readonly attempt?: number;
|
|
392
|
+
};
|
|
393
|
+
type CodexLogListener = (entry: CodexLogEntry) => void;
|
|
394
|
+
declare function consumeCodexLogEntries(): CodexLogEntry[];
|
|
395
|
+
declare function subscribeToCodexLogEntries(listener: CodexLogListener): () => void;
|
|
396
|
+
|
|
375
397
|
declare function createProvider(target: ResolvedTarget): Provider;
|
|
376
398
|
declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
|
|
377
399
|
|
|
@@ -420,6 +442,10 @@ declare class LlmJudgeEvaluator implements Evaluator {
|
|
|
420
442
|
private readonly customPrompt?;
|
|
421
443
|
constructor(options: LlmJudgeEvaluatorOptions);
|
|
422
444
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
445
|
+
private evaluateWithAx;
|
|
446
|
+
private evaluateWithPrompt;
|
|
447
|
+
private buildJudgeForwardOptions;
|
|
448
|
+
private buildJudgeModelConfig;
|
|
423
449
|
}
|
|
424
450
|
interface CodeEvaluatorOptions {
|
|
425
451
|
readonly script: string;
|
|
@@ -492,4 +518,4 @@ type AgentKernel = {
|
|
|
492
518
|
};
|
|
493
519
|
declare function createAgentKernel(): AgentKernel;
|
|
494
520
|
|
|
495
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, readTargetDefinitions, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation };
|
|
521
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, readTargetDefinitions, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
package/dist/index.js
CHANGED
|
@@ -4,8 +4,9 @@ import {
|
|
|
4
4
|
buildSearchRoots,
|
|
5
5
|
fileExists,
|
|
6
6
|
findGitRoot,
|
|
7
|
+
readTextFile,
|
|
7
8
|
resolveFileReference
|
|
8
|
-
} from "./chunk-
|
|
9
|
+
} from "./chunk-OW3SHBIJ.js";
|
|
9
10
|
|
|
10
11
|
// src/evaluation/types.ts
|
|
11
12
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -149,6 +150,9 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
149
150
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
150
151
|
}
|
|
151
152
|
const suite = parsed;
|
|
153
|
+
const datasetNameFromSuite = asString(suite.dataset)?.trim();
|
|
154
|
+
const fallbackDataset = path.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
155
|
+
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
152
156
|
const schema = suite.$schema;
|
|
153
157
|
if (schema !== SCHEMA_EVAL_V2) {
|
|
154
158
|
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
@@ -296,6 +300,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
296
300
|
];
|
|
297
301
|
const testCase = {
|
|
298
302
|
id,
|
|
303
|
+
dataset: datasetName,
|
|
299
304
|
conversation_id: conversationId,
|
|
300
305
|
task: userTextPrompt,
|
|
301
306
|
user_segments: userSegments,
|
|
@@ -676,6 +681,9 @@ var AzureProvider = class {
|
|
|
676
681
|
);
|
|
677
682
|
return mapResponse(ensureChatResponse(response));
|
|
678
683
|
}
|
|
684
|
+
getAxAI() {
|
|
685
|
+
return this.ai;
|
|
686
|
+
}
|
|
679
687
|
};
|
|
680
688
|
var AnthropicProvider = class {
|
|
681
689
|
constructor(targetName, config) {
|
|
@@ -710,6 +718,9 @@ var AnthropicProvider = class {
|
|
|
710
718
|
);
|
|
711
719
|
return mapResponse(ensureChatResponse(response));
|
|
712
720
|
}
|
|
721
|
+
getAxAI() {
|
|
722
|
+
return this.ai;
|
|
723
|
+
}
|
|
713
724
|
};
|
|
714
725
|
var GeminiProvider = class {
|
|
715
726
|
constructor(targetName, config) {
|
|
@@ -743,6 +754,9 @@ var GeminiProvider = class {
|
|
|
743
754
|
);
|
|
744
755
|
return mapResponse(ensureChatResponse(response));
|
|
745
756
|
}
|
|
757
|
+
getAxAI() {
|
|
758
|
+
return this.ai;
|
|
759
|
+
}
|
|
746
760
|
};
|
|
747
761
|
|
|
748
762
|
// src/evaluation/providers/cli.ts
|
|
@@ -1063,6 +1077,59 @@ function pathToFileUri(filePath) {
|
|
|
1063
1077
|
return `file://${normalizedPath}`;
|
|
1064
1078
|
}
|
|
1065
1079
|
|
|
1080
|
+
// src/evaluation/providers/codex-log-tracker.ts
|
|
1081
|
+
var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
|
|
1082
|
+
var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
|
|
1083
|
+
function getCodexLogStore() {
|
|
1084
|
+
const globalObject = globalThis;
|
|
1085
|
+
const existing = globalObject[GLOBAL_LOGS_KEY];
|
|
1086
|
+
if (existing) {
|
|
1087
|
+
return existing;
|
|
1088
|
+
}
|
|
1089
|
+
const created = [];
|
|
1090
|
+
globalObject[GLOBAL_LOGS_KEY] = created;
|
|
1091
|
+
return created;
|
|
1092
|
+
}
|
|
1093
|
+
function getSubscriberStore() {
|
|
1094
|
+
const globalObject = globalThis;
|
|
1095
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
|
|
1096
|
+
if (existing) {
|
|
1097
|
+
return existing;
|
|
1098
|
+
}
|
|
1099
|
+
const created = /* @__PURE__ */ new Set();
|
|
1100
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
|
|
1101
|
+
return created;
|
|
1102
|
+
}
|
|
1103
|
+
function notifySubscribers(entry) {
|
|
1104
|
+
const subscribers = Array.from(getSubscriberStore());
|
|
1105
|
+
for (const listener of subscribers) {
|
|
1106
|
+
try {
|
|
1107
|
+
listener(entry);
|
|
1108
|
+
} catch (error) {
|
|
1109
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1110
|
+
console.warn(`Codex log subscriber failed: ${message}`);
|
|
1111
|
+
}
|
|
1112
|
+
}
|
|
1113
|
+
}
|
|
1114
|
+
function recordCodexLogEntry(entry) {
|
|
1115
|
+
getCodexLogStore().push(entry);
|
|
1116
|
+
notifySubscribers(entry);
|
|
1117
|
+
}
|
|
1118
|
+
function consumeCodexLogEntries() {
|
|
1119
|
+
const store = getCodexLogStore();
|
|
1120
|
+
if (store.length === 0) {
|
|
1121
|
+
return [];
|
|
1122
|
+
}
|
|
1123
|
+
return store.splice(0, store.length);
|
|
1124
|
+
}
|
|
1125
|
+
function subscribeToCodexLogEntries(listener) {
|
|
1126
|
+
const store = getSubscriberStore();
|
|
1127
|
+
store.add(listener);
|
|
1128
|
+
return () => {
|
|
1129
|
+
store.delete(listener);
|
|
1130
|
+
};
|
|
1131
|
+
}
|
|
1132
|
+
|
|
1066
1133
|
// src/evaluation/providers/codex.ts
|
|
1067
1134
|
var execAsync2 = promisify2(execCallback);
|
|
1068
1135
|
var WORKSPACE_PREFIX = "agentv-codex-";
|
|
@@ -1259,7 +1326,12 @@ var CodexProvider = class {
|
|
|
1259
1326
|
attempt: request.attempt,
|
|
1260
1327
|
format: this.config.logFormat ?? "summary"
|
|
1261
1328
|
});
|
|
1262
|
-
|
|
1329
|
+
recordCodexLogEntry({
|
|
1330
|
+
filePath,
|
|
1331
|
+
targetName: this.targetName,
|
|
1332
|
+
evalCaseId: request.evalCaseId,
|
|
1333
|
+
attempt: request.attempt
|
|
1334
|
+
});
|
|
1263
1335
|
return logger;
|
|
1264
1336
|
} catch (error) {
|
|
1265
1337
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -2644,7 +2716,30 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
2644
2716
|
}
|
|
2645
2717
|
|
|
2646
2718
|
// src/evaluation/evaluators.ts
|
|
2719
|
+
import { ax, f } from "@ax-llm/ax";
|
|
2647
2720
|
import { randomUUID as randomUUID2 } from "node:crypto";
|
|
2721
|
+
var LLM_JUDGE_SIGNATURE = f().input(
|
|
2722
|
+
"evaluationContext",
|
|
2723
|
+
f.object(
|
|
2724
|
+
{
|
|
2725
|
+
expectedOutcome: f.string("The expected outcome for the original task"),
|
|
2726
|
+
request: f.string("The original task request"),
|
|
2727
|
+
referenceAnswer: f.string("The gold standard reference answer"),
|
|
2728
|
+
generatedAnswer: f.string("The answer to evaluate"),
|
|
2729
|
+
guidelines: f.string("Additional evaluation guidelines or instructions").optional()
|
|
2730
|
+
},
|
|
2731
|
+
"Complete evaluation context for the judge"
|
|
2732
|
+
)
|
|
2733
|
+
).output(
|
|
2734
|
+
"evaluation",
|
|
2735
|
+
f.object({
|
|
2736
|
+
score: f.number("Score between 0.0 and 1.0").min(0).max(1),
|
|
2737
|
+
hits: f.string("Brief specific achievement").array(),
|
|
2738
|
+
misses: f.string("Brief specific failure or omission").array(),
|
|
2739
|
+
reasoning: f.string("Concise explanation for the score").max(500)
|
|
2740
|
+
})
|
|
2741
|
+
).build();
|
|
2742
|
+
var LLM_JUDGE = ax(LLM_JUDGE_SIGNATURE);
|
|
2648
2743
|
var LlmJudgeEvaluator = class {
|
|
2649
2744
|
kind = "llm_judge";
|
|
2650
2745
|
resolveJudgeProvider;
|
|
@@ -2662,6 +2757,44 @@ var LlmJudgeEvaluator = class {
|
|
|
2662
2757
|
if (!judgeProvider) {
|
|
2663
2758
|
throw new Error("No judge provider available for LLM grading");
|
|
2664
2759
|
}
|
|
2760
|
+
if (providerSupportsAx(judgeProvider)) {
|
|
2761
|
+
return this.evaluateWithAx(context, judgeProvider);
|
|
2762
|
+
}
|
|
2763
|
+
return this.evaluateWithPrompt(context, judgeProvider);
|
|
2764
|
+
}
|
|
2765
|
+
async evaluateWithAx(context, judgeProvider) {
|
|
2766
|
+
const ai = judgeProvider.getAxAI();
|
|
2767
|
+
const guidelines = context.promptInputs.guidelines?.trim();
|
|
2768
|
+
const evaluationContext = {
|
|
2769
|
+
expectedOutcome: context.evalCase.outcome.trim(),
|
|
2770
|
+
request: context.evalCase.task.trim(),
|
|
2771
|
+
referenceAnswer: context.evalCase.expected_assistant_raw.trim(),
|
|
2772
|
+
generatedAnswer: context.candidate.trim(),
|
|
2773
|
+
...guidelines ? { guidelines } : {}
|
|
2774
|
+
};
|
|
2775
|
+
const options = this.buildJudgeForwardOptions(context);
|
|
2776
|
+
const result = await LLM_JUDGE.forward(ai, { evaluationContext }, options);
|
|
2777
|
+
const evaluation = result.evaluation;
|
|
2778
|
+
const expectedAspectCount = Math.max(
|
|
2779
|
+
evaluation.hits.length + evaluation.misses.length,
|
|
2780
|
+
1
|
|
2781
|
+
);
|
|
2782
|
+
return {
|
|
2783
|
+
score: evaluation.score,
|
|
2784
|
+
hits: evaluation.hits,
|
|
2785
|
+
misses: evaluation.misses,
|
|
2786
|
+
expectedAspectCount,
|
|
2787
|
+
reasoning: evaluation.reasoning,
|
|
2788
|
+
evaluatorRawRequest: {
|
|
2789
|
+
id: randomUUID2(),
|
|
2790
|
+
provider: judgeProvider.id,
|
|
2791
|
+
target: context.target.name,
|
|
2792
|
+
method: "ax-structured-output",
|
|
2793
|
+
signature: LLM_JUDGE_SIGNATURE.toString()
|
|
2794
|
+
}
|
|
2795
|
+
};
|
|
2796
|
+
}
|
|
2797
|
+
async evaluateWithPrompt(context, judgeProvider) {
|
|
2665
2798
|
const prompt = buildQualityPrompt(context.evalCase, context.candidate);
|
|
2666
2799
|
const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
|
|
2667
2800
|
const metadata = {
|
|
@@ -2681,6 +2814,7 @@ var LlmJudgeEvaluator = class {
|
|
|
2681
2814
|
const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
2682
2815
|
const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
2683
2816
|
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
2817
|
+
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
2684
2818
|
const evaluatorRawRequest = {
|
|
2685
2819
|
id: randomUUID2(),
|
|
2686
2820
|
provider: judgeProvider.id,
|
|
@@ -2693,12 +2827,34 @@ var LlmJudgeEvaluator = class {
|
|
|
2693
2827
|
score,
|
|
2694
2828
|
hits,
|
|
2695
2829
|
misses,
|
|
2696
|
-
expectedAspectCount
|
|
2830
|
+
expectedAspectCount,
|
|
2697
2831
|
reasoning,
|
|
2698
2832
|
evaluatorRawRequest
|
|
2699
2833
|
};
|
|
2700
2834
|
}
|
|
2835
|
+
buildJudgeForwardOptions(context) {
|
|
2836
|
+
const modelConfig = this.buildJudgeModelConfig();
|
|
2837
|
+
if (modelConfig === void 0 && context.judgeModel === void 0) {
|
|
2838
|
+
return void 0;
|
|
2839
|
+
}
|
|
2840
|
+
return {
|
|
2841
|
+
...context.judgeModel ? { model: context.judgeModel } : {},
|
|
2842
|
+
...modelConfig ? { modelConfig } : {}
|
|
2843
|
+
};
|
|
2844
|
+
}
|
|
2845
|
+
buildJudgeModelConfig() {
|
|
2846
|
+
if (this.maxOutputTokens === void 0 && this.temperature === void 0) {
|
|
2847
|
+
return void 0;
|
|
2848
|
+
}
|
|
2849
|
+
return {
|
|
2850
|
+
...this.maxOutputTokens !== void 0 ? { maxTokens: this.maxOutputTokens } : {},
|
|
2851
|
+
...this.temperature !== void 0 ? { temperature: this.temperature } : {}
|
|
2852
|
+
};
|
|
2853
|
+
}
|
|
2701
2854
|
};
|
|
2855
|
+
function providerSupportsAx(provider) {
|
|
2856
|
+
return typeof provider.getAxAI === "function";
|
|
2857
|
+
}
|
|
2702
2858
|
var QUALITY_SYSTEM_PROMPT = [
|
|
2703
2859
|
"You are an expert evaluator. Your goal is to grade the generated_answer based on how well it achieves the expected_outcome for the original task.",
|
|
2704
2860
|
"",
|
|
@@ -2922,7 +3078,7 @@ function parseJsonSafe(payload) {
|
|
|
2922
3078
|
|
|
2923
3079
|
// src/evaluation/orchestrator.ts
|
|
2924
3080
|
import { createHash, randomUUID as randomUUID3 } from "node:crypto";
|
|
2925
|
-
import { mkdir as mkdir2,
|
|
3081
|
+
import { mkdir as mkdir2, writeFile as writeFile2 } from "node:fs/promises";
|
|
2926
3082
|
import path7 from "node:path";
|
|
2927
3083
|
|
|
2928
3084
|
// ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
|
|
@@ -3469,6 +3625,7 @@ async function evaluateCandidate(options) {
|
|
|
3469
3625
|
};
|
|
3470
3626
|
return {
|
|
3471
3627
|
eval_id: evalCase.id,
|
|
3628
|
+
dataset: evalCase.dataset,
|
|
3472
3629
|
conversation_id: evalCase.conversation_id,
|
|
3473
3630
|
score: score.score,
|
|
3474
3631
|
hits: score.hits,
|
|
@@ -3645,7 +3802,7 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
3645
3802
|
async function resolveCustomPrompt(config) {
|
|
3646
3803
|
if (config.promptPath) {
|
|
3647
3804
|
try {
|
|
3648
|
-
return await
|
|
3805
|
+
return await readTextFile(config.promptPath);
|
|
3649
3806
|
} catch (error) {
|
|
3650
3807
|
const message = error instanceof Error ? error.message : String(error);
|
|
3651
3808
|
console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
|
|
@@ -3733,6 +3890,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
|
|
|
3733
3890
|
};
|
|
3734
3891
|
return {
|
|
3735
3892
|
eval_id: evalCase.id,
|
|
3893
|
+
dataset: evalCase.dataset,
|
|
3736
3894
|
conversation_id: evalCase.conversation_id,
|
|
3737
3895
|
score: 0,
|
|
3738
3896
|
hits: [],
|
|
@@ -3782,6 +3940,7 @@ export {
|
|
|
3782
3940
|
buildDirectoryChain,
|
|
3783
3941
|
buildPromptInputs,
|
|
3784
3942
|
buildSearchRoots,
|
|
3943
|
+
consumeCodexLogEntries,
|
|
3785
3944
|
createAgentKernel,
|
|
3786
3945
|
createProvider,
|
|
3787
3946
|
ensureVSCodeSubagents,
|
|
@@ -3798,10 +3957,12 @@ export {
|
|
|
3798
3957
|
listTargetNames,
|
|
3799
3958
|
loadEvalCases,
|
|
3800
3959
|
readTargetDefinitions,
|
|
3960
|
+
readTextFile,
|
|
3801
3961
|
resolveAndCreateProvider,
|
|
3802
3962
|
resolveFileReference,
|
|
3803
3963
|
resolveTargetDefinition,
|
|
3804
3964
|
runEvalCase,
|
|
3805
|
-
runEvaluation
|
|
3965
|
+
runEvaluation,
|
|
3966
|
+
subscribeToCodexLogEntries
|
|
3806
3967
|
};
|
|
3807
3968
|
//# sourceMappingURL=index.js.map
|