@agentv/core 2.17.0 → 2.17.1-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-CPPYERD2.js → chunk-PSYFRPNT.js} +1 -1
- package/dist/chunk-PSYFRPNT.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +50 -17
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +6 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +51 -18
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-CPPYERD2.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -1166,6 +1166,12 @@ interface EvaluatorResult {
|
|
|
1166
1166
|
readonly details?: JsonObject;
|
|
1167
1167
|
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
1168
1168
|
readonly tokenUsage?: TokenUsage;
|
|
1169
|
+
/** Wall-clock duration of this judge execution in milliseconds. */
|
|
1170
|
+
readonly durationMs?: number;
|
|
1171
|
+
/** ISO 8601 UTC timestamp when this judge started executing. */
|
|
1172
|
+
readonly startedAt?: string;
|
|
1173
|
+
/** ISO 8601 UTC timestamp when this judge finished executing. */
|
|
1174
|
+
readonly endedAt?: string;
|
|
1169
1175
|
}
|
|
1170
1176
|
/**
|
|
1171
1177
|
* Convenience accessor matching the Python hit_count property.
|
package/dist/index.d.ts
CHANGED
|
@@ -1166,6 +1166,12 @@ interface EvaluatorResult {
|
|
|
1166
1166
|
readonly details?: JsonObject;
|
|
1167
1167
|
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
1168
1168
|
readonly tokenUsage?: TokenUsage;
|
|
1169
|
+
/** Wall-clock duration of this judge execution in milliseconds. */
|
|
1170
|
+
readonly durationMs?: number;
|
|
1171
|
+
/** ISO 8601 UTC timestamp when this judge started executing. */
|
|
1172
|
+
readonly startedAt?: string;
|
|
1173
|
+
/** ISO 8601 UTC timestamp when this judge finished executing. */
|
|
1174
|
+
readonly endedAt?: string;
|
|
1169
1175
|
}
|
|
1170
1176
|
/**
|
|
1171
1177
|
* Convenience accessor matching the Python hit_count property.
|
package/dist/index.js
CHANGED
|
@@ -17,7 +17,7 @@ import {
|
|
|
17
17
|
readTextFile,
|
|
18
18
|
resolveFileReference,
|
|
19
19
|
resolveTargetDefinition
|
|
20
|
-
} from "./chunk-
|
|
20
|
+
} from "./chunk-PSYFRPNT.js";
|
|
21
21
|
import {
|
|
22
22
|
OtlpJsonFileExporter
|
|
23
23
|
} from "./chunk-HFSYZHGF.js";
|
|
@@ -151,6 +151,25 @@ import path8 from "node:path";
|
|
|
151
151
|
import micromatch3 from "micromatch";
|
|
152
152
|
import { parse as parse2 } from "yaml";
|
|
153
153
|
|
|
154
|
+
// src/evaluation/interpolation.ts
|
|
155
|
+
var ENV_VAR_PATTERN = /\$\{\{\s*([A-Za-z_][A-Za-z0-9_]*)\s*\}\}/g;
|
|
156
|
+
function interpolateEnv(value, env) {
|
|
157
|
+
if (typeof value === "string") {
|
|
158
|
+
return value.replace(ENV_VAR_PATTERN, (_, varName) => env[varName] ?? "");
|
|
159
|
+
}
|
|
160
|
+
if (Array.isArray(value)) {
|
|
161
|
+
return value.map((item) => interpolateEnv(item, env));
|
|
162
|
+
}
|
|
163
|
+
if (value !== null && typeof value === "object") {
|
|
164
|
+
const result = {};
|
|
165
|
+
for (const [key, val] of Object.entries(value)) {
|
|
166
|
+
result[key] = interpolateEnv(val, env);
|
|
167
|
+
}
|
|
168
|
+
return result;
|
|
169
|
+
}
|
|
170
|
+
return value;
|
|
171
|
+
}
|
|
172
|
+
|
|
154
173
|
// src/evaluation/loaders/case-file-loader.ts
|
|
155
174
|
import { readFile } from "node:fs/promises";
|
|
156
175
|
import path from "node:path";
|
|
@@ -169,7 +188,8 @@ function isGlobPattern(filePath) {
|
|
|
169
188
|
return filePath.includes("*") || filePath.includes("?") || filePath.includes("{");
|
|
170
189
|
}
|
|
171
190
|
function parseYamlCases(content, filePath) {
|
|
172
|
-
const
|
|
191
|
+
const raw = parseYaml(content);
|
|
192
|
+
const parsed = interpolateEnv(raw, process.env);
|
|
173
193
|
if (!Array.isArray(parsed)) {
|
|
174
194
|
throw new Error(
|
|
175
195
|
`External test file must contain a YAML array, got ${typeof parsed}: ${filePath}`
|
|
@@ -191,7 +211,8 @@ function parseJsonlCases(content, filePath) {
|
|
|
191
211
|
const line = lines[i].trim();
|
|
192
212
|
if (line === "") continue;
|
|
193
213
|
try {
|
|
194
|
-
const
|
|
214
|
+
const raw = JSON.parse(line);
|
|
215
|
+
const parsed = interpolateEnv(raw, process.env);
|
|
195
216
|
if (!isJsonObject(parsed)) {
|
|
196
217
|
throw new Error("Expected JSON object");
|
|
197
218
|
}
|
|
@@ -2340,7 +2361,7 @@ async function loadSidecarMetadata(jsonlPath, verbose) {
|
|
|
2340
2361
|
}
|
|
2341
2362
|
try {
|
|
2342
2363
|
const content = await readFile5(sidecarPath, "utf8");
|
|
2343
|
-
const parsed = parseYaml2(content);
|
|
2364
|
+
const parsed = interpolateEnv(parseYaml2(content), process.env);
|
|
2344
2365
|
if (!isJsonObject(parsed)) {
|
|
2345
2366
|
logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
|
|
2346
2367
|
return {};
|
|
@@ -2363,7 +2384,8 @@ function parseJsonlContent(content, filePath) {
|
|
|
2363
2384
|
const line = lines[i].trim();
|
|
2364
2385
|
if (line === "") continue;
|
|
2365
2386
|
try {
|
|
2366
|
-
const
|
|
2387
|
+
const raw = JSON.parse(line);
|
|
2388
|
+
const parsed = interpolateEnv(raw, process.env);
|
|
2367
2389
|
if (!isJsonObject(parsed)) {
|
|
2368
2390
|
throw new Error("Expected JSON object");
|
|
2369
2391
|
}
|
|
@@ -2420,9 +2442,10 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2420
2442
|
}
|
|
2421
2443
|
const inputMessages = resolveInputMessages(evalcase);
|
|
2422
2444
|
const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
|
|
2423
|
-
|
|
2445
|
+
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
|
|
2446
|
+
if (!id || !hasEvaluationSpec || !inputMessages || inputMessages.length === 0) {
|
|
2424
2447
|
logError(
|
|
2425
|
-
`Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id,
|
|
2448
|
+
`Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
|
|
2426
2449
|
);
|
|
2427
2450
|
continue;
|
|
2428
2451
|
}
|
|
@@ -2500,7 +2523,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2500
2523
|
guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
|
|
2501
2524
|
guideline_patterns: guidelinePatterns,
|
|
2502
2525
|
file_paths: allFilePaths,
|
|
2503
|
-
criteria: outcome,
|
|
2526
|
+
criteria: outcome ?? "",
|
|
2504
2527
|
evaluator: evalCaseEvaluatorKind,
|
|
2505
2528
|
evaluators
|
|
2506
2529
|
};
|
|
@@ -2813,7 +2836,7 @@ async function readTestSuiteMetadata(testFilePath) {
|
|
|
2813
2836
|
try {
|
|
2814
2837
|
const absolutePath = path8.resolve(testFilePath);
|
|
2815
2838
|
const content = await readFile7(absolutePath, "utf8");
|
|
2816
|
-
const parsed = parse2(content);
|
|
2839
|
+
const parsed = interpolateEnv(parse2(content), process.env);
|
|
2817
2840
|
if (!isJsonObject(parsed)) {
|
|
2818
2841
|
return {};
|
|
2819
2842
|
}
|
|
@@ -2863,11 +2886,11 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2863
2886
|
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
2864
2887
|
const guidelinePatterns = config?.guideline_patterns;
|
|
2865
2888
|
const rawFile = await readFile7(absoluteTestPath, "utf8");
|
|
2866
|
-
const
|
|
2867
|
-
if (!isJsonObject(
|
|
2889
|
+
const interpolated = interpolateEnv(parse2(rawFile), process.env);
|
|
2890
|
+
if (!isJsonObject(interpolated)) {
|
|
2868
2891
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
2869
2892
|
}
|
|
2870
|
-
const suite =
|
|
2893
|
+
const suite = interpolated;
|
|
2871
2894
|
const datasetNameFromSuite = asString6(suite.dataset)?.trim();
|
|
2872
2895
|
const fallbackDataset = path8.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
2873
2896
|
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
@@ -2911,9 +2934,10 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2911
2934
|
}
|
|
2912
2935
|
const testInputMessages = resolveInputMessages(evalcase);
|
|
2913
2936
|
const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
|
|
2914
|
-
|
|
2937
|
+
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
|
|
2938
|
+
if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
|
|
2915
2939
|
logError2(
|
|
2916
|
-
`Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id,
|
|
2940
|
+
`Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
|
|
2917
2941
|
);
|
|
2918
2942
|
continue;
|
|
2919
2943
|
}
|
|
@@ -3009,7 +3033,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
3009
3033
|
guideline_paths: guidelinePaths.map((guidelinePath) => path8.resolve(guidelinePath)),
|
|
3010
3034
|
guideline_patterns: guidelinePatterns,
|
|
3011
3035
|
file_paths: allFilePaths,
|
|
3012
|
-
criteria: outcome,
|
|
3036
|
+
criteria: outcome ?? "",
|
|
3013
3037
|
evaluator: evalCaseEvaluatorKind,
|
|
3014
3038
|
evaluators,
|
|
3015
3039
|
workspace: mergedWorkspace,
|
|
@@ -3149,7 +3173,7 @@ async function resolveWorkspaceConfig(raw, evalFileDir) {
|
|
|
3149
3173
|
} catch {
|
|
3150
3174
|
throw new Error(`Workspace file not found: ${raw} (resolved to ${workspaceFilePath})`);
|
|
3151
3175
|
}
|
|
3152
|
-
const parsed = parse2(content);
|
|
3176
|
+
const parsed = interpolateEnv(parse2(content), process.env);
|
|
3153
3177
|
if (!isJsonObject(parsed)) {
|
|
3154
3178
|
throw new Error(
|
|
3155
3179
|
`Invalid workspace file format: ${workspaceFilePath} (expected a YAML object)`
|
|
@@ -14847,9 +14871,11 @@ async function runEvaluatorList(options) {
|
|
|
14847
14871
|
registry: typeRegistry
|
|
14848
14872
|
};
|
|
14849
14873
|
for (const evaluatorConfig of evaluators ?? []) {
|
|
14874
|
+
const startedAt = /* @__PURE__ */ new Date();
|
|
14850
14875
|
try {
|
|
14851
14876
|
const evaluatorInstance = await typeRegistry.create(evaluatorConfig, dispatchContext);
|
|
14852
14877
|
const score2 = await evaluatorInstance.evaluate(evalContext);
|
|
14878
|
+
const endedAt = /* @__PURE__ */ new Date();
|
|
14853
14879
|
const weight = evaluatorConfig.weight ?? 1;
|
|
14854
14880
|
scored.push({
|
|
14855
14881
|
score: score2,
|
|
@@ -14870,9 +14896,13 @@ async function runEvaluatorList(options) {
|
|
|
14870
14896
|
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
14871
14897
|
details: score2.details,
|
|
14872
14898
|
scores: mapChildResults(score2.scores),
|
|
14873
|
-
tokenUsage: score2.tokenUsage
|
|
14899
|
+
tokenUsage: score2.tokenUsage,
|
|
14900
|
+
durationMs: endedAt.getTime() - startedAt.getTime(),
|
|
14901
|
+
startedAt: startedAt.toISOString(),
|
|
14902
|
+
endedAt: endedAt.toISOString()
|
|
14874
14903
|
});
|
|
14875
14904
|
} catch (error) {
|
|
14905
|
+
const endedAt = /* @__PURE__ */ new Date();
|
|
14876
14906
|
const message = error instanceof Error ? error.message : String(error);
|
|
14877
14907
|
const fallbackScore = {
|
|
14878
14908
|
score: 0,
|
|
@@ -14898,7 +14928,10 @@ async function runEvaluatorList(options) {
|
|
|
14898
14928
|
verdict: "fail",
|
|
14899
14929
|
hits: [],
|
|
14900
14930
|
misses: [`Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`],
|
|
14901
|
-
reasoning: message
|
|
14931
|
+
reasoning: message,
|
|
14932
|
+
durationMs: endedAt.getTime() - startedAt.getTime(),
|
|
14933
|
+
startedAt: startedAt.toISOString(),
|
|
14934
|
+
endedAt: endedAt.toISOString()
|
|
14902
14935
|
});
|
|
14903
14936
|
}
|
|
14904
14937
|
if (evaluatorConfig.negate === true && scored.length > 0) {
|