@ls-stack/agent-eval 0.24.0 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-DYRmucgj.mjs → app-DS3j_AyX.mjs} +6 -3
- package/dist/apps/web/dist/assets/index-DNsZjOms.css +1 -0
- package/dist/apps/web/dist/assets/index-DqR1YaMG.js +118 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-Be0x8CS3.mjs → cli-ETfZ15RB.mjs} +151 -42
- package/dist/index.d.mts +61 -6
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +28 -18
- package/dist/{runOrchestration-D697g6Qe.mjs → runOrchestration-B31SV_Bq.mjs} +222 -87
- package/dist/{runner-jSujaSKt.mjs → runner-B2f2TEjp.mjs} +1 -1
- package/dist/{runner-B4SosWgD.mjs → runner-cj1TkR-H.mjs} +2 -2
- package/dist/src-CyNb2ycA.mjs +3 -0
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +6 -1
- package/dist/apps/web/dist/assets/index-KbbX3NYr.js +0 -118
- package/dist/apps/web/dist/assets/index-r0dVFK0B.css +0 -1
- package/dist/src-D6cettg0.mjs +0 -3
|
@@ -1506,7 +1506,7 @@ function createTraceCache(generateSpanId) {
|
|
|
1506
1506
|
namespace,
|
|
1507
1507
|
operationType: "value",
|
|
1508
1508
|
operationName: info.name,
|
|
1509
|
-
storedAt:
|
|
1509
|
+
storedAt: new Date(getRealDateNowMs()).toISOString(),
|
|
1510
1510
|
codeFingerprint: cacheCtx.codeFingerprint,
|
|
1511
1511
|
recording: await serializeCacheRecording(recording)
|
|
1512
1512
|
}, {
|
|
@@ -1940,7 +1940,7 @@ async function traceSpanInternal(info, fn) {
|
|
|
1940
1940
|
operationName: info.name,
|
|
1941
1941
|
spanName: info.name,
|
|
1942
1942
|
spanKind: info.kind,
|
|
1943
|
-
storedAt:
|
|
1943
|
+
storedAt: new Date(getRealDateNowMs()).toISOString(),
|
|
1944
1944
|
codeFingerprint: ctx.codeFingerprint,
|
|
1945
1945
|
recording: await serializeCacheRecording(recording)
|
|
1946
1946
|
};
|
|
@@ -2541,6 +2541,12 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
|
|
|
2541
2541
|
const evalStatsConfigSchema = z.array(evalStatItemSchema);
|
|
2542
2542
|
/** Schema summarizing a discovered eval for list and overview screens. */
|
|
2543
2543
|
const evalSummarySchema = z.object({
|
|
2544
|
+
/**
|
|
2545
|
+
* Stable eval identity derived from the workspace-relative file path and
|
|
2546
|
+
* authored eval id. Older clients should display `id`; callers that need an
|
|
2547
|
+
* exact eval must use `key`.
|
|
2548
|
+
*/
|
|
2549
|
+
key: z.string().default(""),
|
|
2544
2550
|
id: z.string(),
|
|
2545
2551
|
title: z.string().optional(),
|
|
2546
2552
|
/** Eval file path relative to the active workspace root. */
|
|
@@ -2580,6 +2586,16 @@ const evalSummarySchema = z.object({
|
|
|
2580
2586
|
});
|
|
2581
2587
|
/** Schema for one case row in an eval run result table. */
|
|
2582
2588
|
const caseRowSchema = z.object({
|
|
2589
|
+
/**
|
|
2590
|
+
* Stable eval identity for this case row. Legacy rows may omit it and fall
|
|
2591
|
+
* back to `evalId`.
|
|
2592
|
+
*/
|
|
2593
|
+
evalKey: z.string().optional(),
|
|
2594
|
+
/**
|
|
2595
|
+
* Stable case identity derived from file path, eval id, and case id. Legacy
|
|
2596
|
+
* rows may omit it and fall back to `caseId`.
|
|
2597
|
+
*/
|
|
2598
|
+
caseKey: z.string().optional(),
|
|
2583
2599
|
caseId: z.string(),
|
|
2584
2600
|
evalId: z.string(),
|
|
2585
2601
|
status: z.enum([
|
|
@@ -2657,6 +2673,10 @@ const scoreTraceSchema = z.object({
|
|
|
2657
2673
|
});
|
|
2658
2674
|
/** Schema for the detailed payload shown when opening a specific case. */
|
|
2659
2675
|
const caseDetailSchema = z.object({
|
|
2676
|
+
/** Stable eval identity for this case detail. */
|
|
2677
|
+
evalKey: z.string().optional(),
|
|
2678
|
+
/** Stable case identity for this case detail. */
|
|
2679
|
+
caseKey: z.string().optional(),
|
|
2660
2680
|
caseId: z.string(),
|
|
2661
2681
|
evalId: z.string(),
|
|
2662
2682
|
status: z.enum([
|
|
@@ -2694,6 +2714,36 @@ const caseDetailSchema = z.object({
|
|
|
2694
2714
|
*/
|
|
2695
2715
|
cacheRefs: z.array(traceCacheRefSchema).default([])
|
|
2696
2716
|
});
|
|
2717
|
+
/** Schema for discovery problems that should be shown before running evals. */
|
|
2718
|
+
const discoveryIssueSchema = z.object({
|
|
2719
|
+
type: z.enum(["duplicate-eval-id"]),
|
|
2720
|
+
severity: z.enum(["error"]),
|
|
2721
|
+
filePath: z.string(),
|
|
2722
|
+
evalId: z.string(),
|
|
2723
|
+
message: z.string()
|
|
2724
|
+
});
|
|
2725
|
+
//#endregion
|
|
2726
|
+
//#region ../shared/src/evalIdentity.ts
|
|
2727
|
+
/** Build the stable identity for one eval inside a workspace. */
|
|
2728
|
+
function buildEvalKey(params) {
|
|
2729
|
+
return `${encodeURIComponent(params.filePath)}#${encodeURIComponent(params.evalId)}`;
|
|
2730
|
+
}
|
|
2731
|
+
/** Build the stable identity for one eval case inside a workspace. */
|
|
2732
|
+
function buildCaseKey(params) {
|
|
2733
|
+
return [
|
|
2734
|
+
encodeURIComponent(params.filePath),
|
|
2735
|
+
encodeURIComponent(params.evalId),
|
|
2736
|
+
encodeURIComponent(params.caseId)
|
|
2737
|
+
].join("#");
|
|
2738
|
+
}
|
|
2739
|
+
/** Return the collision-safe eval key stored on a row, falling back for legacy data. */
|
|
2740
|
+
function getCaseRowEvalKey(row) {
|
|
2741
|
+
return row.evalKey ?? row.evalId;
|
|
2742
|
+
}
|
|
2743
|
+
/** Return the collision-safe case key stored on a row, falling back for legacy data. */
|
|
2744
|
+
function getCaseRowCaseKey(row) {
|
|
2745
|
+
return row.caseKey ?? row.caseId;
|
|
2746
|
+
}
|
|
2697
2747
|
//#endregion
|
|
2698
2748
|
//#region ../shared/src/schemas/config.ts
|
|
2699
2749
|
/** Strategy used to collapse repeated trials into one stored case result. */
|
|
@@ -3072,8 +3122,8 @@ const runManifestSchema = z.object({
|
|
|
3072
3122
|
*/
|
|
3073
3123
|
commitSha: z.string().nullable().optional().default(null),
|
|
3074
3124
|
/**
|
|
3075
|
-
* Eval-file fingerprints captured for this run, keyed by eval
|
|
3076
|
-
* persisted runs may
|
|
3125
|
+
* Eval-file fingerprints captured for this run, keyed by exact eval key.
|
|
3126
|
+
* Older persisted runs may use authored eval ids or omit this field.
|
|
3077
3127
|
*/
|
|
3078
3128
|
evalSourceFingerprints: z.record(z.string(), z.string()).optional().default({}),
|
|
3079
3129
|
target: z.object({
|
|
@@ -3082,6 +3132,10 @@ const runManifestSchema = z.object({
|
|
|
3082
3132
|
"evalIds",
|
|
3083
3133
|
"caseIds"
|
|
3084
3134
|
]),
|
|
3135
|
+
/** Exact stable eval identities (`filePath + evalId`) selected by UI/API callers. */
|
|
3136
|
+
evalKeys: z.array(z.string()).optional(),
|
|
3137
|
+
/** Workspace-relative file paths or glob patterns used to filter selected evals. */
|
|
3138
|
+
files: z.array(z.string()).optional(),
|
|
3085
3139
|
evalIds: z.array(z.string()).optional(),
|
|
3086
3140
|
caseIds: z.array(z.string()).optional()
|
|
3087
3141
|
}),
|
|
@@ -3808,6 +3862,10 @@ const createRunRequestSchema = z.object({
|
|
|
3808
3862
|
"evalIds",
|
|
3809
3863
|
"caseIds"
|
|
3810
3864
|
]),
|
|
3865
|
+
/** Exact stable eval identities (`filePath + evalId`) selected by UI/API callers. */
|
|
3866
|
+
evalKeys: z.array(z.string()).optional(),
|
|
3867
|
+
/** Workspace-relative file paths or glob patterns used to filter selected evals. */
|
|
3868
|
+
files: z.array(z.string()).optional(),
|
|
3811
3869
|
evalIds: z.array(z.string()).optional(),
|
|
3812
3870
|
caseIds: z.array(z.string()).optional()
|
|
3813
3871
|
}),
|
|
@@ -4778,7 +4836,8 @@ function addDefaultOutputs(params) {
|
|
|
4778
4836
|
//#region ../runner/src/discovery.ts
|
|
4779
4837
|
const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
|
|
4780
4838
|
const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
|
|
4781
|
-
|
|
4839
|
+
/** Parse static eval metadata and discovery issues from one eval file. */
|
|
4840
|
+
function parseEvalDiscovery(filePath, content) {
|
|
4782
4841
|
const metas = [];
|
|
4783
4842
|
let searchIndex = 0;
|
|
4784
4843
|
while (searchIndex < content.length) {
|
|
@@ -4801,7 +4860,20 @@ function parseEvalMetas(filePath, content) {
|
|
|
4801
4860
|
}
|
|
4802
4861
|
searchIndex = extracted.nextIndex;
|
|
4803
4862
|
}
|
|
4804
|
-
|
|
4863
|
+
const countsById = /* @__PURE__ */ new Map();
|
|
4864
|
+
for (const meta of metas) countsById.set(meta.id, (countsById.get(meta.id) ?? 0) + 1);
|
|
4865
|
+
const duplicateIds = new Set([...countsById].filter(([, count]) => count > 1).map(([id]) => id));
|
|
4866
|
+
const issues = [...duplicateIds].map((evalId) => ({
|
|
4867
|
+
type: "duplicate-eval-id",
|
|
4868
|
+
severity: "error",
|
|
4869
|
+
filePath,
|
|
4870
|
+
evalId,
|
|
4871
|
+
message: `Duplicate eval id "${evalId}" in ${filePath}. Eval ids must be unique within one file.`
|
|
4872
|
+
}));
|
|
4873
|
+
return {
|
|
4874
|
+
metas: metas.filter((meta) => !duplicateIds.has(meta.id)),
|
|
4875
|
+
issues
|
|
4876
|
+
};
|
|
4805
4877
|
}
|
|
4806
4878
|
function extractDefineEvalObject(content, defineEvalIndex) {
|
|
4807
4879
|
const openParenIndex = content.indexOf("(", defineEvalIndex);
|
|
@@ -4920,40 +4992,6 @@ function getRunFreshnessTimestamp(manifest) {
|
|
|
4920
4992
|
return manifest.endedAt ?? manifest.startedAt;
|
|
4921
4993
|
}
|
|
4922
4994
|
//#endregion
|
|
4923
|
-
//#region ../runner/src/evalSummaries.ts
|
|
4924
|
-
/** Build the API/UI summary payload for one discovered eval. */
|
|
4925
|
-
function buildEvalSummary(params) {
|
|
4926
|
-
const { meta, config, gitState, latestRun, lastRunStatus } = params;
|
|
4927
|
-
const { sourceFingerprint, ...summaryMeta } = meta;
|
|
4928
|
-
const freshness = deriveEvalFreshness({
|
|
4929
|
-
latestRun,
|
|
4930
|
-
gitState,
|
|
4931
|
-
currentEvalSourceFingerprint: sourceFingerprint,
|
|
4932
|
-
staleAfterDays: config.staleAfterDays ?? 14
|
|
4933
|
-
});
|
|
4934
|
-
return {
|
|
4935
|
-
...summaryMeta,
|
|
4936
|
-
stale: freshness.stale,
|
|
4937
|
-
outdated: freshness.outdated,
|
|
4938
|
-
freshnessStatus: freshness.freshnessStatus,
|
|
4939
|
-
latestRunAt: latestRun?.startedAt ?? null,
|
|
4940
|
-
latestRunCommitSha: latestRun?.commitSha ?? null,
|
|
4941
|
-
currentCommitSha: gitState.commitSha,
|
|
4942
|
-
lastRunStatus
|
|
4943
|
-
};
|
|
4944
|
-
}
|
|
4945
|
-
/** Resolve which eval ids a run request should mark as the latest run. */
|
|
4946
|
-
function getTargetEvalIds(params) {
|
|
4947
|
-
const { request, sortedEvalIds, knownEvalIds } = params;
|
|
4948
|
-
if (request.target.evalIds && request.target.evalIds.length > 0) return request.target.evalIds.filter((evalId) => knownEvalIds.has(evalId));
|
|
4949
|
-
return sortedEvalIds;
|
|
4950
|
-
}
|
|
4951
|
-
/** Write one latest-run snapshot to each targeted eval id. */
|
|
4952
|
-
function setLatestRunInfoMap(params) {
|
|
4953
|
-
const { latestRunInfoMap, evalIds, info } = params;
|
|
4954
|
-
for (const evalId of evalIds) latestRunInfoMap.set(evalId, info);
|
|
4955
|
-
}
|
|
4956
|
-
//#endregion
|
|
4957
4995
|
//#region ../runner/src/outputArtifacts.ts
|
|
4958
4996
|
const mimeTypeExtensionMap = {
|
|
4959
4997
|
"application/json": ".json",
|
|
@@ -5064,9 +5102,9 @@ function recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds) {
|
|
|
5064
5102
|
return caseRow.status === "error" ? "error" : "pass";
|
|
5065
5103
|
}
|
|
5066
5104
|
function runTouchesEval(params) {
|
|
5067
|
-
if (params.caseRows.some((caseRow) => caseRow.evalId === params.evalId)) return true;
|
|
5105
|
+
if (params.caseRows.some((caseRow) => getCaseRowEvalKey(caseRow) === params.evalKey || caseRow.evalKey === void 0 && caseRow.evalId === params.evalId)) return true;
|
|
5068
5106
|
if (params.target.mode === "all") return params.evalExists;
|
|
5069
|
-
if (params.target.mode === "evalIds") return params.target.evalIds?.includes(params.evalId) ?? false;
|
|
5107
|
+
if (params.target.mode === "evalIds") return params.target.evalKeys?.includes(params.evalKey) ?? params.target.evalIds?.includes(params.evalId ?? params.evalKey) ?? false;
|
|
5070
5108
|
return false;
|
|
5071
5109
|
}
|
|
5072
5110
|
async function recomputeEvalStatusesInRuns(params) {
|
|
@@ -5075,14 +5113,15 @@ async function recomputeEvalStatusesInRuns(params) {
|
|
|
5075
5113
|
if (!runTouchesEval({
|
|
5076
5114
|
target: run.manifest.target,
|
|
5077
5115
|
caseRows: run.cases,
|
|
5116
|
+
evalKey: params.evalKey,
|
|
5078
5117
|
evalId: params.evalId,
|
|
5079
5118
|
evalExists: params.evalExists
|
|
5080
5119
|
})) continue;
|
|
5081
5120
|
if (run.manifest.status === "running") continue;
|
|
5082
5121
|
let changed = false;
|
|
5083
5122
|
for (const caseRow of run.cases) {
|
|
5084
|
-
if (caseRow.evalId
|
|
5085
|
-
const caseDetail = run.caseDetails.get(caseRow
|
|
5123
|
+
if (getCaseRowEvalKey(caseRow) !== params.evalKey && !(caseRow.evalKey === void 0 && caseRow.evalId === params.evalId)) continue;
|
|
5124
|
+
const caseDetail = run.caseDetails.get(getCaseRowCaseKey(caseRow));
|
|
5086
5125
|
const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, params.scoreThresholds);
|
|
5087
5126
|
if (caseRow.status === nextStatus) continue;
|
|
5088
5127
|
caseRow.status = nextStatus;
|
|
@@ -5150,8 +5189,8 @@ async function loadPersistedRunSnapshots(localStateDir) {
|
|
|
5150
5189
|
}
|
|
5151
5190
|
return snapshots;
|
|
5152
5191
|
}
|
|
5153
|
-
async function persistCaseDetail(runDir, caseDetail) {
|
|
5154
|
-
await writeFile(join(runDir, "case-details", `${encodeCaseDetailFileName(
|
|
5192
|
+
async function persistCaseDetail(runDir, caseDetail, fileId = caseDetail.caseId) {
|
|
5193
|
+
await writeFile(join(runDir, "case-details", `${encodeCaseDetailFileName(fileId)}.json`), JSON.stringify(caseDetail, null, 2));
|
|
5155
5194
|
}
|
|
5156
5195
|
function getLastRunStatuses(params) {
|
|
5157
5196
|
const latestRunInfos = getLatestRunInfos(params);
|
|
@@ -5164,14 +5203,15 @@ function getLastRunStatuses(params) {
|
|
|
5164
5203
|
function getLatestRunInfos(params) {
|
|
5165
5204
|
const { runs, knownEvals } = params;
|
|
5166
5205
|
const knownEvalMetas = [...knownEvals];
|
|
5167
|
-
const
|
|
5206
|
+
const evalIdByKey = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.key, evalMeta.id]));
|
|
5207
|
+
const manualScoreKeysByEval = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.key, evalMeta.columnDefs.filter((columnDef) => columnDef.isManualScore === true).map((columnDef) => columnDef.key)]));
|
|
5168
5208
|
const orderedRuns = [...runs].toSorted((a, b) => new Date(getRunFreshnessTimestamp(a.manifest)).getTime() - new Date(getRunFreshnessTimestamp(b.manifest)).getTime());
|
|
5169
5209
|
const latestRunInfos = /* @__PURE__ */ new Map();
|
|
5170
|
-
for (const run of orderedRuns) for (const
|
|
5171
|
-
status: getEvalStatusForRun(run,
|
|
5210
|
+
for (const run of orderedRuns) for (const evalKey of getRunEvalKeys(run, knownEvalMetas)) latestRunInfos.set(evalKey, {
|
|
5211
|
+
status: getEvalStatusForRun(run, evalKey, evalIdByKey.get(evalKey), manualScoreKeysByEval.get(evalKey) ?? []),
|
|
5172
5212
|
startedAt: getRunFreshnessTimestamp(run.manifest),
|
|
5173
5213
|
commitSha: run.manifest.commitSha ?? null,
|
|
5174
|
-
evalSourceFingerprint: run.manifest.evalSourceFingerprints[
|
|
5214
|
+
evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalKey] ?? run.manifest.evalSourceFingerprints[evalIdByKey.get(evalKey) ?? ""] ?? null
|
|
5175
5215
|
});
|
|
5176
5216
|
return latestRunInfos;
|
|
5177
5217
|
}
|
|
@@ -5224,18 +5264,25 @@ async function readCaseDetails(runDir) {
|
|
|
5224
5264
|
if (!entry.isFile() || !entry.name.endsWith(".json")) continue;
|
|
5225
5265
|
const detail = await readParsedJsonFile(join(detailsDir, entry.name), { safeParse: caseDetailSchema.safeParse.bind(caseDetailSchema) });
|
|
5226
5266
|
if (!detail) continue;
|
|
5227
|
-
caseDetails.set(detail.caseId, detail);
|
|
5267
|
+
caseDetails.set(detail.caseKey ?? detail.caseId, detail);
|
|
5228
5268
|
}
|
|
5229
5269
|
return caseDetails;
|
|
5230
5270
|
}
|
|
5231
|
-
function
|
|
5232
|
-
const
|
|
5233
|
-
|
|
5234
|
-
|
|
5235
|
-
|
|
5236
|
-
|
|
5237
|
-
|
|
5238
|
-
|
|
5271
|
+
function getRunEvalKeys(run, knownEvals) {
|
|
5272
|
+
const knownEvalMetas = [...knownEvals];
|
|
5273
|
+
const evalKeys = new Set(run.cases.map(getCaseRowEvalKey));
|
|
5274
|
+
for (const caseRow of run.cases) {
|
|
5275
|
+
if (caseRow.evalKey !== void 0) continue;
|
|
5276
|
+
for (const evalMeta of knownEvalMetas) if (evalMeta.id === caseRow.evalId) evalKeys.add(evalMeta.key);
|
|
5277
|
+
}
|
|
5278
|
+
if (run.manifest.target.mode === "evalIds") {
|
|
5279
|
+
for (const evalKey of run.manifest.target.evalKeys ?? []) evalKeys.add(evalKey);
|
|
5280
|
+
for (const evalId of run.manifest.target.evalIds ?? []) for (const evalMeta of knownEvalMetas) if (evalMeta.id === evalId) evalKeys.add(evalMeta.key);
|
|
5281
|
+
} else if (run.manifest.target.mode === "all" && evalKeys.size === 0) for (const evalMeta of knownEvalMetas) evalKeys.add(evalMeta.key);
|
|
5282
|
+
return [...evalKeys];
|
|
5283
|
+
}
|
|
5284
|
+
function getEvalStatusForRun(run, evalKey, evalId, manualScoreKeys) {
|
|
5285
|
+
const evalCases = run.cases.filter((caseRow) => getCaseRowEvalKey(caseRow) === evalKey || caseRow.evalKey === void 0 && caseRow.evalId === evalId);
|
|
5239
5286
|
if (evalCases.length > 0) {
|
|
5240
5287
|
if (hasPendingManualScores(evalCases, manualScoreKeys)) return "unscored";
|
|
5241
5288
|
return toLastRunStatus$1(deriveStatusFromCaseRows({ caseRows: evalCases }));
|
|
@@ -5406,8 +5453,7 @@ function resolveTracePresentation(spans, globalTraceDisplay, evalTraceDisplay) {
|
|
|
5406
5453
|
}
|
|
5407
5454
|
//#endregion
|
|
5408
5455
|
//#region ../runner/src/runExecution.ts
|
|
5409
|
-
function filterEvalCases(cases,
|
|
5410
|
-
if (evalIds && evalIds.length > 0 && !evalIds.includes(evalId)) return [];
|
|
5456
|
+
function filterEvalCases(cases, caseIds) {
|
|
5411
5457
|
if (!caseIds || caseIds.length === 0) return cases;
|
|
5412
5458
|
const selectedCaseIds = new Set(caseIds);
|
|
5413
5459
|
return cases.filter((evalCase) => selectedCaseIds.has(evalCase.id));
|
|
@@ -5436,13 +5482,18 @@ async function callWithUnknownResult(fn, args) {
|
|
|
5436
5482
|
return await Reflect.apply(fn, void 0, args);
|
|
5437
5483
|
}
|
|
5438
5484
|
async function runCase(params) {
|
|
5439
|
-
const { evalDef, evalId, evalCase, globalTraceDisplay, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, codeFingerprint, moduleIsolation, evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
5485
|
+
const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, codeFingerprint, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
5440
5486
|
const scopedIdPrefix = buildScopedEvalIdPrefix({
|
|
5441
5487
|
evalId,
|
|
5442
5488
|
evalFilePath,
|
|
5443
5489
|
caseId: evalCase.id,
|
|
5444
5490
|
workspaceRoot
|
|
5445
5491
|
});
|
|
5492
|
+
const caseKey = buildCaseKey({
|
|
5493
|
+
filePath: evalFileRelativePath,
|
|
5494
|
+
evalId,
|
|
5495
|
+
caseId: evalCase.id
|
|
5496
|
+
});
|
|
5446
5497
|
const { scope, error: executeError } = await runInEvalScope(evalCase.id, async () => {
|
|
5447
5498
|
const execute = async () => {
|
|
5448
5499
|
await Reflect.apply(evalDef.execute, evalDef, [{
|
|
@@ -5608,6 +5659,8 @@ async function runCase(params) {
|
|
|
5608
5659
|
stack: nonAssertError.stack
|
|
5609
5660
|
} : null;
|
|
5610
5661
|
const caseDetail = {
|
|
5662
|
+
evalKey,
|
|
5663
|
+
caseKey,
|
|
5611
5664
|
caseId: evalCase.id,
|
|
5612
5665
|
evalId,
|
|
5613
5666
|
status,
|
|
@@ -5693,6 +5746,56 @@ async function executeQueuedCase(params) {
|
|
|
5693
5746
|
await queuedCase.onComplete(result);
|
|
5694
5747
|
}
|
|
5695
5748
|
//#endregion
|
|
5749
|
+
//#region ../runner/src/targeting.ts
|
|
5750
|
+
function escapeRegex(value) {
|
|
5751
|
+
return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
|
|
5752
|
+
}
|
|
5753
|
+
function globToRegex(pattern) {
|
|
5754
|
+
const normalized = pattern.replaceAll("\\", "/");
|
|
5755
|
+
let regex = "^";
|
|
5756
|
+
for (let i = 0; i < normalized.length; i++) {
|
|
5757
|
+
const char = normalized[i];
|
|
5758
|
+
const next = normalized[i + 1];
|
|
5759
|
+
if (char === "*" && next === "*") {
|
|
5760
|
+
regex += ".*";
|
|
5761
|
+
i++;
|
|
5762
|
+
} else if (char === "*") regex += "[^/]*";
|
|
5763
|
+
else if (char === "?") regex += "[^/]";
|
|
5764
|
+
else regex += escapeRegex(char ?? "");
|
|
5765
|
+
}
|
|
5766
|
+
regex += "$";
|
|
5767
|
+
return new RegExp(regex);
|
|
5768
|
+
}
|
|
5769
|
+
function fileMatches(pattern, filePath) {
|
|
5770
|
+
const normalizedPattern = pattern.replaceAll("\\", "/");
|
|
5771
|
+
if (normalizedPattern === filePath) return true;
|
|
5772
|
+
return globToRegex(normalizedPattern).test(filePath);
|
|
5773
|
+
}
|
|
5774
|
+
function matchesFiles(evalMeta, files) {
|
|
5775
|
+
if (files === void 0 || files.length === 0) return true;
|
|
5776
|
+
return files.some((file) => fileMatches(file, evalMeta.filePath));
|
|
5777
|
+
}
|
|
5778
|
+
function matchesEvalIds(evalMeta, evalIds) {
|
|
5779
|
+
if (evalIds === void 0 || evalIds.length === 0) return true;
|
|
5780
|
+
return evalIds.includes(evalMeta.id);
|
|
5781
|
+
}
|
|
5782
|
+
function matchesEvalKeys(evalMeta, evalKeys) {
|
|
5783
|
+
if (evalKeys === void 0 || evalKeys.length === 0) return true;
|
|
5784
|
+
return evalKeys.includes(evalMeta.key);
|
|
5785
|
+
}
|
|
5786
|
+
/** Return the discovered evals selected by a run target. */
|
|
5787
|
+
function getTargetEvals(params) {
|
|
5788
|
+
const { target } = params.request;
|
|
5789
|
+
return [...params.evals].filter((evalMeta) => matchesEvalKeys(evalMeta, target.evalKeys)).filter((evalMeta) => matchesEvalIds(evalMeta, target.evalIds)).filter((evalMeta) => matchesFiles(evalMeta, target.files)).toSorted((a, b) => a.filePath.localeCompare(b.filePath));
|
|
5790
|
+
}
|
|
5791
|
+
/** Resolve which exact eval keys a run request can affect. */
|
|
5792
|
+
function getTargetEvalKeys(params) {
|
|
5793
|
+
return getTargetEvals({
|
|
5794
|
+
evals: params.sortedEvals,
|
|
5795
|
+
request: params.request
|
|
5796
|
+
}).map((evalMeta) => evalMeta.key);
|
|
5797
|
+
}
|
|
5798
|
+
//#endregion
|
|
5696
5799
|
//#region ../runner/src/runOrchestration.ts
|
|
5697
5800
|
/**
|
|
5698
5801
|
* Ranks case statuses from worst to best. Used to order trial attempts so the
|
|
@@ -5743,6 +5846,20 @@ function formatUnknownErrorDetails(error) {
|
|
|
5743
5846
|
if (typeof error === "string") return error;
|
|
5744
5847
|
return String(error);
|
|
5745
5848
|
}
|
|
5849
|
+
function findDuplicateCaseIds(cases) {
|
|
5850
|
+
const counts = /* @__PURE__ */ new Map();
|
|
5851
|
+
for (const evalCase of cases) counts.set(evalCase.id, (counts.get(evalCase.id) ?? 0) + 1);
|
|
5852
|
+
return [...counts].filter(([, count]) => count > 1).map(([caseId]) => caseId).toSorted();
|
|
5853
|
+
}
|
|
5854
|
+
function findAmbiguousTargetCaseIds(preparedEvals) {
|
|
5855
|
+
const ownersByCaseId = /* @__PURE__ */ new Map();
|
|
5856
|
+
for (const preparedEval of preparedEvals) for (const preparedCase of preparedEval.preparedCases) {
|
|
5857
|
+
const owners = ownersByCaseId.get(preparedCase.caseId) ?? /* @__PURE__ */ new Set();
|
|
5858
|
+
owners.add(`${preparedEval.evalMeta.filePath}#${preparedEval.evalMeta.id}`);
|
|
5859
|
+
ownersByCaseId.set(preparedCase.caseId, owners);
|
|
5860
|
+
}
|
|
5861
|
+
return [...ownersByCaseId].filter(([, owners]) => owners.size > 1).map(([caseId, owners]) => `${caseId} (${[...owners].join(", ")})`);
|
|
5862
|
+
}
|
|
5746
5863
|
function buildRunErrorMessage(errors) {
|
|
5747
5864
|
return errors.map((entry) => {
|
|
5748
5865
|
const [firstLine, ...detailLines] = entry.details.split("\n");
|
|
@@ -5762,14 +5879,15 @@ async function finalizePreparedCase(params) {
|
|
|
5762
5879
|
scoreKeys: preparedEval.scoreKeys
|
|
5763
5880
|
});
|
|
5764
5881
|
if (winningTrial.bufferedCacheStore !== null) await winningTrial.bufferedCacheStore.commit();
|
|
5882
|
+
const artifactFileId = getCaseArtifactFileId(runState, winningTrial.caseRow);
|
|
5765
5883
|
runState.cases.push(winningTrial.caseRow);
|
|
5766
|
-
runState.caseDetails.set(
|
|
5884
|
+
runState.caseDetails.set(getCaseRowCaseKey(winningTrial.caseRow), winningTrial.caseDetail);
|
|
5767
5885
|
preparedEval.mergeColumns(winningTrial.caseDetail.columns);
|
|
5768
5886
|
if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
|
|
5769
5887
|
else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
|
|
5770
5888
|
else runState.summary.failedCases++;
|
|
5771
|
-
await writeFile(join(runDir, "traces", `${
|
|
5772
|
-
await persistCaseDetail(runDir, winningTrial.caseDetail);
|
|
5889
|
+
await writeFile(join(runDir, "traces", `${encodeURIComponent(artifactFileId)}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
|
|
5890
|
+
await persistCaseDetail(runDir, winningTrial.caseDetail, artifactFileId);
|
|
5773
5891
|
onCaseFinished?.(winningTrial.caseDetail, winningTrial.caseRow);
|
|
5774
5892
|
emitEvent(runState, {
|
|
5775
5893
|
type: "case.finished",
|
|
@@ -5780,20 +5898,24 @@ async function finalizePreparedCase(params) {
|
|
|
5780
5898
|
preparedEval.evalCaseRows.push(winningTrial.caseRow);
|
|
5781
5899
|
}
|
|
5782
5900
|
function getPreparedCaseOrderKey(caseRow) {
|
|
5783
|
-
return `${caseRow.evalId}\u0000${caseRow.caseId}`;
|
|
5901
|
+
return `${caseRow.evalKey ?? caseRow.evalId}\u0000${caseRow.caseId}`;
|
|
5902
|
+
}
|
|
5903
|
+
function getCaseArtifactFileId(runState, caseRow) {
|
|
5904
|
+
const caseKey = getCaseRowCaseKey(caseRow);
|
|
5905
|
+
return runState.cases.some((existing) => existing.caseId === caseRow.caseId && getCaseRowCaseKey(existing) !== caseKey) ? caseKey : caseRow.caseId;
|
|
5784
5906
|
}
|
|
5785
5907
|
function sortCaseRowsByPreparedOrder(caseRows, preparedEvals) {
|
|
5786
5908
|
const orderByCase = /* @__PURE__ */ new Map();
|
|
5787
5909
|
let order = 0;
|
|
5788
5910
|
for (const preparedEval of preparedEvals) for (const preparedCase of preparedEval.preparedCases) {
|
|
5789
|
-
orderByCase.set(`${preparedEval.evalMeta.
|
|
5911
|
+
orderByCase.set(`${preparedEval.evalMeta.key}\u0000${preparedCase.caseId}`, order);
|
|
5790
5912
|
order++;
|
|
5791
5913
|
}
|
|
5792
5914
|
caseRows.sort((left, right) => {
|
|
5793
5915
|
return (orderByCase.get(getPreparedCaseOrderKey(left)) ?? Number.MAX_SAFE_INTEGER) - (orderByCase.get(getPreparedCaseOrderKey(right)) ?? Number.MAX_SAFE_INTEGER);
|
|
5794
5916
|
});
|
|
5795
5917
|
}
|
|
5796
|
-
async function executeRun({ runState, request, runDir, config,
|
|
5918
|
+
async function executeRun({ runState, request, runDir, config, cacheStore, lastRunStatusMap, latestRunInfoMap, emitEvent, emitDiscoveryEvent, workspaceRoot, getSourceFingerprint, getConfiguredConcurrency, getSortedEvalMetas, getTargetEvals, onCaseFinished }) {
|
|
5797
5919
|
try {
|
|
5798
5920
|
const targetEvals = getTargetEvals(request);
|
|
5799
5921
|
emitEvent(runState, {
|
|
@@ -5822,10 +5944,10 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5822
5944
|
codeFingerprint = "";
|
|
5823
5945
|
}
|
|
5824
5946
|
if (codeFingerprint.length > 0) {
|
|
5825
|
-
runState.manifest.evalSourceFingerprints[evalMeta.
|
|
5947
|
+
runState.manifest.evalSourceFingerprints[evalMeta.key] = codeFingerprint;
|
|
5826
5948
|
evalMeta.sourceFingerprint = codeFingerprint;
|
|
5827
5949
|
} else {
|
|
5828
|
-
delete runState.manifest.evalSourceFingerprints[evalMeta.
|
|
5950
|
+
delete runState.manifest.evalSourceFingerprints[evalMeta.key];
|
|
5829
5951
|
evalMeta.sourceFingerprint = null;
|
|
5830
5952
|
}
|
|
5831
5953
|
try {
|
|
@@ -5846,10 +5968,13 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5846
5968
|
await runWithModuleIsolation(moduleIsolation, async () => {
|
|
5847
5969
|
await runInEvalRuntimeScope("cases", async () => {
|
|
5848
5970
|
await entry.use(async (evalDef) => {
|
|
5849
|
-
const
|
|
5971
|
+
const runnableCases = resolveRunnableEvalCases({
|
|
5850
5972
|
cases: await runWithEvalClock(evalDef.startTime, async () => typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [], { freezeTime: evalDef.freezeTime }),
|
|
5851
5973
|
evalId: evalMeta.id
|
|
5852
|
-
})
|
|
5974
|
+
});
|
|
5975
|
+
const duplicateCaseIds = findDuplicateCaseIds(runnableCases);
|
|
5976
|
+
if (duplicateCaseIds.length > 0) throw new Error(`Duplicate case id${duplicateCaseIds.length === 1 ? "" : "s"} in ${evalMeta.filePath}#${evalMeta.id}: ${duplicateCaseIds.join(", ")}`);
|
|
5977
|
+
const cases = filterEvalCases(runnableCases, request.target.caseIds);
|
|
5853
5978
|
runState.summary.totalCases += cases.length;
|
|
5854
5979
|
const defaultConfig = resolveEvalDefaultConfig({
|
|
5855
5980
|
evalDef,
|
|
@@ -5895,6 +6020,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5895
6020
|
const { caseDetail, caseRowUpdate } = await runCase({
|
|
5896
6021
|
evalDef,
|
|
5897
6022
|
evalId: evalMeta.id,
|
|
6023
|
+
evalKey: evalMeta.key,
|
|
5898
6024
|
evalCase,
|
|
5899
6025
|
globalTraceDisplay,
|
|
5900
6026
|
llmCallsConfig,
|
|
@@ -5907,6 +6033,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5907
6033
|
codeFingerprint,
|
|
5908
6034
|
moduleIsolation,
|
|
5909
6035
|
evalFilePath,
|
|
6036
|
+
evalFileRelativePath: evalMeta.filePath,
|
|
5910
6037
|
workspaceRoot,
|
|
5911
6038
|
artifactDir: join(runDir, "artifacts"),
|
|
5912
6039
|
runId: runState.manifest.id
|
|
@@ -5916,6 +6043,8 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5916
6043
|
caseRow: {
|
|
5917
6044
|
caseId: evalCase.id,
|
|
5918
6045
|
evalId: evalMeta.id,
|
|
6046
|
+
evalKey: evalMeta.key,
|
|
6047
|
+
caseKey: caseDetail.caseKey,
|
|
5919
6048
|
status: caseRowUpdate.status ?? "pending",
|
|
5920
6049
|
durationMs: caseRowUpdate.durationMs ?? null,
|
|
5921
6050
|
columns: caseRowUpdate.columns ?? {},
|
|
@@ -5951,16 +6080,23 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5951
6080
|
evalId: evalMeta.id,
|
|
5952
6081
|
details: formatUnknownErrorDetails(error)
|
|
5953
6082
|
});
|
|
5954
|
-
lastRunStatusMap.set(evalMeta.
|
|
5955
|
-
latestRunInfoMap.set(evalMeta.
|
|
6083
|
+
lastRunStatusMap.set(evalMeta.key, "error");
|
|
6084
|
+
latestRunInfoMap.set(evalMeta.key, {
|
|
5956
6085
|
status: "error",
|
|
5957
6086
|
startedAt: runState.manifest.endedAt ?? runState.manifest.startedAt,
|
|
5958
6087
|
commitSha: runState.manifest.commitSha ?? null,
|
|
5959
|
-
evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalMeta.
|
|
6088
|
+
evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalMeta.key] ?? null
|
|
5960
6089
|
});
|
|
5961
6090
|
}
|
|
5962
6091
|
}
|
|
5963
|
-
|
|
6092
|
+
const ambiguousCaseTargets = request.target.caseIds && request.target.caseIds.length > 0 ? findAmbiguousTargetCaseIds(preparedEvals) : [];
|
|
6093
|
+
if (ambiguousCaseTargets.length > 0) {
|
|
6094
|
+
queuedCases.length = 0;
|
|
6095
|
+
evalErrors.push({
|
|
6096
|
+
evalId: "target",
|
|
6097
|
+
details: `Ambiguous --case target. Narrow it with --file and/or --eval: ${ambiguousCaseTargets.join("; ")}`
|
|
6098
|
+
});
|
|
6099
|
+
} else await executeQueuedCases({
|
|
5964
6100
|
queuedCases,
|
|
5965
6101
|
concurrency: getConfiguredConcurrency(),
|
|
5966
6102
|
globalTraceDisplay: config.traceDisplay
|
|
@@ -5975,13 +6111,13 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5975
6111
|
emitEvent
|
|
5976
6112
|
});
|
|
5977
6113
|
preparedEval.evalMeta.columnDefs = [...preparedEval.accumulatedColumns.values()];
|
|
5978
|
-
lastRunStatusMap.set(preparedEval.evalMeta.
|
|
5979
|
-
const latestStatus = lastRunStatusMap.get(preparedEval.evalMeta.
|
|
5980
|
-
latestRunInfoMap.set(preparedEval.evalMeta.
|
|
6114
|
+
lastRunStatusMap.set(preparedEval.evalMeta.key, toLastRunStatus(deriveStatusFromCaseRows({ caseRows: preparedEval.evalCaseRows })));
|
|
6115
|
+
const latestStatus = lastRunStatusMap.get(preparedEval.evalMeta.key) ?? null;
|
|
6116
|
+
latestRunInfoMap.set(preparedEval.evalMeta.key, {
|
|
5981
6117
|
status: latestStatus,
|
|
5982
6118
|
startedAt: runState.manifest.endedAt ?? runState.manifest.startedAt,
|
|
5983
6119
|
commitSha: runState.manifest.commitSha ?? null,
|
|
5984
|
-
evalSourceFingerprint: runState.manifest.evalSourceFingerprints[preparedEval.evalMeta.
|
|
6120
|
+
evalSourceFingerprint: runState.manifest.evalSourceFingerprints[preparedEval.evalMeta.key] ?? null
|
|
5985
6121
|
});
|
|
5986
6122
|
}
|
|
5987
6123
|
sortCaseRowsByPreparedOrder(runState.cases, preparedEvals);
|
|
@@ -5994,20 +6130,19 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5994
6130
|
const completedRunAt = endTime.toISOString();
|
|
5995
6131
|
runState.manifest.endedAt = completedRunAt;
|
|
5996
6132
|
runState.summary.errorMessage = evalErrors.length > 0 ? buildRunErrorMessage(evalErrors) : null;
|
|
5997
|
-
for (const
|
|
6133
|
+
for (const evalKey of getTargetEvalKeys({
|
|
5998
6134
|
request,
|
|
5999
|
-
|
|
6000
|
-
knownEvalIds: new Set(evals.keys())
|
|
6135
|
+
sortedEvals: getSortedEvalMetas()
|
|
6001
6136
|
})) {
|
|
6002
|
-
const latestStatus = lastRunStatusMap.get(
|
|
6137
|
+
const latestStatus = lastRunStatusMap.get(evalKey) ?? toLastRunStatus(deriveStatusFromCaseRows({
|
|
6003
6138
|
caseRows: [],
|
|
6004
6139
|
lifecycleStatus: runState.manifest.status
|
|
6005
6140
|
}));
|
|
6006
|
-
latestRunInfoMap.set(
|
|
6141
|
+
latestRunInfoMap.set(evalKey, {
|
|
6007
6142
|
status: latestStatus,
|
|
6008
6143
|
startedAt: completedRunAt,
|
|
6009
6144
|
commitSha: runState.manifest.commitSha ?? null,
|
|
6010
|
-
evalSourceFingerprint: runState.manifest.evalSourceFingerprints[
|
|
6145
|
+
evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalKey] ?? null
|
|
6011
6146
|
});
|
|
6012
6147
|
}
|
|
6013
6148
|
await persistRunState(runState);
|
|
@@ -6050,4 +6185,4 @@ function toLastRunStatus(status) {
|
|
|
6050
6185
|
return status === "pending" ? null : status;
|
|
6051
6186
|
}
|
|
6052
6187
|
//#endregion
|
|
6053
|
-
export { llmCallsConfigSchema as $,
|
|
6188
|
+
export { llmCallsConfigSchema as $, traceSpanKindSchema as $t, extractApiCalls as A, getEvalStartTime as An, evalChartTypeSchema as At, runSummarySchema as B, startEvalBackgroundJob as Bn, cacheRecordingOpSchema as Bt, validateCharts as C, advanceEvalTime as Cn, evalChartAggregateSchema as Ct, sseEnvelopeSchema as D, evalLog as Dn, evalChartConfigSchema as Dt, updateManualScoreRequestSchema as E, evalAssert as En, evalChartColorSchema as Et, getEvalDisplayStatus as F, runInEvalRuntimeScope as Fn, cacheEntryWithDebugKeySchema as Ft, apiCallMetricPlacementSchema as G, traceCacheRefSchema as Gt, DEFAULT_LLM_CALLS_CONFIG as H, defineEval as Hn, cacheStatusSchema as Ht, deriveScopedSummaryFromCases as I, runInEvalScope as In, cacheFileSchema as It, defaultConfigKeySchema as J, traceAttributeDisplayPlacementSchema as Jt, apiCallMetricSchema as K, traceAttributeDisplayFormatSchema as Kt, deriveStatusFromCaseRows as L, runInExistingEvalScope as Ln, cacheListItemSchema as Lt, applyDerivedCallAttributes as M, isInEvalScope as Mn, cacheDebugKeyEntrySchema as Mt, getNestedAttribute as N, mergeEvalOutput as Nn, cacheDebugKeyFileSchema as Nt, extractCacheEntries as O, getCurrentScope as On, evalChartMetricSchema as Ot, getEvalTitle as P, nextEvalId as Pn, cacheEntrySchema as Pt, llmCallPricingSchema as Q, traceSpanErrorSchema as Qt, deriveStatusFromChildStatuses as R, setEvalOutput as Rn, cacheModeSchema as Rt, normalizeScoreDef as S, EvalAssertionError as Sn, scoreTraceSchema as St, createRunRequestSchema as T, configureEvalRunLogs as Tn, evalChartBuiltinMetricSchema as Tt, agentEvalsConfigSchema as U, getEvalRegistry as Un, serializedCacheSpanSchema as Ut, DEFAULT_API_CALLS_CONFIG as V, repoFile as Vn, cacheRecordingSchema as Vt, apiCallMetricFormatSchema as W, spanCacheOptionsSchema as Wt, llmCallMetricPlacementSchema as X, traceDisplayConfigSchema as Xt, llmCallMetricFormatSchema as Y, traceAttributeDisplaySchema as Yt, llmCallMetricSchema as Z, traceDisplayInputConfigSchema as Zt, loadEvalModule as _, hashCacheKeySync as _n, evalSummarySchema as _t, getLastRunStatuses as a, columnKindSchema as an, buildCaseKey as at, loadConfig as b, serializeCacheRecording as bn, runLogLocationSchema as bt, loadPersistedRunSnapshots as c, numberDisplayOptionsSchema as cn, getCaseRowEvalKey as ct, persistRunState as d, z$1 as dn, caseRowSchema as dt, traceSpanSchema as en, removeDefaultConfigSchema as et, recomputeEvalStatusesInRuns as f, buildTraceTree as fn, discoveryIssueSchema as ft, deriveEvalFreshness as g, hashCacheKey as gn, evalStatsConfigSchema as gt, resolveArtifactPath as h, evalTracer as hn, evalStatItemSchema as ht, generateRunId as i, columnFormatSchema as in, trialSelectionModeSchema as it, extractLlmCalls as j, incrementEvalOutput as jn, evalChartsConfigSchema as jt, extractCacheHits as k, getEvalCaseInput as kn, evalChartTooltipExtraSchema as kt, nextShortIdFromSnapshots as l, repoFileRefSchema as ln, assertionFailureSchema as lt, runTouchesEval as m, evalSpan as mn, evalStatAggregateSchema as mt, getTargetEvalKeys as n, cellValueSchema as nn, resolveLlmCallsConfig as nt, getLatestRunInfos as o, fileRefSchema as on, buildEvalKey as ot, recomputePersistedCaseStatus as p, captureEvalSpanError as pn, evalFreshnessStatusSchema as pt, apiCallsConfigSchema as q, traceAttributeDisplayInputSchema as qt, getTargetEvals as r, columnDefSchema as rn, runLogsConfigSchema as rt, loadPersistedRunSnapshot as s, jsonCellSchema as sn, getCaseRowCaseKey as st, executeRun as t, traceSpanWarningSchema as tn, resolveApiCallsConfig as tt, persistCaseDetail as u, runArtifactRefSchema as un, caseDetailSchema as ut, parseEvalDiscovery as v, deserializeCacheRecording as vn, runLogEntrySchema as vt, createFsCacheStore as w, appendToEvalOutput as wn, evalChartAxisSchema as wt, buildDeclaredColumnDefs as x, serializeCacheValue as xn, runLogPhaseSchema as xt, resolveEvalDefaultConfig as y, deserializeCacheValue as yn, runLogLevelSchema as yt, runManifestSchema as z, setScopeCacheContext as zn, cacheOperationTypeSchema as zt };
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-cj1TkR-H.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-ETfZ15RB.mjs";
|
|
2
|
+
import "./src-CyNb2ycA.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.25.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -59,8 +59,8 @@
|
|
|
59
59
|
"@types/node": "^24.7.2",
|
|
60
60
|
"typescript": "^5.9.2",
|
|
61
61
|
"@agent-evals/runner": "0.0.1",
|
|
62
|
-
"@agent-evals/
|
|
63
|
-
"@agent-evals/
|
|
62
|
+
"@agent-evals/sdk": "0.0.1",
|
|
63
|
+
"@agent-evals/shared": "0.0.1"
|
|
64
64
|
},
|
|
65
65
|
"scripts": {
|
|
66
66
|
"build": "pnpm --filter @agent-evals/web build && tsdown",
|