@ls-stack/agent-eval 0.56.0 → 0.56.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-Bpe6Monh.mjs → app-Db_x-Rit.mjs} +4 -4
- package/dist/apps/web/dist/assets/{index-CfSiAVmi.js → index-bB8IBDp1.js} +27 -27
- package/dist/apps/web/dist/index.html +1 -1
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +1 -1
- package/dist/{cli-DQO2Fpt2.mjs → cli-Ck0mqxd-.mjs} +4 -4
- package/dist/index.d.mts +20 -20
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +2 -2
- package/dist/{runExecution-6lrtj48K.mjs → runExecution-BH7DlMXl.mjs} +5 -1
- package/dist/{runOrchestration-BYaN2mzS.mjs → runOrchestration-C1Ex9QI-.mjs} +1 -1
- package/dist/{runner-DYlwuAT3.mjs → runner-B3hEOT_I.mjs} +2 -2
- package/dist/{runner-C3CiS2o7.mjs → runner-DbVYcapC.mjs} +1 -1
- package/dist/{src-DCGrFAmO.mjs → src-B3iq-tuv.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +3 -1
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
|
|
26
26
|
rel="stylesheet"
|
|
27
27
|
/>
|
|
28
|
-
<script type="module" crossorigin src="/assets/index-
|
|
28
|
+
<script type="module" crossorigin src="/assets/index-bB8IBDp1.js"></script>
|
|
29
29
|
<link rel="stylesheet" crossorigin href="/assets/index-Xa_7PteQ.css">
|
|
30
30
|
</head>
|
|
31
31
|
<body>
|
package/dist/bin.mjs
CHANGED
package/dist/caseChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { I as configureEvalRunLogs, Pt as runWithEvalRegistry, St as resolveLlmCallsConfig, _ as createBufferedCacheStore, a as isCaseChildParentMessage, d as loadEvalModule, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, q as runInEvalRuntimeScope, r as runCase, v as createFsCacheStore, xt as resolveApiCallsConfig } from "./runExecution-
|
|
1
|
+
import { I as configureEvalRunLogs, Pt as runWithEvalRegistry, St as resolveLlmCallsConfig, _ as createBufferedCacheStore, a as isCaseChildParentMessage, d as loadEvalModule, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, q as runInEvalRuntimeScope, r as runCase, v as createFsCacheStore, xt as resolveApiCallsConfig } from "./runExecution-BH7DlMXl.mjs";
|
|
2
2
|
//#region ../runner/src/caseChild.ts
|
|
3
3
|
let fatalErrorReported = false;
|
|
4
4
|
let disconnectExpected = false;
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Ct as buildEvalKey, Nt as getEvalRegistry, St as resolveLlmCallsConfig, bt as runSummarySchema, c as resolveArtifactPath, ct as applyDerivedCallAttributes, dt as getEvalDisplayStatus, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, gt as matchesTagsFilter, h as normalizeScoreDef, m as buildDeclaredColumnDefs, o as stripTerminalControlCodes, p as loadConfig, s as resolveTracePresentation, ut as getEvalTitle, v as createFsCacheStore, wt as getCaseRowCaseKey, xt as resolveApiCallsConfig } from "./runExecution-
|
|
2
|
-
import { C as validateCharts, S as parseEvalDiscovery, _ as runTouchesEval, a as validateTagsFilters, b as deriveEvalFreshness, c as getLatestRunInfos, d as nextShortIdFromSnapshots, f as persistCaseDetail, g as recomputePersistedCaseStatus, h as recomputeEvalStatusesInRuns, i as resolveEvalTags, l as loadPersistedRunSnapshot, m as persistRunState, n as getTargetEvalKeys, o as generateRunId, p as deleteTemporaryRuns, s as getLastRunStatuses, u as loadPersistedRunSnapshots, v as buildManualInputDescriptor, x as loadIsolatedEvalRegistry, y as parseManualInputValues } from "./runOrchestration-
|
|
1
|
+
import { Ct as buildEvalKey, Nt as getEvalRegistry, St as resolveLlmCallsConfig, bt as runSummarySchema, c as resolveArtifactPath, ct as applyDerivedCallAttributes, dt as getEvalDisplayStatus, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, gt as matchesTagsFilter, h as normalizeScoreDef, m as buildDeclaredColumnDefs, o as stripTerminalControlCodes, p as loadConfig, s as resolveTracePresentation, ut as getEvalTitle, v as createFsCacheStore, wt as getCaseRowCaseKey, xt as resolveApiCallsConfig } from "./runExecution-BH7DlMXl.mjs";
|
|
2
|
+
import { C as validateCharts, S as parseEvalDiscovery, _ as runTouchesEval, a as validateTagsFilters, b as deriveEvalFreshness, c as getLatestRunInfos, d as nextShortIdFromSnapshots, f as persistCaseDetail, g as recomputePersistedCaseStatus, h as recomputeEvalStatusesInRuns, i as resolveEvalTags, l as loadPersistedRunSnapshot, m as persistRunState, n as getTargetEvalKeys, o as generateRunId, p as deleteTemporaryRuns, s as getLastRunStatuses, u as loadPersistedRunSnapshots, v as buildManualInputDescriptor, x as loadIsolatedEvalRegistry, y as parseManualInputValues } from "./runOrchestration-C1Ex9QI-.mjs";
|
|
3
3
|
import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
5
5
|
import { createHash, randomUUID } from "node:crypto";
|
|
@@ -2171,8 +2171,8 @@ async function commandApp(args) {
|
|
|
2171
2171
|
const { serve } = await import("@hono/node-server");
|
|
2172
2172
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
2173
2173
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
2174
|
-
const appModule = await import("./app-
|
|
2175
|
-
const runnerModule = await import("./runner-
|
|
2174
|
+
const appModule = await import("./app-Db_x-Rit.mjs");
|
|
2175
|
+
const runnerModule = await import("./runner-DbVYcapC.mjs");
|
|
2176
2176
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
2177
2177
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
2178
2178
|
await runnerModule.initRunner();
|
package/dist/index.d.mts
CHANGED
|
@@ -2660,9 +2660,9 @@ declare const caseRowSchema$1: z$1.ZodObject<{
|
|
|
2660
2660
|
error: "error";
|
|
2661
2661
|
running: "running";
|
|
2662
2662
|
cancelled: "cancelled";
|
|
2663
|
+
pending: "pending";
|
|
2663
2664
|
pass: "pass";
|
|
2664
2665
|
fail: "fail";
|
|
2665
|
-
pending: "pending";
|
|
2666
2666
|
}>;
|
|
2667
2667
|
durationMs: z$1.ZodNullable<z$1.ZodNumber>;
|
|
2668
2668
|
cacheHits: z$1.ZodOptional<z$1.ZodNumber>;
|
|
@@ -2860,10 +2860,10 @@ declare const scoreTraceSchema: z$1.ZodObject<{
|
|
|
2860
2860
|
namespace: z$1.ZodString;
|
|
2861
2861
|
key: z$1.ZodString;
|
|
2862
2862
|
status: z$1.ZodEnum<{
|
|
2863
|
+
bypass: "bypass";
|
|
2864
|
+
refresh: "refresh";
|
|
2863
2865
|
hit: "hit";
|
|
2864
2866
|
miss: "miss";
|
|
2865
|
-
refresh: "refresh";
|
|
2866
|
-
bypass: "bypass";
|
|
2867
2867
|
}>;
|
|
2868
2868
|
read: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
2869
2869
|
stored: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
@@ -2884,9 +2884,9 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
|
|
|
2884
2884
|
error: "error";
|
|
2885
2885
|
running: "running";
|
|
2886
2886
|
cancelled: "cancelled";
|
|
2887
|
+
pending: "pending";
|
|
2887
2888
|
pass: "pass";
|
|
2888
2889
|
fail: "fail";
|
|
2889
|
-
pending: "pending";
|
|
2890
2890
|
}>;
|
|
2891
2891
|
input: z$1.ZodUnknown;
|
|
2892
2892
|
trace: z$1.ZodArray<z$1.ZodObject<{
|
|
@@ -3032,10 +3032,10 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
|
|
|
3032
3032
|
namespace: z$1.ZodString;
|
|
3033
3033
|
key: z$1.ZodString;
|
|
3034
3034
|
status: z$1.ZodEnum<{
|
|
3035
|
+
bypass: "bypass";
|
|
3036
|
+
refresh: "refresh";
|
|
3035
3037
|
hit: "hit";
|
|
3036
3038
|
miss: "miss";
|
|
3037
|
-
refresh: "refresh";
|
|
3038
|
-
bypass: "bypass";
|
|
3039
3039
|
}>;
|
|
3040
3040
|
read: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
3041
3041
|
stored: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
@@ -3138,10 +3138,10 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
|
|
|
3138
3138
|
namespace: z$1.ZodString;
|
|
3139
3139
|
key: z$1.ZodString;
|
|
3140
3140
|
status: z$1.ZodEnum<{
|
|
3141
|
+
bypass: "bypass";
|
|
3142
|
+
refresh: "refresh";
|
|
3141
3143
|
hit: "hit";
|
|
3142
3144
|
miss: "miss";
|
|
3143
|
-
refresh: "refresh";
|
|
3144
|
-
bypass: "bypass";
|
|
3145
3145
|
}>;
|
|
3146
3146
|
read: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
3147
3147
|
stored: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
@@ -3499,8 +3499,8 @@ declare const runManifestSchema$1: z$1.ZodObject<{
|
|
|
3499
3499
|
target: z$1.ZodObject<{
|
|
3500
3500
|
mode: z$1.ZodEnum<{
|
|
3501
3501
|
all: "all";
|
|
3502
|
-
caseIds: "caseIds";
|
|
3503
3502
|
evalIds: "evalIds";
|
|
3503
|
+
caseIds: "caseIds";
|
|
3504
3504
|
}>;
|
|
3505
3505
|
evalKeys: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
3506
3506
|
files: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
@@ -3514,9 +3514,9 @@ declare const runManifestSchema$1: z$1.ZodObject<{
|
|
|
3514
3514
|
median: "median";
|
|
3515
3515
|
}>>>;
|
|
3516
3516
|
cacheMode: z$1.ZodOptional<z$1.ZodEnum<{
|
|
3517
|
-
refresh: "refresh";
|
|
3518
|
-
bypass: "bypass";
|
|
3519
3517
|
use: "use";
|
|
3518
|
+
bypass: "bypass";
|
|
3519
|
+
refresh: "refresh";
|
|
3520
3520
|
}>>;
|
|
3521
3521
|
}, z$1.core.$strip>;
|
|
3522
3522
|
/** Persisted lifecycle metadata for a single eval run. */
|
|
@@ -4436,9 +4436,9 @@ declare function extractApiCalls(spans: EvalTraceSpan$1[], config: ResolvedApiCa
|
|
|
4436
4436
|
* - `refresh`: never read, always write (forces re-execution and overwrites).
|
|
4437
4437
|
*/
|
|
4438
4438
|
declare const cacheModeSchema: z$1.ZodEnum<{
|
|
4439
|
-
refresh: "refresh";
|
|
4440
|
-
bypass: "bypass";
|
|
4441
4439
|
use: "use";
|
|
4440
|
+
bypass: "bypass";
|
|
4441
|
+
refresh: "refresh";
|
|
4442
4442
|
}>;
|
|
4443
4443
|
/** Mode controlling how cached spans behave during a run. */
|
|
4444
4444
|
type CacheMode = z$1.infer<typeof cacheModeSchema>;
|
|
@@ -4459,10 +4459,10 @@ declare const cacheOperationTypeSchema: z$1.ZodEnum<{
|
|
|
4459
4459
|
type CacheOperationType = z$1.infer<typeof cacheOperationTypeSchema>;
|
|
4460
4460
|
/** Status of a cache lookup recorded on a span or case scope. */
|
|
4461
4461
|
declare const cacheStatusSchema: z$1.ZodEnum<{
|
|
4462
|
+
bypass: "bypass";
|
|
4463
|
+
refresh: "refresh";
|
|
4462
4464
|
hit: "hit";
|
|
4463
4465
|
miss: "miss";
|
|
4464
|
-
refresh: "refresh";
|
|
4465
|
-
bypass: "bypass";
|
|
4466
4466
|
}>;
|
|
4467
4467
|
/** Status of a cache lookup recorded on a span or case scope. */
|
|
4468
4468
|
type CacheStatus = z$1.infer<typeof cacheStatusSchema>;
|
|
@@ -4479,10 +4479,10 @@ declare const traceCacheRefSchema: z$1.ZodObject<{
|
|
|
4479
4479
|
namespace: z$1.ZodString;
|
|
4480
4480
|
key: z$1.ZodString;
|
|
4481
4481
|
status: z$1.ZodEnum<{
|
|
4482
|
+
bypass: "bypass";
|
|
4483
|
+
refresh: "refresh";
|
|
4482
4484
|
hit: "hit";
|
|
4483
4485
|
miss: "miss";
|
|
4484
|
-
refresh: "refresh";
|
|
4485
|
-
bypass: "bypass";
|
|
4486
4486
|
}>;
|
|
4487
4487
|
read: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
4488
4488
|
stored: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
@@ -5435,8 +5435,8 @@ declare const createRunRequestSchema$1: z$1.ZodObject<{
|
|
|
5435
5435
|
target: z$1.ZodObject<{
|
|
5436
5436
|
mode: z$1.ZodEnum<{
|
|
5437
5437
|
all: "all";
|
|
5438
|
-
caseIds: "caseIds";
|
|
5439
5438
|
evalIds: "evalIds";
|
|
5439
|
+
caseIds: "caseIds";
|
|
5440
5440
|
}>;
|
|
5441
5441
|
evalKeys: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
5442
5442
|
files: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
@@ -5448,9 +5448,9 @@ declare const createRunRequestSchema$1: z$1.ZodObject<{
|
|
|
5448
5448
|
temporary: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
5449
5449
|
cache: z$1.ZodOptional<z$1.ZodObject<{
|
|
5450
5450
|
mode: z$1.ZodDefault<z$1.ZodEnum<{
|
|
5451
|
-
refresh: "refresh";
|
|
5452
|
-
bypass: "bypass";
|
|
5453
5451
|
use: "use";
|
|
5452
|
+
bypass: "bypass";
|
|
5453
|
+
refresh: "refresh";
|
|
5454
5454
|
}>>;
|
|
5455
5455
|
}, z$1.core.$strip>>;
|
|
5456
5456
|
manualInputs: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as startEvalBackgroundJob, A as repoFile, B as getCurrentScope, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as appendToEvalOutput, G as mergeEvalOutput, H as incrementEvalOutput, J as runInEvalScope, K as nextEvalId, L as evalAssert, M as readManualInputFile, N as evalExpect, Nt as getEvalRegistry, O as serializeCacheRecording, P as EvalAssertionError, Q as setScopeCacheContext, R as evalLog, S as evalSpan, T as hashCacheKeySync, U as isInEvalScope, V as getEvalCaseInput, Y as runInExistingEvalScope, Z as setEvalOutput, at as extractLlmCalls, b as buildTraceTree, it as extractApiCalls, j as manualInputFileValueSchema, k as serializeCacheValue, lt as getNestedAttribute, nt as extractCacheEntries, ot as simulateLlmCallCost, q as runInEvalRuntimeScope, rt as extractCacheHits, st as simulateTokenAllocation, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalTime } from "./runExecution-
|
|
2
|
-
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-
|
|
3
|
-
import { n as matchesEvalTags, t as defineEval } from "./src-
|
|
1
|
+
import { $ as startEvalBackgroundJob, A as repoFile, B as getCurrentScope, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as appendToEvalOutput, G as mergeEvalOutput, H as incrementEvalOutput, J as runInEvalScope, K as nextEvalId, L as evalAssert, M as readManualInputFile, N as evalExpect, Nt as getEvalRegistry, O as serializeCacheRecording, P as EvalAssertionError, Q as setScopeCacheContext, R as evalLog, S as evalSpan, T as hashCacheKeySync, U as isInEvalScope, V as getEvalCaseInput, Y as runInExistingEvalScope, Z as setEvalOutput, at as extractLlmCalls, b as buildTraceTree, it as extractApiCalls, j as manualInputFileValueSchema, k as serializeCacheValue, lt as getNestedAttribute, nt as extractCacheEntries, ot as simulateLlmCallCost, q as runInEvalRuntimeScope, rt as extractCacheHits, st as simulateTokenAllocation, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalTime } from "./runExecution-BH7DlMXl.mjs";
|
|
2
|
+
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-Ck0mqxd-.mjs";
|
|
3
|
+
import { n as matchesEvalTags, t as defineEval } from "./src-B3iq-tuv.mjs";
|
|
4
4
|
export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { At as evalChartsConfigSchema, Ct as buildEvalKey, Dt as evalStatAggregateSchema, I as configureEvalRunLogs, Ot as evalStatsConfigSchema, bt as runSummarySchema, et as createRunRequestSchema, jt as columnDefSchema, kt as manualInputDescriptorSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, v as createFsCacheStore, yt as runManifestSchema } from "./runExecution-
|
|
2
|
-
import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-
|
|
1
|
+
import { At as evalChartsConfigSchema, Ct as buildEvalKey, Dt as evalStatAggregateSchema, I as configureEvalRunLogs, Ot as evalStatsConfigSchema, bt as runSummarySchema, et as createRunRequestSchema, jt as columnDefSchema, kt as manualInputDescriptorSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, v as createFsCacheStore, yt as runManifestSchema } from "./runExecution-BH7DlMXl.mjs";
|
|
2
|
+
import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-C1Ex9QI-.mjs";
|
|
3
3
|
import { z } from "zod/v4";
|
|
4
4
|
import { readFile } from "node:fs/promises";
|
|
5
5
|
import { relative } from "node:path";
|
|
@@ -6181,6 +6181,10 @@ function sumNullable(values) {
|
|
|
6181
6181
|
}
|
|
6182
6182
|
return hasValue ? total : void 0;
|
|
6183
6183
|
}
|
|
6184
|
+
function getMaxLlmTurns(calls) {
|
|
6185
|
+
if (calls.length === 0) return void 0;
|
|
6186
|
+
return Math.max(...calls.map((call) => Math.max(call.stepCount ?? 1, 1)));
|
|
6187
|
+
}
|
|
6184
6188
|
function assignIfMissing(params) {
|
|
6185
6189
|
if (!params.activeKeys.has(params.key)) return;
|
|
6186
6190
|
if (params.key in params.outputs) return;
|
|
@@ -6202,7 +6206,7 @@ function addDefaultOutputs(params) {
|
|
|
6202
6206
|
assignIfMissing({
|
|
6203
6207
|
outputs: params.outputs,
|
|
6204
6208
|
key: "llmTurns",
|
|
6205
|
-
value: calls
|
|
6209
|
+
value: getMaxLlmTurns(calls),
|
|
6206
6210
|
activeKeys
|
|
6207
6211
|
});
|
|
6208
6212
|
assignIfMissing({
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-
|
|
1
|
+
import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-BH7DlMXl.mjs";
|
|
2
2
|
import { readFile, readdir, rm, writeFile } from "node:fs/promises";
|
|
3
3
|
import { dirname, join } from "node:path";
|
|
4
4
|
import { existsSync } from "node:fs";
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-Ck0mqxd-.mjs";
|
|
2
|
+
import "./src-B3iq-tuv.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-B3hEOT_I.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-BH7DlMXl.mjs";
|
|
2
|
+
import "./cli-Ck0mqxd-.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.56.
|
|
3
|
+
"version": "0.56.1",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -33,8 +33,8 @@
|
|
|
33
33
|
"@types/node": "^24.7.2",
|
|
34
34
|
"typescript": "^5.9.2",
|
|
35
35
|
"@agent-evals/runner": "0.0.1",
|
|
36
|
-
"@agent-evals/
|
|
37
|
-
"@agent-evals/
|
|
36
|
+
"@agent-evals/shared": "0.0.1",
|
|
37
|
+
"@agent-evals/sdk": "0.0.1"
|
|
38
38
|
},
|
|
39
39
|
"scripts": {
|
|
40
40
|
"build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
|
|
@@ -393,7 +393,9 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
393
393
|
tokens, and output tokens separately and use `dedupeConsecutiveValues: true`
|
|
394
394
|
to skip repeated adjacent chart values. `totalTokens` is input + output only;
|
|
395
395
|
cache read/write tokens stay separate and affect `costUsd` at their own
|
|
396
|
-
rates.
|
|
396
|
+
rates. `llmTurns` is the maximum per-call turn count in the case run, using
|
|
397
|
+
configured steps when available and otherwise one turn per matched LLM call
|
|
398
|
+
span.
|
|
397
399
|
Derived base input cost uses `inputTokens - cachedInputTokens -
|
|
398
400
|
cacheCreationInputTokens` so cache details are not double-counted.
|
|
399
401
|
`cacheCreationInputTokens` is the total cache-write count; optional
|