@ls-stack/agent-eval 0.33.0 → 0.35.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-Dc6vvHRL.mjs → app-CcZv9l_q.mjs} +4 -4
- package/dist/apps/web/dist/assets/index-BJpxc61J.css +1 -0
- package/dist/apps/web/dist/assets/index-sWPMWjFJ.js +140 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-huuJbDNb.mjs → cli-CVwIjcsX.mjs} +3 -3
- package/dist/index.d.mts +40 -40
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-ZpN7xty_.mjs → runOrchestration-DoslE_Oo.mjs} +15 -62
- package/dist/{runner-Dkol2ukD.mjs → runner-ChHgWruW.mjs} +2 -2
- package/dist/{runner-BPXPvinB.mjs → runner-DA_o115w.mjs} +1 -1
- package/dist/src-Bcc2ZHK8.mjs +3 -0
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +9 -5
- package/dist/apps/web/dist/assets/index-BPMMRktE.css +0 -1
- package/dist/apps/web/dist/assets/index-BV_DM8fZ.js +0 -118
- package/dist/src-1Qvuh0NH.mjs +0 -3
|
@@ -25,8 +25,8 @@
|
|
|
25
25
|
href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
|
|
26
26
|
rel="stylesheet"
|
|
27
27
|
/>
|
|
28
|
-
<script type="module" crossorigin src="/assets/index-
|
|
29
|
-
<link rel="stylesheet" crossorigin href="/assets/index-
|
|
28
|
+
<script type="module" crossorigin src="/assets/index-sWPMWjFJ.js"></script>
|
|
29
|
+
<link rel="stylesheet" crossorigin href="/assets/index-BJpxc61J.css">
|
|
30
30
|
</head>
|
|
31
31
|
<body>
|
|
32
32
|
<div id="root"></div>
|
package/dist/bin.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { C as loadConfig, D as createFsCacheStore, E as validateCharts, H as getEvalDisplayStatus, S as resolveEvalDefaultConfig, T as normalizeScoreDef, U as deriveScopedSummaryFromCases, V as getEvalTitle, _ as buildManualInputDescriptor, a as getLastRunStatuses, b as loadEvalModule, bt as getCaseRowEvalKey, c as loadPersistedRunSnapshots, d as persistRunState, f as recomputeEvalStatusesInRuns, g as resolveArtifactPath, h as resolveTracePresentation, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, mt as resolveLlmCallsConfig, n as getTargetEvalKeys, o as getLatestRunInfos, p as recomputePersistedCaseStatus, pr as getEvalRegistry, pt as resolveApiCallsConfig, q as runSummarySchema, s as loadPersistedRunSnapshot, u as persistCaseDetail, v as parseManualInputValues, vt as buildEvalKey, w as buildDeclaredColumnDefs, x as parseEvalDiscovery, y as deriveEvalFreshness, yt as getCaseRowCaseKey, z as applyDerivedCallAttributes } from "./runOrchestration-
|
|
1
|
+
import { C as loadConfig, D as createFsCacheStore, E as validateCharts, H as getEvalDisplayStatus, S as resolveEvalDefaultConfig, T as normalizeScoreDef, U as deriveScopedSummaryFromCases, V as getEvalTitle, _ as buildManualInputDescriptor, a as getLastRunStatuses, b as loadEvalModule, bt as getCaseRowEvalKey, c as loadPersistedRunSnapshots, d as persistRunState, f as recomputeEvalStatusesInRuns, g as resolveArtifactPath, h as resolveTracePresentation, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, mt as resolveLlmCallsConfig, n as getTargetEvalKeys, o as getLatestRunInfos, p as recomputePersistedCaseStatus, pr as getEvalRegistry, pt as resolveApiCallsConfig, q as runSummarySchema, s as loadPersistedRunSnapshot, u as persistCaseDetail, v as parseManualInputValues, vt as buildEvalKey, w as buildDeclaredColumnDefs, x as parseEvalDiscovery, y as deriveEvalFreshness, yt as getCaseRowCaseKey, z as applyDerivedCallAttributes } from "./runOrchestration-DoslE_Oo.mjs";
|
|
2
2
|
import { createHash, randomUUID } from "node:crypto";
|
|
3
3
|
import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
@@ -1940,8 +1940,8 @@ async function commandApp(args) {
|
|
|
1940
1940
|
const { serve } = await import("@hono/node-server");
|
|
1941
1941
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
1942
1942
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
1943
|
-
const appModule = await import("./app-
|
|
1944
|
-
const runnerModule = await import("./runner-
|
|
1943
|
+
const appModule = await import("./app-CcZv9l_q.mjs");
|
|
1944
|
+
const runnerModule = await import("./runner-DA_o115w.mjs");
|
|
1945
1945
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
1946
1946
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
1947
1947
|
await runnerModule.initRunner();
|
package/dist/index.d.mts
CHANGED
|
@@ -2908,13 +2908,13 @@ type ColumnKind = z$1.infer<typeof columnKindSchema>;
|
|
|
2908
2908
|
declare const columnFormatSchema: z$1.ZodEnum<{
|
|
2909
2909
|
number: "number";
|
|
2910
2910
|
boolean: "boolean";
|
|
2911
|
-
duration: "duration";
|
|
2912
|
-
json: "json";
|
|
2913
2911
|
file: "file";
|
|
2914
2912
|
markdown: "markdown";
|
|
2913
|
+
json: "json";
|
|
2915
2914
|
image: "image";
|
|
2916
2915
|
audio: "audio";
|
|
2917
2916
|
video: "video";
|
|
2917
|
+
duration: "duration";
|
|
2918
2918
|
percent: "percent";
|
|
2919
2919
|
passFail: "passFail";
|
|
2920
2920
|
stars: "stars";
|
|
@@ -2933,13 +2933,13 @@ declare const columnDefSchema: z$1.ZodObject<{
|
|
|
2933
2933
|
format: z$1.ZodOptional<z$1.ZodEnum<{
|
|
2934
2934
|
number: "number";
|
|
2935
2935
|
boolean: "boolean";
|
|
2936
|
-
duration: "duration";
|
|
2937
|
-
json: "json";
|
|
2938
2936
|
file: "file";
|
|
2939
2937
|
markdown: "markdown";
|
|
2938
|
+
json: "json";
|
|
2940
2939
|
image: "image";
|
|
2941
2940
|
audio: "audio";
|
|
2942
2941
|
video: "video";
|
|
2942
|
+
duration: "duration";
|
|
2943
2943
|
percent: "percent";
|
|
2944
2944
|
passFail: "passFail";
|
|
2945
2945
|
stars: "stars";
|
|
@@ -2984,8 +2984,8 @@ declare const traceSpanKindSchema: z$1.ZodString;
|
|
|
2984
2984
|
declare const traceAttributeDisplayFormatSchema: z$1.ZodEnum<{
|
|
2985
2985
|
string: "string";
|
|
2986
2986
|
number: "number";
|
|
2987
|
-
duration: "duration";
|
|
2988
2987
|
json: "json";
|
|
2988
|
+
duration: "duration";
|
|
2989
2989
|
}>;
|
|
2990
2990
|
/**
|
|
2991
2991
|
* Formatting hint for trace attribute values rendered by the UI.
|
|
@@ -3009,8 +3009,8 @@ declare const traceAttributeDisplaySchema: z$1.ZodObject<{
|
|
|
3009
3009
|
format: z$1.ZodOptional<z$1.ZodEnum<{
|
|
3010
3010
|
string: "string";
|
|
3011
3011
|
number: "number";
|
|
3012
|
-
duration: "duration";
|
|
3013
3012
|
json: "json";
|
|
3013
|
+
duration: "duration";
|
|
3014
3014
|
}>>;
|
|
3015
3015
|
numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
3016
3016
|
placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
|
|
@@ -3045,8 +3045,8 @@ declare const traceDisplayConfigSchema: z$1.ZodObject<{
|
|
|
3045
3045
|
format: z$1.ZodOptional<z$1.ZodEnum<{
|
|
3046
3046
|
string: "string";
|
|
3047
3047
|
number: "number";
|
|
3048
|
-
duration: "duration";
|
|
3049
3048
|
json: "json";
|
|
3049
|
+
duration: "duration";
|
|
3050
3050
|
}>>;
|
|
3051
3051
|
numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
3052
3052
|
placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
|
|
@@ -3085,8 +3085,8 @@ declare const traceAttributeDisplayInputSchema: z$1.ZodObject<{
|
|
|
3085
3085
|
format: z$1.ZodOptional<z$1.ZodEnum<{
|
|
3086
3086
|
string: "string";
|
|
3087
3087
|
number: "number";
|
|
3088
|
-
duration: "duration";
|
|
3089
3088
|
json: "json";
|
|
3089
|
+
duration: "duration";
|
|
3090
3090
|
}>>;
|
|
3091
3091
|
numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
3092
3092
|
placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
|
|
@@ -3123,8 +3123,8 @@ declare const traceDisplayInputConfigSchema: z$1.ZodObject<{
|
|
|
3123
3123
|
format: z$1.ZodOptional<z$1.ZodEnum<{
|
|
3124
3124
|
string: "string";
|
|
3125
3125
|
number: "number";
|
|
3126
|
-
duration: "duration";
|
|
3127
3126
|
json: "json";
|
|
3127
|
+
duration: "duration";
|
|
3128
3128
|
}>>;
|
|
3129
3129
|
numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
3130
3130
|
placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
|
|
@@ -3255,13 +3255,13 @@ declare const evalStatItemSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
|
|
|
3255
3255
|
format: z$1.ZodOptional<z$1.ZodEnum<{
|
|
3256
3256
|
number: "number";
|
|
3257
3257
|
boolean: "boolean";
|
|
3258
|
-
duration: "duration";
|
|
3259
|
-
json: "json";
|
|
3260
3258
|
file: "file";
|
|
3261
3259
|
markdown: "markdown";
|
|
3260
|
+
json: "json";
|
|
3262
3261
|
image: "image";
|
|
3263
3262
|
audio: "audio";
|
|
3264
3263
|
video: "video";
|
|
3264
|
+
duration: "duration";
|
|
3265
3265
|
percent: "percent";
|
|
3266
3266
|
passFail: "passFail";
|
|
3267
3267
|
stars: "stars";
|
|
@@ -3297,13 +3297,13 @@ declare const evalStatsConfigSchema: z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z$1
|
|
|
3297
3297
|
format: z$1.ZodOptional<z$1.ZodEnum<{
|
|
3298
3298
|
number: "number";
|
|
3299
3299
|
boolean: "boolean";
|
|
3300
|
-
duration: "duration";
|
|
3301
|
-
json: "json";
|
|
3302
3300
|
file: "file";
|
|
3303
3301
|
markdown: "markdown";
|
|
3302
|
+
json: "json";
|
|
3304
3303
|
image: "image";
|
|
3305
3304
|
audio: "audio";
|
|
3306
3305
|
video: "video";
|
|
3306
|
+
duration: "duration";
|
|
3307
3307
|
percent: "percent";
|
|
3308
3308
|
passFail: "passFail";
|
|
3309
3309
|
stars: "stars";
|
|
@@ -3340,13 +3340,13 @@ declare const evalSummarySchema: z$1.ZodObject<{
|
|
|
3340
3340
|
format: z$1.ZodOptional<z$1.ZodEnum<{
|
|
3341
3341
|
number: "number";
|
|
3342
3342
|
boolean: "boolean";
|
|
3343
|
-
duration: "duration";
|
|
3344
|
-
json: "json";
|
|
3345
3343
|
file: "file";
|
|
3346
3344
|
markdown: "markdown";
|
|
3345
|
+
json: "json";
|
|
3347
3346
|
image: "image";
|
|
3348
3347
|
audio: "audio";
|
|
3349
3348
|
video: "video";
|
|
3349
|
+
duration: "duration";
|
|
3350
3350
|
percent: "percent";
|
|
3351
3351
|
passFail: "passFail";
|
|
3352
3352
|
stars: "stars";
|
|
@@ -3398,13 +3398,13 @@ declare const evalSummarySchema: z$1.ZodObject<{
|
|
|
3398
3398
|
format: z$1.ZodOptional<z$1.ZodEnum<{
|
|
3399
3399
|
number: "number";
|
|
3400
3400
|
boolean: "boolean";
|
|
3401
|
-
duration: "duration";
|
|
3402
|
-
json: "json";
|
|
3403
3401
|
file: "file";
|
|
3404
3402
|
markdown: "markdown";
|
|
3403
|
+
json: "json";
|
|
3405
3404
|
image: "image";
|
|
3406
3405
|
audio: "audio";
|
|
3407
3406
|
video: "video";
|
|
3407
|
+
duration: "duration";
|
|
3408
3408
|
percent: "percent";
|
|
3409
3409
|
passFail: "passFail";
|
|
3410
3410
|
stars: "stars";
|
|
@@ -3429,8 +3429,8 @@ declare const evalSummarySchema: z$1.ZodObject<{
|
|
|
3429
3429
|
}>;
|
|
3430
3430
|
label: z$1.ZodOptional<z$1.ZodString>;
|
|
3431
3431
|
color: z$1.ZodOptional<z$1.ZodEnum<{
|
|
3432
|
-
error: "error";
|
|
3433
3432
|
success: "success";
|
|
3433
|
+
error: "error";
|
|
3434
3434
|
warning: "warning";
|
|
3435
3435
|
accent: "accent";
|
|
3436
3436
|
accentDim: "accentDim";
|
|
@@ -3453,8 +3453,8 @@ declare const evalSummarySchema: z$1.ZodObject<{
|
|
|
3453
3453
|
}>;
|
|
3454
3454
|
label: z$1.ZodOptional<z$1.ZodString>;
|
|
3455
3455
|
color: z$1.ZodOptional<z$1.ZodEnum<{
|
|
3456
|
-
error: "error";
|
|
3457
3456
|
success: "success";
|
|
3457
|
+
error: "error";
|
|
3458
3458
|
warning: "warning";
|
|
3459
3459
|
accent: "accent";
|
|
3460
3460
|
accentDim: "accentDim";
|
|
@@ -3718,8 +3718,8 @@ declare const scoreTraceSchema: z$1.ZodObject<{
|
|
|
3718
3718
|
format: z$1.ZodOptional<z$1.ZodEnum<{
|
|
3719
3719
|
string: "string";
|
|
3720
3720
|
number: "number";
|
|
3721
|
-
duration: "duration";
|
|
3722
3721
|
json: "json";
|
|
3722
|
+
duration: "duration";
|
|
3723
3723
|
}>>;
|
|
3724
3724
|
numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
3725
3725
|
placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
|
|
@@ -3804,8 +3804,8 @@ declare const caseDetailSchema: z$1.ZodObject<{
|
|
|
3804
3804
|
format: z$1.ZodOptional<z$1.ZodEnum<{
|
|
3805
3805
|
string: "string";
|
|
3806
3806
|
number: "number";
|
|
3807
|
-
duration: "duration";
|
|
3808
3807
|
json: "json";
|
|
3808
|
+
duration: "duration";
|
|
3809
3809
|
}>>;
|
|
3810
3810
|
numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
3811
3811
|
placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
|
|
@@ -3873,8 +3873,8 @@ declare const caseDetailSchema: z$1.ZodObject<{
|
|
|
3873
3873
|
format: z$1.ZodOptional<z$1.ZodEnum<{
|
|
3874
3874
|
string: "string";
|
|
3875
3875
|
number: "number";
|
|
3876
|
-
duration: "duration";
|
|
3877
3876
|
json: "json";
|
|
3877
|
+
duration: "duration";
|
|
3878
3878
|
}>>;
|
|
3879
3879
|
numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
3880
3880
|
placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
|
|
@@ -4037,8 +4037,8 @@ type EvalChartAggregate = z$1.infer<typeof evalChartAggregateSchema>;
|
|
|
4037
4037
|
* not emit raw hex so authored evals stay decoupled from the web theme.
|
|
4038
4038
|
*/
|
|
4039
4039
|
declare const evalChartColorSchema: z$1.ZodEnum<{
|
|
4040
|
-
error: "error";
|
|
4041
4040
|
success: "success";
|
|
4041
|
+
error: "error";
|
|
4042
4042
|
warning: "warning";
|
|
4043
4043
|
accent: "accent";
|
|
4044
4044
|
accentDim: "accentDim";
|
|
@@ -4066,8 +4066,8 @@ declare const evalChartMetricSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
|
|
|
4066
4066
|
}>;
|
|
4067
4067
|
label: z$1.ZodOptional<z$1.ZodString>;
|
|
4068
4068
|
color: z$1.ZodOptional<z$1.ZodEnum<{
|
|
4069
|
-
error: "error";
|
|
4070
4069
|
success: "success";
|
|
4070
|
+
error: "error";
|
|
4071
4071
|
warning: "warning";
|
|
4072
4072
|
accent: "accent";
|
|
4073
4073
|
accentDim: "accentDim";
|
|
@@ -4090,8 +4090,8 @@ declare const evalChartMetricSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
|
|
|
4090
4090
|
}>;
|
|
4091
4091
|
label: z$1.ZodOptional<z$1.ZodString>;
|
|
4092
4092
|
color: z$1.ZodOptional<z$1.ZodEnum<{
|
|
4093
|
-
error: "error";
|
|
4094
4093
|
success: "success";
|
|
4094
|
+
error: "error";
|
|
4095
4095
|
warning: "warning";
|
|
4096
4096
|
accent: "accent";
|
|
4097
4097
|
accentDim: "accentDim";
|
|
@@ -4149,8 +4149,8 @@ declare const evalChartConfigSchema: z$1.ZodObject<{
|
|
|
4149
4149
|
}>;
|
|
4150
4150
|
label: z$1.ZodOptional<z$1.ZodString>;
|
|
4151
4151
|
color: z$1.ZodOptional<z$1.ZodEnum<{
|
|
4152
|
-
error: "error";
|
|
4153
4152
|
success: "success";
|
|
4153
|
+
error: "error";
|
|
4154
4154
|
warning: "warning";
|
|
4155
4155
|
accent: "accent";
|
|
4156
4156
|
accentDim: "accentDim";
|
|
@@ -4173,8 +4173,8 @@ declare const evalChartConfigSchema: z$1.ZodObject<{
|
|
|
4173
4173
|
}>;
|
|
4174
4174
|
label: z$1.ZodOptional<z$1.ZodString>;
|
|
4175
4175
|
color: z$1.ZodOptional<z$1.ZodEnum<{
|
|
4176
|
-
error: "error";
|
|
4177
4176
|
success: "success";
|
|
4177
|
+
error: "error";
|
|
4178
4178
|
warning: "warning";
|
|
4179
4179
|
accent: "accent";
|
|
4180
4180
|
accentDim: "accentDim";
|
|
@@ -4239,8 +4239,8 @@ declare const evalChartsConfigSchema: z$1.ZodArray<z$1.ZodObject<{
|
|
|
4239
4239
|
}>;
|
|
4240
4240
|
label: z$1.ZodOptional<z$1.ZodString>;
|
|
4241
4241
|
color: z$1.ZodOptional<z$1.ZodEnum<{
|
|
4242
|
-
error: "error";
|
|
4243
4242
|
success: "success";
|
|
4243
|
+
error: "error";
|
|
4244
4244
|
warning: "warning";
|
|
4245
4245
|
accent: "accent";
|
|
4246
4246
|
accentDim: "accentDim";
|
|
@@ -4263,8 +4263,8 @@ declare const evalChartsConfigSchema: z$1.ZodArray<z$1.ZodObject<{
|
|
|
4263
4263
|
}>;
|
|
4264
4264
|
label: z$1.ZodOptional<z$1.ZodString>;
|
|
4265
4265
|
color: z$1.ZodOptional<z$1.ZodEnum<{
|
|
4266
|
-
error: "error";
|
|
4267
4266
|
success: "success";
|
|
4267
|
+
error: "error";
|
|
4268
4268
|
warning: "warning";
|
|
4269
4269
|
accent: "accent";
|
|
4270
4270
|
accentDim: "accentDim";
|
|
@@ -4574,8 +4574,8 @@ declare const llmCallMetricFormatSchema: z$1.ZodEnum<{
|
|
|
4574
4574
|
string: "string";
|
|
4575
4575
|
number: "number";
|
|
4576
4576
|
boolean: "boolean";
|
|
4577
|
-
duration: "duration";
|
|
4578
4577
|
json: "json";
|
|
4578
|
+
duration: "duration";
|
|
4579
4579
|
}>;
|
|
4580
4580
|
/** Render format applied to an LLM-call metric value. */
|
|
4581
4581
|
type LlmCallMetricFormat = z$1.infer<typeof llmCallMetricFormatSchema>;
|
|
@@ -4584,8 +4584,8 @@ declare const apiCallMetricFormatSchema: z$1.ZodEnum<{
|
|
|
4584
4584
|
string: "string";
|
|
4585
4585
|
number: "number";
|
|
4586
4586
|
boolean: "boolean";
|
|
4587
|
-
duration: "duration";
|
|
4588
4587
|
json: "json";
|
|
4588
|
+
duration: "duration";
|
|
4589
4589
|
}>;
|
|
4590
4590
|
/** Render format applied to an API-call metric value. */
|
|
4591
4591
|
type ApiCallMetricFormat = z$1.infer<typeof apiCallMetricFormatSchema>;
|
|
@@ -4654,8 +4654,8 @@ declare const llmCallMetricSchema: z$1.ZodObject<{
|
|
|
4654
4654
|
string: "string";
|
|
4655
4655
|
number: "number";
|
|
4656
4656
|
boolean: "boolean";
|
|
4657
|
-
duration: "duration";
|
|
4658
4657
|
json: "json";
|
|
4658
|
+
duration: "duration";
|
|
4659
4659
|
}>>;
|
|
4660
4660
|
numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
4661
4661
|
placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
|
|
@@ -4683,8 +4683,8 @@ declare const apiCallMetricSchema: z$1.ZodObject<{
|
|
|
4683
4683
|
string: "string";
|
|
4684
4684
|
number: "number";
|
|
4685
4685
|
boolean: "boolean";
|
|
4686
|
-
duration: "duration";
|
|
4687
4686
|
json: "json";
|
|
4687
|
+
duration: "duration";
|
|
4688
4688
|
}>>;
|
|
4689
4689
|
numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
4690
4690
|
placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
|
|
@@ -4797,8 +4797,8 @@ declare const llmCallsConfigSchema: z$1.ZodObject<{
|
|
|
4797
4797
|
string: "string";
|
|
4798
4798
|
number: "number";
|
|
4799
4799
|
boolean: "boolean";
|
|
4800
|
-
duration: "duration";
|
|
4801
4800
|
json: "json";
|
|
4801
|
+
duration: "duration";
|
|
4802
4802
|
}>>;
|
|
4803
4803
|
numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
4804
4804
|
placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
|
|
@@ -4833,8 +4833,8 @@ declare const apiCallsConfigSchema: z$1.ZodObject<{
|
|
|
4833
4833
|
string: "string";
|
|
4834
4834
|
number: "number";
|
|
4835
4835
|
boolean: "boolean";
|
|
4836
|
-
duration: "duration";
|
|
4837
4836
|
json: "json";
|
|
4837
|
+
duration: "duration";
|
|
4838
4838
|
}>>;
|
|
4839
4839
|
numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
4840
4840
|
placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
|
|
@@ -5135,8 +5135,8 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
5135
5135
|
format: z$1.ZodOptional<z$1.ZodEnum<{
|
|
5136
5136
|
string: "string";
|
|
5137
5137
|
number: "number";
|
|
5138
|
-
duration: "duration";
|
|
5139
5138
|
json: "json";
|
|
5139
|
+
duration: "duration";
|
|
5140
5140
|
}>>;
|
|
5141
5141
|
numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
5142
5142
|
placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
|
|
@@ -5183,13 +5183,13 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
5183
5183
|
format: z$1.ZodOptional<z$1.ZodEnum<{
|
|
5184
5184
|
number: "number";
|
|
5185
5185
|
boolean: "boolean";
|
|
5186
|
-
duration: "duration";
|
|
5187
|
-
json: "json";
|
|
5188
5186
|
file: "file";
|
|
5189
5187
|
markdown: "markdown";
|
|
5188
|
+
json: "json";
|
|
5190
5189
|
image: "image";
|
|
5191
5190
|
audio: "audio";
|
|
5192
5191
|
video: "video";
|
|
5192
|
+
duration: "duration";
|
|
5193
5193
|
percent: "percent";
|
|
5194
5194
|
passFail: "passFail";
|
|
5195
5195
|
stars: "stars";
|
|
@@ -5248,8 +5248,8 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
5248
5248
|
string: "string";
|
|
5249
5249
|
number: "number";
|
|
5250
5250
|
boolean: "boolean";
|
|
5251
|
-
duration: "duration";
|
|
5252
5251
|
json: "json";
|
|
5252
|
+
duration: "duration";
|
|
5253
5253
|
}>>;
|
|
5254
5254
|
numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
5255
5255
|
placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
|
|
@@ -5293,8 +5293,8 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
5293
5293
|
string: "string";
|
|
5294
5294
|
number: "number";
|
|
5295
5295
|
boolean: "boolean";
|
|
5296
|
-
duration: "duration";
|
|
5297
5296
|
json: "json";
|
|
5297
|
+
duration: "duration";
|
|
5298
5298
|
}>>;
|
|
5299
5299
|
numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
5300
5300
|
placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as apiCallMetricSchema, $n as getCurrentScope, $t as cacheDebugKeyEntrySchema, A as createRunRequestSchema, An as repoFileRefSchema, At as runLogEntrySchema, B as getNestedAttribute, Bn as deserializeCacheValue, Bt as manualInputNumberFieldSchema, Cn as cellValueSchema, Ct as caseRowSchema, Dn as fileRefSchema, Dt as evalStatItemSchema, En as columnKindSchema, Et as evalStatAggregateSchema, F as extractApiCalls, Fn as evalSpan, Ft as manualInputBooleanFieldSchema, G as deriveStatusFromChildStatuses, Gn as readManualInputFile, Gt as evalChartAxisSchema, H as getEvalDisplayStatus, Hn as serializeCacheValue, Ht as manualInputSelectOptionSchema, I as extractLlmCalls, In as evalTracer, It as manualInputDescriptorSchema, J as DEFAULT_API_CALLS_CONFIG, Jn as advanceEvalTime, Jt as evalChartConfigSchema, K as runManifestSchema, Kn as evalExpect, Kt as evalChartBuiltinMetricSchema, L as simulateLlmCallCost, Ln as hashCacheKey, Lt as manualInputFieldDescriptorSchema, M as sseEnvelopeSchema, Mn as z, Mt as runLogLocationSchema, N as extractCacheEntries, Nn as buildTraceTree, Nt as runLogPhaseSchema, O as configReloadStateSchema, On as jsonCellSchema, Ot as evalStatsConfigSchema, P as extractCacheHits, Pn as captureEvalSpanError, Pt as scoreTraceSchema, Q as apiCallMetricPlacementSchema, Qn as evalLog, Qt as evalChartsConfigSchema, R as simulateTokenAllocation, Rn as hashCacheKeySync, Rt as manualInputJsonFieldSchema, Sn as traceSpanWarningSchema, St as caseDetailSchema, Tn as columnFormatSchema, Tt as evalFreshnessStatusSchema, U as deriveScopedSummaryFromCases, Un as repoFile, Ut as manualInputTextFieldSchema, V as getEvalTitle, Vn as serializeCacheRecording, Vt as manualInputSelectFieldSchema, W as deriveStatusFromCaseRows, Wn as manualInputFileValueSchema, Wt as evalChartAggregateSchema, X as agentEvalsConfigSchema, Xt as evalChartTooltipExtraSchema, Y as DEFAULT_LLM_CALLS_CONFIG, Yn as appendToEvalOutput, Yt as evalChartMetricSchema, Z as apiCallMetricFormatSchema, Zn as evalAssert, Zt as evalChartTypeSchema, _n as traceDisplayConfigSchema, _t as buildCaseKey, an as cacheModeSchema, ar as nextEvalId, at as llmCallCostCurrencySchema, bn as traceSpanKindSchema, bt as getCaseRowEvalKey, cn as cacheRecordingSchema, cr as runInExistingEvalScope, ct as llmCallMetricSchema, dn as spanCacheOptionsSchema, dr as startEvalBackgroundJob, dt as llmCallsConfigSchema, en as cacheDebugKeyFileSchema, er as getEvalCaseInput, et as apiCallsConfigSchema, fn as traceCacheRefSchema, fr as defineEval, ft as removeDefaultConfigSchema, gn as traceAttributeDisplaySchema, gt as trialSelectionModeSchema, hn as traceAttributeDisplayPlacementSchema, ht as runLogsConfigSchema, in as cacheListItemSchema, ir as mergeEvalOutput, it as evalDeriveConfigSchema, j as updateManualScoreRequestSchema, jn as runArtifactRefSchema, jt as runLogLevelSchema, k as configReloadStatusSchema, kn as numberDisplayOptionsSchema, kt as evalSummarySchema, ln as cacheStatusSchema, lr as setEvalOutput, lt as llmCallPricingRateSchema, mn as traceAttributeDisplayInputSchema, mt as resolveLlmCallsConfig, nn as cacheEntryWithDebugKeySchema, nr as incrementEvalOutput, nt as evalColumnOverrideSchema, on as cacheOperationTypeSchema, or as runInEvalRuntimeScope, ot as llmCallMetricFormatSchema, pn as traceAttributeDisplayFormatSchema, pr as getEvalRegistry, pt as resolveApiCallsConfig, q as runSummarySchema, qn as EvalAssertionError, qt as evalChartColorSchema, rn as cacheFileSchema, rr as isInEvalScope, rt as evalColumnsSchema, sn as cacheRecordingOpSchema, sr as runInEvalScope, st as llmCallMetricPlacementSchema, tn as cacheEntrySchema, tr as getEvalStartTime, tt as defaultConfigKeySchema, un as serializedCacheSpanSchema, ur as setScopeCacheContext, ut as llmCallPricingSchema, vn as traceDisplayInputConfigSchema, vt as buildEvalKey, wn as columnDefSchema, wt as discoveryIssueSchema, xn as traceSpanSchema, xt as assertionFailureSchema, yn as traceSpanErrorSchema, yt as getCaseRowCaseKey, z as applyDerivedCallAttributes, zn as deserializeCacheRecording, zt as manualInputMultilineFieldSchema } from "./runOrchestration-
|
|
2
|
-
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-
|
|
3
|
-
import "./src-
|
|
1
|
+
import { $ as apiCallMetricSchema, $n as getCurrentScope, $t as cacheDebugKeyEntrySchema, A as createRunRequestSchema, An as repoFileRefSchema, At as runLogEntrySchema, B as getNestedAttribute, Bn as deserializeCacheValue, Bt as manualInputNumberFieldSchema, Cn as cellValueSchema, Ct as caseRowSchema, Dn as fileRefSchema, Dt as evalStatItemSchema, En as columnKindSchema, Et as evalStatAggregateSchema, F as extractApiCalls, Fn as evalSpan, Ft as manualInputBooleanFieldSchema, G as deriveStatusFromChildStatuses, Gn as readManualInputFile, Gt as evalChartAxisSchema, H as getEvalDisplayStatus, Hn as serializeCacheValue, Ht as manualInputSelectOptionSchema, I as extractLlmCalls, In as evalTracer, It as manualInputDescriptorSchema, J as DEFAULT_API_CALLS_CONFIG, Jn as advanceEvalTime, Jt as evalChartConfigSchema, K as runManifestSchema, Kn as evalExpect, Kt as evalChartBuiltinMetricSchema, L as simulateLlmCallCost, Ln as hashCacheKey, Lt as manualInputFieldDescriptorSchema, M as sseEnvelopeSchema, Mn as z, Mt as runLogLocationSchema, N as extractCacheEntries, Nn as buildTraceTree, Nt as runLogPhaseSchema, O as configReloadStateSchema, On as jsonCellSchema, Ot as evalStatsConfigSchema, P as extractCacheHits, Pn as captureEvalSpanError, Pt as scoreTraceSchema, Q as apiCallMetricPlacementSchema, Qn as evalLog, Qt as evalChartsConfigSchema, R as simulateTokenAllocation, Rn as hashCacheKeySync, Rt as manualInputJsonFieldSchema, Sn as traceSpanWarningSchema, St as caseDetailSchema, Tn as columnFormatSchema, Tt as evalFreshnessStatusSchema, U as deriveScopedSummaryFromCases, Un as repoFile, Ut as manualInputTextFieldSchema, V as getEvalTitle, Vn as serializeCacheRecording, Vt as manualInputSelectFieldSchema, W as deriveStatusFromCaseRows, Wn as manualInputFileValueSchema, Wt as evalChartAggregateSchema, X as agentEvalsConfigSchema, Xt as evalChartTooltipExtraSchema, Y as DEFAULT_LLM_CALLS_CONFIG, Yn as appendToEvalOutput, Yt as evalChartMetricSchema, Z as apiCallMetricFormatSchema, Zn as evalAssert, Zt as evalChartTypeSchema, _n as traceDisplayConfigSchema, _t as buildCaseKey, an as cacheModeSchema, ar as nextEvalId, at as llmCallCostCurrencySchema, bn as traceSpanKindSchema, bt as getCaseRowEvalKey, cn as cacheRecordingSchema, cr as runInExistingEvalScope, ct as llmCallMetricSchema, dn as spanCacheOptionsSchema, dr as startEvalBackgroundJob, dt as llmCallsConfigSchema, en as cacheDebugKeyFileSchema, er as getEvalCaseInput, et as apiCallsConfigSchema, fn as traceCacheRefSchema, fr as defineEval, ft as removeDefaultConfigSchema, gn as traceAttributeDisplaySchema, gt as trialSelectionModeSchema, hn as traceAttributeDisplayPlacementSchema, ht as runLogsConfigSchema, in as cacheListItemSchema, ir as mergeEvalOutput, it as evalDeriveConfigSchema, j as updateManualScoreRequestSchema, jn as runArtifactRefSchema, jt as runLogLevelSchema, k as configReloadStatusSchema, kn as numberDisplayOptionsSchema, kt as evalSummarySchema, ln as cacheStatusSchema, lr as setEvalOutput, lt as llmCallPricingRateSchema, mn as traceAttributeDisplayInputSchema, mt as resolveLlmCallsConfig, nn as cacheEntryWithDebugKeySchema, nr as incrementEvalOutput, nt as evalColumnOverrideSchema, on as cacheOperationTypeSchema, or as runInEvalRuntimeScope, ot as llmCallMetricFormatSchema, pn as traceAttributeDisplayFormatSchema, pr as getEvalRegistry, pt as resolveApiCallsConfig, q as runSummarySchema, qn as EvalAssertionError, qt as evalChartColorSchema, rn as cacheFileSchema, rr as isInEvalScope, rt as evalColumnsSchema, sn as cacheRecordingOpSchema, sr as runInEvalScope, st as llmCallMetricPlacementSchema, tn as cacheEntrySchema, tr as getEvalStartTime, tt as defaultConfigKeySchema, un as serializedCacheSpanSchema, ur as setScopeCacheContext, ut as llmCallPricingSchema, vn as traceDisplayInputConfigSchema, vt as buildEvalKey, wn as columnDefSchema, wt as discoveryIssueSchema, xn as traceSpanSchema, xt as assertionFailureSchema, yn as traceSpanErrorSchema, yt as getCaseRowCaseKey, z as applyDerivedCallAttributes, zn as deserializeCacheRecording, zt as manualInputMultilineFieldSchema } from "./runOrchestration-DoslE_Oo.mjs";
|
|
2
|
+
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-CVwIjcsX.mjs";
|
|
3
|
+
import "./src-Bcc2ZHK8.mjs";
|
|
4
4
|
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, cleanupStagedManualInputFiles, columnDefSchema, columnFormatSchema, columnKindSchema, configReloadStateSchema, configReloadStatusSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalColumnOverrideSchema, evalColumnsSchema, evalDeriveConfigSchema, evalExpect, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, jsonCellSchema, llmCallCostCurrencySchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingRateSchema, llmCallPricingSchema, llmCallsConfigSchema, manualInputBooleanFieldSchema, manualInputDescriptorSchema, manualInputFieldDescriptorSchema, manualInputFileValueSchema, manualInputJsonFieldSchema, manualInputMultilineFieldSchema, manualInputNumberFieldSchema, manualInputSelectFieldSchema, manualInputSelectOptionSchema, manualInputTextFieldSchema, materializeManualInputFiles, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, readManualInputFile, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, spanCacheOptionsSchema, sseEnvelopeSchema, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as createRunRequestSchema, C as loadConfig, D as createFsCacheStore, It as manualInputDescriptorSchema, K as runManifestSchema, Ot as evalStatsConfigSchema, Qt as evalChartsConfigSchema, Xn as configureEvalRunLogs, q as runSummarySchema, r as getTargetEvals$1, t as executeRun, vt as buildEvalKey, wn as columnDefSchema, x as parseEvalDiscovery } from "./runOrchestration-
|
|
1
|
+
import { A as createRunRequestSchema, C as loadConfig, D as createFsCacheStore, It as manualInputDescriptorSchema, K as runManifestSchema, Ot as evalStatsConfigSchema, Qt as evalChartsConfigSchema, Xn as configureEvalRunLogs, q as runSummarySchema, r as getTargetEvals$1, t as executeRun, vt as buildEvalKey, wn as columnDefSchema, x as parseEvalDiscovery } from "./runOrchestration-DoslE_Oo.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { relative } from "node:path";
|
|
@@ -1688,7 +1688,7 @@ function createTraceCache(generateSpanId) {
|
|
|
1688
1688
|
const hit = await cacheCtx.adapter.lookup(namespace, keyHash);
|
|
1689
1689
|
if (hit) {
|
|
1690
1690
|
const storedAt = hit.storedAt;
|
|
1691
|
-
const age =
|
|
1691
|
+
const age = getRealDateNowMs() - new Date(storedAt).getTime();
|
|
1692
1692
|
recordCacheRef(scope, activeSpan, {
|
|
1693
1693
|
type: "value",
|
|
1694
1694
|
name: info.name,
|
|
@@ -2149,7 +2149,7 @@ async function traceSpanInternal(info, fn) {
|
|
|
2149
2149
|
mergeSpanAttributes(spanRecord, {
|
|
2150
2150
|
"cache.status": "hit",
|
|
2151
2151
|
"cache.storedAt": storedAt,
|
|
2152
|
-
"cache.age":
|
|
2152
|
+
"cache.age": getRealDateNowMs() - new Date(storedAt).getTime()
|
|
2153
2153
|
});
|
|
2154
2154
|
const recording = deserializeCacheRecording(hit.recording);
|
|
2155
2155
|
replayRecording(scope, spanRecord, recording, { generateSpanId });
|
|
@@ -5025,30 +5025,6 @@ function mergeOverrides(base, override) {
|
|
|
5025
5025
|
};
|
|
5026
5026
|
}
|
|
5027
5027
|
/**
|
|
5028
|
-
* Populate `target` with `ColumnDef` entries for any keys in `columns`
|
|
5029
|
-
* that aren't already present, applying user-supplied `overrides` and
|
|
5030
|
-
* flagging score columns declared via `scores`.
|
|
5031
|
-
*/
|
|
5032
|
-
function mergeColumnDefs(target, columns, overrides, scores, manualScores) {
|
|
5033
|
-
const scoreKeys = new Set(Object.keys(scores ?? {}));
|
|
5034
|
-
const manualScoreKeys = new Set(Object.keys(manualScores ?? {}));
|
|
5035
|
-
const overrideMap = overrides ?? {};
|
|
5036
|
-
for (const [key, value] of Object.entries(columns)) {
|
|
5037
|
-
if (target.has(key)) continue;
|
|
5038
|
-
const override = mergeOverrides(getScoreOverride(scores?.[key]) ?? manualScores?.[key], overrideMap[key]);
|
|
5039
|
-
const isScore = scoreKeys.has(key) || manualScoreKeys.has(key);
|
|
5040
|
-
target.set(key, createColumnDef({
|
|
5041
|
-
key,
|
|
5042
|
-
override,
|
|
5043
|
-
scoreDef: scores?.[key],
|
|
5044
|
-
manualScoreDef: manualScores?.[key],
|
|
5045
|
-
inferredKind: isScore ? "number" : inferKind(value),
|
|
5046
|
-
isScore,
|
|
5047
|
-
isManualScore: manualScoreKeys.has(key)
|
|
5048
|
-
}));
|
|
5049
|
-
}
|
|
5050
|
-
}
|
|
5051
|
-
/**
|
|
5052
5028
|
* Build the column definitions declared directly on an eval before any runtime
|
|
5053
5029
|
* output values exist. This lets discovery metadata describe authored rich
|
|
5054
5030
|
* output columns even for runs created by another process.
|
|
@@ -5092,29 +5068,18 @@ function buildDeclaredColumnDefs(overrides, scores, manualScores) {
|
|
|
5092
5068
|
}
|
|
5093
5069
|
return [...declaredDefs.values()];
|
|
5094
5070
|
}
|
|
5095
|
-
/** Infer a `ColumnKind` from a runtime value when no override is set. */
|
|
5096
|
-
function inferKind(value) {
|
|
5097
|
-
if (typeof value === "number") return "number";
|
|
5098
|
-
if (typeof value === "boolean") return "boolean";
|
|
5099
|
-
return "string";
|
|
5100
|
-
}
|
|
5101
5071
|
/**
|
|
5102
5072
|
* Coerce an arbitrary runtime value into a serializable `CellValue`.
|
|
5103
|
-
*
|
|
5073
|
+
* Runtime values use the SDK's tagged serializer so saved run artifacts keep
|
|
5074
|
+
* structured data instead of storing JSON strings. Native binary/file root
|
|
5075
|
+
* values are handled before this helper.
|
|
5104
5076
|
*/
|
|
5105
|
-
function toCellValue(value
|
|
5106
|
-
|
|
5107
|
-
if (
|
|
5108
|
-
|
|
5109
|
-
|
|
5110
|
-
|
|
5111
|
-
if (parsed.success) return parsed.data;
|
|
5112
|
-
}
|
|
5113
|
-
if (override?.format === "json") {
|
|
5114
|
-
const parsed = jsonCellSchema.safeParse(value);
|
|
5115
|
-
if (parsed.success) return parsed.data;
|
|
5116
|
-
}
|
|
5117
|
-
return JSON.stringify(value);
|
|
5077
|
+
async function toCellValue(value) {
|
|
5078
|
+
const fileRef = fileRefSchema.safeParse(value);
|
|
5079
|
+
if (fileRef.success) return fileRef.data;
|
|
5080
|
+
const serialized = await serializeCacheValue(value, { preserveUndefined: true });
|
|
5081
|
+
const parsed = jsonCellSchema.safeParse(serialized);
|
|
5082
|
+
if (parsed.success) return parsed.data;
|
|
5118
5083
|
}
|
|
5119
5084
|
function inferKindFromFormat(format) {
|
|
5120
5085
|
if (format === "boolean") return "boolean";
|
|
@@ -6586,7 +6551,7 @@ async function runDeriveFromTracingConfig(params) {
|
|
|
6586
6551
|
}
|
|
6587
6552
|
}
|
|
6588
6553
|
async function runCase(params) {
|
|
6589
|
-
const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay,
|
|
6554
|
+
const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalDeriveFromTracing, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
6590
6555
|
const scopedIdPrefix = buildScopedEvalIdPrefix({
|
|
6591
6556
|
evalId,
|
|
6592
6557
|
evalFilePath,
|
|
@@ -6739,12 +6704,6 @@ async function runCase(params) {
|
|
|
6739
6704
|
const status = nonAssertError ? "error" : passed ? "pass" : "fail";
|
|
6740
6705
|
const { trace: displayTrace, traceDisplay } = resolveTracePresentation(spansWithDerivedAttributes, globalTraceDisplay, evalDef.traceDisplay);
|
|
6741
6706
|
const columns = {};
|
|
6742
|
-
const columnOverrides = mergeDefaultColumns({
|
|
6743
|
-
globalColumns,
|
|
6744
|
-
columns: evalDef.columns,
|
|
6745
|
-
globalRemove: globalRemoveDefaultConfig,
|
|
6746
|
-
evalRemove: evalDef.removeDefaultConfig
|
|
6747
|
-
});
|
|
6748
6707
|
for (const [key, value] of Object.entries(scope.outputs)) {
|
|
6749
6708
|
const cell = isBlob(value) ? await persistInlineArtifact({
|
|
6750
6709
|
artifactDir,
|
|
@@ -6753,7 +6712,7 @@ async function runCase(params) {
|
|
|
6753
6712
|
outputKey: key,
|
|
6754
6713
|
trial,
|
|
6755
6714
|
value
|
|
6756
|
-
}) : toCellValue(value
|
|
6715
|
+
}) : await toCellValue(value);
|
|
6757
6716
|
if (cell !== void 0) columns[key] = cell;
|
|
6758
6717
|
}
|
|
6759
6718
|
for (const key of Object.keys(evalDef.manualScores ?? {})) columns[key] = null;
|
|
@@ -6989,7 +6948,6 @@ async function finalizePreparedCase(params) {
|
|
|
6989
6948
|
const artifactFileId = getCaseArtifactFileId(runState, winningTrial.caseRow);
|
|
6990
6949
|
runState.cases.push(winningTrial.caseRow);
|
|
6991
6950
|
runState.caseDetails.set(getCaseRowCaseKey(winningTrial.caseRow), winningTrial.caseDetail);
|
|
6992
|
-
preparedEval.mergeColumns(winningTrial.caseDetail.columns);
|
|
6993
6951
|
if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
|
|
6994
6952
|
else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
|
|
6995
6953
|
else runState.summary.failedCases++;
|
|
@@ -7106,13 +7064,13 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
7106
7064
|
globalRemove: config.removeDefaultConfig
|
|
7107
7065
|
});
|
|
7108
7066
|
const declaredColumnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
|
|
7109
|
-
const accumulatedColumns = new Map(declaredColumnDefs.map((def) => [def.key, def]));
|
|
7110
7067
|
const validatedCharts = validateCharts({
|
|
7111
7068
|
charts: defaultConfig.charts,
|
|
7112
7069
|
columnDefs: declaredColumnDefs,
|
|
7113
7070
|
evalId: evalMeta.id
|
|
7114
7071
|
});
|
|
7115
7072
|
for (const warning of validatedCharts.warnings) console.warn(warning);
|
|
7073
|
+
evalMeta.columnDefs = declaredColumnDefs;
|
|
7116
7074
|
evalMeta.stats = defaultConfig.stats;
|
|
7117
7075
|
evalMeta.charts = validatedCharts.charts;
|
|
7118
7076
|
const evalCaseRows = [];
|
|
@@ -7121,13 +7079,9 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
7121
7079
|
const manualScoreKeys = Object.freeze(Object.keys(evalDef.manualScores ?? {}));
|
|
7122
7080
|
const preparedEval = {
|
|
7123
7081
|
evalMeta,
|
|
7124
|
-
accumulatedColumns,
|
|
7125
7082
|
evalCaseRows,
|
|
7126
7083
|
preparedCases,
|
|
7127
|
-
scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys])
|
|
7128
|
-
mergeColumns: (columns) => {
|
|
7129
|
-
mergeColumnDefs(accumulatedColumns, columns, defaultConfig.columns, evalDef.scores, evalDef.manualScores);
|
|
7130
|
-
}
|
|
7084
|
+
scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys])
|
|
7131
7085
|
};
|
|
7132
7086
|
preparedEvals.push(preparedEval);
|
|
7133
7087
|
for (const evalCase of cases) {
|
|
@@ -7236,7 +7190,6 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
7236
7190
|
onCaseFinished,
|
|
7237
7191
|
emitEvent
|
|
7238
7192
|
});
|
|
7239
|
-
preparedEval.evalMeta.columnDefs = [...preparedEval.accumulatedColumns.values()];
|
|
7240
7193
|
lastRunStatusMap.set(preparedEval.evalMeta.key, toLastRunStatus(deriveStatusFromCaseRows({ caseRows: preparedEval.evalCaseRows })));
|
|
7241
7194
|
const latestStatus = lastRunStatusMap.get(preparedEval.evalMeta.key) ?? null;
|
|
7242
7195
|
latestRunInfoMap.set(preparedEval.evalMeta.key, {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-CVwIjcsX.mjs";
|
|
2
|
+
import "./src-Bcc2ZHK8.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-ChHgWruW.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.35.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -32,8 +32,8 @@
|
|
|
32
32
|
"@types/node": "^24.7.2",
|
|
33
33
|
"typescript": "^5.9.2",
|
|
34
34
|
"@agent-evals/runner": "0.0.1",
|
|
35
|
-
"@agent-evals/
|
|
36
|
-
"@agent-evals/
|
|
35
|
+
"@agent-evals/shared": "0.0.1",
|
|
36
|
+
"@agent-evals/sdk": "0.0.1"
|
|
37
37
|
},
|
|
38
38
|
"scripts": {
|
|
39
39
|
"build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
|