@ls-stack/agent-eval 0.20.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-DsiLU65H.mjs → app-CmwmcUgG.mjs} +3 -3
- package/dist/apps/web/dist/assets/index-EXO08yya.js +118 -0
- package/dist/apps/web/dist/assets/{index-CvR6QCLa.css → index-r0dVFK0B.css} +1 -1
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-weogme5U.mjs → cli-DumvanQI.mjs} +3 -3
- package/dist/index.d.mts +10 -8
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-Cv1kiOAG.mjs → runOrchestration-zYAcAPtS.mjs} +53 -69
- package/dist/{runner-B25oRQxX.mjs → runner-BcwyX9CO.mjs} +1 -1
- package/dist/{runner-DzrMtgBu.mjs → runner-Dy_PECaf.mjs} +2 -2
- package/dist/src-BoAJb4wC.mjs +3 -0
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +7 -3
- package/dist/apps/web/dist/assets/index-Cba4MFa0.js +0 -118
- package/dist/src-B879LZfo.mjs +0 -3
|
@@ -1970,7 +1970,15 @@ const numberDisplayOptionsSchema = z.object({
|
|
|
1970
1970
|
compactDisplay: z.enum(["short", "long"]).optional(),
|
|
1971
1971
|
prefix: z.string().optional(),
|
|
1972
1972
|
suffix: z.string().optional(),
|
|
1973
|
-
|
|
1973
|
+
minDecimalPlaces: z.number().int().min(0).optional(),
|
|
1974
|
+
maxDecimalPlaces: z.number().int().min(0).optional()
|
|
1975
|
+
}).refine((options) => {
|
|
1976
|
+
if (options.minDecimalPlaces === void 0) return true;
|
|
1977
|
+
if (options.maxDecimalPlaces === void 0) return true;
|
|
1978
|
+
return options.minDecimalPlaces <= options.maxDecimalPlaces;
|
|
1979
|
+
}, {
|
|
1980
|
+
message: "minDecimalPlaces must be less than or equal to maxDecimalPlaces",
|
|
1981
|
+
path: ["minDecimalPlaces"]
|
|
1974
1982
|
});
|
|
1975
1983
|
/** Schema for the supported column rendering kinds in list views. */
|
|
1976
1984
|
const columnKindSchema = z.enum([
|
|
@@ -2005,7 +2013,6 @@ const columnDefSchema = z.object({
|
|
|
2005
2013
|
passThreshold: z.number().optional(),
|
|
2006
2014
|
maxStars: z.number().int().min(2).optional(),
|
|
2007
2015
|
hideInTable: z.boolean().optional(),
|
|
2008
|
-
sortable: z.boolean().optional(),
|
|
2009
2016
|
align: z.enum([
|
|
2010
2017
|
"left",
|
|
2011
2018
|
"center",
|
|
@@ -2403,6 +2410,8 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
|
|
|
2403
2410
|
label: z.string().optional(),
|
|
2404
2411
|
aggregate: evalStatAggregateSchema,
|
|
2405
2412
|
format: columnFormatSchema.optional(),
|
|
2413
|
+
/** Number presentation options applied when `format: 'number'`. */
|
|
2414
|
+
numberFormat: numberDisplayOptionsSchema.optional(),
|
|
2406
2415
|
accent: z.boolean().optional()
|
|
2407
2416
|
})
|
|
2408
2417
|
]);
|
|
@@ -4035,7 +4044,6 @@ function getScoreOverride(def) {
|
|
|
4035
4044
|
format: def.format,
|
|
4036
4045
|
numberFormat: def.numberFormat,
|
|
4037
4046
|
hideInTable: def.hideInTable,
|
|
4038
|
-
sortable: def.sortable,
|
|
4039
4047
|
align: def.align,
|
|
4040
4048
|
maxStars: def.maxStars
|
|
4041
4049
|
};
|
|
@@ -4048,7 +4056,6 @@ function mergeOverrides(base, override) {
|
|
|
4048
4056
|
format: override.format ?? base.format,
|
|
4049
4057
|
numberFormat: override.numberFormat ?? base.numberFormat,
|
|
4050
4058
|
hideInTable: override.hideInTable ?? base.hideInTable,
|
|
4051
|
-
sortable: override.sortable ?? base.sortable,
|
|
4052
4059
|
align: override.align ?? base.align,
|
|
4053
4060
|
maxStars: override.maxStars ?? base.maxStars
|
|
4054
4061
|
};
|
|
@@ -4163,7 +4170,6 @@ function createColumnDef(params) {
|
|
|
4163
4170
|
if (override?.numberFormat !== void 0) def.numberFormat = override.numberFormat;
|
|
4164
4171
|
if (override?.maxStars !== void 0) def.maxStars = override.maxStars;
|
|
4165
4172
|
if (override?.hideInTable !== void 0) def.hideInTable = override.hideInTable;
|
|
4166
|
-
if (override?.sortable !== void 0) def.sortable = override.sortable;
|
|
4167
4173
|
if (override?.align !== void 0) def.align = override.align;
|
|
4168
4174
|
if (!isScore) return def;
|
|
4169
4175
|
def.isScore = true;
|
|
@@ -4231,86 +4237,76 @@ const DEFAULT_CONFIG_KEYS = [
|
|
|
4231
4237
|
"totalTokens",
|
|
4232
4238
|
"cachedInputTokens",
|
|
4233
4239
|
"cacheCreationInputTokens",
|
|
4234
|
-
"reasoningTokens",
|
|
4235
4240
|
"llmLatencyMs"
|
|
4236
4241
|
];
|
|
4237
|
-
const tokenNumberFormat = {
|
|
4238
|
-
|
|
4239
|
-
|
|
4242
|
+
const tokenNumberFormat = { notation: "compact" };
|
|
4243
|
+
const countNumberFormat = {
|
|
4244
|
+
minDecimalPlaces: 0,
|
|
4245
|
+
maxDecimalPlaces: 0
|
|
4246
|
+
};
|
|
4247
|
+
const costNumberFormat = {
|
|
4248
|
+
prefix: "$",
|
|
4249
|
+
maxDecimalPlaces: 4
|
|
4240
4250
|
};
|
|
4241
|
-
const countNumberFormat = { decimalPlaces: 0 };
|
|
4242
4251
|
const DEFAULT_COLUMNS = {
|
|
4243
4252
|
apiCalls: {
|
|
4244
4253
|
label: "API Calls",
|
|
4245
4254
|
format: "number",
|
|
4246
4255
|
numberFormat: countNumberFormat,
|
|
4247
|
-
align: "right"
|
|
4248
|
-
sortable: true
|
|
4256
|
+
align: "right"
|
|
4249
4257
|
},
|
|
4250
4258
|
costUsd: {
|
|
4251
4259
|
label: "Cost",
|
|
4252
4260
|
format: "number",
|
|
4253
|
-
numberFormat:
|
|
4254
|
-
|
|
4255
|
-
decimalPlaces: 4
|
|
4256
|
-
},
|
|
4257
|
-
align: "right",
|
|
4258
|
-
sortable: true
|
|
4261
|
+
numberFormat: costNumberFormat,
|
|
4262
|
+
align: "right"
|
|
4259
4263
|
},
|
|
4260
4264
|
llmTurns: {
|
|
4261
4265
|
label: "LLM Turns",
|
|
4262
4266
|
format: "number",
|
|
4263
4267
|
numberFormat: countNumberFormat,
|
|
4264
|
-
align: "right"
|
|
4265
|
-
sortable: true
|
|
4268
|
+
align: "right"
|
|
4266
4269
|
},
|
|
4267
4270
|
inputTokens: {
|
|
4268
4271
|
label: "Input Tokens",
|
|
4269
4272
|
format: "number",
|
|
4270
4273
|
numberFormat: tokenNumberFormat,
|
|
4271
|
-
align: "right"
|
|
4272
|
-
sortable: true
|
|
4274
|
+
align: "right"
|
|
4273
4275
|
},
|
|
4274
4276
|
outputTokens: {
|
|
4275
4277
|
label: "Output Tokens",
|
|
4276
4278
|
format: "number",
|
|
4277
4279
|
numberFormat: tokenNumberFormat,
|
|
4278
|
-
align: "right"
|
|
4279
|
-
sortable: true
|
|
4280
|
+
align: "right"
|
|
4280
4281
|
},
|
|
4281
4282
|
totalTokens: {
|
|
4282
4283
|
label: "Total Tokens",
|
|
4283
4284
|
format: "number",
|
|
4284
4285
|
numberFormat: tokenNumberFormat,
|
|
4285
|
-
align: "right"
|
|
4286
|
-
sortable: true
|
|
4286
|
+
align: "right"
|
|
4287
4287
|
},
|
|
4288
4288
|
cachedInputTokens: {
|
|
4289
4289
|
label: "Cached Input Tokens",
|
|
4290
4290
|
format: "number",
|
|
4291
4291
|
numberFormat: tokenNumberFormat,
|
|
4292
|
-
align: "right"
|
|
4293
|
-
sortable: true
|
|
4292
|
+
align: "right"
|
|
4294
4293
|
},
|
|
4295
4294
|
cacheCreationInputTokens: {
|
|
4296
4295
|
label: "Cache Write Tokens",
|
|
4297
4296
|
format: "number",
|
|
4298
4297
|
numberFormat: tokenNumberFormat,
|
|
4299
|
-
align: "right"
|
|
4300
|
-
sortable: true
|
|
4298
|
+
align: "right"
|
|
4301
4299
|
},
|
|
4302
4300
|
reasoningTokens: {
|
|
4303
4301
|
label: "Reasoning Tokens",
|
|
4304
4302
|
format: "number",
|
|
4305
4303
|
numberFormat: tokenNumberFormat,
|
|
4306
|
-
align: "right"
|
|
4307
|
-
sortable: true
|
|
4304
|
+
align: "right"
|
|
4308
4305
|
},
|
|
4309
4306
|
llmLatencyMs: {
|
|
4310
4307
|
label: "LLM Latency",
|
|
4311
4308
|
format: "duration",
|
|
4312
|
-
align: "right"
|
|
4313
|
-
sortable: true
|
|
4309
|
+
align: "right"
|
|
4314
4310
|
}
|
|
4315
4311
|
};
|
|
4316
4312
|
function resolveRemovedKeys(globalRemove, evalRemove) {
|
|
@@ -4336,31 +4332,29 @@ function appendDefaultStats(params) {
|
|
|
4336
4332
|
kind: "column",
|
|
4337
4333
|
key: "apiCalls",
|
|
4338
4334
|
label: "API Calls",
|
|
4339
|
-
aggregate: "avg"
|
|
4335
|
+
aggregate: "avg",
|
|
4336
|
+
numberFormat: countNumberFormat
|
|
4340
4337
|
});
|
|
4341
4338
|
if (activeKeys.has("costUsd")) defaults.push({
|
|
4342
4339
|
kind: "column",
|
|
4343
4340
|
key: "costUsd",
|
|
4344
4341
|
label: "LLM Cost",
|
|
4345
|
-
aggregate: "
|
|
4342
|
+
aggregate: "avg",
|
|
4343
|
+
numberFormat: costNumberFormat
|
|
4346
4344
|
});
|
|
4347
4345
|
if (activeKeys.has("totalTokens")) defaults.push({
|
|
4348
4346
|
kind: "column",
|
|
4349
4347
|
key: "totalTokens",
|
|
4350
4348
|
label: "Tokens",
|
|
4351
|
-
aggregate: "
|
|
4349
|
+
aggregate: "avg",
|
|
4350
|
+
numberFormat: tokenNumberFormat
|
|
4352
4351
|
});
|
|
4353
4352
|
if (activeKeys.has("llmTurns")) defaults.push({
|
|
4354
4353
|
kind: "column",
|
|
4355
4354
|
key: "llmTurns",
|
|
4356
4355
|
label: "LLM Turns",
|
|
4357
|
-
aggregate: "avg"
|
|
4358
|
-
|
|
4359
|
-
if (activeKeys.has("llmLatencyMs")) defaults.push({
|
|
4360
|
-
kind: "column",
|
|
4361
|
-
key: "llmLatencyMs",
|
|
4362
|
-
label: "LLM Latency",
|
|
4363
|
-
aggregate: "avg"
|
|
4356
|
+
aggregate: "avg",
|
|
4357
|
+
numberFormat: countNumberFormat
|
|
4364
4358
|
});
|
|
4365
4359
|
const merged = [...params.stats ?? [], ...defaults];
|
|
4366
4360
|
return merged.length > 0 ? merged : void 0;
|
|
@@ -4368,24 +4362,13 @@ function appendDefaultStats(params) {
|
|
|
4368
4362
|
function appendDefaultCharts(params) {
|
|
4369
4363
|
const activeKeys = new Set(getActiveDefaultConfigKeys(params));
|
|
4370
4364
|
const defaults = [];
|
|
4371
|
-
if (activeKeys.has("apiCalls")) defaults.push({
|
|
4372
|
-
heading: "API Calls",
|
|
4373
|
-
type: "bar",
|
|
4374
|
-
metrics: [{
|
|
4375
|
-
source: "column",
|
|
4376
|
-
key: "apiCalls",
|
|
4377
|
-
aggregate: "sum",
|
|
4378
|
-
label: "API Calls",
|
|
4379
|
-
color: "accentDim"
|
|
4380
|
-
}]
|
|
4381
|
-
});
|
|
4382
4365
|
if (activeKeys.has("costUsd")) defaults.push({
|
|
4383
4366
|
heading: "LLM Cost",
|
|
4384
4367
|
type: "area",
|
|
4385
4368
|
metrics: [{
|
|
4386
4369
|
source: "column",
|
|
4387
4370
|
key: "costUsd",
|
|
4388
|
-
aggregate: "
|
|
4371
|
+
aggregate: "avg",
|
|
4389
4372
|
label: "Cost",
|
|
4390
4373
|
color: "warning"
|
|
4391
4374
|
}]
|
|
@@ -4394,23 +4377,30 @@ function appendDefaultCharts(params) {
|
|
|
4394
4377
|
activeKeys.has("inputTokens") ? {
|
|
4395
4378
|
source: "column",
|
|
4396
4379
|
key: "inputTokens",
|
|
4397
|
-
aggregate: "
|
|
4380
|
+
aggregate: "avg",
|
|
4398
4381
|
label: "Input",
|
|
4399
4382
|
color: "accent"
|
|
4400
4383
|
} : null,
|
|
4401
4384
|
activeKeys.has("outputTokens") ? {
|
|
4402
4385
|
source: "column",
|
|
4403
4386
|
key: "outputTokens",
|
|
4404
|
-
aggregate: "
|
|
4387
|
+
aggregate: "avg",
|
|
4405
4388
|
label: "Output",
|
|
4406
4389
|
color: "success"
|
|
4407
4390
|
} : null,
|
|
4408
|
-
activeKeys.has("
|
|
4391
|
+
activeKeys.has("cachedInputTokens") ? {
|
|
4409
4392
|
source: "column",
|
|
4410
|
-
key: "
|
|
4411
|
-
aggregate: "
|
|
4412
|
-
label: "
|
|
4393
|
+
key: "cachedInputTokens",
|
|
4394
|
+
aggregate: "avg",
|
|
4395
|
+
label: "Cached Input",
|
|
4413
4396
|
color: "error"
|
|
4397
|
+
} : null,
|
|
4398
|
+
activeKeys.has("cacheCreationInputTokens") ? {
|
|
4399
|
+
source: "column",
|
|
4400
|
+
key: "cacheCreationInputTokens",
|
|
4401
|
+
aggregate: "avg",
|
|
4402
|
+
label: "Cache Write",
|
|
4403
|
+
color: "warning"
|
|
4414
4404
|
} : null
|
|
4415
4405
|
].filter((metric) => metric !== null);
|
|
4416
4406
|
if (tokenMetrics.length > 0) defaults.push({
|
|
@@ -4420,7 +4410,7 @@ function appendDefaultCharts(params) {
|
|
|
4420
4410
|
tooltipExtras: activeKeys.has("totalTokens") ? [{
|
|
4421
4411
|
source: "column",
|
|
4422
4412
|
key: "totalTokens",
|
|
4423
|
-
aggregate: "
|
|
4413
|
+
aggregate: "avg",
|
|
4424
4414
|
label: "Total"
|
|
4425
4415
|
}] : void 0
|
|
4426
4416
|
});
|
|
@@ -4517,12 +4507,6 @@ function addDefaultOutputs(params) {
|
|
|
4517
4507
|
value: sumNullable(calls.map((call) => call.cacheCreationInputTokens)),
|
|
4518
4508
|
activeKeys
|
|
4519
4509
|
});
|
|
4520
|
-
assignIfMissing({
|
|
4521
|
-
outputs: params.outputs,
|
|
4522
|
-
key: "reasoningTokens",
|
|
4523
|
-
value: sumNullable(calls.map((call) => call.reasoningTokens)),
|
|
4524
|
-
activeKeys
|
|
4525
|
-
});
|
|
4526
4510
|
assignIfMissing({
|
|
4527
4511
|
outputs: params.outputs,
|
|
4528
4512
|
key: "llmLatencyMs",
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-Dy_PECaf.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-DumvanQI.mjs";
|
|
2
|
+
import "./src-BoAJb4wC.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
package/package.json
CHANGED
|
@@ -287,9 +287,13 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
287
287
|
are still captured.
|
|
288
288
|
|
|
289
289
|
Stats rows and history charts on the eval card can be authored via `stats` /
|
|
290
|
-
`charts` on the eval definition. Usage stats
|
|
291
|
-
unless removed with `removeDefaultConfig`.
|
|
292
|
-
|
|
290
|
+
`charts` on the eval definition. Usage stats and LLM usage charts are added by
|
|
291
|
+
default unless removed with `removeDefaultConfig`. Column stats can override
|
|
292
|
+
`format` and `numberFormat`, otherwise they inherit from the matching column.
|
|
293
|
+
Number formats use `maxDecimalPlaces` to cap decimals and `minDecimalPlaces`
|
|
294
|
+
to pad trailing zeroes. Without `maxDecimalPlaces`, they render up to 3 decimal
|
|
295
|
+
places.
|
|
296
|
+
Their shapes live in the types; no need to memorize the option set.
|
|
293
297
|
|
|
294
298
|
## Cached operations
|
|
295
299
|
|