@ls-stack/agent-eval 0.20.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1970,7 +1970,15 @@ const numberDisplayOptionsSchema = z.object({
1970
1970
  compactDisplay: z.enum(["short", "long"]).optional(),
1971
1971
  prefix: z.string().optional(),
1972
1972
  suffix: z.string().optional(),
1973
- decimalPlaces: z.number().int().min(0).optional()
1973
+ minDecimalPlaces: z.number().int().min(0).optional(),
1974
+ maxDecimalPlaces: z.number().int().min(0).optional()
1975
+ }).refine((options) => {
1976
+ if (options.minDecimalPlaces === void 0) return true;
1977
+ if (options.maxDecimalPlaces === void 0) return true;
1978
+ return options.minDecimalPlaces <= options.maxDecimalPlaces;
1979
+ }, {
1980
+ message: "minDecimalPlaces must be less than or equal to maxDecimalPlaces",
1981
+ path: ["minDecimalPlaces"]
1974
1982
  });
1975
1983
  /** Schema for the supported column rendering kinds in list views. */
1976
1984
  const columnKindSchema = z.enum([
@@ -2005,7 +2013,6 @@ const columnDefSchema = z.object({
2005
2013
  passThreshold: z.number().optional(),
2006
2014
  maxStars: z.number().int().min(2).optional(),
2007
2015
  hideInTable: z.boolean().optional(),
2008
- sortable: z.boolean().optional(),
2009
2016
  align: z.enum([
2010
2017
  "left",
2011
2018
  "center",
@@ -2403,6 +2410,8 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
2403
2410
  label: z.string().optional(),
2404
2411
  aggregate: evalStatAggregateSchema,
2405
2412
  format: columnFormatSchema.optional(),
2413
+ /** Number presentation options applied when `format: 'number'`. */
2414
+ numberFormat: numberDisplayOptionsSchema.optional(),
2406
2415
  accent: z.boolean().optional()
2407
2416
  })
2408
2417
  ]);
@@ -4035,7 +4044,6 @@ function getScoreOverride(def) {
4035
4044
  format: def.format,
4036
4045
  numberFormat: def.numberFormat,
4037
4046
  hideInTable: def.hideInTable,
4038
- sortable: def.sortable,
4039
4047
  align: def.align,
4040
4048
  maxStars: def.maxStars
4041
4049
  };
@@ -4048,7 +4056,6 @@ function mergeOverrides(base, override) {
4048
4056
  format: override.format ?? base.format,
4049
4057
  numberFormat: override.numberFormat ?? base.numberFormat,
4050
4058
  hideInTable: override.hideInTable ?? base.hideInTable,
4051
- sortable: override.sortable ?? base.sortable,
4052
4059
  align: override.align ?? base.align,
4053
4060
  maxStars: override.maxStars ?? base.maxStars
4054
4061
  };
@@ -4163,7 +4170,6 @@ function createColumnDef(params) {
4163
4170
  if (override?.numberFormat !== void 0) def.numberFormat = override.numberFormat;
4164
4171
  if (override?.maxStars !== void 0) def.maxStars = override.maxStars;
4165
4172
  if (override?.hideInTable !== void 0) def.hideInTable = override.hideInTable;
4166
- if (override?.sortable !== void 0) def.sortable = override.sortable;
4167
4173
  if (override?.align !== void 0) def.align = override.align;
4168
4174
  if (!isScore) return def;
4169
4175
  def.isScore = true;
@@ -4231,86 +4237,76 @@ const DEFAULT_CONFIG_KEYS = [
4231
4237
  "totalTokens",
4232
4238
  "cachedInputTokens",
4233
4239
  "cacheCreationInputTokens",
4234
- "reasoningTokens",
4235
4240
  "llmLatencyMs"
4236
4241
  ];
4237
- const tokenNumberFormat = {
4238
- notation: "compact",
4239
- decimalPlaces: 1
4242
+ const tokenNumberFormat = { notation: "compact" };
4243
+ const countNumberFormat = {
4244
+ minDecimalPlaces: 0,
4245
+ maxDecimalPlaces: 0
4246
+ };
4247
+ const costNumberFormat = {
4248
+ prefix: "$",
4249
+ maxDecimalPlaces: 4
4240
4250
  };
4241
- const countNumberFormat = { decimalPlaces: 0 };
4242
4251
  const DEFAULT_COLUMNS = {
4243
4252
  apiCalls: {
4244
4253
  label: "API Calls",
4245
4254
  format: "number",
4246
4255
  numberFormat: countNumberFormat,
4247
- align: "right",
4248
- sortable: true
4256
+ align: "right"
4249
4257
  },
4250
4258
  costUsd: {
4251
4259
  label: "Cost",
4252
4260
  format: "number",
4253
- numberFormat: {
4254
- prefix: "$",
4255
- decimalPlaces: 4
4256
- },
4257
- align: "right",
4258
- sortable: true
4261
+ numberFormat: costNumberFormat,
4262
+ align: "right"
4259
4263
  },
4260
4264
  llmTurns: {
4261
4265
  label: "LLM Turns",
4262
4266
  format: "number",
4263
4267
  numberFormat: countNumberFormat,
4264
- align: "right",
4265
- sortable: true
4268
+ align: "right"
4266
4269
  },
4267
4270
  inputTokens: {
4268
4271
  label: "Input Tokens",
4269
4272
  format: "number",
4270
4273
  numberFormat: tokenNumberFormat,
4271
- align: "right",
4272
- sortable: true
4274
+ align: "right"
4273
4275
  },
4274
4276
  outputTokens: {
4275
4277
  label: "Output Tokens",
4276
4278
  format: "number",
4277
4279
  numberFormat: tokenNumberFormat,
4278
- align: "right",
4279
- sortable: true
4280
+ align: "right"
4280
4281
  },
4281
4282
  totalTokens: {
4282
4283
  label: "Total Tokens",
4283
4284
  format: "number",
4284
4285
  numberFormat: tokenNumberFormat,
4285
- align: "right",
4286
- sortable: true
4286
+ align: "right"
4287
4287
  },
4288
4288
  cachedInputTokens: {
4289
4289
  label: "Cached Input Tokens",
4290
4290
  format: "number",
4291
4291
  numberFormat: tokenNumberFormat,
4292
- align: "right",
4293
- sortable: true
4292
+ align: "right"
4294
4293
  },
4295
4294
  cacheCreationInputTokens: {
4296
4295
  label: "Cache Write Tokens",
4297
4296
  format: "number",
4298
4297
  numberFormat: tokenNumberFormat,
4299
- align: "right",
4300
- sortable: true
4298
+ align: "right"
4301
4299
  },
4302
4300
  reasoningTokens: {
4303
4301
  label: "Reasoning Tokens",
4304
4302
  format: "number",
4305
4303
  numberFormat: tokenNumberFormat,
4306
- align: "right",
4307
- sortable: true
4304
+ align: "right"
4308
4305
  },
4309
4306
  llmLatencyMs: {
4310
4307
  label: "LLM Latency",
4311
4308
  format: "duration",
4312
- align: "right",
4313
- sortable: true
4309
+ align: "right"
4314
4310
  }
4315
4311
  };
4316
4312
  function resolveRemovedKeys(globalRemove, evalRemove) {
@@ -4336,31 +4332,29 @@ function appendDefaultStats(params) {
4336
4332
  kind: "column",
4337
4333
  key: "apiCalls",
4338
4334
  label: "API Calls",
4339
- aggregate: "avg"
4335
+ aggregate: "avg",
4336
+ numberFormat: countNumberFormat
4340
4337
  });
4341
4338
  if (activeKeys.has("costUsd")) defaults.push({
4342
4339
  kind: "column",
4343
4340
  key: "costUsd",
4344
4341
  label: "LLM Cost",
4345
- aggregate: "sum"
4342
+ aggregate: "avg",
4343
+ numberFormat: costNumberFormat
4346
4344
  });
4347
4345
  if (activeKeys.has("totalTokens")) defaults.push({
4348
4346
  kind: "column",
4349
4347
  key: "totalTokens",
4350
4348
  label: "Tokens",
4351
- aggregate: "sum"
4349
+ aggregate: "avg",
4350
+ numberFormat: tokenNumberFormat
4352
4351
  });
4353
4352
  if (activeKeys.has("llmTurns")) defaults.push({
4354
4353
  kind: "column",
4355
4354
  key: "llmTurns",
4356
4355
  label: "LLM Turns",
4357
- aggregate: "avg"
4358
- });
4359
- if (activeKeys.has("llmLatencyMs")) defaults.push({
4360
- kind: "column",
4361
- key: "llmLatencyMs",
4362
- label: "LLM Latency",
4363
- aggregate: "avg"
4356
+ aggregate: "avg",
4357
+ numberFormat: countNumberFormat
4364
4358
  });
4365
4359
  const merged = [...params.stats ?? [], ...defaults];
4366
4360
  return merged.length > 0 ? merged : void 0;
@@ -4368,24 +4362,13 @@ function appendDefaultStats(params) {
4368
4362
  function appendDefaultCharts(params) {
4369
4363
  const activeKeys = new Set(getActiveDefaultConfigKeys(params));
4370
4364
  const defaults = [];
4371
- if (activeKeys.has("apiCalls")) defaults.push({
4372
- heading: "API Calls",
4373
- type: "bar",
4374
- metrics: [{
4375
- source: "column",
4376
- key: "apiCalls",
4377
- aggregate: "sum",
4378
- label: "API Calls",
4379
- color: "accentDim"
4380
- }]
4381
- });
4382
4365
  if (activeKeys.has("costUsd")) defaults.push({
4383
4366
  heading: "LLM Cost",
4384
4367
  type: "area",
4385
4368
  metrics: [{
4386
4369
  source: "column",
4387
4370
  key: "costUsd",
4388
- aggregate: "sum",
4371
+ aggregate: "avg",
4389
4372
  label: "Cost",
4390
4373
  color: "warning"
4391
4374
  }]
@@ -4394,23 +4377,30 @@ function appendDefaultCharts(params) {
4394
4377
  activeKeys.has("inputTokens") ? {
4395
4378
  source: "column",
4396
4379
  key: "inputTokens",
4397
- aggregate: "sum",
4380
+ aggregate: "avg",
4398
4381
  label: "Input",
4399
4382
  color: "accent"
4400
4383
  } : null,
4401
4384
  activeKeys.has("outputTokens") ? {
4402
4385
  source: "column",
4403
4386
  key: "outputTokens",
4404
- aggregate: "sum",
4387
+ aggregate: "avg",
4405
4388
  label: "Output",
4406
4389
  color: "success"
4407
4390
  } : null,
4408
- activeKeys.has("reasoningTokens") ? {
4391
+ activeKeys.has("cachedInputTokens") ? {
4409
4392
  source: "column",
4410
- key: "reasoningTokens",
4411
- aggregate: "sum",
4412
- label: "Reasoning",
4393
+ key: "cachedInputTokens",
4394
+ aggregate: "avg",
4395
+ label: "Cached Input",
4413
4396
  color: "error"
4397
+ } : null,
4398
+ activeKeys.has("cacheCreationInputTokens") ? {
4399
+ source: "column",
4400
+ key: "cacheCreationInputTokens",
4401
+ aggregate: "avg",
4402
+ label: "Cache Write",
4403
+ color: "warning"
4414
4404
  } : null
4415
4405
  ].filter((metric) => metric !== null);
4416
4406
  if (tokenMetrics.length > 0) defaults.push({
@@ -4420,7 +4410,7 @@ function appendDefaultCharts(params) {
4420
4410
  tooltipExtras: activeKeys.has("totalTokens") ? [{
4421
4411
  source: "column",
4422
4412
  key: "totalTokens",
4423
- aggregate: "sum",
4413
+ aggregate: "avg",
4424
4414
  label: "Total"
4425
4415
  }] : void 0
4426
4416
  });
@@ -4517,12 +4507,6 @@ function addDefaultOutputs(params) {
4517
4507
  value: sumNullable(calls.map((call) => call.cacheCreationInputTokens)),
4518
4508
  activeKeys
4519
4509
  });
4520
- assignIfMissing({
4521
- outputs: params.outputs,
4522
- key: "reasoningTokens",
4523
- value: sumNullable(calls.map((call) => call.reasoningTokens)),
4524
- activeKeys
4525
- });
4526
4510
  assignIfMissing({
4527
4511
  outputs: params.outputs,
4528
4512
  key: "llmLatencyMs",
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-DzrMtgBu.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-Dy_PECaf.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-weogme5U.mjs";
2
- import "./src-B879LZfo.mjs";
1
+ import { n as createRunner } from "./cli-DumvanQI.mjs";
2
+ import "./src-BoAJb4wC.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -0,0 +1,3 @@
1
+ import "./runOrchestration-zYAcAPtS.mjs";
2
+ import "./cli-DumvanQI.mjs";
3
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.20.0",
3
+ "version": "0.21.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -287,9 +287,13 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
287
287
  are still captured.
288
288
 
289
289
  Stats rows and history charts on the eval card can be authored via `stats` /
290
- `charts` on the eval definition. Usage stats/charts are added by default
291
- unless removed with `removeDefaultConfig`. Their shapes live in the types; no
292
- need to memorize the option set.
290
+ `charts` on the eval definition. Usage stats and LLM usage charts are added by
291
+ default unless removed with `removeDefaultConfig`. Column stats can override
292
+ `format` and `numberFormat`, otherwise they inherit from the matching column.
293
+ Number formats use `maxDecimalPlaces` to cap decimals and `minDecimalPlaces`
294
+ to pad trailing zeroes. Without `maxDecimalPlaces`, they render up to 3 decimal
295
+ places.
296
+ Their shapes live in the types; no need to memorize the option set.
293
297
 
294
298
  ## Cached operations
295
299