@ls-stack/agent-eval 0.56.0 → 0.57.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-Bpe6Monh.mjs → app-Db_x-Rit.mjs} +4 -4
- package/dist/apps/web/dist/assets/{index-CfSiAVmi.js → index-o4o2EktS.js} +76 -76
- package/dist/apps/web/dist/index.html +1 -1
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +1 -1
- package/dist/{cli-DQO2Fpt2.mjs → cli-Ck0mqxd-.mjs} +4 -4
- package/dist/index.d.mts +110 -110
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +2 -2
- package/dist/{runExecution-6lrtj48K.mjs → runExecution-BH7DlMXl.mjs} +5 -1
- package/dist/{runOrchestration-BYaN2mzS.mjs → runOrchestration-C1Ex9QI-.mjs} +1 -1
- package/dist/{runner-DYlwuAT3.mjs → runner-B3hEOT_I.mjs} +2 -2
- package/dist/{runner-C3CiS2o7.mjs → runner-DbVYcapC.mjs} +1 -1
- package/dist/{src-DCGrFAmO.mjs → src-B3iq-tuv.mjs} +2 -2
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +5 -1
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as startEvalBackgroundJob, A as repoFile, B as getCurrentScope, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as appendToEvalOutput, G as mergeEvalOutput, H as incrementEvalOutput, J as runInEvalScope, K as nextEvalId, L as evalAssert, M as readManualInputFile, N as evalExpect, Nt as getEvalRegistry, O as serializeCacheRecording, P as EvalAssertionError, Q as setScopeCacheContext, R as evalLog, S as evalSpan, T as hashCacheKeySync, U as isInEvalScope, V as getEvalCaseInput, Y as runInExistingEvalScope, Z as setEvalOutput, at as extractLlmCalls, b as buildTraceTree, it as extractApiCalls, j as manualInputFileValueSchema, k as serializeCacheValue, lt as getNestedAttribute, nt as extractCacheEntries, ot as simulateLlmCallCost, q as runInEvalRuntimeScope, rt as extractCacheHits, st as simulateTokenAllocation, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalTime } from "./runExecution-
|
|
2
|
-
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-
|
|
3
|
-
import { n as matchesEvalTags, t as defineEval } from "./src-
|
|
1
|
+
import { $ as startEvalBackgroundJob, A as repoFile, B as getCurrentScope, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as appendToEvalOutput, G as mergeEvalOutput, H as incrementEvalOutput, J as runInEvalScope, K as nextEvalId, L as evalAssert, M as readManualInputFile, N as evalExpect, Nt as getEvalRegistry, O as serializeCacheRecording, P as EvalAssertionError, Q as setScopeCacheContext, R as evalLog, S as evalSpan, T as hashCacheKeySync, U as isInEvalScope, V as getEvalCaseInput, Y as runInExistingEvalScope, Z as setEvalOutput, at as extractLlmCalls, b as buildTraceTree, it as extractApiCalls, j as manualInputFileValueSchema, k as serializeCacheValue, lt as getNestedAttribute, nt as extractCacheEntries, ot as simulateLlmCallCost, q as runInEvalRuntimeScope, rt as extractCacheHits, st as simulateTokenAllocation, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalTime } from "./runExecution-BH7DlMXl.mjs";
|
|
2
|
+
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-Ck0mqxd-.mjs";
|
|
3
|
+
import { n as matchesEvalTags, t as defineEval } from "./src-B3iq-tuv.mjs";
|
|
4
4
|
export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { At as evalChartsConfigSchema, Ct as buildEvalKey, Dt as evalStatAggregateSchema, I as configureEvalRunLogs, Ot as evalStatsConfigSchema, bt as runSummarySchema, et as createRunRequestSchema, jt as columnDefSchema, kt as manualInputDescriptorSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, v as createFsCacheStore, yt as runManifestSchema } from "./runExecution-
|
|
2
|
-
import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-
|
|
1
|
+
import { At as evalChartsConfigSchema, Ct as buildEvalKey, Dt as evalStatAggregateSchema, I as configureEvalRunLogs, Ot as evalStatsConfigSchema, bt as runSummarySchema, et as createRunRequestSchema, jt as columnDefSchema, kt as manualInputDescriptorSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, v as createFsCacheStore, yt as runManifestSchema } from "./runExecution-BH7DlMXl.mjs";
|
|
2
|
+
import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-C1Ex9QI-.mjs";
|
|
3
3
|
import { z } from "zod/v4";
|
|
4
4
|
import { readFile } from "node:fs/promises";
|
|
5
5
|
import { relative } from "node:path";
|
|
@@ -6181,6 +6181,10 @@ function sumNullable(values) {
|
|
|
6181
6181
|
}
|
|
6182
6182
|
return hasValue ? total : void 0;
|
|
6183
6183
|
}
|
|
6184
|
+
function getMaxLlmTurns(calls) {
|
|
6185
|
+
if (calls.length === 0) return void 0;
|
|
6186
|
+
return Math.max(...calls.map((call) => Math.max(call.stepCount ?? 1, 1)));
|
|
6187
|
+
}
|
|
6184
6188
|
function assignIfMissing(params) {
|
|
6185
6189
|
if (!params.activeKeys.has(params.key)) return;
|
|
6186
6190
|
if (params.key in params.outputs) return;
|
|
@@ -6202,7 +6206,7 @@ function addDefaultOutputs(params) {
|
|
|
6202
6206
|
assignIfMissing({
|
|
6203
6207
|
outputs: params.outputs,
|
|
6204
6208
|
key: "llmTurns",
|
|
6205
|
-
value: calls
|
|
6209
|
+
value: getMaxLlmTurns(calls),
|
|
6206
6210
|
activeKeys
|
|
6207
6211
|
});
|
|
6208
6212
|
assignIfMissing({
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-
|
|
1
|
+
import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-BH7DlMXl.mjs";
|
|
2
2
|
import { readFile, readdir, rm, writeFile } from "node:fs/promises";
|
|
3
3
|
import { dirname, join } from "node:path";
|
|
4
4
|
import { existsSync } from "node:fs";
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-Ck0mqxd-.mjs";
|
|
2
|
+
import "./src-B3iq-tuv.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-B3hEOT_I.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-BH7DlMXl.mjs";
|
|
2
|
+
import "./cli-Ck0mqxd-.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -393,7 +393,9 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
393
393
|
tokens, and output tokens separately and use `dedupeConsecutiveValues: true`
|
|
394
394
|
to skip repeated adjacent chart values. `totalTokens` is input + output only;
|
|
395
395
|
cache read/write tokens stay separate and affect `costUsd` at their own
|
|
396
|
-
rates.
|
|
396
|
+
rates. `llmTurns` is the maximum per-call turn count in the case run, using
|
|
397
|
+
configured steps when available and otherwise one turn per matched LLM call
|
|
398
|
+
span.
|
|
397
399
|
Derived base input cost uses `inputTokens - cachedInputTokens -
|
|
398
400
|
cacheCreationInputTokens` so cache details are not double-counted.
|
|
399
401
|
`cacheCreationInputTokens` is the total cache-write count; optional
|
|
@@ -568,6 +570,8 @@ Targeted evals in `run.json` are recorded by exact `evalKeys`
|
|
|
568
570
|
unambiguous in saved history.
|
|
569
571
|
Temporary runs use the same directory layout, but are removed before the next
|
|
570
572
|
run of any kind starts.
|
|
573
|
+
When a saved case needs to be handed to another agent, the app can copy the
|
|
574
|
+
saved case detail path or the saved run folder path directly.
|
|
571
575
|
|
|
572
576
|
Use `agent-evals show-runs` when you need stable file
|
|
573
577
|
paths before reading saved output:
|