@ls-stack/agent-eval 0.52.2 → 0.52.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-CzLj4ZX0.mjs → app-BsFcUIQp.mjs} +4 -4
- package/dist/apps/web/dist/assets/{index-gJHi1MdH.js → index-D9HUAH8K.js} +74 -74
- package/dist/apps/web/dist/index.html +1 -1
- package/dist/bin.mjs +1 -1
- package/dist/{cli-Cvs7tc2v.mjs → cli-DbVFgRO3.mjs} +3 -3
- package/dist/index.d.mts +48 -0
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-o38J7uZO.mjs → runOrchestration-DT6cje9E.mjs} +9 -3
- package/dist/{runner-iWtmKx9z.mjs → runner-CyTUvbHE.mjs} +1 -1
- package/dist/{runner-LdMiDmAN.mjs → runner-WRQdfG0r.mjs} +2 -2
- package/dist/{src-Jahivm6d.mjs → src-DlvYXPxG.mjs} +2 -2
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +4 -2
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
|
|
26
26
|
rel="stylesheet"
|
|
27
27
|
/>
|
|
28
|
-
<script type="module" crossorigin src="/assets/index-
|
|
28
|
+
<script type="module" crossorigin src="/assets/index-D9HUAH8K.js"></script>
|
|
29
29
|
<link rel="stylesheet" crossorigin href="/assets/index-BHc4gfUO.css">
|
|
30
30
|
</head>
|
|
31
31
|
<body>
|
package/dist/bin.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, Lt as getEvalRegistry, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadEvalModule, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-
|
|
1
|
+
import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, Lt as getEvalRegistry, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadEvalModule, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-DT6cje9E.mjs";
|
|
2
2
|
import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
3
3
|
import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
4
4
|
import { createHash, randomUUID } from "node:crypto";
|
|
@@ -2095,8 +2095,8 @@ async function commandApp(args) {
|
|
|
2095
2095
|
const { serve } = await import("@hono/node-server");
|
|
2096
2096
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
2097
2097
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
2098
|
-
const appModule = await import("./app-
|
|
2099
|
-
const runnerModule = await import("./runner-
|
|
2098
|
+
const appModule = await import("./app-BsFcUIQp.mjs");
|
|
2099
|
+
const runnerModule = await import("./runner-CyTUvbHE.mjs");
|
|
2100
2100
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
2101
2101
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
2102
2102
|
await runnerModule.initRunner();
|
package/dist/index.d.mts
CHANGED
|
@@ -2568,6 +2568,22 @@ declare const scoreTraceSchema: z$1.ZodObject<{
|
|
|
2568
2568
|
}>>;
|
|
2569
2569
|
}, z$1.core.$strip>>>;
|
|
2570
2570
|
}, z$1.core.$strip>;
|
|
2571
|
+
cacheRefs: z$1.ZodDefault<z$1.ZodArray<z$1.ZodObject<{
|
|
2572
|
+
type: z$1.ZodLiteral<"value">;
|
|
2573
|
+
name: z$1.ZodString;
|
|
2574
|
+
namespace: z$1.ZodString;
|
|
2575
|
+
key: z$1.ZodString;
|
|
2576
|
+
status: z$1.ZodEnum<{
|
|
2577
|
+
hit: "hit";
|
|
2578
|
+
miss: "miss";
|
|
2579
|
+
refresh: "refresh";
|
|
2580
|
+
bypass: "bypass";
|
|
2581
|
+
}>;
|
|
2582
|
+
read: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
2583
|
+
stored: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
2584
|
+
storedAt: z$1.ZodOptional<z$1.ZodString>;
|
|
2585
|
+
age: z$1.ZodOptional<z$1.ZodNumber>;
|
|
2586
|
+
}, z$1.core.$strip>>>;
|
|
2571
2587
|
}, z$1.core.$strip>;
|
|
2572
2588
|
/** Trace payload captured while computing one score for a case. */
|
|
2573
2589
|
type ScoreTrace = z$1.infer<typeof scoreTraceSchema>;
|
|
@@ -2724,6 +2740,22 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
|
|
|
2724
2740
|
}>>;
|
|
2725
2741
|
}, z$1.core.$strip>>>;
|
|
2726
2742
|
}, z$1.core.$strip>;
|
|
2743
|
+
cacheRefs: z$1.ZodDefault<z$1.ZodArray<z$1.ZodObject<{
|
|
2744
|
+
type: z$1.ZodLiteral<"value">;
|
|
2745
|
+
name: z$1.ZodString;
|
|
2746
|
+
namespace: z$1.ZodString;
|
|
2747
|
+
key: z$1.ZodString;
|
|
2748
|
+
status: z$1.ZodEnum<{
|
|
2749
|
+
hit: "hit";
|
|
2750
|
+
miss: "miss";
|
|
2751
|
+
refresh: "refresh";
|
|
2752
|
+
bypass: "bypass";
|
|
2753
|
+
}>;
|
|
2754
|
+
read: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
2755
|
+
stored: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
2756
|
+
storedAt: z$1.ZodOptional<z$1.ZodString>;
|
|
2757
|
+
age: z$1.ZodOptional<z$1.ZodNumber>;
|
|
2758
|
+
}, z$1.core.$strip>>>;
|
|
2727
2759
|
}, z$1.core.$strip>>>;
|
|
2728
2760
|
columns: z$1.ZodRecord<z$1.ZodString, z$1.ZodUnion<readonly [z$1.ZodType<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown, z$1.core.$ZodTypeInternals<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown>>, z$1.ZodUnion<readonly [z$1.ZodObject<{
|
|
2729
2761
|
source: z$1.ZodLiteral<"repo">;
|
|
@@ -5566,6 +5598,22 @@ declare const caseDetailSchema: z$1.ZodObject<{
|
|
|
5566
5598
|
}>>;
|
|
5567
5599
|
}, z$1.core.$strip>>>;
|
|
5568
5600
|
}, z$1.core.$strip>;
|
|
5601
|
+
cacheRefs: z$1.ZodDefault<z$1.ZodArray<z$1.ZodObject<{
|
|
5602
|
+
type: z$1.ZodLiteral<"value">;
|
|
5603
|
+
name: z$1.ZodString;
|
|
5604
|
+
namespace: z$1.ZodString;
|
|
5605
|
+
key: z$1.ZodString;
|
|
5606
|
+
status: z$1.ZodEnum<{
|
|
5607
|
+
hit: "hit";
|
|
5608
|
+
miss: "miss";
|
|
5609
|
+
refresh: "refresh";
|
|
5610
|
+
bypass: "bypass";
|
|
5611
|
+
}>;
|
|
5612
|
+
read: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
5613
|
+
stored: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
5614
|
+
storedAt: z$1.ZodOptional<z$1.ZodString>;
|
|
5615
|
+
age: z$1.ZodOptional<z$1.ZodNumber>;
|
|
5616
|
+
}, z$1.core.$strip>>>;
|
|
5569
5617
|
}, z$1.core.$strip>>>;
|
|
5570
5618
|
columns: z$1.ZodRecord<z$1.ZodString, z$1.ZodUnion<readonly [z$1.ZodType<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown, z$1.core.$ZodTypeInternals<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown>>, z$1.ZodUnion<readonly [z$1.ZodObject<{
|
|
5571
5619
|
source: z$1.ZodLiteral<"repo">;
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, Lt as getEvalRegistry, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-
|
|
2
|
-
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-
|
|
3
|
-
import { n as matchesEvalTags, t as defineEval } from "./src-
|
|
1
|
+
import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, Lt as getEvalRegistry, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-DT6cje9E.mjs";
|
|
2
|
+
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-DbVFgRO3.mjs";
|
|
3
|
+
import { n as matchesEvalTags, t as defineEval } from "./src-DlvYXPxG.mjs";
|
|
4
4
|
export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as columnDefSchema, Mt as evalStatsConfigSchema, Nt as manualInputDescriptorSchema, Pt as evalChartsConfigSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-
|
|
1
|
+
import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as columnDefSchema, Mt as evalStatsConfigSchema, Nt as manualInputDescriptorSchema, Pt as evalChartsConfigSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-DT6cje9E.mjs";
|
|
2
2
|
import { z } from "zod/v4";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { relative } from "node:path";
|
|
@@ -827,7 +827,12 @@ const runLogEntrySchema = z.object({
|
|
|
827
827
|
/** Trace payload captured while computing one score for a case. */
|
|
828
828
|
const scoreTraceSchema = z.object({
|
|
829
829
|
trace: z.array(traceSpanSchema),
|
|
830
|
-
traceDisplay: traceDisplayConfigSchema
|
|
830
|
+
traceDisplay: traceDisplayConfigSchema,
|
|
831
|
+
/**
|
|
832
|
+
* Value-cache refs recorded by `evalTracer.cache(...)` calls made directly
|
|
833
|
+
* from the score compute body, with no surrounding scorer span.
|
|
834
|
+
*/
|
|
835
|
+
cacheRefs: z.array(traceCacheRefSchema).default([])
|
|
831
836
|
});
|
|
832
837
|
/** Schema for the detailed payload shown when opening a specific case. */
|
|
833
838
|
const caseDetailSchema = z.object({
|
|
@@ -7134,9 +7139,10 @@ async function runCase(params) {
|
|
|
7134
7139
|
...entry,
|
|
7135
7140
|
source: key
|
|
7136
7141
|
})));
|
|
7137
|
-
if (trace.length > 0) scoringTraces[key] = {
|
|
7142
|
+
if (trace.length > 0 || scoreRun.scope.caseCacheRefs.length > 0) scoringTraces[key] = {
|
|
7138
7143
|
trace,
|
|
7139
|
-
traceDisplay
|
|
7144
|
+
traceDisplay,
|
|
7145
|
+
cacheRefs: scoreRun.scope.caseCacheRefs
|
|
7140
7146
|
};
|
|
7141
7147
|
const rawValue = scoreRun.result;
|
|
7142
7148
|
if (scoreRun.error) {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-WRQdfG0r.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-DbVFgRO3.mjs";
|
|
2
|
+
import "./src-DlvYXPxG.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-DT6cje9E.mjs";
|
|
2
|
+
import "./cli-DbVFgRO3.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -303,7 +303,8 @@ or if the case errors. Scores without `passThreshold` are informational.
|
|
|
303
303
|
Score functions run in their own trace scope, separate from the execution
|
|
304
304
|
trace, so LLM-as-judge scorers can use `evalTracer.span(...)` and cached spans
|
|
305
305
|
without polluting the agent trajectory. Outputs set inside a scorer stay
|
|
306
|
-
private to that score.
|
|
306
|
+
private to that score. Spanless `evalTracer.cache(...)` calls made directly
|
|
307
|
+
inside a scorer are stored on that score trace's `cacheRefs` payload.
|
|
307
308
|
|
|
308
309
|
`manualScores` declares score columns that reviewers fill in after a run.
|
|
309
310
|
Pending values keep the eval in an `unscored` state instead of failing.
|
|
@@ -473,7 +474,8 @@ Mental model:
|
|
|
473
474
|
span, that span gets a `cache.refs` entry with the value cache name, key,
|
|
474
475
|
namespace, and hit/miss status. When called directly from the case body
|
|
475
476
|
(no surrounding span), the ref is recorded on the case detail's `cacheRefs`
|
|
476
|
-
array.
|
|
477
|
+
array. When called directly from a scorer, the ref is recorded on that
|
|
478
|
+
scoring trace's `cacheRefs` array.
|
|
477
479
|
- Cache identity is the namespace plus the authored key. Source-file
|
|
478
480
|
fingerprints are tracked for run freshness separately, but do not participate
|
|
479
481
|
in cache-key hashing.
|