@ls-stack/agent-eval 0.60.0 → 0.60.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,7 +25,7 @@
25
25
  href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
26
26
  rel="stylesheet"
27
27
  />
28
- <script type="module" crossorigin src="/assets/index-Dowobz-z.js"></script>
28
+ <script type="module" crossorigin src="/assets/index-CM6MDNqo.js"></script>
29
29
  <link rel="stylesheet" crossorigin href="/assets/index-CqWfzcFb.css">
30
30
  </head>
31
31
  <body>
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-FOyPC8UD.mjs";
2
+ import { t as runCli } from "./cli-CbePEEua.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -1,4 +1,4 @@
1
- import { Ct as resolveLlmCallsConfig, It as runWithEvalRegistry, J as runInEvalRuntimeScope, L as configureEvalRunLogs, St as resolveApiCallsConfig, _ as createBufferedCacheStore, a as isCaseChildParentMessage, d as loadEvalModule, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, r as runCase, v as createFsCacheStore, y as getCacheRetentionOptions } from "./runExecution-CjWJUUZ5.mjs";
1
+ import { Ct as resolveLlmCallsConfig, It as runWithEvalRegistry, J as runInEvalRuntimeScope, L as configureEvalRunLogs, St as resolveApiCallsConfig, _ as createBufferedCacheStore, a as isCaseChildParentMessage, d as loadEvalModule, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, r as runCase, v as createFsCacheStore, y as getCacheRetentionOptions } from "./runExecution-Bq0Y3y_1.mjs";
2
2
  //#region ../runner/src/caseChild.ts
3
3
  let fatalErrorReported = false;
4
4
  let disconnectExpected = false;
@@ -1,5 +1,6 @@
1
- import { Ct as resolveLlmCallsConfig, Et as getCaseRowCaseKey, Ft as getEvalRegistry, Ot as caseRowSchema, St as resolveApiCallsConfig, Tt as buildEvalKey, _t as matchesTagsFilter, c as resolveArtifactPath, dt as getEvalTitle, f as resolveEvalDefaultConfig, ft as getEvalDisplayStatus, h as normalizeScoreDef, lt as applyDerivedCallAttributes, m as buildDeclaredColumnDefs, o as stripTerminalControlCodes, p as loadConfig, pt as deriveScopedSummaryFromCases, s as resolveTracePresentation, v as createFsCacheStore, xt as runSummarySchema, y as getCacheRetentionOptions } from "./runExecution-CjWJUUZ5.mjs";
2
- import { C as validateCharts, S as parseEvalDiscovery, _ as runTouchesEval, a as validateTagsFilters, b as deriveEvalFreshness, c as getLatestRunInfos, d as nextShortIdFromSnapshots, f as persistCaseDetail, g as recomputePersistedCaseStatus, h as recomputeEvalStatusesInRuns, i as resolveEvalTags, l as loadPersistedRunSnapshot, m as persistRunState, n as getTargetEvalKeys, o as generateRunId, p as deleteTemporaryRuns, s as getLastRunStatuses, u as loadPersistedRunSnapshots, v as buildManualInputDescriptor, x as loadIsolatedEvalRegistry, y as parseManualInputValues } from "./runOrchestration-DE2TFAS6.mjs";
1
+ import { Ct as resolveLlmCallsConfig, Et as getCaseRowCaseKey, Ft as getEvalRegistry, Ot as caseRowSchema, St as resolveApiCallsConfig, Tt as buildEvalKey, _t as matchesTagsFilter, c as resolveArtifactPath, dt as getEvalTitle, f as resolveEvalDefaultConfig, ft as getEvalDisplayStatus, h as normalizeScoreDef, lt as applyDerivedCallAttributes, m as buildDeclaredColumnDefs, o as stripTerminalControlCodes, p as loadConfig, pt as deriveScopedSummaryFromCases, s as resolveTracePresentation, v as createFsCacheStore, xt as runSummarySchema, y as getCacheRetentionOptions } from "./runExecution-Bq0Y3y_1.mjs";
2
+ import { C as validateCharts, S as parseEvalDiscovery, _ as runTouchesEval, a as validateTagsFilters, b as deriveEvalFreshness, c as getLatestRunInfos, d as nextShortIdFromSnapshots, f as persistCaseDetail, g as recomputePersistedCaseStatus, h as recomputeEvalStatusesInRuns, i as resolveEvalTags, l as loadPersistedRunSnapshot, m as persistRunState, n as getTargetEvalKeys, o as generateRunId, p as deleteTemporaryRuns, s as getLastRunStatuses, u as loadPersistedRunSnapshots, v as buildManualInputDescriptor, x as loadIsolatedEvalRegistry, y as parseManualInputValues } from "./runOrchestration-BpwW0AmB.mjs";
3
+ import { parseEnv } from "node:util";
3
4
  import { resultify } from "t-result";
4
5
  import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
5
6
  import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
@@ -33,7 +34,7 @@ function resolveCaseDetailLookup(run, caseId) {
33
34
  }
34
35
  //#endregion
35
36
  //#region ../runner/src/configReload.ts
36
- /** Coordinates idle-only reloads for `agent-evals.config.ts` in app mode. */
37
+ /** Coordinates idle-only reloads for workspace config and `.env` in app mode. */
37
38
  function createConfigReloadController({ getActiveRunCount, closeRunnerWatchers, loadRunnerState, emitToDiscoveryListeners }) {
38
39
  let watcher;
39
40
  let reloadTimer;
@@ -125,7 +126,7 @@ function createConfigReloadController({ getActiveRunCount, closeRunnerWatchers,
125
126
  await reloadConfig(state.lastChangedAt ?? (/* @__PURE__ */ new Date()).toISOString());
126
127
  }
127
128
  async function setupWatcher() {
128
- const nextWatcher = watch(resolve(process.cwd(), "agent-evals.config.ts"), {
129
+ const nextWatcher = watch([resolve(process.cwd(), "agent-evals.config.ts"), resolve(process.cwd(), ".env")], {
129
130
  awaitWriteFinish: {
130
131
  stabilityThreshold: 100,
131
132
  pollInterval: 20
@@ -904,10 +905,51 @@ function getWatchRootsForIncludePatterns(params) {
904
905
  return [...roots];
905
906
  }
906
907
  //#endregion
908
+ //#region ../runner/src/workspaceEnv.ts
909
+ const shellEnvKeys = new Set(Object.keys(process.env));
910
+ const appliedWorkspaceEnvValues = /* @__PURE__ */ new Map();
911
+ async function loadWorkspaceEnv(workspaceRoot) {
912
+ const envPath = resolve(workspaceRoot, ".env");
913
+ if (!existsSync(envPath)) {
914
+ applyWorkspaceEnv(/* @__PURE__ */ new Map());
915
+ return { error: null };
916
+ }
917
+ const readResult = await resultify(() => readFile(envPath, "utf-8"));
918
+ if (readResult.error) return { error: `Failed to read .env at ${envPath}: ${readResult.error.message}` };
919
+ const parseResult = resultify(() => parseEnv(readResult.value));
920
+ if (parseResult.error) return { error: `Failed to parse .env at ${envPath}: ${parseResult.error.message}` };
921
+ applyWorkspaceEnv(new Map(getEnvEntries(parseResult.value)));
922
+ return { error: null };
923
+ }
924
+ function getEnvEntries(env) {
925
+ const entries = [];
926
+ for (const [key, value] of Object.entries(env)) if (value !== void 0) entries.push([key, value]);
927
+ return entries;
928
+ }
929
+ function applyWorkspaceEnv(nextValues) {
930
+ for (const [key, previousValue] of appliedWorkspaceEnvValues) {
931
+ if (nextValues.has(key)) continue;
932
+ if (process.env[key] === previousValue) delete process.env[key];
933
+ appliedWorkspaceEnvValues.delete(key);
934
+ }
935
+ for (const [key, value] of nextValues) {
936
+ if (shellEnvKeys.has(key)) continue;
937
+ process.env[key] = value;
938
+ appliedWorkspaceEnvValues.set(key, value);
939
+ }
940
+ }
941
+ //#endregion
907
942
  //#region ../runner/src/runner.ts
908
943
  const defaultCachePruneIdleDelayMs = 5e3;
909
- /** Create an in-memory eval runner bound to the current workspace config. */
910
- function createRunner({ watchForChanges = true } = {}) {
944
+ /**
945
+ * Create an in-memory eval runner bound to the current workspace config.
946
+ *
947
+ * @param options.watchForChanges Watch eval files, run history, config, and
948
+ * workspace `.env` for live reloads.
949
+ * @param options.loadEnv Load `.env` from the current workspace before config,
950
+ * discovery, and runs. Shell-provided values keep precedence.
951
+ */
952
+ function createRunner({ watchForChanges = true, loadEnv = true } = {}) {
911
953
  let config;
912
954
  let workspaceRoot;
913
955
  let localStateDir;
@@ -1475,6 +1517,10 @@ function createRunner({ watchForChanges = true } = {}) {
1475
1517
  }
1476
1518
  };
1477
1519
  async function loadRunnerState() {
1520
+ if (loadEnv) {
1521
+ const envResult = await loadWorkspaceEnv(process.cwd());
1522
+ if (envResult.error !== null) throw new Error(envResult.error);
1523
+ }
1478
1524
  config = await loadConfig();
1479
1525
  workspaceRoot = config.workspaceRoot ?? process.cwd();
1480
1526
  localStateDir = resolve(workspaceRoot, ".agent-evals");
@@ -2079,7 +2125,6 @@ function parseArgs(argv) {
2079
2125
  */
2080
2126
  async function runCli(argv) {
2081
2127
  const args = parseArgs(argv);
2082
- if (args.loadEnv && !loadWorkspaceEnv()) process.exit(1);
2083
2128
  if (args.showHelp) {
2084
2129
  if (args.unknownHelpTarget !== void 0) {
2085
2130
  console.error(`No help found for "${args.unknownHelpTarget}".`);
@@ -2134,18 +2179,6 @@ function fileMatches(pattern, filePath) {
2134
2179
  const normalized = pattern.replaceAll("\\", "/");
2135
2180
  return normalized === filePath || globToRegex(normalized).test(filePath);
2136
2181
  }
2137
- function loadWorkspaceEnv() {
2138
- const envPath = resolve(process.cwd(), ".env");
2139
- if (!existsSync(envPath)) return true;
2140
- const loadResult = resultify(() => {
2141
- process.loadEnvFile(envPath);
2142
- });
2143
- if (loadResult.error) {
2144
- console.error(`Failed to load .env at ${envPath}: ${loadResult.error.message}`);
2145
- return false;
2146
- }
2147
- return true;
2148
- }
2149
2182
  function formatUnknownErrorDetails(error) {
2150
2183
  if (error instanceof Error) return error.stack ?? error.message;
2151
2184
  if (typeof error === "string") return error;
@@ -2199,19 +2232,22 @@ async function commandApp(args) {
2199
2232
  const { serve } = await import("@hono/node-server");
2200
2233
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
2201
2234
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
2202
- const appModule = await import("./app-opbcrpvt.mjs");
2203
- const runnerModule = await import("./runner-CIxj7jYj.mjs");
2235
+ const appModule = await import("./app-DPCFFkyQ.mjs");
2236
+ const runnerModule = await import("./runner-XEP21_u9.mjs");
2204
2237
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
2205
2238
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
2206
- await runnerModule.initRunner();
2239
+ await runnerModule.initRunner({ loadEnv: args.loadEnv });
2207
2240
  console.info(`Agent Evals app: http://localhost:${String(args.port)}`);
2208
2241
  serve({
2209
2242
  fetch: appModule.app.fetch,
2210
2243
  port: args.port
2211
2244
  });
2212
2245
  }
2213
- async function commandList(args_) {
2214
- const runner = createRunner({ watchForChanges: false });
2246
+ async function commandList(args) {
2247
+ const runner = createRunner({
2248
+ watchForChanges: false,
2249
+ loadEnv: args.loadEnv
2250
+ });
2215
2251
  await runner.init();
2216
2252
  const discoveryIssues = runner.getDiscoveryIssues();
2217
2253
  if (discoveryIssues.length > 0) {
@@ -2244,7 +2280,10 @@ async function commandList(args_) {
2244
2280
  if (discoveryIssues.length > 0) process.exit(1);
2245
2281
  }
2246
2282
  async function commandRun(args) {
2247
- const runner = createRunner({ watchForChanges: false });
2283
+ const runner = createRunner({
2284
+ watchForChanges: false,
2285
+ loadEnv: args.loadEnv
2286
+ });
2248
2287
  await runner.init();
2249
2288
  if (args.evalIds.length === 0 && args.caseIds.length === 0 && args.files.length === 0 && args.tagsFilter.length === 0 && !runner.getAllowCliRunAll()) {
2250
2289
  console.error("This workspace disables running all evals from the CLI. Pass --eval <id>, --file <path|glob>, --case <id>, or --tags-filter <expr> to run a targeted subset.");
@@ -2333,7 +2372,10 @@ async function commandRun(args) {
2333
2372
  if (summary.status === "error" || summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
2334
2373
  }
2335
2374
  async function commandShowRuns(args) {
2336
- const runner = createRunner({ watchForChanges: false });
2375
+ const runner = createRunner({
2376
+ watchForChanges: false,
2377
+ loadEnv: args.loadEnv
2378
+ });
2337
2379
  await runner.init();
2338
2380
  const runRef = args.positionals[0];
2339
2381
  if (runRef !== void 0) {
@@ -2358,7 +2400,10 @@ async function commandShowRuns(args) {
2358
2400
  printRunFileIndexes(indexes);
2359
2401
  }
2360
2402
  async function commandCache(args) {
2361
- const runner = createRunner({ watchForChanges: false });
2403
+ const runner = createRunner({
2404
+ watchForChanges: false,
2405
+ loadEnv: args.loadEnv
2406
+ });
2362
2407
  await runner.init();
2363
2408
  if (args.subcommand === "list" || args.subcommand === void 0) {
2364
2409
  const entries = await runner.listCache();