@ls-stack/agent-eval 0.36.0 → 0.38.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as apiCallMetricSchema, $n as getCurrentScope, $t as cacheDebugKeyEntrySchema, A as createRunRequestSchema, An as repoFileRefSchema, At as runLogEntrySchema, B as getNestedAttribute, Bn as deserializeCacheValue, Bt as manualInputNumberFieldSchema, Cn as cellValueSchema, Ct as caseRowSchema, Dn as fileRefSchema, Dt as evalStatItemSchema, En as columnKindSchema, Et as evalStatAggregateSchema, F as extractApiCalls, Fn as evalSpan, Ft as manualInputBooleanFieldSchema, G as deriveStatusFromChildStatuses, Gn as readManualInputFile, Gt as evalChartAxisSchema, H as getEvalDisplayStatus, Hn as serializeCacheValue, Ht as manualInputSelectOptionSchema, I as extractLlmCalls, In as evalTracer, It as manualInputDescriptorSchema, J as DEFAULT_API_CALLS_CONFIG, Jn as advanceEvalTime, Jt as evalChartConfigSchema, K as runManifestSchema, Kn as evalExpect, Kt as evalChartBuiltinMetricSchema, L as simulateLlmCallCost, Ln as hashCacheKey, Lt as manualInputFieldDescriptorSchema, M as sseEnvelopeSchema, Mn as z, Mt as runLogLocationSchema, N as extractCacheEntries, Nn as buildTraceTree, Nt as runLogPhaseSchema, O as configReloadStateSchema, On as jsonCellSchema, Ot as evalStatsConfigSchema, P as extractCacheHits, Pn as captureEvalSpanError, Pt as scoreTraceSchema, Q as apiCallMetricPlacementSchema, Qn as evalLog, Qt as evalChartsConfigSchema, R as simulateTokenAllocation, Rn as hashCacheKeySync, Rt as manualInputJsonFieldSchema, Sn as traceSpanWarningSchema, St as caseDetailSchema, Tn as columnFormatSchema, Tt as evalFreshnessStatusSchema, U as deriveScopedSummaryFromCases, Un as repoFile, Ut as manualInputTextFieldSchema, V as getEvalTitle, Vn as serializeCacheRecording, Vt as manualInputSelectFieldSchema, W as deriveStatusFromCaseRows, Wn as manualInputFileValueSchema, Wt as evalChartAggregateSchema, X as agentEvalsConfigSchema, Xt as evalChartTooltipExtraSchema, Y as DEFAULT_LLM_CALLS_CONFIG, Yn as appendToEvalOutput, Yt as evalChartMetricSchema, Z as apiCallMetricFormatSchema, Zn as evalAssert, Zt as evalChartTypeSchema, _n as traceDisplayConfigSchema, _t as buildCaseKey, an as cacheModeSchema, ar as nextEvalId, at as llmCallCostCurrencySchema, bn as traceSpanKindSchema, bt as getCaseRowEvalKey, cn as cacheRecordingSchema, cr as runInExistingEvalScope, ct as llmCallMetricSchema, dn as spanCacheOptionsSchema, dr as startEvalBackgroundJob, dt as llmCallsConfigSchema, en as cacheDebugKeyFileSchema, er as getEvalCaseInput, et as apiCallsConfigSchema, fn as traceCacheRefSchema, fr as defineEval, ft as removeDefaultConfigSchema, gn as traceAttributeDisplaySchema, gt as trialSelectionModeSchema, hn as traceAttributeDisplayPlacementSchema, ht as runLogsConfigSchema, in as cacheListItemSchema, ir as mergeEvalOutput, it as evalDeriveConfigSchema, j as updateManualScoreRequestSchema, jn as runArtifactRefSchema, jt as runLogLevelSchema, k as configReloadStatusSchema, kn as numberDisplayOptionsSchema, kt as evalSummarySchema, ln as cacheStatusSchema, lr as setEvalOutput, lt as llmCallPricingRateSchema, mn as traceAttributeDisplayInputSchema, mt as resolveLlmCallsConfig, nn as cacheEntryWithDebugKeySchema, nr as incrementEvalOutput, nt as evalColumnOverrideSchema, on as cacheOperationTypeSchema, or as runInEvalRuntimeScope, ot as llmCallMetricFormatSchema, pn as traceAttributeDisplayFormatSchema, pr as getEvalRegistry, pt as resolveApiCallsConfig, q as runSummarySchema, qn as EvalAssertionError, qt as evalChartColorSchema, rn as cacheFileSchema, rr as isInEvalScope, rt as evalColumnsSchema, sn as cacheRecordingOpSchema, sr as runInEvalScope, st as llmCallMetricPlacementSchema, tn as cacheEntrySchema, tr as getEvalStartTime, tt as defaultConfigKeySchema, un as serializedCacheSpanSchema, ur as setScopeCacheContext, ut as llmCallPricingSchema, vn as traceDisplayInputConfigSchema, vt as buildEvalKey, wn as columnDefSchema, wt as discoveryIssueSchema, xn as traceSpanSchema, xt as assertionFailureSchema, yn as traceSpanErrorSchema, yt as getCaseRowCaseKey, z as applyDerivedCallAttributes, zn as deserializeCacheRecording, zt as manualInputMultilineFieldSchema } from "./runOrchestration-V1TxX8es.mjs";
2
- import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-Dg3abrOv.mjs";
3
- import "./src-DBypR4TV.mjs";
1
+ import { $ as apiCallMetricFormatSchema, $n as evalAssert, $t as evalChartTypeSchema, A as configReloadStateSchema, An as jsonCellSchema, At as evalStatsConfigSchema, B as simulateTokenAllocation, Bn as hashCacheKeySync, Bt as manualInputJsonFieldSchema, Cn as traceSpanSchema, Ct as assertionFailureSchema, Dn as columnFormatSchema, Dt as evalFreshnessStatusSchema, En as columnDefSchema, Et as discoveryIssueSchema, F as extractCacheEntries, Fn as buildTraceTree, Ft as runLogPhaseSchema, G as deriveScopedSummaryFromCases, Gn as repoFile, Gt as manualInputTextFieldSchema, H as getNestedAttribute, Hn as deserializeCacheValue, Ht as manualInputNumberFieldSchema, I as extractCacheHits, In as captureEvalSpanError, It as scoreTraceSchema, J as runManifestSchema, Jn as evalExpect, Jt as evalChartBuiltinMetricSchema, K as deriveStatusFromCaseRows, Kn as manualInputFileValueSchema, Kt as evalChartAggregateSchema, L as extractApiCalls, Ln as evalSpan, Lt as manualInputBooleanFieldSchema, M as createRunRequestSchema, Mn as repoFileRefSchema, Mt as runLogEntrySchema, N as updateManualScoreRequestSchema, Nn as runArtifactRefSchema, Nt as runLogLevelSchema, On as columnKindSchema, Ot as evalStatAggregateSchema, P as sseEnvelopeSchema, Pn as z, Pt as runLogLocationSchema, Q as agentEvalsConfigSchema, Qt as evalChartTooltipExtraSchema, R as extractLlmCalls, Rn as evalTracer, Rt as manualInputDescriptorSchema, Sn as traceSpanKindSchema, St as getCaseRowEvalKey, Tn as cellValueSchema, Tt as caseRowSchema, U as getEvalTitle, Un as serializeCacheRecording, Ut as manualInputSelectFieldSchema, V as applyDerivedCallAttributes, Vn as deserializeCacheRecording, Vt as manualInputMultilineFieldSchema, W as getEvalDisplayStatus, Wn as serializeCacheValue, Wt as manualInputSelectOptionSchema, X as DEFAULT_API_CALLS_CONFIG, Xn as advanceEvalTime, Xt as evalChartConfigSchema, Y as runSummarySchema, Yn as EvalAssertionError, Yt as evalChartColorSchema, Z as DEFAULT_LLM_CALLS_CONFIG, Zn as appendToEvalOutput, Zt as evalChartMetricSchema, _n as traceAttributeDisplayPlacementSchema, _t as runLogsConfigSchema, an as cacheFileSchema, ar as isInEvalScope, at as evalColumnsSchema, bn as traceDisplayInputConfigSchema, bt as buildEvalKey, cn as cacheOperationTypeSchema, cr as runInEvalRuntimeScope, ct as llmCallMetricFormatSchema, dn as cacheStatusSchema, dr as setEvalOutput, dt as llmCallPricingRateSchema, en as evalChartsConfigSchema, er as evalLog, et as apiCallMetricPlacementSchema, fn as serializedCacheSpanSchema, fr as setScopeCacheContext, ft as llmCallPricingSchema, gn as traceAttributeDisplayInputSchema, gt as resolveLlmCallsConfig, hn as traceAttributeDisplayFormatSchema, hr as getEvalRegistry, ht as resolveApiCallsConfig, in as cacheEntryWithDebugKeySchema, ir as incrementEvalOutput, it as evalColumnOverrideSchema, j as configReloadStatusSchema, jn as numberDisplayOptionsSchema, jt as evalSummarySchema, kn as fileRefSchema, kt as evalStatItemSchema, ln as cacheRecordingOpSchema, lr as runInEvalScope, lt as llmCallMetricPlacementSchema, mn as traceCacheRefSchema, mr as defineEval, mt as removeDefaultConfigSchema, nn as cacheDebugKeyFileSchema, nr as getEvalCaseInput, nt as apiCallsConfigSchema, on as cacheListItemSchema, or as mergeEvalOutput, ot as evalDeriveConfigSchema, pn as spanCacheOptionsSchema, pr as startEvalBackgroundJob, pt as llmCallsConfigSchema, q as deriveStatusFromChildStatuses, qn as readManualInputFile, qt as evalChartAxisSchema, rn as cacheEntrySchema, rr as getEvalStartTime, rt as defaultConfigKeySchema, sn as cacheModeSchema, sr as nextEvalId, st as llmCallCostCurrencySchema, tn as cacheDebugKeyEntrySchema, tr as getCurrentScope, tt as apiCallMetricSchema, un as cacheRecordingSchema, ur as runInExistingEvalScope, ut as llmCallMetricSchema, vn as traceAttributeDisplaySchema, vt as trialSelectionModeSchema, wn as traceSpanWarningSchema, wt as caseDetailSchema, xn as traceSpanErrorSchema, xt as getCaseRowCaseKey, yn as traceDisplayConfigSchema, yt as buildCaseKey, z as simulateLlmCallCost, zn as hashCacheKey, zt as manualInputFieldDescriptorSchema } from "./runOrchestration-BhUFWvq9.mjs";
2
+ import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-BUX6tr9J.mjs";
3
+ import "./src-BwJ5tod2.mjs";
4
4
  export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, cleanupStagedManualInputFiles, columnDefSchema, columnFormatSchema, columnKindSchema, configReloadStateSchema, configReloadStatusSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalColumnOverrideSchema, evalColumnsSchema, evalDeriveConfigSchema, evalExpect, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, jsonCellSchema, llmCallCostCurrencySchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingRateSchema, llmCallPricingSchema, llmCallsConfigSchema, manualInputBooleanFieldSchema, manualInputDescriptorSchema, manualInputFieldDescriptorSchema, manualInputFileValueSchema, manualInputJsonFieldSchema, manualInputMultilineFieldSchema, manualInputNumberFieldSchema, manualInputSelectFieldSchema, manualInputSelectOptionSchema, manualInputTextFieldSchema, materializeManualInputFiles, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, readManualInputFile, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, spanCacheOptionsSchema, sseEnvelopeSchema, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
package/dist/runChild.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { A as createRunRequestSchema, C as loadConfig, D as createFsCacheStore, It as manualInputDescriptorSchema, K as runManifestSchema, Ot as evalStatsConfigSchema, Qt as evalChartsConfigSchema, Xn as configureEvalRunLogs, q as runSummarySchema, r as getTargetEvals$1, t as executeRun, vt as buildEvalKey, wn as columnDefSchema, x as parseEvalDiscovery } from "./runOrchestration-V1TxX8es.mjs";
1
+ import { At as evalStatsConfigSchema, C as parseEvalDiscovery, En as columnDefSchema, J as runManifestSchema, M as createRunRequestSchema, Qn as configureEvalRunLogs, Rt as manualInputDescriptorSchema, T as loadConfig, Y as runSummarySchema, bt as buildEvalKey, en as evalChartsConfigSchema, k as createFsCacheStore, p as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-BhUFWvq9.mjs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { readFile } from "node:fs/promises";
4
4
  import { relative } from "node:path";
@@ -14,6 +14,7 @@ const evalMetaSchema = z.object({
14
14
  sourceFingerprint: z.string().nullable(),
15
15
  columnDefs: z.array(columnDefSchema),
16
16
  caseCount: z.number().nullable(),
17
+ caseIds: z.array(z.string()).optional(),
17
18
  stats: evalStatsConfigSchema.optional(),
18
19
  charts: evalChartsConfigSchema.optional(),
19
20
  manualInputDescriptor: manualInputDescriptorSchema.optional(),
@@ -27,10 +28,21 @@ const runChildContextSchema = z.object({
27
28
  summary: runSummarySchema,
28
29
  evals: z.array(evalMetaSchema).optional()
29
30
  });
31
+ let activeContext;
32
+ let fatalErrorReported = false;
33
+ let disconnectExpected = false;
30
34
  function sendMessage(message) {
31
35
  if (process.send === void 0) return;
32
36
  process.send(message);
33
37
  }
38
+ function installFatalRunChildErrorHandlers() {
39
+ process.once("uncaughtException", (error) => {
40
+ reportFatalRunChildErrorAndExit(error);
41
+ });
42
+ process.once("unhandledRejection", (reason) => {
43
+ reportFatalRunChildErrorAndExit(toUnhandledRejectionError(reason));
44
+ });
45
+ }
34
46
  function getSourceFingerprint(source) {
35
47
  return createHash("sha256").update(source).digest("hex");
36
48
  }
@@ -90,9 +102,11 @@ async function readContext(contextPath) {
90
102
  }
91
103
  async function main() {
92
104
  process.on("disconnect", () => {
105
+ if (disconnectExpected) return;
93
106
  process.exit(1);
94
107
  });
95
108
  const context = await readContext(process.argv[2]);
109
+ activeContext = context;
96
110
  process.chdir(context.workspaceRoot);
97
111
  const config = await loadConfig();
98
112
  configureEvalRunLogs({ captureConsole: config.runLogs?.captureConsole !== false });
@@ -153,7 +167,64 @@ async function main() {
153
167
  evals: [...evals.values()]
154
168
  });
155
169
  }
156
- await main();
170
+ async function handleFatalRunChildError(error) {
171
+ if (fatalErrorReported) return;
172
+ fatalErrorReported = true;
173
+ const message = formatUnknownErrorDetails(error);
174
+ process.exitCode = 1;
175
+ console.error(message);
176
+ if (activeContext === void 0) return;
177
+ const endedAt = (/* @__PURE__ */ new Date()).toISOString();
178
+ await persistRunState({
179
+ runDir: activeContext.runDir,
180
+ manifest: {
181
+ ...activeContext.manifest,
182
+ status: "error",
183
+ endedAt
184
+ },
185
+ summary: {
186
+ ...activeContext.summary,
187
+ status: "error",
188
+ errorMessage: message
189
+ },
190
+ cases: [],
191
+ caseDetails: /* @__PURE__ */ new Map(),
192
+ listeners: /* @__PURE__ */ new Set()
193
+ });
194
+ sendMessage({
195
+ type: "event",
196
+ event: {
197
+ type: "run.error",
198
+ runId: activeContext.manifest.id,
199
+ timestamp: endedAt,
200
+ payload: { message }
201
+ }
202
+ });
203
+ }
204
+ function formatUnknownErrorDetails(error) {
205
+ if (error instanceof Error) return error.stack ?? error.message;
206
+ if (typeof error === "string") return error;
207
+ return String(error);
208
+ }
209
+ function toUnhandledRejectionError(reason) {
210
+ if (reason instanceof Error) return reason;
211
+ return /* @__PURE__ */ new Error(`Unhandled rejection: ${formatUnknownErrorDetails(reason)}`);
212
+ }
213
+ async function reportFatalRunChildErrorAndExit(error) {
214
+ try {
215
+ await handleFatalRunChildError(error);
216
+ } catch (reportError) {
217
+ console.error("Failed to report fatal run child error:");
218
+ console.error(formatUnknownErrorDetails(reportError));
219
+ } finally {
220
+ process.exit(1);
221
+ }
222
+ }
223
+ installFatalRunChildErrorHandlers();
224
+ await main().catch(async (error) => {
225
+ await handleFatalRunChildError(error);
226
+ });
227
+ disconnectExpected = true;
157
228
  process.disconnect();
158
229
  //#endregion
159
230
  export {};