@ls-stack/agent-eval 0.36.0 → 0.38.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-BlNzXWDM.mjs → app-DD-8kx5H.mjs} +50 -6
- package/dist/apps/web/dist/assets/index-C6PgBOfP.css +1 -0
- package/dist/apps/web/dist/assets/index-CO86PsY-.js +140 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +14 -3
- package/dist/{cli-Dg3abrOv.mjs → cli-BUX6tr9J.mjs} +106 -25
- package/dist/index.d.mts +159 -150
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +73 -2
- package/dist/{runOrchestration-V1TxX8es.mjs → runOrchestration-BhUFWvq9.mjs} +293 -121
- package/dist/{runner-BCs5rzej.mjs → runner-B1wfPKNH.mjs} +2 -2
- package/dist/{runner-znY6PY1M.mjs → runner-CoRf7Vef.mjs} +1 -1
- package/dist/src-BwJ5tod2.mjs +3 -0
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +18 -6
- package/dist/apps/web/dist/assets/index-BYtcGddU.js +0 -140
- package/dist/apps/web/dist/assets/index-D0rC5MSS.css +0 -1
- package/dist/src-DBypR4TV.mjs +0 -3
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as
|
|
2
|
-
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-
|
|
3
|
-
import "./src-
|
|
1
|
+
import { $ as apiCallMetricFormatSchema, $n as evalAssert, $t as evalChartTypeSchema, A as configReloadStateSchema, An as jsonCellSchema, At as evalStatsConfigSchema, B as simulateTokenAllocation, Bn as hashCacheKeySync, Bt as manualInputJsonFieldSchema, Cn as traceSpanSchema, Ct as assertionFailureSchema, Dn as columnFormatSchema, Dt as evalFreshnessStatusSchema, En as columnDefSchema, Et as discoveryIssueSchema, F as extractCacheEntries, Fn as buildTraceTree, Ft as runLogPhaseSchema, G as deriveScopedSummaryFromCases, Gn as repoFile, Gt as manualInputTextFieldSchema, H as getNestedAttribute, Hn as deserializeCacheValue, Ht as manualInputNumberFieldSchema, I as extractCacheHits, In as captureEvalSpanError, It as scoreTraceSchema, J as runManifestSchema, Jn as evalExpect, Jt as evalChartBuiltinMetricSchema, K as deriveStatusFromCaseRows, Kn as manualInputFileValueSchema, Kt as evalChartAggregateSchema, L as extractApiCalls, Ln as evalSpan, Lt as manualInputBooleanFieldSchema, M as createRunRequestSchema, Mn as repoFileRefSchema, Mt as runLogEntrySchema, N as updateManualScoreRequestSchema, Nn as runArtifactRefSchema, Nt as runLogLevelSchema, On as columnKindSchema, Ot as evalStatAggregateSchema, P as sseEnvelopeSchema, Pn as z, Pt as runLogLocationSchema, Q as agentEvalsConfigSchema, Qt as evalChartTooltipExtraSchema, R as extractLlmCalls, Rn as evalTracer, Rt as manualInputDescriptorSchema, Sn as traceSpanKindSchema, St as getCaseRowEvalKey, Tn as cellValueSchema, Tt as caseRowSchema, U as getEvalTitle, Un as serializeCacheRecording, Ut as manualInputSelectFieldSchema, V as applyDerivedCallAttributes, Vn as deserializeCacheRecording, Vt as manualInputMultilineFieldSchema, W as getEvalDisplayStatus, Wn as serializeCacheValue, Wt as manualInputSelectOptionSchema, X as DEFAULT_API_CALLS_CONFIG, Xn as advanceEvalTime, Xt as evalChartConfigSchema, Y as runSummarySchema, Yn as EvalAssertionError, Yt as evalChartColorSchema, Z as DEFAULT_LLM_CALLS_CONFIG, Zn as appendToEvalOutput, Zt as evalChartMetricSchema, _n as traceAttributeDisplayPlacementSchema, _t as runLogsConfigSchema, an as cacheFileSchema, ar as isInEvalScope, at as evalColumnsSchema, bn as traceDisplayInputConfigSchema, bt as buildEvalKey, cn as cacheOperationTypeSchema, cr as runInEvalRuntimeScope, ct as llmCallMetricFormatSchema, dn as cacheStatusSchema, dr as setEvalOutput, dt as llmCallPricingRateSchema, en as evalChartsConfigSchema, er as evalLog, et as apiCallMetricPlacementSchema, fn as serializedCacheSpanSchema, fr as setScopeCacheContext, ft as llmCallPricingSchema, gn as traceAttributeDisplayInputSchema, gt as resolveLlmCallsConfig, hn as traceAttributeDisplayFormatSchema, hr as getEvalRegistry, ht as resolveApiCallsConfig, in as cacheEntryWithDebugKeySchema, ir as incrementEvalOutput, it as evalColumnOverrideSchema, j as configReloadStatusSchema, jn as numberDisplayOptionsSchema, jt as evalSummarySchema, kn as fileRefSchema, kt as evalStatItemSchema, ln as cacheRecordingOpSchema, lr as runInEvalScope, lt as llmCallMetricPlacementSchema, mn as traceCacheRefSchema, mr as defineEval, mt as removeDefaultConfigSchema, nn as cacheDebugKeyFileSchema, nr as getEvalCaseInput, nt as apiCallsConfigSchema, on as cacheListItemSchema, or as mergeEvalOutput, ot as evalDeriveConfigSchema, pn as spanCacheOptionsSchema, pr as startEvalBackgroundJob, pt as llmCallsConfigSchema, q as deriveStatusFromChildStatuses, qn as readManualInputFile, qt as evalChartAxisSchema, rn as cacheEntrySchema, rr as getEvalStartTime, rt as defaultConfigKeySchema, sn as cacheModeSchema, sr as nextEvalId, st as llmCallCostCurrencySchema, tn as cacheDebugKeyEntrySchema, tr as getCurrentScope, tt as apiCallMetricSchema, un as cacheRecordingSchema, ur as runInExistingEvalScope, ut as llmCallMetricSchema, vn as traceAttributeDisplaySchema, vt as trialSelectionModeSchema, wn as traceSpanWarningSchema, wt as caseDetailSchema, xn as traceSpanErrorSchema, xt as getCaseRowCaseKey, yn as traceDisplayConfigSchema, yt as buildCaseKey, z as simulateLlmCallCost, zn as hashCacheKey, zt as manualInputFieldDescriptorSchema } from "./runOrchestration-BhUFWvq9.mjs";
|
|
2
|
+
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-BUX6tr9J.mjs";
|
|
3
|
+
import "./src-BwJ5tod2.mjs";
|
|
4
4
|
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, cleanupStagedManualInputFiles, columnDefSchema, columnFormatSchema, columnKindSchema, configReloadStateSchema, configReloadStatusSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalColumnOverrideSchema, evalColumnsSchema, evalDeriveConfigSchema, evalExpect, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, jsonCellSchema, llmCallCostCurrencySchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingRateSchema, llmCallPricingSchema, llmCallsConfigSchema, manualInputBooleanFieldSchema, manualInputDescriptorSchema, manualInputFieldDescriptorSchema, manualInputFileValueSchema, manualInputJsonFieldSchema, manualInputMultilineFieldSchema, manualInputNumberFieldSchema, manualInputSelectFieldSchema, manualInputSelectOptionSchema, manualInputTextFieldSchema, materializeManualInputFiles, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, readManualInputFile, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, spanCacheOptionsSchema, sseEnvelopeSchema, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { At as evalStatsConfigSchema, C as parseEvalDiscovery, En as columnDefSchema, J as runManifestSchema, M as createRunRequestSchema, Qn as configureEvalRunLogs, Rt as manualInputDescriptorSchema, T as loadConfig, Y as runSummarySchema, bt as buildEvalKey, en as evalChartsConfigSchema, k as createFsCacheStore, p as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-BhUFWvq9.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { relative } from "node:path";
|
|
@@ -14,6 +14,7 @@ const evalMetaSchema = z.object({
|
|
|
14
14
|
sourceFingerprint: z.string().nullable(),
|
|
15
15
|
columnDefs: z.array(columnDefSchema),
|
|
16
16
|
caseCount: z.number().nullable(),
|
|
17
|
+
caseIds: z.array(z.string()).optional(),
|
|
17
18
|
stats: evalStatsConfigSchema.optional(),
|
|
18
19
|
charts: evalChartsConfigSchema.optional(),
|
|
19
20
|
manualInputDescriptor: manualInputDescriptorSchema.optional(),
|
|
@@ -27,10 +28,21 @@ const runChildContextSchema = z.object({
|
|
|
27
28
|
summary: runSummarySchema,
|
|
28
29
|
evals: z.array(evalMetaSchema).optional()
|
|
29
30
|
});
|
|
31
|
+
let activeContext;
|
|
32
|
+
let fatalErrorReported = false;
|
|
33
|
+
let disconnectExpected = false;
|
|
30
34
|
function sendMessage(message) {
|
|
31
35
|
if (process.send === void 0) return;
|
|
32
36
|
process.send(message);
|
|
33
37
|
}
|
|
38
|
+
function installFatalRunChildErrorHandlers() {
|
|
39
|
+
process.once("uncaughtException", (error) => {
|
|
40
|
+
reportFatalRunChildErrorAndExit(error);
|
|
41
|
+
});
|
|
42
|
+
process.once("unhandledRejection", (reason) => {
|
|
43
|
+
reportFatalRunChildErrorAndExit(toUnhandledRejectionError(reason));
|
|
44
|
+
});
|
|
45
|
+
}
|
|
34
46
|
function getSourceFingerprint(source) {
|
|
35
47
|
return createHash("sha256").update(source).digest("hex");
|
|
36
48
|
}
|
|
@@ -90,9 +102,11 @@ async function readContext(contextPath) {
|
|
|
90
102
|
}
|
|
91
103
|
async function main() {
|
|
92
104
|
process.on("disconnect", () => {
|
|
105
|
+
if (disconnectExpected) return;
|
|
93
106
|
process.exit(1);
|
|
94
107
|
});
|
|
95
108
|
const context = await readContext(process.argv[2]);
|
|
109
|
+
activeContext = context;
|
|
96
110
|
process.chdir(context.workspaceRoot);
|
|
97
111
|
const config = await loadConfig();
|
|
98
112
|
configureEvalRunLogs({ captureConsole: config.runLogs?.captureConsole !== false });
|
|
@@ -153,7 +167,64 @@ async function main() {
|
|
|
153
167
|
evals: [...evals.values()]
|
|
154
168
|
});
|
|
155
169
|
}
|
|
156
|
-
|
|
170
|
+
async function handleFatalRunChildError(error) {
|
|
171
|
+
if (fatalErrorReported) return;
|
|
172
|
+
fatalErrorReported = true;
|
|
173
|
+
const message = formatUnknownErrorDetails(error);
|
|
174
|
+
process.exitCode = 1;
|
|
175
|
+
console.error(message);
|
|
176
|
+
if (activeContext === void 0) return;
|
|
177
|
+
const endedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
178
|
+
await persistRunState({
|
|
179
|
+
runDir: activeContext.runDir,
|
|
180
|
+
manifest: {
|
|
181
|
+
...activeContext.manifest,
|
|
182
|
+
status: "error",
|
|
183
|
+
endedAt
|
|
184
|
+
},
|
|
185
|
+
summary: {
|
|
186
|
+
...activeContext.summary,
|
|
187
|
+
status: "error",
|
|
188
|
+
errorMessage: message
|
|
189
|
+
},
|
|
190
|
+
cases: [],
|
|
191
|
+
caseDetails: /* @__PURE__ */ new Map(),
|
|
192
|
+
listeners: /* @__PURE__ */ new Set()
|
|
193
|
+
});
|
|
194
|
+
sendMessage({
|
|
195
|
+
type: "event",
|
|
196
|
+
event: {
|
|
197
|
+
type: "run.error",
|
|
198
|
+
runId: activeContext.manifest.id,
|
|
199
|
+
timestamp: endedAt,
|
|
200
|
+
payload: { message }
|
|
201
|
+
}
|
|
202
|
+
});
|
|
203
|
+
}
|
|
204
|
+
function formatUnknownErrorDetails(error) {
|
|
205
|
+
if (error instanceof Error) return error.stack ?? error.message;
|
|
206
|
+
if (typeof error === "string") return error;
|
|
207
|
+
return String(error);
|
|
208
|
+
}
|
|
209
|
+
function toUnhandledRejectionError(reason) {
|
|
210
|
+
if (reason instanceof Error) return reason;
|
|
211
|
+
return /* @__PURE__ */ new Error(`Unhandled rejection: ${formatUnknownErrorDetails(reason)}`);
|
|
212
|
+
}
|
|
213
|
+
async function reportFatalRunChildErrorAndExit(error) {
|
|
214
|
+
try {
|
|
215
|
+
await handleFatalRunChildError(error);
|
|
216
|
+
} catch (reportError) {
|
|
217
|
+
console.error("Failed to report fatal run child error:");
|
|
218
|
+
console.error(formatUnknownErrorDetails(reportError));
|
|
219
|
+
} finally {
|
|
220
|
+
process.exit(1);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
installFatalRunChildErrorHandlers();
|
|
224
|
+
await main().catch(async (error) => {
|
|
225
|
+
await handleFatalRunChildError(error);
|
|
226
|
+
});
|
|
227
|
+
disconnectExpected = true;
|
|
157
228
|
process.disconnect();
|
|
158
229
|
//#endregion
|
|
159
230
|
export {};
|