@ls-stack/agent-eval 0.23.0 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-Cw79dJDr.mjs → app-DYRmucgj.mjs} +3 -3
- package/dist/apps/web/dist/assets/{index-AUDD3rNB.js → index-KbbX3NYr.js} +35 -35
- package/dist/apps/web/dist/index.html +1 -1
- package/dist/bin.mjs +1 -1
- package/dist/{cli-D3QNOcPN.mjs → cli-Be0x8CS3.mjs} +3 -3
- package/dist/index.d.mts +51 -7
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-CimthgI7.mjs → runOrchestration-D697g6Qe.mjs} +143 -31
- package/dist/{runner-4yNYRvmF.mjs → runner-B4SosWgD.mjs} +2 -2
- package/dist/{runner-B-SYzW8w.mjs → runner-jSujaSKt.mjs} +1 -1
- package/dist/src-D6cettg0.mjs +3 -0
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +8 -5
- package/dist/src-CcXfWT4M.mjs +0 -3
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
|
|
26
26
|
rel="stylesheet"
|
|
27
27
|
/>
|
|
28
|
-
<script type="module" crossorigin src="/assets/index-
|
|
28
|
+
<script type="module" crossorigin src="/assets/index-KbbX3NYr.js"></script>
|
|
29
29
|
<link rel="stylesheet" crossorigin href="/assets/index-r0dVFK0B.css">
|
|
30
30
|
</head>
|
|
31
31
|
<body>
|
package/dist/bin.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { C as validateCharts, F as
|
|
1
|
+
import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, Rn as getEvalRegistry, S as normalizeScoreDef, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as loadConfig, c as persistCaseDetail, d as recomputePersistedCaseStatus, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, i as getLatestRunInfos, l as persistRunState, m as buildEvalSummary, n as generateRunId, nt as resolveLlmCallsConfig, o as loadPersistedRunSnapshots, p as resolveArtifactPath, r as getLastRunStatuses, s as nextShortIdFromSnapshots, tt as resolveApiCallsConfig, u as recomputeEvalStatusesInRuns, v as parseEvalMetas, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-D697g6Qe.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import { dirname, join, relative, resolve } from "node:path";
|
|
@@ -959,8 +959,8 @@ async function commandApp(args) {
|
|
|
959
959
|
const { serve } = await import("@hono/node-server");
|
|
960
960
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
961
961
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
962
|
-
const appModule = await import("./app-
|
|
963
|
-
const runnerModule = await import("./runner-
|
|
962
|
+
const appModule = await import("./app-DYRmucgj.mjs");
|
|
963
|
+
const runnerModule = await import("./runner-jSujaSKt.mjs");
|
|
964
964
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
965
965
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
966
966
|
await runnerModule.initRunner();
|
package/dist/index.d.mts
CHANGED
|
@@ -1539,13 +1539,35 @@ declare const apiCallMetricPlacementSchema: z$1.ZodEnum<{
|
|
|
1539
1539
|
}>;
|
|
1540
1540
|
/** Placement option for an API-call metric. */
|
|
1541
1541
|
type ApiCallMetricPlacement = z$1.infer<typeof apiCallMetricPlacementSchema>;
|
|
1542
|
+
/** Context passed to an LLM/API-call derived attribute function. */
|
|
1543
|
+
type CallDerivedAttributeContext = {
|
|
1544
|
+
/** Raw attributes from the matching trace span. */attributes: Record<string, unknown> | undefined; /** Matching trace span. */
|
|
1545
|
+
span: EvalTraceSpan; /** Dot-path helper for reading from `span.attributes`. */
|
|
1546
|
+
get: (path: string) => unknown;
|
|
1547
|
+
};
|
|
1548
|
+
/**
|
|
1549
|
+
* Runner-side function used to derive one new span attribute from a matching
|
|
1550
|
+
* LLM/API-call span. Return `undefined` to omit the attribute for that span.
|
|
1551
|
+
*/
|
|
1552
|
+
type CallDerivedAttribute = (ctx: CallDerivedAttributeContext) => unknown;
|
|
1553
|
+
/** One resolved derived span attribute rule. */
|
|
1554
|
+
type ResolvedCallDerivedAttribute = {
|
|
1555
|
+
/** Dot-path where the derived value is persisted on `span.attributes`. */path: string;
|
|
1556
|
+
/**
|
|
1557
|
+
* Function that derives the persisted value for each matching span. Omitted
|
|
1558
|
+
* after this config is serialized to the browser.
|
|
1559
|
+
*/
|
|
1560
|
+
compute?: CallDerivedAttribute;
|
|
1561
|
+
};
|
|
1542
1562
|
/**
|
|
1543
1563
|
* Schema for a single user-defined metric attached to LLM call rows.
|
|
1544
1564
|
*
|
|
1545
1565
|
* Each metric reads `path` from the span's `attributes` and renders the value
|
|
1546
|
-
* with the configured `format` and `numberFormat`.
|
|
1547
|
-
*
|
|
1548
|
-
*
|
|
1566
|
+
* with the configured `format` and `numberFormat`. Use
|
|
1567
|
+
* `llmCalls.derivedAttributes` when a metric should read a value computed from
|
|
1568
|
+
* other attributes. `placements` controls whether the metric appears as a chip
|
|
1569
|
+
* on the collapsed row header, as a row inside the expanded body, or both.
|
|
1570
|
+
* Defaults to `['body']` when omitted.
|
|
1549
1571
|
*/
|
|
1550
1572
|
declare const llmCallMetricSchema: z$1.ZodObject<{
|
|
1551
1573
|
label: z$1.ZodString;
|
|
@@ -1570,9 +1592,11 @@ type LlmCallMetric = z$1.infer<typeof llmCallMetricSchema>;
|
|
|
1570
1592
|
* Schema for a single user-defined metric attached to API call rows.
|
|
1571
1593
|
*
|
|
1572
1594
|
* Each metric reads `path` from the span's `attributes` and renders the value
|
|
1573
|
-
* with the configured `format` and `numberFormat`.
|
|
1574
|
-
*
|
|
1575
|
-
*
|
|
1595
|
+
* with the configured `format` and `numberFormat`. Use
|
|
1596
|
+
* `apiCalls.derivedAttributes` when a metric should read a value computed from
|
|
1597
|
+
* other attributes. `placements` controls whether the metric appears as a chip
|
|
1598
|
+
* on the collapsed row header, as a row inside the expanded body, or both.
|
|
1599
|
+
* Defaults to `['body']` when omitted.
|
|
1576
1600
|
*/
|
|
1577
1601
|
declare const apiCallMetricSchema: z$1.ZodObject<{
|
|
1578
1602
|
label: z$1.ZodString;
|
|
@@ -1629,6 +1653,7 @@ declare const llmCallsConfigSchema: z$1.ZodObject<{
|
|
|
1629
1653
|
reasoning: z$1.ZodOptional<z$1.ZodString>;
|
|
1630
1654
|
toolCalls: z$1.ZodOptional<z$1.ZodString>;
|
|
1631
1655
|
}, z$1.core.$strip>>;
|
|
1656
|
+
derivedAttributes: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodCustom<CallDerivedAttribute, CallDerivedAttribute>>>;
|
|
1632
1657
|
pricing: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
|
|
1633
1658
|
model: z$1.ZodString;
|
|
1634
1659
|
provider: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -1674,6 +1699,7 @@ declare const apiCallsConfigSchema: z$1.ZodObject<{
|
|
|
1674
1699
|
durationMs: z$1.ZodOptional<z$1.ZodString>;
|
|
1675
1700
|
error: z$1.ZodOptional<z$1.ZodString>;
|
|
1676
1701
|
}, z$1.core.$strip>>;
|
|
1702
|
+
derivedAttributes: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodCustom<CallDerivedAttribute, CallDerivedAttribute>>>;
|
|
1677
1703
|
metrics: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
|
|
1678
1704
|
label: z$1.ZodString;
|
|
1679
1705
|
tooltip: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -1720,6 +1746,7 @@ type ResolvedLlmCallsConfig = {
|
|
|
1720
1746
|
reasoning: string;
|
|
1721
1747
|
toolCalls: string;
|
|
1722
1748
|
};
|
|
1749
|
+
derivedAttributes: ResolvedCallDerivedAttribute[];
|
|
1723
1750
|
metrics: ResolvedLlmCallMetric[];
|
|
1724
1751
|
pricing: ResolvedLlmCallPricing[];
|
|
1725
1752
|
};
|
|
@@ -1738,6 +1765,7 @@ type ResolvedApiCallsConfig = {
|
|
|
1738
1765
|
durationMs: string;
|
|
1739
1766
|
error: string;
|
|
1740
1767
|
};
|
|
1768
|
+
derivedAttributes: ResolvedCallDerivedAttribute[];
|
|
1741
1769
|
metrics: ResolvedApiCallMetric[];
|
|
1742
1770
|
};
|
|
1743
1771
|
/** Fully-resolved LLM-call metric used by the runner and UI. */
|
|
@@ -1983,6 +2011,7 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
1983
2011
|
reasoning: z$1.ZodOptional<z$1.ZodString>;
|
|
1984
2012
|
toolCalls: z$1.ZodOptional<z$1.ZodString>;
|
|
1985
2013
|
}, z$1.core.$strip>>;
|
|
2014
|
+
derivedAttributes: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodCustom<CallDerivedAttribute, CallDerivedAttribute>>>;
|
|
1986
2015
|
pricing: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
|
|
1987
2016
|
model: z$1.ZodString;
|
|
1988
2017
|
provider: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -2037,6 +2066,7 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
2037
2066
|
durationMs: z$1.ZodOptional<z$1.ZodString>;
|
|
2038
2067
|
error: z$1.ZodOptional<z$1.ZodString>;
|
|
2039
2068
|
}, z$1.core.$strip>>;
|
|
2069
|
+
derivedAttributes: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodCustom<CallDerivedAttribute, CallDerivedAttribute>>>;
|
|
2040
2070
|
metrics: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
|
|
2041
2071
|
label: z$1.ZodString;
|
|
2042
2072
|
tooltip: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -2067,6 +2097,20 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
2067
2097
|
}, z$1.core.$strip>>;
|
|
2068
2098
|
}, z$1.core.$strip>;
|
|
2069
2099
|
//#endregion
|
|
2100
|
+
//#region ../shared/src/utils/deriveCallAttributes.d.ts
|
|
2101
|
+
/**
|
|
2102
|
+
* Persist configured derived attributes onto matching LLM/API spans.
|
|
2103
|
+
*
|
|
2104
|
+
* These derived attributes are applied before trace consumers run, so
|
|
2105
|
+
* `deriveFromTracing`, default usage extraction, trace display, and call
|
|
2106
|
+
* metrics can all read them by normal dot-path lookup.
|
|
2107
|
+
*/
|
|
2108
|
+
declare function applyDerivedCallAttributes(params: {
|
|
2109
|
+
spans: EvalTraceSpan[];
|
|
2110
|
+
llmCallsConfig: ResolvedLlmCallsConfig;
|
|
2111
|
+
apiCallsConfig: ResolvedApiCallsConfig;
|
|
2112
|
+
}): EvalTraceSpan[];
|
|
2113
|
+
//#endregion
|
|
2070
2114
|
//#region ../shared/src/utils/extractLlmCalls.d.ts
|
|
2071
2115
|
/** Resolved value for one user-defined metric on an LLM call row. */
|
|
2072
2116
|
type LlmCallMetricValue = {
|
|
@@ -3623,4 +3667,4 @@ declare function createRunner({
|
|
|
3623
3667
|
*/
|
|
3624
3668
|
declare function runCli(argv: string[]): Promise<void>;
|
|
3625
3669
|
//#endregion
|
|
3626
|
-
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
3670
|
+
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as
|
|
2
|
-
import { n as createRunner, t as runCli } from "./cli-
|
|
3
|
-
import "./src-
|
|
4
|
-
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
1
|
+
import { $ as llmCallsConfigSchema, $t as columnFormatSchema, A as extractApiCalls, An as runInEvalRuntimeScope, At as cacheEntryWithDebugKeySchema, B as runSummarySchema, Bt as traceCacheRefSchema, Cn as getCurrentScope, Ct as evalChartMetricSchema, D as sseEnvelopeSchema, Dn as isInEvalScope, Dt as cacheDebugKeyEntrySchema, E as updateManualScoreRequestSchema, En as incrementEvalOutput, Et as evalChartsConfigSchema, F as getEvalDisplayStatus, Fn as startEvalBackgroundJob, Ft as cacheRecordingOpSchema, G as apiCallMetricPlacementSchema, Gt as traceDisplayConfigSchema, H as DEFAULT_LLM_CALLS_CONFIG, Ht as traceAttributeDisplayInputSchema, I as deriveScopedSummaryFromCases, In as repoFile, It as cacheRecordingSchema, J as defaultConfigKeySchema, Jt as traceSpanKindSchema, K as apiCallMetricSchema, Kt as traceDisplayInputConfigSchema, L as deriveStatusFromCaseRows, Ln as defineEval, Lt as cacheStatusSchema, M as applyDerivedCallAttributes, Mn as runInExistingEvalScope, Mt as cacheListItemSchema, N as getNestedAttribute, Nn as setEvalOutput, Nt as cacheModeSchema, O as extractCacheEntries, On as mergeEvalOutput, Ot as cacheDebugKeyFileSchema, P as getEvalTitle, Pn as setScopeCacheContext, Pt as cacheOperationTypeSchema, Q as llmCallPricingSchema, Qt as columnDefSchema, R as deriveStatusFromChildStatuses, Rn as getEvalRegistry, Rt as serializedCacheSpanSchema, Sn as evalLog, St as evalChartConfigSchema, T as createRunRequestSchema, Tn as getEvalStartTime, Tt as evalChartTypeSchema, U as agentEvalsConfigSchema, Ut as traceAttributeDisplayPlacementSchema, V as DEFAULT_API_CALLS_CONFIG, Vt as traceAttributeDisplayFormatSchema, W as apiCallMetricFormatSchema, Wt as traceAttributeDisplaySchema, X as llmCallMetricPlacementSchema, Xt as traceSpanWarningSchema, Y as llmCallMetricFormatSchema, Yt as traceSpanSchema, Z as llmCallMetricSchema, Zt as cellValueSchema, _n as EvalAssertionError, _t as scoreTraceSchema, an as runArtifactRefSchema, at as assertionFailureSchema, bt as evalChartBuiltinMetricSchema, cn as captureEvalSpanError, ct as evalFreshnessStatusSchema, dn as hashCacheKey, dt as evalStatsConfigSchema, en as columnKindSchema, et as removeDefaultConfigSchema, fn as hashCacheKeySync, ft as evalSummarySchema, gn as serializeCacheValue, gt as runLogPhaseSchema, hn as serializeCacheRecording, ht as runLogLocationSchema, in as repoFileRefSchema, it as trialSelectionModeSchema, j as extractLlmCalls, jn as runInEvalScope, jt as cacheFileSchema, k as extractCacheHits, kn as nextEvalId, kt as cacheEntrySchema, ln as evalSpan, lt as evalStatAggregateSchema, mn as deserializeCacheValue, mt as runLogLevelSchema, nn as jsonCellSchema, nt as resolveLlmCallsConfig, on as z, ot as caseDetailSchema, pn as deserializeCacheRecording, pt as runLogEntrySchema, q as apiCallsConfigSchema, qt as traceSpanErrorSchema, rn as numberDisplayOptionsSchema, rt as runLogsConfigSchema, sn as buildTraceTree, st as caseRowSchema, tn as fileRefSchema, tt as resolveApiCallsConfig, un as evalTracer, ut as evalStatItemSchema, vn as advanceEvalTime, vt as evalChartAggregateSchema, wn as getEvalCaseInput, wt as evalChartTooltipExtraSchema, xn as evalAssert, xt as evalChartColorSchema, yn as appendToEvalOutput, yt as evalChartAxisSchema, z as runManifestSchema, zt as spanCacheOptionsSchema } from "./runOrchestration-D697g6Qe.mjs";
|
|
2
|
+
import { n as createRunner, t as runCli } from "./cli-Be0x8CS3.mjs";
|
|
3
|
+
import "./src-D6cettg0.mjs";
|
|
4
|
+
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { B as runSummarySchema, Et as evalChartsConfigSchema, Qt as columnDefSchema, T as createRunRequestSchema, b as loadConfig, bn as configureEvalRunLogs, dt as evalStatsConfigSchema, t as executeRun, v as parseEvalMetas, w as createFsCacheStore, z as runManifestSchema } from "./runOrchestration-D697g6Qe.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { relative } from "node:path";
|
|
@@ -1330,7 +1330,7 @@ const errorCoreFields = new Set([
|
|
|
1330
1330
|
"stack",
|
|
1331
1331
|
"capturedAt"
|
|
1332
1332
|
]);
|
|
1333
|
-
function isRecord$
|
|
1333
|
+
function isRecord$5(value) {
|
|
1334
1334
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
1335
1335
|
}
|
|
1336
1336
|
function formatUnknownErrorMessage(error) {
|
|
@@ -1358,7 +1358,7 @@ function normalizeTraceError(error, capturedAt = void 0) {
|
|
|
1358
1358
|
stack: error.stack,
|
|
1359
1359
|
capturedAt
|
|
1360
1360
|
};
|
|
1361
|
-
if (isRecord$
|
|
1361
|
+
if (isRecord$5(error)) {
|
|
1362
1362
|
const extraFields = getErrorExtraFields(error);
|
|
1363
1363
|
const name = typeof error.name === "string" ? error.name : void 0;
|
|
1364
1364
|
const stack = typeof error.stack === "string" ? error.stack : void 0;
|
|
@@ -1383,7 +1383,7 @@ function normalizeTraceWarnings(warningOrWarnings, additionalWarnings, capturedA
|
|
|
1383
1383
|
return (additionalWarnings.length > 0 ? [warningOrWarnings, ...additionalWarnings] : Array.isArray(warningOrWarnings) ? warningOrWarnings : [warningOrWarnings]).map((warning) => normalizeTraceError(warning, capturedAt));
|
|
1384
1384
|
}
|
|
1385
1385
|
function isCaptureEvalSpanErrorOptions(value) {
|
|
1386
|
-
if (!isRecord$
|
|
1386
|
+
if (!isRecord$5(value)) return false;
|
|
1387
1387
|
const keys = Object.keys(value);
|
|
1388
1388
|
if (keys.length === 0) return false;
|
|
1389
1389
|
if (!keys.every((key) => key === "level")) return false;
|
|
@@ -2727,13 +2727,16 @@ const apiCallMetricFormatSchema = llmCallMetricFormatSchema;
|
|
|
2727
2727
|
const llmCallMetricPlacementSchema = z.enum(["header", "body"]);
|
|
2728
2728
|
/** Where an API-call metric is rendered inside the API calls tab. */
|
|
2729
2729
|
const apiCallMetricPlacementSchema = llmCallMetricPlacementSchema;
|
|
2730
|
+
const callDerivedAttributeSchema = z.custom((value) => typeof value === "function", { message: "Expected a derived attribute function" });
|
|
2730
2731
|
/**
|
|
2731
2732
|
* Schema for a single user-defined metric attached to LLM call rows.
|
|
2732
2733
|
*
|
|
2733
2734
|
* Each metric reads `path` from the span's `attributes` and renders the value
|
|
2734
|
-
* with the configured `format` and `numberFormat`.
|
|
2735
|
-
*
|
|
2736
|
-
*
|
|
2735
|
+
* with the configured `format` and `numberFormat`. Use
|
|
2736
|
+
* `llmCalls.derivedAttributes` when a metric should read a value computed from
|
|
2737
|
+
* other attributes. `placements` controls whether the metric appears as a chip
|
|
2738
|
+
* on the collapsed row header, as a row inside the expanded body, or both.
|
|
2739
|
+
* Defaults to `['body']` when omitted.
|
|
2737
2740
|
*/
|
|
2738
2741
|
const llmCallMetricSchema = z.object({
|
|
2739
2742
|
/** Display label for the metric row or header chip. */
|
|
@@ -2760,9 +2763,11 @@ const llmCallMetricSchema = z.object({
|
|
|
2760
2763
|
* Schema for a single user-defined metric attached to API call rows.
|
|
2761
2764
|
*
|
|
2762
2765
|
* Each metric reads `path` from the span's `attributes` and renders the value
|
|
2763
|
-
* with the configured `format` and `numberFormat`.
|
|
2764
|
-
*
|
|
2765
|
-
*
|
|
2766
|
+
* with the configured `format` and `numberFormat`. Use
|
|
2767
|
+
* `apiCalls.derivedAttributes` when a metric should read a value computed from
|
|
2768
|
+
* other attributes. `placements` controls whether the metric appears as a chip
|
|
2769
|
+
* on the collapsed row header, as a row inside the expanded body, or both.
|
|
2770
|
+
* Defaults to `['body']` when omitted.
|
|
2766
2771
|
*/
|
|
2767
2772
|
const apiCallMetricSchema = z.object({
|
|
2768
2773
|
/** Display label for the metric row or header chip. */
|
|
@@ -2839,6 +2844,13 @@ const llmCallsConfigSchema = z.object({
|
|
|
2839
2844
|
toolCalls: z.string().optional()
|
|
2840
2845
|
}).optional(),
|
|
2841
2846
|
/**
|
|
2847
|
+
* Derived attributes persisted onto every matching LLM span before
|
|
2848
|
+
* `deriveFromTracing`, default outputs, trace display, and call metrics read
|
|
2849
|
+
* the trace. Keys are dot-paths under `span.attributes`; return `undefined`
|
|
2850
|
+
* to skip writing the attribute for one span.
|
|
2851
|
+
*/
|
|
2852
|
+
derivedAttributes: z.record(z.string().min(1), callDerivedAttributeSchema).optional(),
|
|
2853
|
+
/**
|
|
2842
2854
|
* Model/provider pricing registry used to calculate LLM-call costs from
|
|
2843
2855
|
* token counts. Built-in LLM cost fields are only derived from this registry.
|
|
2844
2856
|
*/
|
|
@@ -2867,6 +2879,13 @@ const apiCallsConfigSchema = z.object({
|
|
|
2867
2879
|
durationMs: z.string().optional(),
|
|
2868
2880
|
error: z.string().optional()
|
|
2869
2881
|
}).optional(),
|
|
2882
|
+
/**
|
|
2883
|
+
* Derived attributes persisted onto every matching API span before trace
|
|
2884
|
+
* display and call metrics read the trace. Keys are dot-paths under
|
|
2885
|
+
* `span.attributes`; return `undefined` to skip writing the attribute for
|
|
2886
|
+
* one span.
|
|
2887
|
+
*/
|
|
2888
|
+
derivedAttributes: z.record(z.string().min(1), callDerivedAttributeSchema).optional(),
|
|
2870
2889
|
/** Custom user-defined metrics surfaced on each API call. */
|
|
2871
2890
|
metrics: z.array(apiCallMetricSchema).optional()
|
|
2872
2891
|
});
|
|
@@ -2898,6 +2917,7 @@ const DEFAULT_LLM_CALLS_CONFIG = {
|
|
|
2898
2917
|
reasoning: "reasoning",
|
|
2899
2918
|
toolCalls: "toolCalls"
|
|
2900
2919
|
},
|
|
2920
|
+
derivedAttributes: [],
|
|
2901
2921
|
metrics: [],
|
|
2902
2922
|
pricing: []
|
|
2903
2923
|
};
|
|
@@ -2921,8 +2941,35 @@ const DEFAULT_API_CALLS_CONFIG = {
|
|
|
2921
2941
|
durationMs: "durationMs",
|
|
2922
2942
|
error: "error"
|
|
2923
2943
|
},
|
|
2944
|
+
derivedAttributes: [],
|
|
2924
2945
|
metrics: []
|
|
2925
2946
|
};
|
|
2947
|
+
function resolveDerivedAttributes(input) {
|
|
2948
|
+
return Object.entries(input ?? {}).map(([path, compute]) => ({
|
|
2949
|
+
path,
|
|
2950
|
+
compute
|
|
2951
|
+
}));
|
|
2952
|
+
}
|
|
2953
|
+
function resolveLlmCallMetric(metric) {
|
|
2954
|
+
return {
|
|
2955
|
+
label: metric.label,
|
|
2956
|
+
tooltip: metric.tooltip,
|
|
2957
|
+
path: metric.path,
|
|
2958
|
+
format: metric.format ?? "string",
|
|
2959
|
+
numberFormat: metric.numberFormat,
|
|
2960
|
+
placements: metric.placements ? [...metric.placements] : ["body"]
|
|
2961
|
+
};
|
|
2962
|
+
}
|
|
2963
|
+
function resolveApiCallMetric(metric) {
|
|
2964
|
+
return {
|
|
2965
|
+
label: metric.label,
|
|
2966
|
+
tooltip: metric.tooltip,
|
|
2967
|
+
path: metric.path,
|
|
2968
|
+
format: metric.format ?? "string",
|
|
2969
|
+
numberFormat: metric.numberFormat,
|
|
2970
|
+
placements: metric.placements ? [...metric.placements] : ["body"]
|
|
2971
|
+
};
|
|
2972
|
+
}
|
|
2926
2973
|
/**
|
|
2927
2974
|
* Resolve the user-authored LLM-calls config to a fully-defaulted shape used
|
|
2928
2975
|
* by the UI to derive the LLM calls tab.
|
|
@@ -2942,14 +2989,8 @@ function resolveLlmCallsConfig(input) {
|
|
|
2942
2989
|
...DEFAULT_LLM_CALLS_CONFIG.attributes,
|
|
2943
2990
|
...input?.attributes
|
|
2944
2991
|
},
|
|
2945
|
-
|
|
2946
|
-
|
|
2947
|
-
tooltip: m.tooltip,
|
|
2948
|
-
path: m.path,
|
|
2949
|
-
format: m.format ?? "string",
|
|
2950
|
-
numberFormat: m.numberFormat,
|
|
2951
|
-
placements: m.placements ? [...m.placements] : ["body"]
|
|
2952
|
-
})),
|
|
2992
|
+
derivedAttributes: resolveDerivedAttributes(input?.derivedAttributes),
|
|
2993
|
+
metrics: (input?.metrics ?? []).map(resolveLlmCallMetric),
|
|
2953
2994
|
pricing: (input?.pricing ?? []).map((p) => ({
|
|
2954
2995
|
model: p.model,
|
|
2955
2996
|
provider: p.provider,
|
|
@@ -2979,14 +3020,8 @@ function resolveApiCallsConfig(input) {
|
|
|
2979
3020
|
...DEFAULT_API_CALLS_CONFIG.attributes,
|
|
2980
3021
|
...input?.attributes
|
|
2981
3022
|
},
|
|
2982
|
-
|
|
2983
|
-
|
|
2984
|
-
tooltip: m.tooltip,
|
|
2985
|
-
path: m.path,
|
|
2986
|
-
format: m.format ?? "string",
|
|
2987
|
-
numberFormat: m.numberFormat,
|
|
2988
|
-
placements: m.placements ? [...m.placements] : ["body"]
|
|
2989
|
-
}))
|
|
3023
|
+
derivedAttributes: resolveDerivedAttributes(input?.derivedAttributes),
|
|
3024
|
+
metrics: (input?.metrics ?? []).map(resolveApiCallMetric)
|
|
2990
3025
|
};
|
|
2991
3026
|
}
|
|
2992
3027
|
/** Zod schema for validating `agent-evals.config.ts` input. */
|
|
@@ -3206,7 +3241,7 @@ function getEvalTitle(evalLike) {
|
|
|
3206
3241
|
}
|
|
3207
3242
|
//#endregion
|
|
3208
3243
|
//#region ../shared/src/utils/getNestedAttribute.ts
|
|
3209
|
-
function isRecord$
|
|
3244
|
+
function isRecord$4(value) {
|
|
3210
3245
|
return typeof value === "object" && value !== null;
|
|
3211
3246
|
}
|
|
3212
3247
|
/**
|
|
@@ -3221,12 +3256,84 @@ function getNestedAttribute(value, path) {
|
|
|
3221
3256
|
const parts = path.split(".");
|
|
3222
3257
|
let current = value;
|
|
3223
3258
|
for (const part of parts) {
|
|
3224
|
-
if (!isRecord$
|
|
3259
|
+
if (!isRecord$4(current) || !(part in current)) return;
|
|
3225
3260
|
current = current[part];
|
|
3226
3261
|
}
|
|
3227
3262
|
return current;
|
|
3228
3263
|
}
|
|
3229
3264
|
//#endregion
|
|
3265
|
+
//#region ../shared/src/utils/deriveCallAttributes.ts
|
|
3266
|
+
function isRecord$3(value) {
|
|
3267
|
+
return typeof value === "object" && value !== null;
|
|
3268
|
+
}
|
|
3269
|
+
function mergeNestedAttribute$1(value, path, attributeValue) {
|
|
3270
|
+
const root = value === void 0 ? {} : { ...value };
|
|
3271
|
+
const parts = path.split(".");
|
|
3272
|
+
let current = root;
|
|
3273
|
+
for (const [index, part] of parts.entries()) {
|
|
3274
|
+
if (index === parts.length - 1) {
|
|
3275
|
+
current[part] = attributeValue;
|
|
3276
|
+
continue;
|
|
3277
|
+
}
|
|
3278
|
+
const nextValue = current[part];
|
|
3279
|
+
const nextRecord = isRecord$3(nextValue) ? { ...nextValue } : {};
|
|
3280
|
+
current[part] = nextRecord;
|
|
3281
|
+
current = nextRecord;
|
|
3282
|
+
}
|
|
3283
|
+
return root;
|
|
3284
|
+
}
|
|
3285
|
+
function applyDerivedAttributesForKind(params) {
|
|
3286
|
+
let attributes = params.span.attributes;
|
|
3287
|
+
for (const derivedAttribute of params.derivedAttributes) {
|
|
3288
|
+
if (derivedAttribute.compute === void 0) continue;
|
|
3289
|
+
const span = {
|
|
3290
|
+
...params.span,
|
|
3291
|
+
attributes
|
|
3292
|
+
};
|
|
3293
|
+
const value = (() => {
|
|
3294
|
+
try {
|
|
3295
|
+
return derivedAttribute.compute({
|
|
3296
|
+
attributes,
|
|
3297
|
+
span,
|
|
3298
|
+
get: (path) => getNestedAttribute(attributes, path)
|
|
3299
|
+
});
|
|
3300
|
+
} catch {
|
|
3301
|
+
return;
|
|
3302
|
+
}
|
|
3303
|
+
})();
|
|
3304
|
+
if (value === void 0) continue;
|
|
3305
|
+
attributes = mergeNestedAttribute$1(attributes, derivedAttribute.path, value);
|
|
3306
|
+
}
|
|
3307
|
+
if (attributes === params.span.attributes) return params.span;
|
|
3308
|
+
return {
|
|
3309
|
+
...params.span,
|
|
3310
|
+
attributes
|
|
3311
|
+
};
|
|
3312
|
+
}
|
|
3313
|
+
/**
|
|
3314
|
+
* Persist configured derived attributes onto matching LLM/API spans.
|
|
3315
|
+
*
|
|
3316
|
+
* These derived attributes are applied before trace consumers run, so
|
|
3317
|
+
* `deriveFromTracing`, default usage extraction, trace display, and call
|
|
3318
|
+
* metrics can all read them by normal dot-path lookup.
|
|
3319
|
+
*/
|
|
3320
|
+
function applyDerivedCallAttributes(params) {
|
|
3321
|
+
const llmKinds = new Set(params.llmCallsConfig.kinds);
|
|
3322
|
+
const apiKinds = new Set(params.apiCallsConfig.kinds);
|
|
3323
|
+
return params.spans.map((span) => {
|
|
3324
|
+
let nextSpan = span;
|
|
3325
|
+
if (llmKinds.has(span.kind)) nextSpan = applyDerivedAttributesForKind({
|
|
3326
|
+
span: nextSpan,
|
|
3327
|
+
derivedAttributes: params.llmCallsConfig.derivedAttributes
|
|
3328
|
+
});
|
|
3329
|
+
if (apiKinds.has(span.kind)) nextSpan = applyDerivedAttributesForKind({
|
|
3330
|
+
span: nextSpan,
|
|
3331
|
+
derivedAttributes: params.apiCallsConfig.derivedAttributes
|
|
3332
|
+
});
|
|
3333
|
+
return nextSpan;
|
|
3334
|
+
});
|
|
3335
|
+
}
|
|
3336
|
+
//#endregion
|
|
3230
3337
|
//#region ../shared/src/utils/extractLlmCalls.ts
|
|
3231
3338
|
function readNumber$2(attributes, path) {
|
|
3232
3339
|
const raw = getNestedAttribute(attributes, path);
|
|
@@ -5361,7 +5468,12 @@ async function runCase(params) {
|
|
|
5361
5468
|
startTime: evalDef.startTime,
|
|
5362
5469
|
freezeTime: evalDef.freezeTime
|
|
5363
5470
|
});
|
|
5364
|
-
const
|
|
5471
|
+
const spansWithDerivedAttributes = applyDerivedCallAttributes({
|
|
5472
|
+
spans: scope.spans,
|
|
5473
|
+
llmCallsConfig,
|
|
5474
|
+
apiCallsConfig
|
|
5475
|
+
});
|
|
5476
|
+
const traceTree = buildTraceTree(spansWithDerivedAttributes, scope.checkpoints);
|
|
5365
5477
|
const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
|
|
5366
5478
|
if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) scope.assertionFailures.push(toAssertionFailure(executeError.message, executeError));
|
|
5367
5479
|
if (!nonAssertError && evalDef.deriveFromTracing) {
|
|
@@ -5383,7 +5495,7 @@ async function runCase(params) {
|
|
|
5383
5495
|
}
|
|
5384
5496
|
if (!nonAssertError) addDefaultOutputs({
|
|
5385
5497
|
outputs: scope.outputs,
|
|
5386
|
-
spans:
|
|
5498
|
+
spans: spansWithDerivedAttributes,
|
|
5387
5499
|
llmCallsConfig,
|
|
5388
5500
|
apiCallsConfig,
|
|
5389
5501
|
globalRemove: globalRemoveDefaultConfig,
|
|
@@ -5471,7 +5583,7 @@ async function runCase(params) {
|
|
|
5471
5583
|
}
|
|
5472
5584
|
}
|
|
5473
5585
|
const status = nonAssertError ? "error" : passed ? "pass" : "fail";
|
|
5474
|
-
const { trace: displayTrace, traceDisplay } = resolveTracePresentation(
|
|
5586
|
+
const { trace: displayTrace, traceDisplay } = resolveTracePresentation(spansWithDerivedAttributes, globalTraceDisplay, evalDef.traceDisplay);
|
|
5475
5587
|
const columns = {};
|
|
5476
5588
|
const columnOverrides = mergeDefaultColumns({
|
|
5477
5589
|
columns: evalDef.columns,
|
|
@@ -5938,4 +6050,4 @@ function toLastRunStatus(status) {
|
|
|
5938
6050
|
return status === "pending" ? null : status;
|
|
5939
6051
|
}
|
|
5940
6052
|
//#endregion
|
|
5941
|
-
export {
|
|
6053
|
+
export { llmCallsConfigSchema as $, columnFormatSchema as $t, extractApiCalls as A, runInEvalRuntimeScope as An, cacheEntryWithDebugKeySchema as At, runSummarySchema as B, traceCacheRefSchema as Bt, validateCharts as C, getCurrentScope as Cn, evalChartMetricSchema as Ct, sseEnvelopeSchema as D, isInEvalScope as Dn, cacheDebugKeyEntrySchema as Dt, updateManualScoreRequestSchema as E, incrementEvalOutput as En, evalChartsConfigSchema as Et, getEvalDisplayStatus as F, startEvalBackgroundJob as Fn, cacheRecordingOpSchema as Ft, apiCallMetricPlacementSchema as G, traceDisplayConfigSchema as Gt, DEFAULT_LLM_CALLS_CONFIG as H, traceAttributeDisplayInputSchema as Ht, deriveScopedSummaryFromCases as I, repoFile as In, cacheRecordingSchema as It, defaultConfigKeySchema as J, traceSpanKindSchema as Jt, apiCallMetricSchema as K, traceDisplayInputConfigSchema as Kt, deriveStatusFromCaseRows as L, defineEval as Ln, cacheStatusSchema as Lt, applyDerivedCallAttributes as M, runInExistingEvalScope as Mn, cacheListItemSchema as Mt, getNestedAttribute as N, setEvalOutput as Nn, cacheModeSchema as Nt, extractCacheEntries as O, mergeEvalOutput as On, cacheDebugKeyFileSchema as Ot, getEvalTitle as P, setScopeCacheContext as Pn, cacheOperationTypeSchema as Pt, llmCallPricingSchema as Q, columnDefSchema as Qt, deriveStatusFromChildStatuses as R, getEvalRegistry as Rn, serializedCacheSpanSchema as Rt, normalizeScoreDef as S, evalLog as Sn, evalChartConfigSchema as St, createRunRequestSchema as T, getEvalStartTime as Tn, evalChartTypeSchema as Tt, agentEvalsConfigSchema as U, traceAttributeDisplayPlacementSchema as Ut, DEFAULT_API_CALLS_CONFIG as V, traceAttributeDisplayFormatSchema as Vt, apiCallMetricFormatSchema as W, traceAttributeDisplaySchema as Wt, llmCallMetricPlacementSchema as X, traceSpanWarningSchema as Xt, llmCallMetricFormatSchema as Y, traceSpanSchema as Yt, llmCallMetricSchema as Z, cellValueSchema as Zt, loadEvalModule as _, EvalAssertionError as _n, scoreTraceSchema as _t, loadPersistedRunSnapshot as a, runArtifactRefSchema as an, assertionFailureSchema as at, loadConfig as b, configureEvalRunLogs as bn, evalChartBuiltinMetricSchema as bt, persistCaseDetail as c, captureEvalSpanError as cn, evalFreshnessStatusSchema as ct, recomputePersistedCaseStatus as d, hashCacheKey as dn, evalStatsConfigSchema as dt, columnKindSchema as en, removeDefaultConfigSchema as et, runTouchesEval as f, hashCacheKeySync as fn, evalSummarySchema as ft, setLatestRunInfoMap as g, serializeCacheValue as gn, runLogPhaseSchema as gt, getTargetEvalIds as h, serializeCacheRecording as hn, runLogLocationSchema as ht, getLatestRunInfos as i, repoFileRefSchema as in, trialSelectionModeSchema as it, extractLlmCalls as j, runInEvalScope as jn, cacheFileSchema as jt, extractCacheHits as k, nextEvalId as kn, cacheEntrySchema as kt, persistRunState as l, evalSpan as ln, evalStatAggregateSchema as lt, buildEvalSummary as m, deserializeCacheValue as mn, runLogLevelSchema as mt, generateRunId as n, jsonCellSchema as nn, resolveLlmCallsConfig as nt, loadPersistedRunSnapshots as o, z$1 as on, caseDetailSchema as ot, resolveArtifactPath as p, deserializeCacheRecording as pn, runLogEntrySchema as pt, apiCallsConfigSchema as q, traceSpanErrorSchema as qt, getLastRunStatuses as r, numberDisplayOptionsSchema as rn, runLogsConfigSchema as rt, nextShortIdFromSnapshots as s, buildTraceTree as sn, caseRowSchema as st, executeRun as t, fileRefSchema as tn, resolveApiCallsConfig as tt, recomputeEvalStatusesInRuns as u, evalTracer as un, evalStatItemSchema as ut, parseEvalMetas as v, advanceEvalTime as vn, evalChartAggregateSchema as vt, createFsCacheStore as w, getEvalCaseInput as wn, evalChartTooltipExtraSchema as wt, buildDeclaredColumnDefs as x, evalAssert as xn, evalChartColorSchema as xt, resolveEvalDefaultConfig as y, appendToEvalOutput as yn, evalChartAxisSchema as yt, runManifestSchema as z, spanCacheOptionsSchema as zt };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-Be0x8CS3.mjs";
|
|
2
|
+
import "./src-D6cettg0.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-B4SosWgD.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.24.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -59,8 +59,8 @@
|
|
|
59
59
|
"@types/node": "^24.7.2",
|
|
60
60
|
"typescript": "^5.9.2",
|
|
61
61
|
"@agent-evals/runner": "0.0.1",
|
|
62
|
-
"@agent-evals/
|
|
63
|
-
"@agent-evals/
|
|
62
|
+
"@agent-evals/shared": "0.0.1",
|
|
63
|
+
"@agent-evals/sdk": "0.0.1"
|
|
64
64
|
},
|
|
65
65
|
"scripts": {
|
|
66
66
|
"build": "pnpm --filter @agent-evals/web build && tsdown",
|
|
@@ -274,10 +274,12 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
274
274
|
attribute paths. `latencyMs` is time to first token; duration, total tokens,
|
|
275
275
|
tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
|
|
276
276
|
override `attributes.<field>` for non-default primitive span shapes, configure
|
|
277
|
-
`pricing` to derive USD costs from token counts by model/provider,
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
'
|
|
277
|
+
`pricing` to derive USD costs from token counts by model/provider, add
|
|
278
|
+
`derivedAttributes` to persist computed values back onto matching LLM spans
|
|
279
|
+
before trace consumers run, and add entries to `metrics` to surface arbitrary
|
|
280
|
+
user metrics (`format: 'string' | 'number' | 'duration' | 'json' |
|
|
281
|
+
'boolean'`, `placements: ['header' | 'body']`). `derivedAttributes` keys are
|
|
282
|
+
dot-paths under `span.attributes`; return `undefined` to skip one span.
|
|
281
283
|
- Default usage config derives missing eval outputs from matching LLM/API spans
|
|
282
284
|
before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`,
|
|
283
285
|
`inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`,
|
|
@@ -298,7 +300,8 @@ cacheCreationInputTokens` so cache details are not double-counted.
|
|
|
298
300
|
and `'fetch'` spans with `method`, `url`, `statusCode`, `request`,
|
|
299
301
|
`response`, `requestBody`, `responseBody`, `headers`, `durationMs`, and
|
|
300
302
|
`error` read from conventional attribute paths. Override `kinds` or
|
|
301
|
-
`attributes.<field>` for external tracers,
|
|
303
|
+
`attributes.<field>` for external tracers, add `derivedAttributes` for
|
|
304
|
+
computed persisted API span attributes, and add `metrics` with the same
|
|
302
305
|
formats and placements as LLM-call metrics.
|
|
303
306
|
- `runLogs` (in `agent-evals.config.ts`) controls case log capture. Use
|
|
304
307
|
`runLogs: { captureConsole: false }` to keep console output in the terminal
|
package/dist/src-CcXfWT4M.mjs
DELETED