@ls-stack/agent-eval 0.58.5 → 0.59.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-DLNmRUqH.mjs → app-B3PEtWqH.mjs} +4 -4
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +1 -1
- package/dist/{cli-ClAkjTvo.mjs → cli-Dkp2-rBm.mjs} +4 -4
- package/dist/index.d.mts +45 -9
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +2 -2
- package/dist/{runExecution-BMnJXWhN.mjs → runExecution-C3XVZHRC.mjs} +94 -6
- package/dist/{runOrchestration-CvbTAoEb.mjs → runOrchestration-B5An-AEi.mjs} +1 -1
- package/dist/{runner-DJJekv9f.mjs → runner-BJXz_V_V.mjs} +1 -1
- package/dist/{runner-BfHgVhGS.mjs → runner-C9J-1fkp.mjs} +2 -2
- package/dist/{src-DfzidkYr.mjs → src-8dGXUULC.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +10 -2
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { Dt as getCaseRowCaseKey, ft as getEvalTitle, it as extractCacheEntries, kt as caseRowSchema, nt as createRunRequestSchema, rt as updateManualScoreRequestSchema } from "./runExecution-
|
|
2
|
-
import { o as stageManualInputFile } from "./cli-
|
|
3
|
-
import "./src-
|
|
4
|
-
import { t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { Dt as getCaseRowCaseKey, ft as getEvalTitle, it as extractCacheEntries, kt as caseRowSchema, nt as createRunRequestSchema, rt as updateManualScoreRequestSchema } from "./runExecution-C3XVZHRC.mjs";
|
|
2
|
+
import { o as stageManualInputFile } from "./cli-Dkp2-rBm.mjs";
|
|
3
|
+
import "./src-8dGXUULC.mjs";
|
|
4
|
+
import { t as getRunnerInstance } from "./runner-C9J-1fkp.mjs";
|
|
5
5
|
import { z } from "zod/v4";
|
|
6
6
|
import { resultify } from "t-result";
|
|
7
7
|
import { readFile } from "node:fs/promises";
|
package/dist/bin.mjs
CHANGED
package/dist/caseChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Ct as resolveApiCallsConfig, Lt as runWithEvalRegistry, R as configureEvalRunLogs, Y as runInEvalRuntimeScope, _ as createBufferedCacheStore, a as isCaseChildParentMessage, d as loadEvalModule, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, r as runCase, v as createFsCacheStore, wt as resolveLlmCallsConfig, y as getCacheRetentionOptions } from "./runExecution-
|
|
1
|
+
import { Ct as resolveApiCallsConfig, Lt as runWithEvalRegistry, R as configureEvalRunLogs, Y as runInEvalRuntimeScope, _ as createBufferedCacheStore, a as isCaseChildParentMessage, d as loadEvalModule, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, r as runCase, v as createFsCacheStore, wt as resolveLlmCallsConfig, y as getCacheRetentionOptions } from "./runExecution-C3XVZHRC.mjs";
|
|
2
2
|
//#region ../runner/src/caseChild.ts
|
|
3
3
|
let fatalErrorReported = false;
|
|
4
4
|
let disconnectExpected = false;
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Ct as resolveApiCallsConfig, Dt as getCaseRowCaseKey, Et as buildEvalKey, It as getEvalRegistry, St as runSummarySchema, c as resolveArtifactPath, f as resolveEvalDefaultConfig, ft as getEvalTitle, h as normalizeScoreDef, kt as caseRowSchema, m as buildDeclaredColumnDefs, mt as deriveScopedSummaryFromCases, o as stripTerminalControlCodes, p as loadConfig, pt as getEvalDisplayStatus, s as resolveTracePresentation, ut as applyDerivedCallAttributes, v as createFsCacheStore, vt as matchesTagsFilter, wt as resolveLlmCallsConfig, y as getCacheRetentionOptions } from "./runExecution-
|
|
2
|
-
import { C as validateCharts, S as parseEvalDiscovery, _ as runTouchesEval, a as validateTagsFilters, b as deriveEvalFreshness, c as getLatestRunInfos, d as nextShortIdFromSnapshots, f as persistCaseDetail, g as recomputePersistedCaseStatus, h as recomputeEvalStatusesInRuns, i as resolveEvalTags, l as loadPersistedRunSnapshot, m as persistRunState, n as getTargetEvalKeys, o as generateRunId, p as deleteTemporaryRuns, s as getLastRunStatuses, u as loadPersistedRunSnapshots, v as buildManualInputDescriptor, x as loadIsolatedEvalRegistry, y as parseManualInputValues } from "./runOrchestration-
|
|
1
|
+
import { Ct as resolveApiCallsConfig, Dt as getCaseRowCaseKey, Et as buildEvalKey, It as getEvalRegistry, St as runSummarySchema, c as resolveArtifactPath, f as resolveEvalDefaultConfig, ft as getEvalTitle, h as normalizeScoreDef, kt as caseRowSchema, m as buildDeclaredColumnDefs, mt as deriveScopedSummaryFromCases, o as stripTerminalControlCodes, p as loadConfig, pt as getEvalDisplayStatus, s as resolveTracePresentation, ut as applyDerivedCallAttributes, v as createFsCacheStore, vt as matchesTagsFilter, wt as resolveLlmCallsConfig, y as getCacheRetentionOptions } from "./runExecution-C3XVZHRC.mjs";
|
|
2
|
+
import { C as validateCharts, S as parseEvalDiscovery, _ as runTouchesEval, a as validateTagsFilters, b as deriveEvalFreshness, c as getLatestRunInfos, d as nextShortIdFromSnapshots, f as persistCaseDetail, g as recomputePersistedCaseStatus, h as recomputeEvalStatusesInRuns, i as resolveEvalTags, l as loadPersistedRunSnapshot, m as persistRunState, n as getTargetEvalKeys, o as generateRunId, p as deleteTemporaryRuns, s as getLastRunStatuses, u as loadPersistedRunSnapshots, v as buildManualInputDescriptor, x as loadIsolatedEvalRegistry, y as parseManualInputValues } from "./runOrchestration-B5An-AEi.mjs";
|
|
3
3
|
import { resultify } from "t-result";
|
|
4
4
|
import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
5
5
|
import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
@@ -2199,8 +2199,8 @@ async function commandApp(args) {
|
|
|
2199
2199
|
const { serve } = await import("@hono/node-server");
|
|
2200
2200
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
2201
2201
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
2202
|
-
const appModule = await import("./app-
|
|
2203
|
-
const runnerModule = await import("./runner-
|
|
2202
|
+
const appModule = await import("./app-B3PEtWqH.mjs");
|
|
2203
|
+
const runnerModule = await import("./runner-BJXz_V_V.mjs");
|
|
2204
2204
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
2205
2205
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
2206
2206
|
await runnerModule.initRunner();
|
package/dist/index.d.mts
CHANGED
|
@@ -398,6 +398,18 @@ type EvalCase$1$1<TInput = unknown> = {
|
|
|
398
398
|
input: TInput;
|
|
399
399
|
tags?: string[];
|
|
400
400
|
};
|
|
401
|
+
/** Normalized view of one tool-call span and its common tool metadata. */
|
|
402
|
+
type EvalToolCallSpan = {
|
|
403
|
+
/** Preferred tool name, using GenAI/Mastra identity metadata when present. */name: string; /** Original trace span display name. */
|
|
404
|
+
spanName: string; /** Original trace span kind. */
|
|
405
|
+
kind: string; /** Parsed tool-call arguments, or the raw value when parsing is not possible. */
|
|
406
|
+
arguments: unknown; /** Parsed tool-call result, or the raw value when parsing is not possible. */
|
|
407
|
+
result: unknown; /** Tool description from GenAI/Mastra metadata when present. */
|
|
408
|
+
description: string | undefined; /** Tool type from GenAI/Mastra metadata when present. */
|
|
409
|
+
toolType: string | undefined; /** Original span attributes. */
|
|
410
|
+
attributes: Record<string, unknown> | undefined; /** Original trace span for fields not normalized above. */
|
|
411
|
+
span: EvalTraceSpan$2;
|
|
412
|
+
};
|
|
401
413
|
/** Query helpers built from the flattened trace recorded for one eval case. */
|
|
402
414
|
type EvalTraceTree = {
|
|
403
415
|
/** Flat span list in creation order. */spans: EvalTraceSpan$2[]; /** Top-level spans whose `parentId` is `null`. */
|
|
@@ -406,10 +418,16 @@ type EvalTraceTree = {
|
|
|
406
418
|
findSpans: (name: string) => EvalTraceSpan$2[]; /** Return whether any span name exactly matches `name`. */
|
|
407
419
|
hasSpan: (name: string) => boolean; /** Return every span whose kind exactly matches `kind`. */
|
|
408
420
|
findSpansByKind: (kind: string) => EvalTraceSpan$2[]; /** Return every span with `kind: 'tool'` or `kind: 'tool_call'`. */
|
|
409
|
-
findToolCallSpans: () => EvalTraceSpan$2[];
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
421
|
+
findToolCallSpans: () => EvalTraceSpan$2[];
|
|
422
|
+
/**
|
|
423
|
+
* Return tool-call names, preferring GenAI/Mastra tool identity attributes
|
|
424
|
+
* when available.
|
|
425
|
+
*/
|
|
426
|
+
listToolCallSpanNames: () => string[]; /** Return whether a tool-call span name or tool identity matches `name`. */
|
|
427
|
+
hasToolCallSpan: (name: string) => boolean; /** Return normalized tool-call spans whose name or tool identity matches `name`. */
|
|
428
|
+
getToolCallSpans: (name: string) => EvalToolCallSpan[]; /** Return how many tool-call spans have a name or tool identity matching `toolName`. */
|
|
429
|
+
getToolCallSpanCount: (toolName: string) => number; /** Return whether a tool-call span name or tool identity appears exactly `expectedCalls` times. */
|
|
430
|
+
hasToolCallSpanCount: (toolName: string, expectedCalls: number) => boolean; /** Return span names in creation order, optionally filtered by kind. */
|
|
413
431
|
listSpanNames: (kind?: string) => string[]; /** Return span names in depth-first tree order, optionally filtered by kind. */
|
|
414
432
|
listSpanNamesDfs: (kind?: string) => string[]; /** Return all spans in depth-first tree order. */
|
|
415
433
|
flattenDfs: () => EvalTraceSpan$2[];
|
|
@@ -3688,6 +3706,18 @@ type EvalCase$1<TInput = unknown> = {
|
|
|
3688
3706
|
input: TInput;
|
|
3689
3707
|
tags?: string[];
|
|
3690
3708
|
};
|
|
3709
|
+
/** Normalized view of one tool-call span and its common tool metadata. */
|
|
3710
|
+
type EvalToolCallSpan$1 = {
|
|
3711
|
+
/** Preferred tool name, using GenAI/Mastra identity metadata when present. */name: string; /** Original trace span display name. */
|
|
3712
|
+
spanName: string; /** Original trace span kind. */
|
|
3713
|
+
kind: string; /** Parsed tool-call arguments, or the raw value when parsing is not possible. */
|
|
3714
|
+
arguments: unknown; /** Parsed tool-call result, or the raw value when parsing is not possible. */
|
|
3715
|
+
result: unknown; /** Tool description from GenAI/Mastra metadata when present. */
|
|
3716
|
+
description: string | undefined; /** Tool type from GenAI/Mastra metadata when present. */
|
|
3717
|
+
toolType: string | undefined; /** Original span attributes. */
|
|
3718
|
+
attributes: Record<string, unknown> | undefined; /** Original trace span for fields not normalized above. */
|
|
3719
|
+
span: EvalTraceSpan$1;
|
|
3720
|
+
};
|
|
3691
3721
|
/** Query helpers built from the flattened trace recorded for one eval case. */
|
|
3692
3722
|
type EvalTraceTree$1 = {
|
|
3693
3723
|
/** Flat span list in creation order. */spans: EvalTraceSpan$1[]; /** Top-level spans whose `parentId` is `null`. */
|
|
@@ -3696,10 +3726,16 @@ type EvalTraceTree$1 = {
|
|
|
3696
3726
|
findSpans: (name: string) => EvalTraceSpan$1[]; /** Return whether any span name exactly matches `name`. */
|
|
3697
3727
|
hasSpan: (name: string) => boolean; /** Return every span whose kind exactly matches `kind`. */
|
|
3698
3728
|
findSpansByKind: (kind: string) => EvalTraceSpan$1[]; /** Return every span with `kind: 'tool'` or `kind: 'tool_call'`. */
|
|
3699
|
-
findToolCallSpans: () => EvalTraceSpan$1[];
|
|
3700
|
-
|
|
3701
|
-
|
|
3702
|
-
|
|
3729
|
+
findToolCallSpans: () => EvalTraceSpan$1[];
|
|
3730
|
+
/**
|
|
3731
|
+
* Return tool-call names, preferring GenAI/Mastra tool identity attributes
|
|
3732
|
+
* when available.
|
|
3733
|
+
*/
|
|
3734
|
+
listToolCallSpanNames: () => string[]; /** Return whether a tool-call span name or tool identity matches `name`. */
|
|
3735
|
+
hasToolCallSpan: (name: string) => boolean; /** Return normalized tool-call spans whose name or tool identity matches `name`. */
|
|
3736
|
+
getToolCallSpans: (name: string) => EvalToolCallSpan$1[]; /** Return how many tool-call spans have a name or tool identity matching `toolName`. */
|
|
3737
|
+
getToolCallSpanCount: (toolName: string) => number; /** Return whether a tool-call span name or tool identity appears exactly `expectedCalls` times. */
|
|
3738
|
+
hasToolCallSpanCount: (toolName: string, expectedCalls: number) => boolean; /** Return span names in creation order, optionally filtered by kind. */
|
|
3703
3739
|
listSpanNames: (kind?: string) => string[]; /** Return span names in depth-first tree order, optionally filtered by kind. */
|
|
3704
3740
|
listSpanNamesDfs: (kind?: string) => string[]; /** Return all spans in depth-first tree order. */
|
|
3705
3741
|
flattenDfs: () => EvalTraceSpan$1[];
|
|
@@ -7294,4 +7330,4 @@ declare function defineEval<TInput = unknown, TOutputs extends EvalOutputs = Eva
|
|
|
7294
7330
|
/** Return whether the active eval case has tags matching the typed input. */
|
|
7295
7331
|
declare function matchesEvalTags(input: EvalTagMatchInput): boolean;
|
|
7296
7332
|
//#endregion
|
|
7297
|
-
export { AgentEvalTagRegistry, AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheRepairSummary, type CacheScopeContext, type CacheSerializationOptions, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CallDerivedAttributesConfig, type CallDerivedAttributesFn, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type ConfigReloadState, type ConfigReloadStatus, type CreateRunRequest, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCacheConfig, EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, EvalDefinition, type EvalDeriveConfig, type EvalDeriveContext, type EvalDeriveFn, type EvalDeriveMap, type EvalDeriveValueFn, type EvalDisplayStatus, type EvalExecuteContext, type EvalExpectation, type EvalFreshnessStatus, type EvalManualInputConfig, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, EvalRuntimeUsageError, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, EvalTag, EvalTagMatchInput, type EvalTraceTree, type EvalTracingAssertionsConfig, type EvalTracingAssertionsFn, type JsonCell, type LlmCallCostBreakdown, type LlmCallCostCurrency, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallPricingRate, type LlmCallPricingRegistry, type LlmCallSimulatedTokens, type LlmCallsConfigInput, type LlmCostScenario, type ManualInputDescriptor, type ManualInputFieldDescriptor, type ManualInputFieldKind, type ManualInputFieldOverride, type ManualInputFieldsConfig, type ManualInputFileValue, type ManualInputSelectOption, type MaterializeManualInputFilesResult, type NumberDisplayOptions, type ReadManualInputFileResult, type RemoveDefaultConfig, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallCostCurrency, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
|
7333
|
+
export { AgentEvalTagRegistry, AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheRepairSummary, type CacheScopeContext, type CacheSerializationOptions, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CallDerivedAttributesConfig, type CallDerivedAttributesFn, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type ConfigReloadState, type ConfigReloadStatus, type CreateRunRequest, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCacheConfig, EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, EvalDefinition, type EvalDeriveConfig, type EvalDeriveContext, type EvalDeriveFn, type EvalDeriveMap, type EvalDeriveValueFn, type EvalDisplayStatus, type EvalExecuteContext, type EvalExpectation, type EvalFreshnessStatus, type EvalManualInputConfig, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, EvalRuntimeUsageError, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, EvalTag, EvalTagMatchInput, type EvalToolCallSpan, type EvalTraceTree, type EvalTracingAssertionsConfig, type EvalTracingAssertionsFn, type JsonCell, type LlmCallCostBreakdown, type LlmCallCostCurrency, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallPricingRate, type LlmCallPricingRegistry, type LlmCallSimulatedTokens, type LlmCallsConfigInput, type LlmCostScenario, type ManualInputDescriptor, type ManualInputFieldDescriptor, type ManualInputFieldKind, type ManualInputFieldOverride, type ManualInputFieldsConfig, type ManualInputFileValue, type ManualInputSelectOption, type MaterializeManualInputFilesResult, type NumberDisplayOptions, type ReadManualInputFileResult, type RemoveDefaultConfig, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallCostCurrency, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as setEvalOutput, A as serializeCacheValue, B as evalLog, C as evalSpan, D as deserializeCacheRecording, E as hashCacheKeySync, F as EvalAssertionError, G as isInEvalScope, H as getCurrentScope, I as EvalRuntimeUsageError, It as getEvalRegistry, J as nextEvalId, L as appendToEvalOutput, M as manualInputFileValueSchema, N as readManualInputFile, O as deserializeCacheValue, P as evalExpect, S as captureEvalSpanError, T as hashCacheKey, U as getEvalCaseInput, V as evalTime, W as incrementEvalOutput, X as runInEvalScope, Y as runInEvalRuntimeScope, Z as runInExistingEvalScope, at as extractCacheHits, b as z, ct as simulateLlmCallCost, dt as getNestedAttribute, et as setScopeCacheContext, it as extractCacheEntries, j as repoFile, k as serializeCacheRecording, lt as simulateTokenAllocation, ot as extractApiCalls, q as mergeEvalOutput, st as extractLlmCalls, tt as startEvalBackgroundJob, w as evalTracer, x as buildTraceTree, z as evalAssert } from "./runExecution-
|
|
2
|
-
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-
|
|
3
|
-
import { n as matchesEvalTags, t as defineEval } from "./src-
|
|
1
|
+
import { $ as setEvalOutput, A as serializeCacheValue, B as evalLog, C as evalSpan, D as deserializeCacheRecording, E as hashCacheKeySync, F as EvalAssertionError, G as isInEvalScope, H as getCurrentScope, I as EvalRuntimeUsageError, It as getEvalRegistry, J as nextEvalId, L as appendToEvalOutput, M as manualInputFileValueSchema, N as readManualInputFile, O as deserializeCacheValue, P as evalExpect, S as captureEvalSpanError, T as hashCacheKey, U as getEvalCaseInput, V as evalTime, W as incrementEvalOutput, X as runInEvalScope, Y as runInEvalRuntimeScope, Z as runInExistingEvalScope, at as extractCacheHits, b as z, ct as simulateLlmCallCost, dt as getNestedAttribute, et as setScopeCacheContext, it as extractCacheEntries, j as repoFile, k as serializeCacheRecording, lt as simulateTokenAllocation, ot as extractApiCalls, q as mergeEvalOutput, st as extractLlmCalls, tt as startEvalBackgroundJob, w as evalTracer, x as buildTraceTree, z as evalAssert } from "./runExecution-C3XVZHRC.mjs";
|
|
2
|
+
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-Dkp2-rBm.mjs";
|
|
3
|
+
import { n as matchesEvalTags, t as defineEval } from "./src-8dGXUULC.mjs";
|
|
4
4
|
export { EvalAssertionError, EvalRuntimeUsageError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { At as evalStatAggregateSchema, Et as buildEvalKey, Mt as manualInputDescriptorSchema, Nt as evalChartsConfigSchema, Pt as columnDefSchema, R as configureEvalRunLogs, St as runSummarySchema, jt as evalStatsConfigSchema, l as registerAgentEvalsPackageResolutionHooks, nt as createRunRequestSchema, p as loadConfig, v as createFsCacheStore, xt as runManifestSchema, y as getCacheRetentionOptions } from "./runExecution-
|
|
2
|
-
import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-
|
|
1
|
+
import { At as evalStatAggregateSchema, Et as buildEvalKey, Mt as manualInputDescriptorSchema, Nt as evalChartsConfigSchema, Pt as columnDefSchema, R as configureEvalRunLogs, St as runSummarySchema, jt as evalStatsConfigSchema, l as registerAgentEvalsPackageResolutionHooks, nt as createRunRequestSchema, p as loadConfig, v as createFsCacheStore, xt as runManifestSchema, y as getCacheRetentionOptions } from "./runExecution-C3XVZHRC.mjs";
|
|
2
|
+
import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-B5An-AEi.mjs";
|
|
3
3
|
import { z } from "zod/v4";
|
|
4
4
|
import { readFile } from "node:fs/promises";
|
|
5
5
|
import { relative } from "node:path";
|
|
@@ -5142,8 +5142,88 @@ function buildTraceTree(spans, checkpoints) {
|
|
|
5142
5142
|
visit(null);
|
|
5143
5143
|
return result;
|
|
5144
5144
|
};
|
|
5145
|
+
const isRecord = (value) => {
|
|
5146
|
+
return typeof value === "object" && value !== null;
|
|
5147
|
+
};
|
|
5148
|
+
const readRecordValue = (value, key) => {
|
|
5149
|
+
if (!isRecord(value)) return void 0;
|
|
5150
|
+
const child = value[key];
|
|
5151
|
+
return isRecord(child) ? child : void 0;
|
|
5152
|
+
};
|
|
5153
|
+
const readStringValue = (value, key) => {
|
|
5154
|
+
if (!isRecord(value)) return void 0;
|
|
5155
|
+
const child = value[key];
|
|
5156
|
+
return typeof child === "string" && child.length > 0 ? child : void 0;
|
|
5157
|
+
};
|
|
5158
|
+
const readValue = (value, key) => {
|
|
5159
|
+
if (!isRecord(value)) return void 0;
|
|
5160
|
+
return value[key];
|
|
5161
|
+
};
|
|
5162
|
+
const parseMaybeJson = (value) => {
|
|
5163
|
+
if (typeof value !== "string") return value;
|
|
5164
|
+
const parsed = resultify(() => JSON.parse(value));
|
|
5165
|
+
return parsed.error ? value : parsed.value;
|
|
5166
|
+
};
|
|
5167
|
+
const firstDefined = (values) => {
|
|
5168
|
+
return values.find((value) => value !== void 0);
|
|
5169
|
+
};
|
|
5170
|
+
const getToolCallMetadata = (span) => {
|
|
5171
|
+
const attributes = span.attributes;
|
|
5172
|
+
return {
|
|
5173
|
+
attributes,
|
|
5174
|
+
genAI: readRecordValue(attributes, "genAI"),
|
|
5175
|
+
mastra: readRecordValue(attributes, "mastra"),
|
|
5176
|
+
toolAttributes: readRecordValue(attributes, "attributes")
|
|
5177
|
+
};
|
|
5178
|
+
};
|
|
5145
5179
|
const isToolCallSpan = (span) => {
|
|
5146
|
-
|
|
5180
|
+
const { attributes, genAI, mastra } = getToolCallMetadata(span);
|
|
5181
|
+
return span.kind === "tool" || span.kind === "tool_call" || readStringValue(attributes, "gen_ai.tool.type") === "tool" || readStringValue(genAI, "gen_ai.tool.type") === "tool" || readStringValue(genAI, "mastra.span.type") === "tool_call" || readStringValue(mastra, "type") === "tool_call" || readStringValue(mastra, "entityType") === "tool";
|
|
5182
|
+
};
|
|
5183
|
+
const getToolCallIdentityNames = (span) => {
|
|
5184
|
+
const { attributes, genAI, mastra } = getToolCallMetadata(span);
|
|
5185
|
+
return [
|
|
5186
|
+
readStringValue(attributes, "gen_ai.tool.name"),
|
|
5187
|
+
readStringValue(genAI, "gen_ai.tool.name"),
|
|
5188
|
+
readStringValue(mastra, "entityName"),
|
|
5189
|
+
readStringValue(mastra, "entityId"),
|
|
5190
|
+
span.name
|
|
5191
|
+
].filter((name) => name !== void 0);
|
|
5192
|
+
};
|
|
5193
|
+
const getPreferredToolCallName = (span) => {
|
|
5194
|
+
return getToolCallIdentityNames(span)[0] ?? span.name;
|
|
5195
|
+
};
|
|
5196
|
+
const toolCallSpanMatchesName = (span, toolName) => {
|
|
5197
|
+
return getToolCallIdentityNames(span).includes(toolName);
|
|
5198
|
+
};
|
|
5199
|
+
const countToolCallSpans = (toolName) => {
|
|
5200
|
+
return spans.filter((span) => {
|
|
5201
|
+
return isToolCallSpan(span) && toolCallSpanMatchesName(span, toolName);
|
|
5202
|
+
}).length;
|
|
5203
|
+
};
|
|
5204
|
+
const buildToolCallSpan = (span) => {
|
|
5205
|
+
const { attributes, genAI, toolAttributes } = getToolCallMetadata(span);
|
|
5206
|
+
return {
|
|
5207
|
+
name: getPreferredToolCallName(span),
|
|
5208
|
+
spanName: span.name,
|
|
5209
|
+
kind: span.kind,
|
|
5210
|
+
arguments: parseMaybeJson(firstDefined([
|
|
5211
|
+
readValue(attributes, "gen_ai.tool.call.arguments"),
|
|
5212
|
+
readValue(genAI, "gen_ai.tool.call.arguments"),
|
|
5213
|
+
readValue(attributes, "arguments"),
|
|
5214
|
+
readValue(attributes, "input")
|
|
5215
|
+
])),
|
|
5216
|
+
result: parseMaybeJson(firstDefined([
|
|
5217
|
+
readValue(attributes, "gen_ai.tool.call.result"),
|
|
5218
|
+
readValue(genAI, "gen_ai.tool.call.result"),
|
|
5219
|
+
readValue(attributes, "result"),
|
|
5220
|
+
readValue(attributes, "output")
|
|
5221
|
+
])),
|
|
5222
|
+
description: readStringValue(attributes, "gen_ai.tool.description") ?? readStringValue(genAI, "gen_ai.tool.description") ?? readStringValue(toolAttributes, "toolDescription"),
|
|
5223
|
+
toolType: readStringValue(attributes, "gen_ai.tool.type") ?? readStringValue(genAI, "gen_ai.tool.type") ?? readStringValue(toolAttributes, "toolType"),
|
|
5224
|
+
attributes,
|
|
5225
|
+
span
|
|
5226
|
+
};
|
|
5147
5227
|
};
|
|
5148
5228
|
const filterSpanNames = (sourceSpans, kind) => {
|
|
5149
5229
|
return sourceSpans.filter((span) => kind === void 0 || span.kind === kind).map((span) => span.name);
|
|
@@ -5167,15 +5247,23 @@ function buildTraceTree(spans, checkpoints) {
|
|
|
5167
5247
|
return spans.filter(isToolCallSpan);
|
|
5168
5248
|
},
|
|
5169
5249
|
listToolCallSpanNames() {
|
|
5170
|
-
return spans.filter(isToolCallSpan).map(
|
|
5250
|
+
return spans.filter(isToolCallSpan).map(getPreferredToolCallName);
|
|
5171
5251
|
},
|
|
5172
5252
|
hasToolCallSpan(name) {
|
|
5173
|
-
return spans.some((s) =>
|
|
5253
|
+
return spans.some((s) => {
|
|
5254
|
+
return isToolCallSpan(s) && toolCallSpanMatchesName(s, name);
|
|
5255
|
+
});
|
|
5174
5256
|
},
|
|
5175
|
-
|
|
5257
|
+
getToolCallSpans(name) {
|
|
5176
5258
|
return spans.filter((span) => {
|
|
5177
|
-
return isToolCallSpan(span) && span
|
|
5178
|
-
}).
|
|
5259
|
+
return isToolCallSpan(span) && toolCallSpanMatchesName(span, name);
|
|
5260
|
+
}).map(buildToolCallSpan);
|
|
5261
|
+
},
|
|
5262
|
+
getToolCallSpanCount(toolName) {
|
|
5263
|
+
return countToolCallSpans(toolName);
|
|
5264
|
+
},
|
|
5265
|
+
hasToolCallSpanCount(toolName, expectedCalls) {
|
|
5266
|
+
return countToolCallSpans(toolName) === expectedCalls;
|
|
5179
5267
|
},
|
|
5180
5268
|
listSpanNames(kind) {
|
|
5181
5269
|
return filterSpanNames(spans, kind);
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Dt as getCaseRowCaseKey, Lt as runWithEvalRegistry, Ot as caseDetailSchema, Q as runWithEvalClock, St as runSummarySchema, Tt as buildCaseKey, Y as runInEvalRuntimeScope, _t as dedupeEvalTags, bt as validateTagsFilterExpression, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as deriveStatusFromChildStatuses, ht as deriveStatusFromCaseRows, i as isCaseChildMessage, kt as caseRowSchema, m as buildDeclaredColumnDefs, mt as deriveScopedSummaryFromCases, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, t as filterEvalCases, u as runWithModuleIsolation, vt as matchesTagsFilter, xt as runManifestSchema, yt as validateEvalTagName } from "./runExecution-
|
|
1
|
+
import { Dt as getCaseRowCaseKey, Lt as runWithEvalRegistry, Ot as caseDetailSchema, Q as runWithEvalClock, St as runSummarySchema, Tt as buildCaseKey, Y as runInEvalRuntimeScope, _t as dedupeEvalTags, bt as validateTagsFilterExpression, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as deriveStatusFromChildStatuses, ht as deriveStatusFromCaseRows, i as isCaseChildMessage, kt as caseRowSchema, m as buildDeclaredColumnDefs, mt as deriveScopedSummaryFromCases, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, t as filterEvalCases, u as runWithModuleIsolation, vt as matchesTagsFilter, xt as runManifestSchema, yt as validateEvalTagName } from "./runExecution-C3XVZHRC.mjs";
|
|
2
2
|
import { Result, resultify } from "t-result";
|
|
3
3
|
import { readFile, readdir, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import { dirname, join } from "node:path";
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-C9J-1fkp.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-Dkp2-rBm.mjs";
|
|
2
|
+
import "./src-8dGXUULC.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Ft as defineEval$1, K as matchesEvalTags$1 } from "./runExecution-
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { Ft as defineEval$1, K as matchesEvalTags$1 } from "./runExecution-C3XVZHRC.mjs";
|
|
2
|
+
import "./cli-Dkp2-rBm.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.59.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -32,9 +32,9 @@
|
|
|
32
32
|
"devDependencies": {
|
|
33
33
|
"@types/node": "^24.7.2",
|
|
34
34
|
"typescript": "^5.9.2",
|
|
35
|
-
"@agent-evals/sdk": "0.0.1",
|
|
36
35
|
"@agent-evals/runner": "0.0.1",
|
|
37
|
-
"@agent-evals/shared": "0.0.1"
|
|
36
|
+
"@agent-evals/shared": "0.0.1",
|
|
37
|
+
"@agent-evals/sdk": "0.0.1"
|
|
38
38
|
},
|
|
39
39
|
"scripts": {
|
|
40
40
|
"build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
|
|
@@ -375,11 +375,19 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
375
375
|
`trace.hasSpan(name)`, `trace.findSpansByKind(kind)`,
|
|
376
376
|
`trace.findToolCallSpans()`, `trace.listToolCallSpanNames()`,
|
|
377
377
|
`trace.hasToolCallSpan(name)`,
|
|
378
|
-
`trace.
|
|
378
|
+
`trace.getToolCallSpans(name)`,
|
|
379
|
+
`trace.getToolCallSpanCount(toolName)`,
|
|
380
|
+
`trace.hasToolCallSpanCount(toolName, expectedCalls)`,
|
|
379
381
|
`trace.listSpanNames(kind?)`, `trace.listSpanNamesDfs(kind?)`, and
|
|
380
382
|
`trace.flattenDfs()`.
|
|
381
383
|
The tool-call helpers include both `kind: 'tool'` spans and imported
|
|
382
|
-
execution spans recorded as `kind: 'tool_call'`.
|
|
384
|
+
execution spans recorded as `kind: 'tool_call'`. Tool-name checks and counts
|
|
385
|
+
match the span `name` as well as GenAI/Mastra identity attributes such as
|
|
386
|
+
`genAI["gen_ai.tool.name"]` and `mastra.entityName`; list helpers prefer
|
|
387
|
+
those tool identity attributes when present. `getToolCallSpans(name)`
|
|
388
|
+
returns one normalized object per matching call, including parsed
|
|
389
|
+
`arguments`, parsed `result`, `description`, `toolType`, `attributes`, and
|
|
390
|
+
the original `span`.
|
|
383
391
|
- `traceDisplay` promotes selected span attributes into the trace tree and
|
|
384
392
|
detail pane; it supports aggregation across subtrees (`scope`, `mode`) and
|
|
385
393
|
user-defined `transform(...)` for derived views (e.g. currency conversion).
|