@ls-stack/agent-eval 0.30.0 → 0.32.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-CbOZBHju.mjs → app-Dc6vvHRL.mjs} +4 -4
- package/dist/apps/web/dist/assets/index-BNQnbfi0.js +118 -0
- package/dist/apps/web/dist/assets/index-BPMMRktE.css +1 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-CiFOqMwS.mjs → cli-huuJbDNb.mjs} +3 -3
- package/dist/index.d.mts +176 -53
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-CO3Vf0cQ.mjs → runOrchestration-ZpN7xty_.mjs} +127 -3
- package/dist/{runner-4pF_Qrc9.mjs → runner-BPXPvinB.mjs} +1 -1
- package/dist/{runner-CXHkf7ih.mjs → runner-Dkol2ukD.mjs} +2 -2
- package/dist/src-1Qvuh0NH.mjs +3 -0
- package/package.json +2 -2
- package/skills/agent-eval/SKILL.md +4 -3
- package/dist/apps/web/dist/assets/index-DEikHy2a.js +0 -118
- package/dist/apps/web/dist/assets/index-DjUTm3M-.css +0 -1
- package/dist/src-BiPLv9ya.mjs +0 -3
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as
|
|
2
|
-
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-
|
|
3
|
-
import "./src-
|
|
4
|
-
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, cleanupStagedManualInputFiles, columnDefSchema, columnFormatSchema, columnKindSchema, configReloadStateSchema, configReloadStatusSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalColumnOverrideSchema, evalColumnsSchema, evalDeriveConfigSchema, evalExpect, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingRateSchema, llmCallPricingSchema, llmCallsConfigSchema, manualInputBooleanFieldSchema, manualInputDescriptorSchema, manualInputFieldDescriptorSchema, manualInputFileValueSchema, manualInputJsonFieldSchema, manualInputMultilineFieldSchema, manualInputNumberFieldSchema, manualInputSelectFieldSchema, manualInputSelectOptionSchema, manualInputTextFieldSchema, materializeManualInputFiles, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, readManualInputFile, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
1
|
+
import { $ as apiCallMetricSchema, $n as getCurrentScope, $t as cacheDebugKeyEntrySchema, A as createRunRequestSchema, An as repoFileRefSchema, At as runLogEntrySchema, B as getNestedAttribute, Bn as deserializeCacheValue, Bt as manualInputNumberFieldSchema, Cn as cellValueSchema, Ct as caseRowSchema, Dn as fileRefSchema, Dt as evalStatItemSchema, En as columnKindSchema, Et as evalStatAggregateSchema, F as extractApiCalls, Fn as evalSpan, Ft as manualInputBooleanFieldSchema, G as deriveStatusFromChildStatuses, Gn as readManualInputFile, Gt as evalChartAxisSchema, H as getEvalDisplayStatus, Hn as serializeCacheValue, Ht as manualInputSelectOptionSchema, I as extractLlmCalls, In as evalTracer, It as manualInputDescriptorSchema, J as DEFAULT_API_CALLS_CONFIG, Jn as advanceEvalTime, Jt as evalChartConfigSchema, K as runManifestSchema, Kn as evalExpect, Kt as evalChartBuiltinMetricSchema, L as simulateLlmCallCost, Ln as hashCacheKey, Lt as manualInputFieldDescriptorSchema, M as sseEnvelopeSchema, Mn as z, Mt as runLogLocationSchema, N as extractCacheEntries, Nn as buildTraceTree, Nt as runLogPhaseSchema, O as configReloadStateSchema, On as jsonCellSchema, Ot as evalStatsConfigSchema, P as extractCacheHits, Pn as captureEvalSpanError, Pt as scoreTraceSchema, Q as apiCallMetricPlacementSchema, Qn as evalLog, Qt as evalChartsConfigSchema, R as simulateTokenAllocation, Rn as hashCacheKeySync, Rt as manualInputJsonFieldSchema, Sn as traceSpanWarningSchema, St as caseDetailSchema, Tn as columnFormatSchema, Tt as evalFreshnessStatusSchema, U as deriveScopedSummaryFromCases, Un as repoFile, Ut as manualInputTextFieldSchema, V as getEvalTitle, Vn as serializeCacheRecording, Vt as manualInputSelectFieldSchema, W as deriveStatusFromCaseRows, Wn as manualInputFileValueSchema, Wt as evalChartAggregateSchema, X as agentEvalsConfigSchema, Xt as evalChartTooltipExtraSchema, Y as DEFAULT_LLM_CALLS_CONFIG, Yn as appendToEvalOutput, Yt as evalChartMetricSchema, Z as apiCallMetricFormatSchema, Zn as evalAssert, Zt as evalChartTypeSchema, _n as traceDisplayConfigSchema, _t as buildCaseKey, an as cacheModeSchema, ar as nextEvalId, at as llmCallCostCurrencySchema, bn as traceSpanKindSchema, bt as getCaseRowEvalKey, cn as cacheRecordingSchema, cr as runInExistingEvalScope, ct as llmCallMetricSchema, dn as spanCacheOptionsSchema, dr as startEvalBackgroundJob, dt as llmCallsConfigSchema, en as cacheDebugKeyFileSchema, er as getEvalCaseInput, et as apiCallsConfigSchema, fn as traceCacheRefSchema, fr as defineEval, ft as removeDefaultConfigSchema, gn as traceAttributeDisplaySchema, gt as trialSelectionModeSchema, hn as traceAttributeDisplayPlacementSchema, ht as runLogsConfigSchema, in as cacheListItemSchema, ir as mergeEvalOutput, it as evalDeriveConfigSchema, j as updateManualScoreRequestSchema, jn as runArtifactRefSchema, jt as runLogLevelSchema, k as configReloadStatusSchema, kn as numberDisplayOptionsSchema, kt as evalSummarySchema, ln as cacheStatusSchema, lr as setEvalOutput, lt as llmCallPricingRateSchema, mn as traceAttributeDisplayInputSchema, mt as resolveLlmCallsConfig, nn as cacheEntryWithDebugKeySchema, nr as incrementEvalOutput, nt as evalColumnOverrideSchema, on as cacheOperationTypeSchema, or as runInEvalRuntimeScope, ot as llmCallMetricFormatSchema, pn as traceAttributeDisplayFormatSchema, pr as getEvalRegistry, pt as resolveApiCallsConfig, q as runSummarySchema, qn as EvalAssertionError, qt as evalChartColorSchema, rn as cacheFileSchema, rr as isInEvalScope, rt as evalColumnsSchema, sn as cacheRecordingOpSchema, sr as runInEvalScope, st as llmCallMetricPlacementSchema, tn as cacheEntrySchema, tr as getEvalStartTime, tt as defaultConfigKeySchema, un as serializedCacheSpanSchema, ur as setScopeCacheContext, ut as llmCallPricingSchema, vn as traceDisplayInputConfigSchema, vt as buildEvalKey, wn as columnDefSchema, wt as discoveryIssueSchema, xn as traceSpanSchema, xt as assertionFailureSchema, yn as traceSpanErrorSchema, yt as getCaseRowCaseKey, z as applyDerivedCallAttributes, zn as deserializeCacheRecording, zt as manualInputMultilineFieldSchema } from "./runOrchestration-ZpN7xty_.mjs";
|
|
2
|
+
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-huuJbDNb.mjs";
|
|
3
|
+
import "./src-1Qvuh0NH.mjs";
|
|
4
|
+
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, cleanupStagedManualInputFiles, columnDefSchema, columnFormatSchema, columnKindSchema, configReloadStateSchema, configReloadStatusSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalColumnOverrideSchema, evalColumnsSchema, evalDeriveConfigSchema, evalExpect, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, jsonCellSchema, llmCallCostCurrencySchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingRateSchema, llmCallPricingSchema, llmCallsConfigSchema, manualInputBooleanFieldSchema, manualInputDescriptorSchema, manualInputFieldDescriptorSchema, manualInputFileValueSchema, manualInputJsonFieldSchema, manualInputMultilineFieldSchema, manualInputNumberFieldSchema, manualInputSelectFieldSchema, manualInputSelectOptionSchema, manualInputTextFieldSchema, materializeManualInputFiles, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, readManualInputFile, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, spanCacheOptionsSchema, sseEnvelopeSchema, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as createRunRequestSchema, C as loadConfig, D as createFsCacheStore,
|
|
1
|
+
import { A as createRunRequestSchema, C as loadConfig, D as createFsCacheStore, It as manualInputDescriptorSchema, K as runManifestSchema, Ot as evalStatsConfigSchema, Qt as evalChartsConfigSchema, Xn as configureEvalRunLogs, q as runSummarySchema, r as getTargetEvals$1, t as executeRun, vt as buildEvalKey, wn as columnDefSchema, x as parseEvalDiscovery } from "./runOrchestration-ZpN7xty_.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { relative } from "node:path";
|
|
@@ -3316,6 +3316,20 @@ const llmCallPricingSchema = llmCallPricingRateSchema.extend({
|
|
|
3316
3316
|
*/
|
|
3317
3317
|
providers: z.record(z.string().min(1), llmCallPricingRateSchema).optional()
|
|
3318
3318
|
});
|
|
3319
|
+
/**
|
|
3320
|
+
* Schema for extra currencies displayed in the LLM calls breakdown table.
|
|
3321
|
+
* Costs are still derived in USD, then multiplied by `usdToCurrencyRate`.
|
|
3322
|
+
*/
|
|
3323
|
+
const llmCallCostCurrencySchema = z.object({
|
|
3324
|
+
/** Currency code or short display token, such as `BRL` or `EUR`. */
|
|
3325
|
+
code: z.string().min(1),
|
|
3326
|
+
/** Optional display label for tooltips and future UI surfaces. */
|
|
3327
|
+
label: z.string().min(1).optional(),
|
|
3328
|
+
/** Multiplier used to convert one USD to this currency. */
|
|
3329
|
+
usdToCurrencyRate: z.number().nonnegative(),
|
|
3330
|
+
/** Number presentation options for the converted value. */
|
|
3331
|
+
numberFormat: numberDisplayOptionsSchema.optional()
|
|
3332
|
+
});
|
|
3319
3333
|
/** Schema for the global LLM calls config block in `agent-evals.config.ts`. */
|
|
3320
3334
|
const llmCallsConfigSchema = z.object({
|
|
3321
3335
|
/** Span kinds treated as LLM calls. Defaults to `['llm']`. */
|
|
@@ -3357,6 +3371,11 @@ const llmCallsConfigSchema = z.object({
|
|
|
3357
3371
|
* counts. Built-in LLM cost fields are only derived from this registry.
|
|
3358
3372
|
*/
|
|
3359
3373
|
pricing: z.record(z.string().min(1), llmCallPricingSchema).optional(),
|
|
3374
|
+
/**
|
|
3375
|
+
* Additional currencies shown as columns in the LLM calls breakdown table.
|
|
3376
|
+
* These do not change persisted `costUsd` outputs, stats, or charts.
|
|
3377
|
+
*/
|
|
3378
|
+
costCurrencies: z.array(llmCallCostCurrencySchema).optional(),
|
|
3360
3379
|
/** Custom user-defined metrics surfaced on each LLM call. */
|
|
3361
3380
|
metrics: z.array(llmCallMetricSchema).optional()
|
|
3362
3381
|
});
|
|
@@ -3422,7 +3441,8 @@ const DEFAULT_LLM_CALLS_CONFIG = {
|
|
|
3422
3441
|
},
|
|
3423
3442
|
derivedAttributes: [],
|
|
3424
3443
|
metrics: [],
|
|
3425
|
-
pricing: []
|
|
3444
|
+
pricing: [],
|
|
3445
|
+
costCurrencies: []
|
|
3426
3446
|
};
|
|
3427
3447
|
/** Default API-calls config the UI uses before the workspace fetch resolves. */
|
|
3428
3448
|
const DEFAULT_API_CALLS_CONFIG = {
|
|
@@ -3502,6 +3522,14 @@ function resolveLlmCallPricingEntries(model, pricing) {
|
|
|
3502
3522
|
});
|
|
3503
3523
|
return entries;
|
|
3504
3524
|
}
|
|
3525
|
+
function resolveLlmCallCostCurrency(currency) {
|
|
3526
|
+
return {
|
|
3527
|
+
code: currency.code,
|
|
3528
|
+
label: currency.label,
|
|
3529
|
+
usdToCurrencyRate: currency.usdToCurrencyRate,
|
|
3530
|
+
numberFormat: currency.numberFormat
|
|
3531
|
+
};
|
|
3532
|
+
}
|
|
3505
3533
|
/**
|
|
3506
3534
|
* Resolve the user-authored LLM-calls config to a fully-defaulted shape used
|
|
3507
3535
|
* by the UI to derive the LLM calls tab.
|
|
@@ -3513,6 +3541,8 @@ function resolveLlmCallPricingEntries(model, pricing) {
|
|
|
3513
3541
|
* - Missing `metrics[].placements` defaults to `['body']`.
|
|
3514
3542
|
* - Missing `pricing` defaults to an empty registry; built-in costs are only
|
|
3515
3543
|
* derived from configured model-keyed pricing and token counts.
|
|
3544
|
+
* - Missing `costCurrencies` defaults to an empty list; extra currencies only
|
|
3545
|
+
* affect the expanded LLM calls breakdown table.
|
|
3516
3546
|
*/
|
|
3517
3547
|
function resolveLlmCallsConfig(input) {
|
|
3518
3548
|
return {
|
|
@@ -3523,7 +3553,8 @@ function resolveLlmCallsConfig(input) {
|
|
|
3523
3553
|
},
|
|
3524
3554
|
derivedAttributes: resolveDerivedAttributes(input?.derivedAttributes),
|
|
3525
3555
|
metrics: (input?.metrics ?? []).map(resolveLlmCallMetric),
|
|
3526
|
-
pricing: Object.entries(input?.pricing ?? {}).flatMap(([model, pricing]) => resolveLlmCallPricingEntries(model, pricing))
|
|
3556
|
+
pricing: Object.entries(input?.pricing ?? {}).flatMap(([model, pricing]) => resolveLlmCallPricingEntries(model, pricing)),
|
|
3557
|
+
costCurrencies: (input?.costCurrencies ?? []).map(resolveLlmCallCostCurrency)
|
|
3527
3558
|
};
|
|
3528
3559
|
}
|
|
3529
3560
|
/**
|
|
@@ -3964,6 +3995,99 @@ function computeTotalCost({ inputTokens, inputCostUsd, outputTokens, outputCostU
|
|
|
3964
3995
|
if (hasCost) return total;
|
|
3965
3996
|
return hasReportedTokens ? 0 : null;
|
|
3966
3997
|
}
|
|
3998
|
+
/**
|
|
3999
|
+
* Recompute the LLM-call cost breakdown for a hypothetical billing scenario,
|
|
4000
|
+
* using the call's recorded token counts and the resolved pricing registry.
|
|
4001
|
+
*
|
|
4002
|
+
* The `actual` scenario returns the costs already stored on `entry`. Other
|
|
4003
|
+
* scenarios re-derive each cost component from `pricing` so users can compare
|
|
4004
|
+
* what the same usage would have cost under different cache strategies. When
|
|
4005
|
+
* pricing is missing for the model/provider, simulated cost components fall
|
|
4006
|
+
* back to `null` exactly like the original extractor.
|
|
4007
|
+
*/
|
|
4008
|
+
function simulateLlmCallCost({ entry, pricing, scenario }) {
|
|
4009
|
+
if (scenario === "actual") return {
|
|
4010
|
+
inputCostUsd: entry.inputCostUsd,
|
|
4011
|
+
outputCostUsd: entry.outputCostUsd,
|
|
4012
|
+
cachedInputCostUsd: entry.cachedInputCostUsd,
|
|
4013
|
+
cacheCreationInputCostUsd: entry.cacheCreationInputCostUsd,
|
|
4014
|
+
reasoningCostUsd: entry.reasoningCostUsd,
|
|
4015
|
+
totalCostUsd: entry.costUsd
|
|
4016
|
+
};
|
|
4017
|
+
const pricingEntry = pickPricingEntry({
|
|
4018
|
+
pricing,
|
|
4019
|
+
model: entry.model,
|
|
4020
|
+
provider: entry.provider
|
|
4021
|
+
});
|
|
4022
|
+
const outputCostUsd = computeTokenCost(entry.outputTokens, pricingEntry?.outputUsdPerMillion);
|
|
4023
|
+
const reasoningCostUsd = computeTokenCost(entry.reasoningTokens, pricingEntry?.reasoningUsdPerMillion);
|
|
4024
|
+
const simulatedTokens = simulateTokenAllocation({
|
|
4025
|
+
entry,
|
|
4026
|
+
scenario
|
|
4027
|
+
});
|
|
4028
|
+
const writeRate = scenario === "withExtendedCachingWrite" ? pricingEntry?.cacheCreationInput1hUsdPerMillion : pricingEntry?.cacheCreationInputUsdPerMillion;
|
|
4029
|
+
const inputCostUsd = computeTokenCost(simulatedTokens.baseInputTokens, pricingEntry?.inputUsdPerMillion);
|
|
4030
|
+
const cachedInputCostUsd = computeTokenCost(simulatedTokens.cachedInputTokens, pricingEntry?.cachedInputUsdPerMillion);
|
|
4031
|
+
const cacheCreationInputCostUsd = computeTokenCost(simulatedTokens.cacheCreationInputTokens, writeRate);
|
|
4032
|
+
return {
|
|
4033
|
+
inputCostUsd,
|
|
4034
|
+
outputCostUsd,
|
|
4035
|
+
cachedInputCostUsd,
|
|
4036
|
+
cacheCreationInputCostUsd,
|
|
4037
|
+
reasoningCostUsd,
|
|
4038
|
+
totalCostUsd: computeTotalCost({
|
|
4039
|
+
inputTokens: simulatedTokens.baseInputTokens,
|
|
4040
|
+
inputCostUsd,
|
|
4041
|
+
outputTokens: entry.outputTokens,
|
|
4042
|
+
outputCostUsd,
|
|
4043
|
+
cachedInputTokens: simulatedTokens.cachedInputTokens,
|
|
4044
|
+
cachedInputCostUsd,
|
|
4045
|
+
cacheCreationInputTokens: simulatedTokens.cacheCreationInputTokens,
|
|
4046
|
+
cacheCreationInputCostUsd,
|
|
4047
|
+
reasoningTokens: entry.reasoningTokens,
|
|
4048
|
+
reasoningCostUsd
|
|
4049
|
+
})
|
|
4050
|
+
};
|
|
4051
|
+
}
|
|
4052
|
+
/**
|
|
4053
|
+
* Project the call's recorded token allocation onto a hypothetical billing
|
|
4054
|
+
* scenario. Cacheable tokens shift between rows so the breakdown reflects the
|
|
4055
|
+
* simulated billing model: `noCache` folds reads/writes into base input,
|
|
4056
|
+
* `withBaseCaching` (warmed) treats every cacheable token as a cache read, and
|
|
4057
|
+
* the first-call write scenarios treat every cacheable token as a cache write.
|
|
4058
|
+
*
|
|
4059
|
+
* The returned counts are what the UI renders on each row and what
|
|
4060
|
+
* {@link simulateLlmCallCost} prices, so display and totals never drift.
|
|
4061
|
+
*/
|
|
4062
|
+
function simulateTokenAllocation({ entry, scenario }) {
|
|
4063
|
+
const baseInputTokens = computeBaseInputTokens({
|
|
4064
|
+
inputTokens: entry.inputTokens,
|
|
4065
|
+
cachedInputTokens: entry.cachedInputTokens,
|
|
4066
|
+
cacheCreationInputTokens: entry.cacheCreationInputTokens
|
|
4067
|
+
});
|
|
4068
|
+
if (scenario === "actual" || entry.inputTokens === null) return {
|
|
4069
|
+
baseInputTokens,
|
|
4070
|
+
cachedInputTokens: entry.cachedInputTokens,
|
|
4071
|
+
cacheCreationInputTokens: entry.cacheCreationInputTokens
|
|
4072
|
+
};
|
|
4073
|
+
const cacheableTokens = (entry.cachedInputTokens ?? 0) + (entry.cacheCreationInputTokens ?? 0);
|
|
4074
|
+
const hasCacheable = cacheableTokens > 0;
|
|
4075
|
+
if (scenario === "noCache") return {
|
|
4076
|
+
baseInputTokens: entry.inputTokens,
|
|
4077
|
+
cachedInputTokens: 0,
|
|
4078
|
+
cacheCreationInputTokens: 0
|
|
4079
|
+
};
|
|
4080
|
+
if (scenario === "withBaseCaching") return {
|
|
4081
|
+
baseInputTokens: hasCacheable ? baseInputTokens : 0,
|
|
4082
|
+
cachedInputTokens: hasCacheable ? cacheableTokens : entry.inputTokens,
|
|
4083
|
+
cacheCreationInputTokens: 0
|
|
4084
|
+
};
|
|
4085
|
+
return {
|
|
4086
|
+
baseInputTokens: hasCacheable ? baseInputTokens : 0,
|
|
4087
|
+
cachedInputTokens: 0,
|
|
4088
|
+
cacheCreationInputTokens: hasCacheable ? cacheableTokens : entry.inputTokens
|
|
4089
|
+
};
|
|
4090
|
+
}
|
|
3967
4091
|
function computeDurationMs$1(span) {
|
|
3968
4092
|
if (span.endedAt === null) return null;
|
|
3969
4093
|
const started = Date.parse(span.startedAt);
|
|
@@ -7187,4 +7311,4 @@ function toLastRunStatus(status) {
|
|
|
7187
7311
|
return status === "pending" ? null : status;
|
|
7188
7312
|
}
|
|
7189
7313
|
//#endregion
|
|
7190
|
-
export {
|
|
7314
|
+
export { apiCallMetricSchema as $, getCurrentScope as $n, cacheDebugKeyEntrySchema as $t, createRunRequestSchema as A, repoFileRefSchema as An, runLogEntrySchema as At, getNestedAttribute as B, deserializeCacheValue as Bn, manualInputNumberFieldSchema as Bt, loadConfig as C, cellValueSchema as Cn, caseRowSchema as Ct, createFsCacheStore as D, fileRefSchema as Dn, evalStatItemSchema as Dt, validateCharts as E, columnKindSchema as En, evalStatAggregateSchema as Et, extractApiCalls as F, evalSpan as Fn, manualInputBooleanFieldSchema as Ft, deriveStatusFromChildStatuses as G, readManualInputFile as Gn, evalChartAxisSchema as Gt, getEvalDisplayStatus as H, serializeCacheValue as Hn, manualInputSelectOptionSchema as Ht, extractLlmCalls as I, evalTracer as In, manualInputDescriptorSchema as It, DEFAULT_API_CALLS_CONFIG as J, advanceEvalTime as Jn, evalChartConfigSchema as Jt, runManifestSchema as K, evalExpect as Kn, evalChartBuiltinMetricSchema as Kt, simulateLlmCallCost as L, hashCacheKey as Ln, manualInputFieldDescriptorSchema as Lt, sseEnvelopeSchema as M, z$1 as Mn, runLogLocationSchema as Mt, extractCacheEntries as N, buildTraceTree as Nn, runLogPhaseSchema as Nt, configReloadStateSchema as O, jsonCellSchema as On, evalStatsConfigSchema as Ot, extractCacheHits as P, captureEvalSpanError as Pn, scoreTraceSchema as Pt, apiCallMetricPlacementSchema as Q, evalLog as Qn, evalChartsConfigSchema as Qt, simulateTokenAllocation as R, hashCacheKeySync as Rn, manualInputJsonFieldSchema as Rt, resolveEvalDefaultConfig as S, traceSpanWarningSchema as Sn, caseDetailSchema as St, normalizeScoreDef as T, columnFormatSchema as Tn, evalFreshnessStatusSchema as Tt, deriveScopedSummaryFromCases as U, repoFile as Un, manualInputTextFieldSchema as Ut, getEvalTitle as V, serializeCacheRecording as Vn, manualInputSelectFieldSchema as Vt, deriveStatusFromCaseRows as W, manualInputFileValueSchema as Wn, evalChartAggregateSchema as Wt, agentEvalsConfigSchema as X, configureEvalRunLogs as Xn, evalChartTooltipExtraSchema as Xt, DEFAULT_LLM_CALLS_CONFIG as Y, appendToEvalOutput as Yn, evalChartMetricSchema as Yt, apiCallMetricFormatSchema as Z, evalAssert as Zn, evalChartTypeSchema as Zt, buildManualInputDescriptor as _, traceDisplayConfigSchema as _n, buildCaseKey as _t, getLastRunStatuses as a, cacheModeSchema as an, nextEvalId as ar, llmCallCostCurrencySchema as at, loadEvalModule as b, traceSpanKindSchema as bn, getCaseRowEvalKey as bt, loadPersistedRunSnapshots as c, cacheRecordingSchema as cn, runInExistingEvalScope as cr, llmCallMetricSchema as ct, persistRunState as d, spanCacheOptionsSchema as dn, startEvalBackgroundJob as dr, llmCallsConfigSchema as dt, cacheDebugKeyFileSchema as en, getEvalCaseInput as er, apiCallsConfigSchema as et, recomputeEvalStatusesInRuns as f, traceCacheRefSchema as fn, defineEval as fr, removeDefaultConfigSchema as ft, resolveArtifactPath as g, traceAttributeDisplaySchema as gn, trialSelectionModeSchema as gt, resolveTracePresentation as h, traceAttributeDisplayPlacementSchema as hn, runLogsConfigSchema as ht, generateRunId as i, cacheListItemSchema as in, mergeEvalOutput as ir, evalDeriveConfigSchema as it, updateManualScoreRequestSchema as j, runArtifactRefSchema as jn, runLogLevelSchema as jt, configReloadStatusSchema as k, numberDisplayOptionsSchema as kn, evalSummarySchema as kt, nextShortIdFromSnapshots as l, cacheStatusSchema as ln, setEvalOutput as lr, llmCallPricingRateSchema as lt, runTouchesEval as m, traceAttributeDisplayInputSchema as mn, resolveLlmCallsConfig as mt, getTargetEvalKeys as n, cacheEntryWithDebugKeySchema as nn, incrementEvalOutput as nr, evalColumnOverrideSchema as nt, getLatestRunInfos as o, cacheOperationTypeSchema as on, runInEvalRuntimeScope as or, llmCallMetricFormatSchema as ot, recomputePersistedCaseStatus as p, traceAttributeDisplayFormatSchema as pn, getEvalRegistry as pr, resolveApiCallsConfig as pt, runSummarySchema as q, EvalAssertionError as qn, evalChartColorSchema as qt, getTargetEvals as r, cacheFileSchema as rn, isInEvalScope as rr, evalColumnsSchema as rt, loadPersistedRunSnapshot as s, cacheRecordingOpSchema as sn, runInEvalScope as sr, llmCallMetricPlacementSchema as st, executeRun as t, cacheEntrySchema as tn, getEvalStartTime as tr, defaultConfigKeySchema as tt, persistCaseDetail as u, serializedCacheSpanSchema as un, setScopeCacheContext as ur, llmCallPricingSchema as ut, parseManualInputValues as v, traceDisplayInputConfigSchema as vn, buildEvalKey as vt, buildDeclaredColumnDefs as w, columnDefSchema as wn, discoveryIssueSchema as wt, parseEvalDiscovery as x, traceSpanSchema as xn, assertionFailureSchema as xt, deriveEvalFreshness as y, traceSpanErrorSchema as yn, getCaseRowCaseKey as yt, applyDerivedCallAttributes as z, deserializeCacheRecording as zn, manualInputMultilineFieldSchema as zt };
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-Dkol2ukD.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-huuJbDNb.mjs";
|
|
2
|
+
import "./src-1Qvuh0NH.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.32.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -31,8 +31,8 @@
|
|
|
31
31
|
"devDependencies": {
|
|
32
32
|
"@types/node": "^24.7.2",
|
|
33
33
|
"typescript": "^5.9.2",
|
|
34
|
-
"@agent-evals/runner": "0.0.1",
|
|
35
34
|
"@agent-evals/sdk": "0.0.1",
|
|
35
|
+
"@agent-evals/runner": "0.0.1",
|
|
36
36
|
"@agent-evals/shared": "0.0.1"
|
|
37
37
|
},
|
|
38
38
|
"scripts": {
|
|
@@ -341,9 +341,10 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
341
341
|
tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
|
|
342
342
|
override `attributes.<field>` for non-default primitive span shapes, configure
|
|
343
343
|
model-keyed `pricing` to derive USD costs from token counts, with nested
|
|
344
|
-
`providers` entries for provider-specific rates, add `
|
|
345
|
-
|
|
346
|
-
|
|
344
|
+
`providers` entries for provider-specific rates, add `costCurrencies` to show
|
|
345
|
+
converted cost columns in the expanded breakdown table only, add
|
|
346
|
+
`derivedAttributes` to persist computed values back onto matching LLM spans
|
|
347
|
+
before trace consumers run, and add entries to `metrics` to surface arbitrary user metrics
|
|
347
348
|
(`format: 'string' | 'number' | 'duration' | 'json' | 'boolean'`,
|
|
348
349
|
`placements: ['header' | 'body']`). `derivedAttributes` can be a keyed map
|
|
349
350
|
for one-off fields or one callback that returns multiple path/value pairs.
|