@ls-stack/agent-eval 0.23.0 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-Cw79dJDr.mjs → app-DS3j_AyX.mjs} +6 -3
- package/dist/apps/web/dist/assets/index-DNsZjOms.css +1 -0
- package/dist/apps/web/dist/assets/index-DqR1YaMG.js +118 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-D3QNOcPN.mjs → cli-ETfZ15RB.mjs} +151 -42
- package/dist/index.d.mts +111 -12
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +28 -18
- package/dist/{runOrchestration-CimthgI7.mjs → runOrchestration-B31SV_Bq.mjs} +364 -117
- package/dist/{runner-B-SYzW8w.mjs → runner-B2f2TEjp.mjs} +1 -1
- package/dist/{runner-4yNYRvmF.mjs → runner-cj1TkR-H.mjs} +2 -2
- package/dist/src-CyNb2ycA.mjs +3 -0
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +14 -6
- package/dist/apps/web/dist/assets/index-AUDD3rNB.js +0 -118
- package/dist/apps/web/dist/assets/index-r0dVFK0B.css +0 -1
- package/dist/src-CcXfWT4M.mjs +0 -3
package/dist/index.d.mts
CHANGED
|
@@ -457,6 +457,7 @@ declare const evalStatsConfigSchema: z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z$1
|
|
|
457
457
|
type EvalStatsConfig = z$1.infer<typeof evalStatsConfigSchema>;
|
|
458
458
|
/** Schema summarizing a discovered eval for list and overview screens. */
|
|
459
459
|
declare const evalSummarySchema: z$1.ZodObject<{
|
|
460
|
+
key: z$1.ZodDefault<z$1.ZodString>;
|
|
460
461
|
id: z$1.ZodString;
|
|
461
462
|
title: z$1.ZodOptional<z$1.ZodString>;
|
|
462
463
|
filePath: z$1.ZodString;
|
|
@@ -635,6 +636,8 @@ declare const evalSummarySchema: z$1.ZodObject<{
|
|
|
635
636
|
type EvalSummary = z$1.infer<typeof evalSummarySchema>;
|
|
636
637
|
/** Schema for one case row in an eval run result table. */
|
|
637
638
|
declare const caseRowSchema: z$1.ZodObject<{
|
|
639
|
+
evalKey: z$1.ZodOptional<z$1.ZodString>;
|
|
640
|
+
caseKey: z$1.ZodOptional<z$1.ZodString>;
|
|
638
641
|
caseId: z$1.ZodString;
|
|
639
642
|
evalId: z$1.ZodString;
|
|
640
643
|
status: z$1.ZodEnum<{
|
|
@@ -796,6 +799,8 @@ declare const scoreTraceSchema: z$1.ZodObject<{
|
|
|
796
799
|
type ScoreTrace = z$1.infer<typeof scoreTraceSchema>;
|
|
797
800
|
/** Schema for the detailed payload shown when opening a specific case. */
|
|
798
801
|
declare const caseDetailSchema: z$1.ZodObject<{
|
|
802
|
+
evalKey: z$1.ZodOptional<z$1.ZodString>;
|
|
803
|
+
caseKey: z$1.ZodOptional<z$1.ZodString>;
|
|
799
804
|
caseId: z$1.ZodString;
|
|
800
805
|
evalId: z$1.ZodString;
|
|
801
806
|
status: z$1.ZodEnum<{
|
|
@@ -1009,6 +1014,43 @@ declare const caseDetailSchema: z$1.ZodObject<{
|
|
|
1009
1014
|
}, z$1.core.$strip>;
|
|
1010
1015
|
/** Full case payload including inputs, trace, outputs, and failures. */
|
|
1011
1016
|
type CaseDetail = z$1.infer<typeof caseDetailSchema>;
|
|
1017
|
+
/** Schema for discovery problems that should be shown before running evals. */
|
|
1018
|
+
declare const discoveryIssueSchema: z$1.ZodObject<{
|
|
1019
|
+
type: z$1.ZodEnum<{
|
|
1020
|
+
"duplicate-eval-id": "duplicate-eval-id";
|
|
1021
|
+
}>;
|
|
1022
|
+
severity: z$1.ZodEnum<{
|
|
1023
|
+
error: "error";
|
|
1024
|
+
}>;
|
|
1025
|
+
filePath: z$1.ZodString;
|
|
1026
|
+
evalId: z$1.ZodString;
|
|
1027
|
+
message: z$1.ZodString;
|
|
1028
|
+
}, z$1.core.$strip>;
|
|
1029
|
+
/** Discovery problem found while scanning eval files. */
|
|
1030
|
+
type DiscoveryIssue = z$1.infer<typeof discoveryIssueSchema>;
|
|
1031
|
+
//#endregion
|
|
1032
|
+
//#region ../shared/src/evalIdentity.d.ts
|
|
1033
|
+
/** Build the stable identity for one eval inside a workspace. */
|
|
1034
|
+
declare function buildEvalKey(params: {
|
|
1035
|
+
filePath: string;
|
|
1036
|
+
evalId: string;
|
|
1037
|
+
}): string;
|
|
1038
|
+
/** Build the stable identity for one eval case inside a workspace. */
|
|
1039
|
+
declare function buildCaseKey(params: {
|
|
1040
|
+
filePath: string;
|
|
1041
|
+
evalId: string;
|
|
1042
|
+
caseId: string;
|
|
1043
|
+
}): string;
|
|
1044
|
+
/** Return the collision-safe eval key stored on a row, falling back for legacy data. */
|
|
1045
|
+
declare function getCaseRowEvalKey(row: {
|
|
1046
|
+
evalKey?: string;
|
|
1047
|
+
evalId: string;
|
|
1048
|
+
}): string;
|
|
1049
|
+
/** Return the collision-safe case key stored on a row, falling back for legacy data. */
|
|
1050
|
+
declare function getCaseRowCaseKey(row: {
|
|
1051
|
+
caseKey?: string;
|
|
1052
|
+
caseId: string;
|
|
1053
|
+
}): string;
|
|
1012
1054
|
//#endregion
|
|
1013
1055
|
//#region ../shared/src/schemas/chart.d.ts
|
|
1014
1056
|
/** Chart type rendered for a single eval history chart. */
|
|
@@ -1338,6 +1380,8 @@ declare const runManifestSchema: z$1.ZodObject<{
|
|
|
1338
1380
|
evalIds: "evalIds";
|
|
1339
1381
|
caseIds: "caseIds";
|
|
1340
1382
|
}>;
|
|
1383
|
+
evalKeys: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
1384
|
+
files: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
1341
1385
|
evalIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
1342
1386
|
caseIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
1343
1387
|
}, z$1.core.$strip>;
|
|
@@ -1539,13 +1583,35 @@ declare const apiCallMetricPlacementSchema: z$1.ZodEnum<{
|
|
|
1539
1583
|
}>;
|
|
1540
1584
|
/** Placement option for an API-call metric. */
|
|
1541
1585
|
type ApiCallMetricPlacement = z$1.infer<typeof apiCallMetricPlacementSchema>;
|
|
1586
|
+
/** Context passed to an LLM/API-call derived attribute function. */
|
|
1587
|
+
type CallDerivedAttributeContext = {
|
|
1588
|
+
/** Raw attributes from the matching trace span. */attributes: Record<string, unknown> | undefined; /** Matching trace span. */
|
|
1589
|
+
span: EvalTraceSpan; /** Dot-path helper for reading from `span.attributes`. */
|
|
1590
|
+
get: (path: string) => unknown;
|
|
1591
|
+
};
|
|
1592
|
+
/**
|
|
1593
|
+
* Runner-side function used to derive one new span attribute from a matching
|
|
1594
|
+
* LLM/API-call span. Return `undefined` to omit the attribute for that span.
|
|
1595
|
+
*/
|
|
1596
|
+
type CallDerivedAttribute = (ctx: CallDerivedAttributeContext) => unknown;
|
|
1597
|
+
/** One resolved derived span attribute rule. */
|
|
1598
|
+
type ResolvedCallDerivedAttribute = {
|
|
1599
|
+
/** Dot-path where the derived value is persisted on `span.attributes`. */path: string;
|
|
1600
|
+
/**
|
|
1601
|
+
* Function that derives the persisted value for each matching span. Omitted
|
|
1602
|
+
* after this config is serialized to the browser.
|
|
1603
|
+
*/
|
|
1604
|
+
compute?: CallDerivedAttribute;
|
|
1605
|
+
};
|
|
1542
1606
|
/**
|
|
1543
1607
|
* Schema for a single user-defined metric attached to LLM call rows.
|
|
1544
1608
|
*
|
|
1545
1609
|
* Each metric reads `path` from the span's `attributes` and renders the value
|
|
1546
|
-
* with the configured `format` and `numberFormat`.
|
|
1547
|
-
*
|
|
1548
|
-
*
|
|
1610
|
+
* with the configured `format` and `numberFormat`. Use
|
|
1611
|
+
* `llmCalls.derivedAttributes` when a metric should read a value computed from
|
|
1612
|
+
* other attributes. `placements` controls whether the metric appears as a chip
|
|
1613
|
+
* on the collapsed row header, as a row inside the expanded body, or both.
|
|
1614
|
+
* Defaults to `['body']` when omitted.
|
|
1549
1615
|
*/
|
|
1550
1616
|
declare const llmCallMetricSchema: z$1.ZodObject<{
|
|
1551
1617
|
label: z$1.ZodString;
|
|
@@ -1570,9 +1636,11 @@ type LlmCallMetric = z$1.infer<typeof llmCallMetricSchema>;
|
|
|
1570
1636
|
* Schema for a single user-defined metric attached to API call rows.
|
|
1571
1637
|
*
|
|
1572
1638
|
* Each metric reads `path` from the span's `attributes` and renders the value
|
|
1573
|
-
* with the configured `format` and `numberFormat`.
|
|
1574
|
-
*
|
|
1575
|
-
*
|
|
1639
|
+
* with the configured `format` and `numberFormat`. Use
|
|
1640
|
+
* `apiCalls.derivedAttributes` when a metric should read a value computed from
|
|
1641
|
+
* other attributes. `placements` controls whether the metric appears as a chip
|
|
1642
|
+
* on the collapsed row header, as a row inside the expanded body, or both.
|
|
1643
|
+
* Defaults to `['body']` when omitted.
|
|
1576
1644
|
*/
|
|
1577
1645
|
declare const apiCallMetricSchema: z$1.ZodObject<{
|
|
1578
1646
|
label: z$1.ZodString;
|
|
@@ -1629,6 +1697,7 @@ declare const llmCallsConfigSchema: z$1.ZodObject<{
|
|
|
1629
1697
|
reasoning: z$1.ZodOptional<z$1.ZodString>;
|
|
1630
1698
|
toolCalls: z$1.ZodOptional<z$1.ZodString>;
|
|
1631
1699
|
}, z$1.core.$strip>>;
|
|
1700
|
+
derivedAttributes: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodCustom<CallDerivedAttribute, CallDerivedAttribute>>>;
|
|
1632
1701
|
pricing: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
|
|
1633
1702
|
model: z$1.ZodString;
|
|
1634
1703
|
provider: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -1674,6 +1743,7 @@ declare const apiCallsConfigSchema: z$1.ZodObject<{
|
|
|
1674
1743
|
durationMs: z$1.ZodOptional<z$1.ZodString>;
|
|
1675
1744
|
error: z$1.ZodOptional<z$1.ZodString>;
|
|
1676
1745
|
}, z$1.core.$strip>>;
|
|
1746
|
+
derivedAttributes: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodCustom<CallDerivedAttribute, CallDerivedAttribute>>>;
|
|
1677
1747
|
metrics: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
|
|
1678
1748
|
label: z$1.ZodString;
|
|
1679
1749
|
tooltip: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -1720,6 +1790,7 @@ type ResolvedLlmCallsConfig = {
|
|
|
1720
1790
|
reasoning: string;
|
|
1721
1791
|
toolCalls: string;
|
|
1722
1792
|
};
|
|
1793
|
+
derivedAttributes: ResolvedCallDerivedAttribute[];
|
|
1723
1794
|
metrics: ResolvedLlmCallMetric[];
|
|
1724
1795
|
pricing: ResolvedLlmCallPricing[];
|
|
1725
1796
|
};
|
|
@@ -1738,6 +1809,7 @@ type ResolvedApiCallsConfig = {
|
|
|
1738
1809
|
durationMs: string;
|
|
1739
1810
|
error: string;
|
|
1740
1811
|
};
|
|
1812
|
+
derivedAttributes: ResolvedCallDerivedAttribute[];
|
|
1741
1813
|
metrics: ResolvedApiCallMetric[];
|
|
1742
1814
|
};
|
|
1743
1815
|
/** Fully-resolved LLM-call metric used by the runner and UI. */
|
|
@@ -1983,6 +2055,7 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
1983
2055
|
reasoning: z$1.ZodOptional<z$1.ZodString>;
|
|
1984
2056
|
toolCalls: z$1.ZodOptional<z$1.ZodString>;
|
|
1985
2057
|
}, z$1.core.$strip>>;
|
|
2058
|
+
derivedAttributes: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodCustom<CallDerivedAttribute, CallDerivedAttribute>>>;
|
|
1986
2059
|
pricing: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
|
|
1987
2060
|
model: z$1.ZodString;
|
|
1988
2061
|
provider: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -2037,6 +2110,7 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
2037
2110
|
durationMs: z$1.ZodOptional<z$1.ZodString>;
|
|
2038
2111
|
error: z$1.ZodOptional<z$1.ZodString>;
|
|
2039
2112
|
}, z$1.core.$strip>>;
|
|
2113
|
+
derivedAttributes: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodCustom<CallDerivedAttribute, CallDerivedAttribute>>>;
|
|
2040
2114
|
metrics: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
|
|
2041
2115
|
label: z$1.ZodString;
|
|
2042
2116
|
tooltip: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -2067,6 +2141,20 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
2067
2141
|
}, z$1.core.$strip>>;
|
|
2068
2142
|
}, z$1.core.$strip>;
|
|
2069
2143
|
//#endregion
|
|
2144
|
+
//#region ../shared/src/utils/deriveCallAttributes.d.ts
|
|
2145
|
+
/**
|
|
2146
|
+
* Persist configured derived attributes onto matching LLM/API spans.
|
|
2147
|
+
*
|
|
2148
|
+
* These derived attributes are applied before trace consumers run, so
|
|
2149
|
+
* `deriveFromTracing`, default usage extraction, trace display, and call
|
|
2150
|
+
* metrics can all read them by normal dot-path lookup.
|
|
2151
|
+
*/
|
|
2152
|
+
declare function applyDerivedCallAttributes(params: {
|
|
2153
|
+
spans: EvalTraceSpan[];
|
|
2154
|
+
llmCallsConfig: ResolvedLlmCallsConfig;
|
|
2155
|
+
apiCallsConfig: ResolvedApiCallsConfig;
|
|
2156
|
+
}): EvalTraceSpan[];
|
|
2157
|
+
//#endregion
|
|
2070
2158
|
//#region ../shared/src/utils/extractLlmCalls.d.ts
|
|
2071
2159
|
/** Resolved value for one user-defined metric on an LLM call row. */
|
|
2072
2160
|
type LlmCallMetricValue = {
|
|
@@ -2728,6 +2816,8 @@ declare const createRunRequestSchema: z$1.ZodObject<{
|
|
|
2728
2816
|
evalIds: "evalIds";
|
|
2729
2817
|
caseIds: "caseIds";
|
|
2730
2818
|
}>;
|
|
2819
|
+
evalKeys: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
2820
|
+
files: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
2731
2821
|
evalIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
2732
2822
|
caseIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
2733
2823
|
}, z$1.core.$strip>;
|
|
@@ -3520,7 +3610,8 @@ type CacheClearFilter = {
|
|
|
3520
3610
|
type EvalRunner = {
|
|
3521
3611
|
/** Load workspace config, discover evals, and start file watching when enabled. */init(): Promise<void>; /** Return the currently discovered eval summaries for the active workspace. */
|
|
3522
3612
|
getEvals(): EvalSummary[]; /** Look up one discovered eval by id. */
|
|
3523
|
-
getEval(id: string): EvalSummary | undefined; /**
|
|
3613
|
+
getEval(id: string): EvalSummary | undefined; /** Return discovery errors that should be shown before running evals. */
|
|
3614
|
+
getDiscoveryIssues(): DiscoveryIssue[]; /** Re-scan configured eval files and emit a discovery update to listeners. */
|
|
3524
3615
|
refreshDiscovery(): Promise<void>;
|
|
3525
3616
|
startRun(request: CreateRunRequest): Promise<{
|
|
3526
3617
|
manifest: RunManifest;
|
|
@@ -3573,11 +3664,19 @@ type EvalRunner = {
|
|
|
3573
3664
|
* Remove cache entries matching `filter`, or all entries when no filter is
|
|
3574
3665
|
* supplied.
|
|
3575
3666
|
*/
|
|
3576
|
-
clearCache(filter?: CacheClearFilter): Promise<void>;
|
|
3577
|
-
|
|
3667
|
+
clearCache(filter?: CacheClearFilter): Promise<void>;
|
|
3668
|
+
/**
|
|
3669
|
+
* Recompute persisted case and run statuses for terminal runs touching one
|
|
3670
|
+
* eval. Accepts the exact eval key, with a legacy fallback for unique eval ids.
|
|
3671
|
+
*/
|
|
3672
|
+
recomputeStatusesForEval(evalKey: string): Promise<{
|
|
3578
3673
|
updatedRuns: number;
|
|
3579
|
-
}>;
|
|
3580
|
-
|
|
3674
|
+
}>;
|
|
3675
|
+
/**
|
|
3676
|
+
* Delete terminal persisted runs that touch one eval from memory and disk.
|
|
3677
|
+
* Accepts the exact eval key, with a legacy fallback for unique eval ids.
|
|
3678
|
+
*/
|
|
3679
|
+
cleanRunsForEval(evalKey: string): Promise<{
|
|
3581
3680
|
deletedRuns: number;
|
|
3582
3681
|
}>; /** Persist a UI-authored manual score for one case and recompute affected summaries. */
|
|
3583
3682
|
updateManualScore(params: {
|
|
@@ -3623,4 +3722,4 @@ declare function createRunner({
|
|
|
3623
3722
|
*/
|
|
3624
3723
|
declare function runCli(argv: string[]): Promise<void>;
|
|
3625
3724
|
//#endregion
|
|
3626
|
-
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
3725
|
+
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as
|
|
2
|
-
import { n as createRunner, t as runCli } from "./cli-
|
|
3
|
-
import "./src-
|
|
4
|
-
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
1
|
+
import { $ as llmCallsConfigSchema, $t as traceSpanKindSchema, A as extractApiCalls, An as getEvalStartTime, At as evalChartTypeSchema, B as runSummarySchema, Bn as startEvalBackgroundJob, Bt as cacheRecordingOpSchema, Cn as advanceEvalTime, Ct as evalChartAggregateSchema, D as sseEnvelopeSchema, Dn as evalLog, Dt as evalChartConfigSchema, E as updateManualScoreRequestSchema, En as evalAssert, Et as evalChartColorSchema, F as getEvalDisplayStatus, Fn as runInEvalRuntimeScope, Ft as cacheEntryWithDebugKeySchema, G as apiCallMetricPlacementSchema, Gt as traceCacheRefSchema, H as DEFAULT_LLM_CALLS_CONFIG, Hn as defineEval, Ht as cacheStatusSchema, I as deriveScopedSummaryFromCases, In as runInEvalScope, It as cacheFileSchema, J as defaultConfigKeySchema, Jt as traceAttributeDisplayPlacementSchema, K as apiCallMetricSchema, Kt as traceAttributeDisplayFormatSchema, L as deriveStatusFromCaseRows, Ln as runInExistingEvalScope, Lt as cacheListItemSchema, M as applyDerivedCallAttributes, Mn as isInEvalScope, Mt as cacheDebugKeyEntrySchema, N as getNestedAttribute, Nn as mergeEvalOutput, Nt as cacheDebugKeyFileSchema, O as extractCacheEntries, On as getCurrentScope, Ot as evalChartMetricSchema, P as getEvalTitle, Pn as nextEvalId, Pt as cacheEntrySchema, Q as llmCallPricingSchema, Qt as traceSpanErrorSchema, R as deriveStatusFromChildStatuses, Rn as setEvalOutput, Rt as cacheModeSchema, Sn as EvalAssertionError, St as scoreTraceSchema, T as createRunRequestSchema, Tt as evalChartBuiltinMetricSchema, U as agentEvalsConfigSchema, Un as getEvalRegistry, Ut as serializedCacheSpanSchema, V as DEFAULT_API_CALLS_CONFIG, Vn as repoFile, Vt as cacheRecordingSchema, W as apiCallMetricFormatSchema, Wt as spanCacheOptionsSchema, X as llmCallMetricPlacementSchema, Xt as traceDisplayConfigSchema, Y as llmCallMetricFormatSchema, Yt as traceAttributeDisplaySchema, Z as llmCallMetricSchema, Zt as traceDisplayInputConfigSchema, _n as hashCacheKeySync, _t as evalSummarySchema, an as columnKindSchema, at as buildCaseKey, bn as serializeCacheRecording, bt as runLogLocationSchema, cn as numberDisplayOptionsSchema, ct as getCaseRowEvalKey, dn as z, dt as caseRowSchema, en as traceSpanSchema, et as removeDefaultConfigSchema, fn as buildTraceTree, ft as discoveryIssueSchema, gn as hashCacheKey, gt as evalStatsConfigSchema, hn as evalTracer, ht as evalStatItemSchema, in as columnFormatSchema, it as trialSelectionModeSchema, j as extractLlmCalls, jn as incrementEvalOutput, jt as evalChartsConfigSchema, k as extractCacheHits, kn as getEvalCaseInput, kt as evalChartTooltipExtraSchema, ln as repoFileRefSchema, lt as assertionFailureSchema, mn as evalSpan, mt as evalStatAggregateSchema, nn as cellValueSchema, nt as resolveLlmCallsConfig, on as fileRefSchema, ot as buildEvalKey, pn as captureEvalSpanError, pt as evalFreshnessStatusSchema, q as apiCallsConfigSchema, qt as traceAttributeDisplayInputSchema, rn as columnDefSchema, rt as runLogsConfigSchema, sn as jsonCellSchema, st as getCaseRowCaseKey, tn as traceSpanWarningSchema, tt as resolveApiCallsConfig, un as runArtifactRefSchema, ut as caseDetailSchema, vn as deserializeCacheRecording, vt as runLogEntrySchema, wn as appendToEvalOutput, wt as evalChartAxisSchema, xn as serializeCacheValue, xt as runLogPhaseSchema, yn as deserializeCacheValue, yt as runLogLevelSchema, z as runManifestSchema, zn as setScopeCacheContext, zt as cacheOperationTypeSchema } from "./runOrchestration-B31SV_Bq.mjs";
|
|
2
|
+
import { n as createRunner, t as runCli } from "./cli-ETfZ15RB.mjs";
|
|
3
|
+
import "./src-CyNb2ycA.mjs";
|
|
4
|
+
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { B as runSummarySchema, T as createRunRequestSchema, Tn as configureEvalRunLogs, b as loadConfig, gt as evalStatsConfigSchema, jt as evalChartsConfigSchema, ot as buildEvalKey, r as getTargetEvals$1, rn as columnDefSchema, t as executeRun, v as parseEvalDiscovery, w as createFsCacheStore, z as runManifestSchema } from "./runOrchestration-B31SV_Bq.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { relative } from "node:path";
|
|
@@ -6,6 +6,7 @@ import { z } from "zod/v4";
|
|
|
6
6
|
import { glob } from "glob";
|
|
7
7
|
//#region ../runner/src/runChild.ts
|
|
8
8
|
const evalMetaSchema = z.object({
|
|
9
|
+
key: z.string(),
|
|
9
10
|
id: z.string(),
|
|
10
11
|
title: z.string().optional(),
|
|
11
12
|
filePath: z.string(),
|
|
@@ -36,8 +37,10 @@ function getConfiguredConcurrency(configConcurrency) {
|
|
|
36
37
|
return Math.max(1, Math.floor(configConcurrency));
|
|
37
38
|
}
|
|
38
39
|
function getTargetEvals(params) {
|
|
39
|
-
|
|
40
|
-
|
|
40
|
+
return getTargetEvals$1({
|
|
41
|
+
evals: params.evals.values(),
|
|
42
|
+
request: params.request
|
|
43
|
+
});
|
|
41
44
|
}
|
|
42
45
|
function toWorkspaceRelativePath(params) {
|
|
43
46
|
return relative(params.workspaceRoot, params.filePath).replaceAll("\\", "/");
|
|
@@ -55,21 +58,29 @@ async function discoverRunEvals(params) {
|
|
|
55
58
|
for (const filePath of discovered) {
|
|
56
59
|
const source = await readFile(filePath, "utf-8");
|
|
57
60
|
const sourceFingerprint = getSourceFingerprint(source);
|
|
58
|
-
const metas =
|
|
59
|
-
for (const meta of metas)
|
|
60
|
-
|
|
61
|
-
title: meta.title,
|
|
62
|
-
filePath: toWorkspaceRelativePath({
|
|
61
|
+
const metas = parseEvalDiscovery(filePath, source).metas;
|
|
62
|
+
for (const meta of metas) {
|
|
63
|
+
const relativeFilePath = toWorkspaceRelativePath({
|
|
63
64
|
filePath: meta.filePath,
|
|
64
65
|
workspaceRoot: params.workspaceRoot
|
|
65
|
-
})
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
66
|
+
});
|
|
67
|
+
const key = buildEvalKey({
|
|
68
|
+
filePath: relativeFilePath,
|
|
69
|
+
evalId: meta.id
|
|
70
|
+
});
|
|
71
|
+
evals.set(key, {
|
|
72
|
+
key,
|
|
73
|
+
id: meta.id,
|
|
74
|
+
title: meta.title,
|
|
75
|
+
filePath: relativeFilePath,
|
|
76
|
+
sourceFilePath: meta.filePath,
|
|
77
|
+
sourceFingerprint,
|
|
78
|
+
columnDefs: [],
|
|
79
|
+
caseCount: null
|
|
80
|
+
});
|
|
81
|
+
}
|
|
71
82
|
}
|
|
72
|
-
return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
|
|
83
|
+
return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath) || a.id.localeCompare(b.id));
|
|
73
84
|
}
|
|
74
85
|
async function readContext(contextPath) {
|
|
75
86
|
if (contextPath === void 0) throw new Error("Missing run child context path");
|
|
@@ -93,7 +104,7 @@ async function main() {
|
|
|
93
104
|
config,
|
|
94
105
|
workspaceRoot: context.workspaceRoot
|
|
95
106
|
});
|
|
96
|
-
const evals = new Map(evalMetas.map((evalMeta) => [evalMeta.
|
|
107
|
+
const evals = new Map(evalMetas.map((evalMeta) => [evalMeta.key, evalMeta]));
|
|
97
108
|
const lastRunStatusMap = /* @__PURE__ */ new Map();
|
|
98
109
|
const latestRunInfoMap = /* @__PURE__ */ new Map();
|
|
99
110
|
await executeRun({
|
|
@@ -108,7 +119,6 @@ async function main() {
|
|
|
108
119
|
request: context.request,
|
|
109
120
|
runDir: context.runDir,
|
|
110
121
|
config,
|
|
111
|
-
evals,
|
|
112
122
|
cacheStore,
|
|
113
123
|
lastRunStatusMap,
|
|
114
124
|
latestRunInfoMap,
|
|
@@ -123,7 +133,7 @@ async function main() {
|
|
|
123
133
|
workspaceRoot: context.workspaceRoot,
|
|
124
134
|
getSourceFingerprint,
|
|
125
135
|
getConfiguredConcurrency: () => getConfiguredConcurrency(config.concurrency),
|
|
126
|
-
getSortedEvalMetas: () => [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath)),
|
|
136
|
+
getSortedEvalMetas: () => [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath) || a.id.localeCompare(b.id)),
|
|
127
137
|
getTargetEvals: (request) => getTargetEvals({
|
|
128
138
|
evals,
|
|
129
139
|
request
|