@tangle-network/agent-eval 0.23.1 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +80 -0
- package/README.md +141 -79
- package/dist/baseline-4R5deP0N.d.ts +108 -0
- package/dist/benchmarks/index.d.ts +3 -2
- package/dist/benchmarks/index.js +1 -1
- package/dist/builder-eval/index.d.ts +249 -0
- package/dist/builder-eval/index.js +391 -0
- package/dist/builder-eval/index.js.map +1 -0
- package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
- package/dist/chunk-2A5XJB43.js.map +1 -0
- package/dist/chunk-47X6LRCE.js +76 -0
- package/dist/chunk-47X6LRCE.js.map +1 -0
- package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
- package/dist/chunk-4F5DQN55.js.map +1 -0
- package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
- package/dist/chunk-4S4BM3QQ.js.map +1 -0
- package/dist/chunk-5BKGXME7.js +65 -0
- package/dist/chunk-5BKGXME7.js.map +1 -0
- package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
- package/dist/chunk-6QDKWHLS.js.map +1 -0
- package/dist/chunk-I4MBDTY5.js +272 -0
- package/dist/chunk-I4MBDTY5.js.map +1 -0
- package/dist/chunk-K2TPS5LB.js +569 -0
- package/dist/chunk-K2TPS5LB.js.map +1 -0
- package/dist/chunk-KKHDIONI.js +414 -0
- package/dist/chunk-KKHDIONI.js.map +1 -0
- package/dist/chunk-KMPRBJK4.js +74 -0
- package/dist/chunk-KMPRBJK4.js.map +1 -0
- package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
- package/dist/chunk-KTGTIOFD.js.map +1 -0
- package/dist/chunk-LSH4MMOZ.js +838 -0
- package/dist/chunk-LSH4MMOZ.js.map +1 -0
- package/dist/chunk-NG236HPC.js +57 -0
- package/dist/chunk-NG236HPC.js.map +1 -0
- package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
- package/dist/chunk-NLMNWKVM.js.map +1 -0
- package/dist/chunk-NU65VQ7M.js +99 -0
- package/dist/chunk-NU65VQ7M.js.map +1 -0
- package/dist/chunk-OHEPNJQN.js +554 -0
- package/dist/chunk-OHEPNJQN.js.map +1 -0
- package/dist/chunk-OWLAAMME.js +250 -0
- package/dist/chunk-OWLAAMME.js.map +1 -0
- package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
- package/dist/chunk-PC4UYEBM.js.map +1 -0
- package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
- package/dist/chunk-RAF443UI.js.map +1 -0
- package/dist/chunk-RZTMDUO7.js +49 -0
- package/dist/chunk-RZTMDUO7.js.map +1 -0
- package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
- package/dist/chunk-SESZDQPX.js.map +1 -0
- package/dist/{chunk-6KQG5HAH.js → chunk-SY6WAAAD.js} +84 -71
- package/dist/chunk-SY6WAAAD.js.map +1 -0
- package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
- package/dist/chunk-TVVP3ZZQ.js.map +1 -0
- package/dist/{chunk-VQQSPGSM.js → chunk-VRJVTXRV.js} +169 -111
- package/dist/chunk-VRJVTXRV.js.map +1 -0
- package/dist/chunk-WWYCWKUM.js +196 -0
- package/dist/chunk-WWYCWKUM.js.map +1 -0
- package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
- package/dist/chunk-YRZ4M5GS.js.map +1 -0
- package/dist/chunk-ZN274SWR.js +613 -0
- package/dist/chunk-ZN274SWR.js.map +1 -0
- package/dist/cli.js +10 -6
- package/dist/cli.js.map +1 -1
- package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
- package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
- package/dist/control.d.ts +8 -6
- package/dist/control.js +10 -7
- package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
- package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
- package/dist/errors-BZ9sTdz7.d.ts +70 -0
- package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
- package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
- package/dist/governance/index.d.ts +5 -0
- package/dist/governance/index.js +18 -0
- package/dist/governance/index.js.map +1 -0
- package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
- package/dist/index-Oj9fAPPN.d.ts +270 -0
- package/dist/index.d.ts +1866 -3151
- package/dist/index.js +5457 -7809
- package/dist/index.js.map +1 -1
- package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
- package/dist/knowledge/index.d.ts +102 -0
- package/dist/knowledge/index.js +18 -0
- package/dist/knowledge/index.js.map +1 -0
- package/dist/meta-eval/index.d.ts +99 -0
- package/dist/meta-eval/index.js +324 -0
- package/dist/meta-eval/index.js.map +1 -0
- package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +11 -8
- package/dist/optimization.js +11 -9
- package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
- package/dist/pipelines/index.d.ts +172 -0
- package/dist/pipelines/index.js +409 -0
- package/dist/pipelines/index.js.map +1 -0
- package/dist/prm/index.d.ts +99 -0
- package/dist/prm/index.js +222 -0
- package/dist/prm/index.js.map +1 -0
- package/dist/query-DODUYdPg.d.ts +30 -0
- package/dist/release-report-TDPn1cxq.d.ts +292 -0
- package/dist/replay-BL96gCEP.d.ts +226 -0
- package/dist/reporting.d.ts +10 -295
- package/dist/reporting.js +10 -6
- package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-CUOiGcGv.d.ts} +148 -146
- package/dist/rl.d.ts +1762 -8
- package/dist/rl.js +2035 -58
- package/dist/rl.js.map +1 -1
- package/dist/rubric-D5tjHNJQ.d.ts +72 -0
- package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
- package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
- package/dist/sequential-Dgz1n51-.d.ts +139 -0
- package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
- package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-BXGs_9V0.d.ts} +3 -76
- package/dist/telemetry/file.js +4 -1
- package/dist/telemetry/file.js.map +1 -1
- package/dist/telemetry/index.js +57 -57
- package/dist/telemetry/index.js.map +1 -1
- package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
- package/dist/traces.d.ts +142 -387
- package/dist/traces.js +1302 -40
- package/dist/traces.js.map +1 -1
- package/dist/trajectory-CnoBo-JY.d.ts +32 -0
- package/dist/wire/index.d.ts +22 -22
- package/dist/wire/index.js +4 -3
- package/package.json +44 -18
- package/dist/chunk-42I2QC2L.js.map +0 -1
- package/dist/chunk-5IIQKMD5.js.map +0 -1
- package/dist/chunk-6KQG5HAH.js.map +0 -1
- package/dist/chunk-6M774GY6.js.map +0 -1
- package/dist/chunk-7EAUOUQS.js.map +0 -1
- package/dist/chunk-AXHNWLIX.js.map +0 -1
- package/dist/chunk-EXGR4XEM.js.map +0 -1
- package/dist/chunk-IOXMGMHQ.js.map +0 -1
- package/dist/chunk-KAO3Q65R.js.map +0 -1
- package/dist/chunk-LZKIOBG2.js +0 -2026
- package/dist/chunk-LZKIOBG2.js.map +0 -1
- package/dist/chunk-QBW3YBTR.js.map +0 -1
- package/dist/chunk-QUKKGHTZ.js.map +0 -1
- package/dist/chunk-SQQLHODJ.js.map +0 -1
- package/dist/chunk-V5QSWN7L.js +0 -1310
- package/dist/chunk-V5QSWN7L.js.map +0 -1
- package/dist/chunk-VQQSPGSM.js.map +0 -1
- package/dist/chunk-XPHOZPOM.js +0 -1947
- package/dist/chunk-XPHOZPOM.js.map +0 -1
- package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
- package/dist/index-ekBXweiQ.d.ts +0 -1894
- package/dist/sequential-DgU2mFsE.d.ts +0 -304
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult, T as TestGradedScenario, b as TestGradedRunResult } from '../test-graded-scenario-B2kWEdh9.js';
|
|
2
|
+
import { T as TraceEmitter } from '../emitter-DP_cSSiw.js';
|
|
3
|
+
import { T as TraceStore, R as Run } from '../store-Db2Bv8Cf.js';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* BuilderSession — ties a builder-of-builders workflow together.
|
|
7
|
+
*
|
|
8
|
+
* Models agent-builder's shape: Project → Chat → Edit → Ship → App →
|
|
9
|
+
* AppAgent. Each layer is a Run (linked via parentRunId). The
|
|
10
|
+
* framework-enforced invariants:
|
|
11
|
+
*
|
|
12
|
+
* - One Project → many Chats; chatId scopes runs within a project.
|
|
13
|
+
* - One Chat = one builder Run with `layer='builder'`.
|
|
14
|
+
* - One Ship = one child Run with `layer='app-build'` + SandboxHarness.
|
|
15
|
+
* - One AppScenario = one grandchild Run with `layer='app-runtime'`.
|
|
16
|
+
*
|
|
17
|
+
* Consumers obtain a BuilderSession, call `startChat`, drive the
|
|
18
|
+
* builder agent (emitting spans), and call `ship` / `runAppScenario`
|
|
19
|
+
* as the workflow progresses. The session reconstructs itself from
|
|
20
|
+
* trace data via `resume(store, projectId)`.
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
interface BuilderSessionInit {
|
|
24
|
+
projectId: string;
|
|
25
|
+
chatId?: string;
|
|
26
|
+
/** Free-form: user's task description, project name, etc. Stored on the builder Run. */
|
|
27
|
+
tags?: Record<string, string>;
|
|
28
|
+
}
|
|
29
|
+
interface ShipOptions {
|
|
30
|
+
harness: HarnessConfig;
|
|
31
|
+
driver?: SandboxDriver;
|
|
32
|
+
/** scenarioId of this app-build run. Defaults to `${projectId}/build`. */
|
|
33
|
+
scenarioId?: string;
|
|
34
|
+
}
|
|
35
|
+
interface RunAppScenarioOptions {
|
|
36
|
+
scenario: TestGradedScenario;
|
|
37
|
+
/** Harness driver override; defaults to the one the session was created with. */
|
|
38
|
+
driver?: SandboxDriver;
|
|
39
|
+
}
|
|
40
|
+
declare class BuilderSession {
|
|
41
|
+
private store;
|
|
42
|
+
private builderEmitter;
|
|
43
|
+
readonly projectId: string;
|
|
44
|
+
readonly chatId: string;
|
|
45
|
+
private builderRunId?;
|
|
46
|
+
private lastBuildRunId?;
|
|
47
|
+
private defaultDriver?;
|
|
48
|
+
constructor(store: TraceStore, init: BuilderSessionInit, driver?: SandboxDriver);
|
|
49
|
+
/** Start the builder (L0) run for this chat. Returns the runId. */
|
|
50
|
+
startChat(scenarioId?: string): Promise<string>;
|
|
51
|
+
/** The emitter for builder-level spans (edits, LLM calls, tool invocations). */
|
|
52
|
+
get emitter(): TraceEmitter;
|
|
53
|
+
/**
|
|
54
|
+
* Ship the project's generated app: run the sandbox harness as a child
|
|
55
|
+
* Run (`layer='app-build'`). Returns the build result + runId.
|
|
56
|
+
*/
|
|
57
|
+
ship(options: ShipOptions): Promise<{
|
|
58
|
+
runId: string;
|
|
59
|
+
result: SandboxHarnessResult;
|
|
60
|
+
}>;
|
|
61
|
+
/**
|
|
62
|
+
* Run a domain scenario against the just-built app as a grandchild Run
|
|
63
|
+
* (`layer='app-runtime'`). The `ship` call must precede this so the
|
|
64
|
+
* parent is set correctly; if no build exists yet the session attaches
|
|
65
|
+
* directly to the builder run (useful for prototypes).
|
|
66
|
+
*/
|
|
67
|
+
runAppScenario(options: RunAppScenarioOptions): Promise<TestGradedRunResult>;
|
|
68
|
+
/** Record an end-of-chat meta score (judge verdict on whether the builder
|
|
69
|
+
* served the user's intent). Accepts a numeric score + optional rationale. */
|
|
70
|
+
recordMetaScore(score: number, rationale?: string): Promise<void>;
|
|
71
|
+
/** Close the builder Run with a final outcome. */
|
|
72
|
+
endChat(outcome: {
|
|
73
|
+
pass: boolean;
|
|
74
|
+
score?: number;
|
|
75
|
+
notes?: string;
|
|
76
|
+
}): Promise<void>;
|
|
77
|
+
/**
|
|
78
|
+
* Inline app-runtime run — for cases where the "scenario" isn't a
|
|
79
|
+
* SWE-bench-style test suite but a live agent interaction (LLM chat,
|
|
80
|
+
* domain flow). Returns an emitter bound to a fresh Run in the
|
|
81
|
+
* `app-runtime` layer; caller emits spans inside and calls
|
|
82
|
+
* `.endRun()` with the final verdict.
|
|
83
|
+
*/
|
|
84
|
+
startAppRuntime(scenarioId: string): Promise<TraceEmitter>;
|
|
85
|
+
/**
|
|
86
|
+
* Lightweight "ship marker" — record an app-build Run with a caller-
|
|
87
|
+
* provided verdict. Use when there isn't a sandbox harness to run but
|
|
88
|
+
* you still want to mark the build state at publish time.
|
|
89
|
+
*/
|
|
90
|
+
recordShipMarker(args: {
|
|
91
|
+
pass: boolean;
|
|
92
|
+
score: number;
|
|
93
|
+
scenarioId?: string;
|
|
94
|
+
notes?: string;
|
|
95
|
+
}): Promise<string>;
|
|
96
|
+
get lastBuildRunIdValue(): string | undefined;
|
|
97
|
+
get builderRunIdValue(): string | undefined;
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Reconstruct the most recent BuilderSession state for a given project —
|
|
101
|
+
* returns { builderRunId, lastBuildRunId, chatRuns }. For chat-first UIs
|
|
102
|
+
* this is how a resumed session finds its place in the edit history.
|
|
103
|
+
*/
|
|
104
|
+
declare function resumeBuilderSession(store: TraceStore, projectId: string): Promise<{
|
|
105
|
+
projectId: string;
|
|
106
|
+
chatRuns: Run[];
|
|
107
|
+
lastBuilderRun?: Run;
|
|
108
|
+
lastBuildRun?: Run;
|
|
109
|
+
lastAppRuntimeRuns: Run[];
|
|
110
|
+
}>;
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Three-layer evaluation — the canonical scoring breakdown for
|
|
114
|
+
* builder-of-builders workflows.
|
|
115
|
+
*
|
|
116
|
+
* meta_score: did the builder understand + satisfy user intent?
|
|
117
|
+
* (judge verdict attached to the builder run)
|
|
118
|
+
* build_score: did the generated scaffold build + pass its own tests?
|
|
119
|
+
* (outcome.score on the app-build child run)
|
|
120
|
+
* runtime_score: did the generated agent pass its domain scenarios?
|
|
121
|
+
* (mean outcome.score over app-runtime grandchild runs)
|
|
122
|
+
*
|
|
123
|
+
* Returns a structured report per project. The cross-layer correlation
|
|
124
|
+
* is the highest-leverage signal the framework computes — if
|
|
125
|
+
* meta_score doesn't predict runtime_score, the builder's self-scoring
|
|
126
|
+
* is broken.
|
|
127
|
+
*
|
|
128
|
+
* Scaffold-only mode: when a project has no `app-runtime` runs (e.g. a
|
|
129
|
+
* scaffold-builder eval that grades compose + build without driving a
|
|
130
|
+
* runtime scenario), `kind` is `'scaffold-only'` and `complete` measures
|
|
131
|
+
* meta + build only. Consumers can tell the two apart without having to
|
|
132
|
+
* interpret null-runtime as either "not yet computed" or "N/A for this
|
|
133
|
+
* project shape".
|
|
134
|
+
*/
|
|
135
|
+
|
|
136
|
+
type ProjectKind = 'full' | 'scaffold-only';
|
|
137
|
+
interface ThreeLayerProjectReport {
|
|
138
|
+
projectId: string;
|
|
139
|
+
/**
|
|
140
|
+
* `'full'` when the project has at least one `app-runtime` run;
|
|
141
|
+
* `'scaffold-only'` when it only has meta + build layers. Lets
|
|
142
|
+
* downstream consumers treat a null runtime score as expected
|
|
143
|
+
* (scaffold-only) vs. missing (full, pipeline broke).
|
|
144
|
+
*/
|
|
145
|
+
kind: ProjectKind;
|
|
146
|
+
builderRunId?: string;
|
|
147
|
+
/** Judge-verdict score on the builder run (0..1 after normalization). */
|
|
148
|
+
metaScore: number | null;
|
|
149
|
+
buildRunId?: string;
|
|
150
|
+
/** 0..1 from the sandbox harness (testsPassed / testsTotal). */
|
|
151
|
+
buildScore: number | null;
|
|
152
|
+
appRuntimeRunIds: string[];
|
|
153
|
+
/** Mean of outcome.score over app-runtime runs, 0..1. Always null in scaffold-only mode. */
|
|
154
|
+
runtimeScore: number | null;
|
|
155
|
+
runtimePassRate: number | null;
|
|
156
|
+
/**
|
|
157
|
+
* Layer-aware completeness:
|
|
158
|
+
* - `kind='full'`: all three layers scored
|
|
159
|
+
* - `kind='scaffold-only'`: meta + build scored (runtime not applicable)
|
|
160
|
+
*/
|
|
161
|
+
complete: boolean;
|
|
162
|
+
}
|
|
163
|
+
declare function scoreProject(store: TraceStore, projectId: string): Promise<ThreeLayerProjectReport>;
|
|
164
|
+
/** Aggregate scoring across every project in a corpus. */
|
|
165
|
+
declare function scoreAllProjects(store: TraceStore): Promise<ThreeLayerProjectReport[]>;
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Meta-eval correlation — the highest-leverage signal in the framework.
|
|
169
|
+
*
|
|
170
|
+
* Given a corpus of three-layer project reports, compute how well each
|
|
171
|
+
* pair of layers correlates. The question we care about most:
|
|
172
|
+
*
|
|
173
|
+
* Does `metaScore` (what the builder thinks it did) predict
|
|
174
|
+
* `runtimeScore` (what the user actually gets)?
|
|
175
|
+
*
|
|
176
|
+
* If r < ~0.4, the builder's self-scoring is broken — it's optimizing
|
|
177
|
+
* for something other than real-world success. If r > 0.7, meta_score
|
|
178
|
+
* is a usable proxy and can drive CI gates cheaply.
|
|
179
|
+
*
|
|
180
|
+
* Non-parametric rank correlation (Spearman) is also reported because
|
|
181
|
+
* meta scores are often ordinal-ish.
|
|
182
|
+
*/
|
|
183
|
+
|
|
184
|
+
interface LayerCorrelation {
|
|
185
|
+
n: number;
|
|
186
|
+
pearson: number;
|
|
187
|
+
spearman: number;
|
|
188
|
+
}
|
|
189
|
+
interface CorrelationReport {
|
|
190
|
+
/** Pairs present in the corpus (layers with ≥ 2 matched data points). */
|
|
191
|
+
metaVsBuild?: LayerCorrelation;
|
|
192
|
+
metaVsRuntime?: LayerCorrelation;
|
|
193
|
+
buildVsRuntime?: LayerCorrelation;
|
|
194
|
+
/** Number of complete projects (all 3 scores present). */
|
|
195
|
+
completeProjects: number;
|
|
196
|
+
}
|
|
197
|
+
declare function correlateLayers(reports: ThreeLayerProjectReport[]): CorrelationReport;
|
|
198
|
+
|
|
199
|
+
/**
|
|
200
|
+
* ProjectRegistry — project-level aggregation over the trace corpus.
|
|
201
|
+
*
|
|
202
|
+
* Thin reader over TraceStore that answers the questions a chat-first,
|
|
203
|
+
* resumable UI needs:
|
|
204
|
+
* - listProjects() → project IDs with latest activity
|
|
205
|
+
* - projectTimeline(id) → chats + builds + runtime runs, chronological
|
|
206
|
+
* - projectChats(id) → chat-level summaries (turn count, outcome)
|
|
207
|
+
*
|
|
208
|
+
* All queries are pure reads; no state duplication.
|
|
209
|
+
*/
|
|
210
|
+
|
|
211
|
+
interface ProjectSummary {
|
|
212
|
+
projectId: string;
|
|
213
|
+
chatCount: number;
|
|
214
|
+
buildCount: number;
|
|
215
|
+
appRuntimeCount: number;
|
|
216
|
+
lastActivityAt: number;
|
|
217
|
+
latestChatId?: string;
|
|
218
|
+
latestOutcome?: {
|
|
219
|
+
pass: boolean;
|
|
220
|
+
score?: number;
|
|
221
|
+
};
|
|
222
|
+
}
|
|
223
|
+
interface ChatSummary {
|
|
224
|
+
chatId: string;
|
|
225
|
+
projectId: string;
|
|
226
|
+
builderRunId: string;
|
|
227
|
+
startedAt: number;
|
|
228
|
+
endedAt?: number;
|
|
229
|
+
status: Run['status'];
|
|
230
|
+
outcome?: Run['outcome'];
|
|
231
|
+
/** Counts of spans emitted during the chat. */
|
|
232
|
+
llmTurns?: number;
|
|
233
|
+
toolCalls?: number;
|
|
234
|
+
buildRunId?: string;
|
|
235
|
+
appRuntimeRunIds: string[];
|
|
236
|
+
}
|
|
237
|
+
interface ProjectTimelineEntry {
|
|
238
|
+
run: Run;
|
|
239
|
+
layerBucket: 'chat' | 'build' | 'runtime' | 'other';
|
|
240
|
+
}
|
|
241
|
+
declare class ProjectRegistry {
|
|
242
|
+
private store;
|
|
243
|
+
constructor(store: TraceStore);
|
|
244
|
+
listProjects(): Promise<ProjectSummary[]>;
|
|
245
|
+
projectTimeline(projectId: string): Promise<ProjectTimelineEntry[]>;
|
|
246
|
+
projectChats(projectId: string): Promise<ChatSummary[]>;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
export { BuilderSession, type BuilderSessionInit, type ChatSummary, type CorrelationReport, type LayerCorrelation, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type RunAppScenarioOptions, type ShipOptions, type ThreeLayerProjectReport, correlateLayers, resumeBuilderSession, scoreAllProjects, scoreProject };
|
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
import {
|
|
2
|
+
SandboxHarness,
|
|
3
|
+
runTestGradedScenario
|
|
4
|
+
} from "../chunk-OWLAAMME.js";
|
|
5
|
+
import {
|
|
6
|
+
judgeSpans
|
|
7
|
+
} from "../chunk-47X6LRCE.js";
|
|
8
|
+
import "../chunk-5BKGXME7.js";
|
|
9
|
+
import {
|
|
10
|
+
TraceEmitter
|
|
11
|
+
} from "../chunk-TVVP3ZZQ.js";
|
|
12
|
+
import "../chunk-NG236HPC.js";
|
|
13
|
+
import "../chunk-PZ5AY32C.js";
|
|
14
|
+
|
|
15
|
+
// src/builder-eval/builder-session.ts
|
|
16
|
+
var BuilderSession = class {
|
|
17
|
+
store;
|
|
18
|
+
builderEmitter;
|
|
19
|
+
projectId;
|
|
20
|
+
chatId;
|
|
21
|
+
builderRunId;
|
|
22
|
+
lastBuildRunId;
|
|
23
|
+
defaultDriver;
|
|
24
|
+
constructor(store, init, driver) {
|
|
25
|
+
this.store = store;
|
|
26
|
+
this.projectId = init.projectId;
|
|
27
|
+
this.chatId = init.chatId ?? cryptoId();
|
|
28
|
+
this.defaultDriver = driver;
|
|
29
|
+
this.builderEmitter = new TraceEmitter(store);
|
|
30
|
+
}
|
|
31
|
+
/** Start the builder (L0) run for this chat. Returns the runId. */
|
|
32
|
+
async startChat(scenarioId = `${this.projectId}/chat`) {
|
|
33
|
+
await this.builderEmitter.startRun({
|
|
34
|
+
scenarioId,
|
|
35
|
+
projectId: this.projectId,
|
|
36
|
+
chatId: this.chatId,
|
|
37
|
+
layer: "builder"
|
|
38
|
+
});
|
|
39
|
+
this.builderRunId = this.builderEmitter.runId;
|
|
40
|
+
return this.builderRunId;
|
|
41
|
+
}
|
|
42
|
+
/** The emitter for builder-level spans (edits, LLM calls, tool invocations). */
|
|
43
|
+
get emitter() {
|
|
44
|
+
if (!this.builderRunId) throw new Error("BuilderSession.emitter: call startChat() first");
|
|
45
|
+
return this.builderEmitter;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Ship the project's generated app: run the sandbox harness as a child
|
|
49
|
+
* Run (`layer='app-build'`). Returns the build result + runId.
|
|
50
|
+
*/
|
|
51
|
+
async ship(options) {
|
|
52
|
+
if (!this.builderRunId) throw new Error("BuilderSession.ship: call startChat() first");
|
|
53
|
+
const buildEmitter = new TraceEmitter(this.store);
|
|
54
|
+
await buildEmitter.startRun({
|
|
55
|
+
scenarioId: options.scenarioId ?? `${this.projectId}/build`,
|
|
56
|
+
projectId: this.projectId,
|
|
57
|
+
chatId: this.chatId,
|
|
58
|
+
parentRunId: this.builderRunId,
|
|
59
|
+
layer: "app-build"
|
|
60
|
+
});
|
|
61
|
+
const harness = new SandboxHarness(options.driver ?? this.defaultDriver);
|
|
62
|
+
const result = await harness.run(options.harness, buildEmitter);
|
|
63
|
+
await buildEmitter.endRun({
|
|
64
|
+
pass: result.passed,
|
|
65
|
+
score: result.score,
|
|
66
|
+
failureClass: result.passed ? "success" : "sandbox_failure"
|
|
67
|
+
});
|
|
68
|
+
this.lastBuildRunId = buildEmitter.runId;
|
|
69
|
+
return { runId: buildEmitter.runId, result };
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Run a domain scenario against the just-built app as a grandchild Run
|
|
73
|
+
* (`layer='app-runtime'`). The `ship` call must precede this so the
|
|
74
|
+
* parent is set correctly; if no build exists yet the session attaches
|
|
75
|
+
* directly to the builder run (useful for prototypes).
|
|
76
|
+
*/
|
|
77
|
+
async runAppScenario(options) {
|
|
78
|
+
const parentRunId = this.lastBuildRunId ?? this.builderRunId;
|
|
79
|
+
if (!parentRunId)
|
|
80
|
+
throw new Error("BuilderSession.runAppScenario: call startChat() + ship() first");
|
|
81
|
+
const { scenario, driver } = options;
|
|
82
|
+
const result = await runTestGradedScenario(scenario, this.store, {
|
|
83
|
+
driver: driver ?? this.defaultDriver,
|
|
84
|
+
provenance: { codeSha: void 0, promptSha: void 0, modelFingerprint: void 0 }
|
|
85
|
+
});
|
|
86
|
+
await this.store.updateRun(result.runId, {
|
|
87
|
+
parentRunId,
|
|
88
|
+
projectId: this.projectId,
|
|
89
|
+
chatId: this.chatId,
|
|
90
|
+
layer: "app-runtime"
|
|
91
|
+
});
|
|
92
|
+
return result;
|
|
93
|
+
}
|
|
94
|
+
/** Record an end-of-chat meta score (judge verdict on whether the builder
|
|
95
|
+
* served the user's intent). Accepts a numeric score + optional rationale. */
|
|
96
|
+
async recordMetaScore(score, rationale) {
|
|
97
|
+
if (!this.builderRunId)
|
|
98
|
+
throw new Error("BuilderSession.recordMetaScore: call startChat() first");
|
|
99
|
+
await this.builderEmitter.recordJudge({
|
|
100
|
+
judgeId: "builder-meta",
|
|
101
|
+
targetSpanId: this.builderRunId,
|
|
102
|
+
// attach to the builder run itself
|
|
103
|
+
dimension: "user_intent_satisfaction",
|
|
104
|
+
score,
|
|
105
|
+
rationale,
|
|
106
|
+
name: "builder-meta"
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
/** Close the builder Run with a final outcome. */
|
|
110
|
+
async endChat(outcome) {
|
|
111
|
+
await this.builderEmitter.endRun({
|
|
112
|
+
pass: outcome.pass,
|
|
113
|
+
score: outcome.score,
|
|
114
|
+
notes: outcome.notes
|
|
115
|
+
});
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Inline app-runtime run — for cases where the "scenario" isn't a
|
|
119
|
+
* SWE-bench-style test suite but a live agent interaction (LLM chat,
|
|
120
|
+
* domain flow). Returns an emitter bound to a fresh Run in the
|
|
121
|
+
* `app-runtime` layer; caller emits spans inside and calls
|
|
122
|
+
* `.endRun()` with the final verdict.
|
|
123
|
+
*/
|
|
124
|
+
async startAppRuntime(scenarioId) {
|
|
125
|
+
const parentRunId = this.lastBuildRunId ?? this.builderRunId;
|
|
126
|
+
if (!parentRunId)
|
|
127
|
+
throw new Error(
|
|
128
|
+
"BuilderSession.startAppRuntime: call startChat() + (optionally) ship() first"
|
|
129
|
+
);
|
|
130
|
+
const emitter = new TraceEmitter(this.store);
|
|
131
|
+
await emitter.startRun({
|
|
132
|
+
scenarioId,
|
|
133
|
+
projectId: this.projectId,
|
|
134
|
+
chatId: this.chatId,
|
|
135
|
+
parentRunId,
|
|
136
|
+
layer: "app-runtime"
|
|
137
|
+
});
|
|
138
|
+
return emitter;
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* Lightweight "ship marker" — record an app-build Run with a caller-
|
|
142
|
+
* provided verdict. Use when there isn't a sandbox harness to run but
|
|
143
|
+
* you still want to mark the build state at publish time.
|
|
144
|
+
*/
|
|
145
|
+
async recordShipMarker(args) {
|
|
146
|
+
if (!this.builderRunId)
|
|
147
|
+
throw new Error("BuilderSession.recordShipMarker: call startChat() first");
|
|
148
|
+
const emitter = new TraceEmitter(this.store);
|
|
149
|
+
await emitter.startRun({
|
|
150
|
+
scenarioId: args.scenarioId ?? `${this.projectId}/ship`,
|
|
151
|
+
projectId: this.projectId,
|
|
152
|
+
chatId: this.chatId,
|
|
153
|
+
parentRunId: this.builderRunId,
|
|
154
|
+
layer: "app-build"
|
|
155
|
+
});
|
|
156
|
+
await emitter.endRun({
|
|
157
|
+
pass: args.pass,
|
|
158
|
+
score: args.score,
|
|
159
|
+
failureClass: args.pass ? "success" : "sandbox_failure",
|
|
160
|
+
notes: args.notes
|
|
161
|
+
});
|
|
162
|
+
this.lastBuildRunId = emitter.runId;
|
|
163
|
+
return emitter.runId;
|
|
164
|
+
}
|
|
165
|
+
get lastBuildRunIdValue() {
|
|
166
|
+
return this.lastBuildRunId;
|
|
167
|
+
}
|
|
168
|
+
get builderRunIdValue() {
|
|
169
|
+
return this.builderRunId;
|
|
170
|
+
}
|
|
171
|
+
};
|
|
172
|
+
async function resumeBuilderSession(store, projectId) {
|
|
173
|
+
const runs = await store.listRuns({ projectId });
|
|
174
|
+
const chatRuns = runs.filter((r) => r.layer === "builder").sort((a, b) => b.startedAt - a.startedAt);
|
|
175
|
+
const buildRuns = runs.filter((r) => r.layer === "app-build").sort((a, b) => b.startedAt - a.startedAt);
|
|
176
|
+
const appRuntimeRuns = runs.filter((r) => r.layer === "app-runtime").sort((a, b) => b.startedAt - a.startedAt);
|
|
177
|
+
return {
|
|
178
|
+
projectId,
|
|
179
|
+
chatRuns,
|
|
180
|
+
lastBuilderRun: chatRuns[0],
|
|
181
|
+
lastBuildRun: buildRuns[0],
|
|
182
|
+
lastAppRuntimeRuns: appRuntimeRuns
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
function cryptoId() {
|
|
186
|
+
if (typeof globalThis.crypto?.randomUUID === "function") return globalThis.crypto.randomUUID();
|
|
187
|
+
return `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// src/builder-eval/correlation.ts
|
|
191
|
+
function correlateLayers(reports) {
|
|
192
|
+
const completeProjects = reports.filter((r) => r.complete).length;
|
|
193
|
+
return {
|
|
194
|
+
metaVsBuild: pairwise(
|
|
195
|
+
reports,
|
|
196
|
+
(r) => r.metaScore,
|
|
197
|
+
(r) => r.buildScore
|
|
198
|
+
),
|
|
199
|
+
metaVsRuntime: pairwise(
|
|
200
|
+
reports,
|
|
201
|
+
(r) => r.metaScore,
|
|
202
|
+
(r) => r.runtimeScore
|
|
203
|
+
),
|
|
204
|
+
buildVsRuntime: pairwise(
|
|
205
|
+
reports,
|
|
206
|
+
(r) => r.buildScore,
|
|
207
|
+
(r) => r.runtimeScore
|
|
208
|
+
),
|
|
209
|
+
completeProjects
|
|
210
|
+
};
|
|
211
|
+
}
|
|
212
|
+
function pairwise(reports, a, b) {
|
|
213
|
+
const xs = [];
|
|
214
|
+
const ys = [];
|
|
215
|
+
for (const r of reports) {
|
|
216
|
+
const x = a(r);
|
|
217
|
+
const y = b(r);
|
|
218
|
+
if (x !== null && y !== null && Number.isFinite(x) && Number.isFinite(y)) {
|
|
219
|
+
xs.push(x);
|
|
220
|
+
ys.push(y);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
if (xs.length < 2) return void 0;
|
|
224
|
+
return {
|
|
225
|
+
n: xs.length,
|
|
226
|
+
pearson: pearsonR(xs, ys),
|
|
227
|
+
spearman: spearmanR(xs, ys)
|
|
228
|
+
};
|
|
229
|
+
}
|
|
230
|
+
function pearsonR(a, b) {
|
|
231
|
+
const mA = a.reduce((s, v) => s + v, 0) / a.length;
|
|
232
|
+
const mB = b.reduce((s, v) => s + v, 0) / b.length;
|
|
233
|
+
let num = 0, dA = 0, dB = 0;
|
|
234
|
+
for (let i = 0; i < a.length; i++) {
|
|
235
|
+
const da = a[i] - mA;
|
|
236
|
+
const db = b[i] - mB;
|
|
237
|
+
num += da * db;
|
|
238
|
+
dA += da * da;
|
|
239
|
+
dB += db * db;
|
|
240
|
+
}
|
|
241
|
+
if (dA === 0 || dB === 0) return dA === 0 && dB === 0 ? 1 : 0;
|
|
242
|
+
return num / Math.sqrt(dA * dB);
|
|
243
|
+
}
|
|
244
|
+
function spearmanR(a, b) {
|
|
245
|
+
return pearsonR(ranks(a), ranks(b));
|
|
246
|
+
}
|
|
247
|
+
function ranks(xs) {
|
|
248
|
+
const indexed = xs.map((v, i) => ({ v, i })).sort((x, y) => x.v - y.v);
|
|
249
|
+
const r = new Array(xs.length);
|
|
250
|
+
for (let i = 0; i < indexed.length; i++) {
|
|
251
|
+
let j = i;
|
|
252
|
+
while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
|
|
253
|
+
const avg = (i + j + 2) / 2;
|
|
254
|
+
for (let k = i; k <= j; k++) r[indexed[k].i] = avg;
|
|
255
|
+
i = j;
|
|
256
|
+
}
|
|
257
|
+
return r;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// src/builder-eval/project-registry.ts
|
|
261
|
+
var ProjectRegistry = class {
|
|
262
|
+
constructor(store) {
|
|
263
|
+
this.store = store;
|
|
264
|
+
}
|
|
265
|
+
store;
|
|
266
|
+
async listProjects() {
|
|
267
|
+
const runs = await this.store.listRuns();
|
|
268
|
+
const byProject = /* @__PURE__ */ new Map();
|
|
269
|
+
for (const r of runs) {
|
|
270
|
+
if (!r.projectId) continue;
|
|
271
|
+
const arr = byProject.get(r.projectId) ?? [];
|
|
272
|
+
arr.push(r);
|
|
273
|
+
byProject.set(r.projectId, arr);
|
|
274
|
+
}
|
|
275
|
+
const summaries = [];
|
|
276
|
+
for (const [projectId, projectRuns] of byProject) {
|
|
277
|
+
const sorted = projectRuns.slice().sort((a, b) => b.startedAt - a.startedAt);
|
|
278
|
+
const chats = projectRuns.filter((r) => r.layer === "builder");
|
|
279
|
+
const builds = projectRuns.filter((r) => r.layer === "app-build");
|
|
280
|
+
const runtimes = projectRuns.filter((r) => r.layer === "app-runtime");
|
|
281
|
+
const latest = sorted[0];
|
|
282
|
+
if (!latest) continue;
|
|
283
|
+
summaries.push({
|
|
284
|
+
projectId,
|
|
285
|
+
chatCount: chats.length,
|
|
286
|
+
buildCount: builds.length,
|
|
287
|
+
appRuntimeCount: runtimes.length,
|
|
288
|
+
lastActivityAt: latest.startedAt,
|
|
289
|
+
latestChatId: chats[0]?.chatId,
|
|
290
|
+
latestOutcome: latest.outcome ? { pass: latest.outcome.pass ?? false, score: latest.outcome.score } : void 0
|
|
291
|
+
});
|
|
292
|
+
}
|
|
293
|
+
return summaries.sort((a, b) => b.lastActivityAt - a.lastActivityAt);
|
|
294
|
+
}
|
|
295
|
+
async projectTimeline(projectId) {
|
|
296
|
+
const runs = await this.store.listRuns({ projectId });
|
|
297
|
+
const ordered = runs.slice().sort((a, b) => a.startedAt - b.startedAt);
|
|
298
|
+
return ordered.map((run) => ({
|
|
299
|
+
run,
|
|
300
|
+
layerBucket: run.layer === "builder" ? "chat" : run.layer === "app-build" ? "build" : run.layer === "app-runtime" ? "runtime" : "other"
|
|
301
|
+
}));
|
|
302
|
+
}
|
|
303
|
+
async projectChats(projectId) {
|
|
304
|
+
const builderRuns = (await this.store.listRuns({ projectId, layer: "builder" })).sort(
|
|
305
|
+
(a, b) => b.startedAt - a.startedAt
|
|
306
|
+
);
|
|
307
|
+
const childrenFor = async (runId) => this.store.listRuns({ parentRunId: runId });
|
|
308
|
+
const out = [];
|
|
309
|
+
for (const run of builderRuns) {
|
|
310
|
+
const spans = await this.store.spans({ runId: run.runId });
|
|
311
|
+
const children = await childrenFor(run.runId);
|
|
312
|
+
const build = children.find((c) => c.layer === "app-build");
|
|
313
|
+
const runtime = [];
|
|
314
|
+
if (build) {
|
|
315
|
+
const grands = await childrenFor(build.runId);
|
|
316
|
+
for (const g of grands) if (g.layer === "app-runtime") runtime.push(g.runId);
|
|
317
|
+
}
|
|
318
|
+
for (const c of children) if (c.layer === "app-runtime") runtime.push(c.runId);
|
|
319
|
+
out.push({
|
|
320
|
+
chatId: run.chatId ?? run.runId,
|
|
321
|
+
projectId,
|
|
322
|
+
builderRunId: run.runId,
|
|
323
|
+
startedAt: run.startedAt,
|
|
324
|
+
endedAt: run.endedAt,
|
|
325
|
+
status: run.status,
|
|
326
|
+
outcome: run.outcome,
|
|
327
|
+
llmTurns: spans.filter((s) => s.kind === "llm").length,
|
|
328
|
+
toolCalls: spans.filter((s) => s.kind === "tool").length,
|
|
329
|
+
buildRunId: build?.runId,
|
|
330
|
+
appRuntimeRunIds: runtime
|
|
331
|
+
});
|
|
332
|
+
}
|
|
333
|
+
return out;
|
|
334
|
+
}
|
|
335
|
+
};
|
|
336
|
+
|
|
337
|
+
// src/builder-eval/three-layer-eval.ts
|
|
338
|
+
async function scoreProject(store, projectId) {
|
|
339
|
+
const allRuns = await store.listRuns({ projectId });
|
|
340
|
+
const builder = latestByLayer(allRuns, "builder");
|
|
341
|
+
const build = latestByLayer(allRuns, "app-build");
|
|
342
|
+
const runtime = allRuns.filter((r) => r.layer === "app-runtime");
|
|
343
|
+
const metaScore = builder ? await extractMetaScore(store, builder.runId) : null;
|
|
344
|
+
const buildScore = build?.outcome?.score ?? null;
|
|
345
|
+
const runtimeScores = runtime.map((r) => r.outcome?.score).filter((s) => typeof s === "number");
|
|
346
|
+
const runtimeScore = runtimeScores.length > 0 ? runtimeScores.reduce((a, b) => a + b, 0) / runtimeScores.length : null;
|
|
347
|
+
const runtimePassed = runtime.filter((r) => r.outcome?.pass === true).length;
|
|
348
|
+
const runtimePassRate = runtime.length > 0 ? runtimePassed / runtime.length : null;
|
|
349
|
+
const kind = runtime.length === 0 ? "scaffold-only" : "full";
|
|
350
|
+
const complete = kind === "scaffold-only" ? metaScore !== null && buildScore !== null : metaScore !== null && buildScore !== null && runtimeScore !== null;
|
|
351
|
+
return {
|
|
352
|
+
projectId,
|
|
353
|
+
kind,
|
|
354
|
+
builderRunId: builder?.runId,
|
|
355
|
+
metaScore,
|
|
356
|
+
buildRunId: build?.runId,
|
|
357
|
+
buildScore,
|
|
358
|
+
appRuntimeRunIds: runtime.map((r) => r.runId),
|
|
359
|
+
runtimeScore,
|
|
360
|
+
runtimePassRate,
|
|
361
|
+
complete
|
|
362
|
+
};
|
|
363
|
+
}
|
|
364
|
+
async function scoreAllProjects(store) {
|
|
365
|
+
const runs = await store.listRuns();
|
|
366
|
+
const projectIds = [...new Set(runs.map((r) => r.projectId).filter((p) => !!p))];
|
|
367
|
+
return Promise.all(projectIds.map((p) => scoreProject(store, p)));
|
|
368
|
+
}
|
|
369
|
+
function latestByLayer(runs, layer) {
|
|
370
|
+
const filtered = runs.filter((r) => r.layer === layer).sort((a, b) => b.startedAt - a.startedAt);
|
|
371
|
+
return filtered[0];
|
|
372
|
+
}
|
|
373
|
+
async function extractMetaScore(store, builderRunId) {
|
|
374
|
+
const js = await judgeSpans(store, builderRunId);
|
|
375
|
+
const meta = js.find(
|
|
376
|
+
(s) => s.judgeId === "builder-meta" && s.dimension === "user_intent_satisfaction"
|
|
377
|
+
);
|
|
378
|
+
if (!meta) return null;
|
|
379
|
+
if (meta.score >= 0 && meta.score <= 1) return meta.score;
|
|
380
|
+
if (meta.score >= 0 && meta.score <= 10) return meta.score / 10;
|
|
381
|
+
return null;
|
|
382
|
+
}
|
|
383
|
+
export {
|
|
384
|
+
BuilderSession,
|
|
385
|
+
ProjectRegistry,
|
|
386
|
+
correlateLayers,
|
|
387
|
+
resumeBuilderSession,
|
|
388
|
+
scoreAllProjects,
|
|
389
|
+
scoreProject
|
|
390
|
+
};
|
|
391
|
+
//# sourceMappingURL=index.js.map
|