@tangle-network/agent-eval 0.23.1 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/CHANGELOG.md +80 -0
  2. package/README.md +141 -79
  3. package/dist/baseline-4R5deP0N.d.ts +108 -0
  4. package/dist/benchmarks/index.d.ts +3 -2
  5. package/dist/benchmarks/index.js +1 -1
  6. package/dist/builder-eval/index.d.ts +249 -0
  7. package/dist/builder-eval/index.js +391 -0
  8. package/dist/builder-eval/index.js.map +1 -0
  9. package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
  10. package/dist/chunk-2A5XJB43.js.map +1 -0
  11. package/dist/chunk-47X6LRCE.js +76 -0
  12. package/dist/chunk-47X6LRCE.js.map +1 -0
  13. package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
  14. package/dist/chunk-4F5DQN55.js.map +1 -0
  15. package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
  16. package/dist/chunk-4S4BM3QQ.js.map +1 -0
  17. package/dist/chunk-5BKGXME7.js +65 -0
  18. package/dist/chunk-5BKGXME7.js.map +1 -0
  19. package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
  20. package/dist/chunk-6QDKWHLS.js.map +1 -0
  21. package/dist/chunk-I4MBDTY5.js +272 -0
  22. package/dist/chunk-I4MBDTY5.js.map +1 -0
  23. package/dist/chunk-K2TPS5LB.js +569 -0
  24. package/dist/chunk-K2TPS5LB.js.map +1 -0
  25. package/dist/chunk-KKHDIONI.js +414 -0
  26. package/dist/chunk-KKHDIONI.js.map +1 -0
  27. package/dist/chunk-KMPRBJK4.js +74 -0
  28. package/dist/chunk-KMPRBJK4.js.map +1 -0
  29. package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
  30. package/dist/chunk-KTGTIOFD.js.map +1 -0
  31. package/dist/chunk-LSH4MMOZ.js +838 -0
  32. package/dist/chunk-LSH4MMOZ.js.map +1 -0
  33. package/dist/chunk-NG236HPC.js +57 -0
  34. package/dist/chunk-NG236HPC.js.map +1 -0
  35. package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
  36. package/dist/chunk-NLMNWKVM.js.map +1 -0
  37. package/dist/chunk-NU65VQ7M.js +99 -0
  38. package/dist/chunk-NU65VQ7M.js.map +1 -0
  39. package/dist/chunk-OHEPNJQN.js +554 -0
  40. package/dist/chunk-OHEPNJQN.js.map +1 -0
  41. package/dist/chunk-OWLAAMME.js +250 -0
  42. package/dist/chunk-OWLAAMME.js.map +1 -0
  43. package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
  44. package/dist/chunk-PC4UYEBM.js.map +1 -0
  45. package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
  46. package/dist/chunk-RAF443UI.js.map +1 -0
  47. package/dist/chunk-RZTMDUO7.js +49 -0
  48. package/dist/chunk-RZTMDUO7.js.map +1 -0
  49. package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
  50. package/dist/chunk-SESZDQPX.js.map +1 -0
  51. package/dist/{chunk-6KQG5HAH.js → chunk-SY6WAAAD.js} +84 -71
  52. package/dist/chunk-SY6WAAAD.js.map +1 -0
  53. package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
  54. package/dist/chunk-TVVP3ZZQ.js.map +1 -0
  55. package/dist/{chunk-VQQSPGSM.js → chunk-VRJVTXRV.js} +169 -111
  56. package/dist/chunk-VRJVTXRV.js.map +1 -0
  57. package/dist/chunk-WWYCWKUM.js +196 -0
  58. package/dist/chunk-WWYCWKUM.js.map +1 -0
  59. package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
  60. package/dist/chunk-YRZ4M5GS.js.map +1 -0
  61. package/dist/chunk-ZN274SWR.js +613 -0
  62. package/dist/chunk-ZN274SWR.js.map +1 -0
  63. package/dist/cli.js +10 -6
  64. package/dist/cli.js.map +1 -1
  65. package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
  66. package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
  67. package/dist/control.d.ts +8 -6
  68. package/dist/control.js +10 -7
  69. package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
  70. package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  71. package/dist/errors-BZ9sTdz7.d.ts +70 -0
  72. package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
  73. package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
  74. package/dist/governance/index.d.ts +5 -0
  75. package/dist/governance/index.js +18 -0
  76. package/dist/governance/index.js.map +1 -0
  77. package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
  78. package/dist/index-Oj9fAPPN.d.ts +270 -0
  79. package/dist/index.d.ts +1866 -3151
  80. package/dist/index.js +5457 -7809
  81. package/dist/index.js.map +1 -1
  82. package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
  83. package/dist/knowledge/index.d.ts +102 -0
  84. package/dist/knowledge/index.js +18 -0
  85. package/dist/knowledge/index.js.map +1 -0
  86. package/dist/meta-eval/index.d.ts +99 -0
  87. package/dist/meta-eval/index.js +324 -0
  88. package/dist/meta-eval/index.js.map +1 -0
  89. package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
  90. package/dist/openapi.json +1 -1
  91. package/dist/optimization.d.ts +11 -8
  92. package/dist/optimization.js +11 -9
  93. package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
  94. package/dist/pipelines/index.d.ts +172 -0
  95. package/dist/pipelines/index.js +409 -0
  96. package/dist/pipelines/index.js.map +1 -0
  97. package/dist/prm/index.d.ts +99 -0
  98. package/dist/prm/index.js +222 -0
  99. package/dist/prm/index.js.map +1 -0
  100. package/dist/query-DODUYdPg.d.ts +30 -0
  101. package/dist/release-report-TDPn1cxq.d.ts +292 -0
  102. package/dist/replay-BL96gCEP.d.ts +226 -0
  103. package/dist/reporting.d.ts +10 -295
  104. package/dist/reporting.js +10 -6
  105. package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-CUOiGcGv.d.ts} +148 -146
  106. package/dist/rl.d.ts +1762 -8
  107. package/dist/rl.js +2035 -58
  108. package/dist/rl.js.map +1 -1
  109. package/dist/rubric-D5tjHNJQ.d.ts +72 -0
  110. package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
  111. package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
  112. package/dist/sequential-Dgz1n51-.d.ts +139 -0
  113. package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  114. package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-BXGs_9V0.d.ts} +3 -76
  115. package/dist/telemetry/file.js +4 -1
  116. package/dist/telemetry/file.js.map +1 -1
  117. package/dist/telemetry/index.js +57 -57
  118. package/dist/telemetry/index.js.map +1 -1
  119. package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
  120. package/dist/traces.d.ts +142 -387
  121. package/dist/traces.js +1302 -40
  122. package/dist/traces.js.map +1 -1
  123. package/dist/trajectory-CnoBo-JY.d.ts +32 -0
  124. package/dist/wire/index.d.ts +22 -22
  125. package/dist/wire/index.js +4 -3
  126. package/package.json +44 -18
  127. package/dist/chunk-42I2QC2L.js.map +0 -1
  128. package/dist/chunk-5IIQKMD5.js.map +0 -1
  129. package/dist/chunk-6KQG5HAH.js.map +0 -1
  130. package/dist/chunk-6M774GY6.js.map +0 -1
  131. package/dist/chunk-7EAUOUQS.js.map +0 -1
  132. package/dist/chunk-AXHNWLIX.js.map +0 -1
  133. package/dist/chunk-EXGR4XEM.js.map +0 -1
  134. package/dist/chunk-IOXMGMHQ.js.map +0 -1
  135. package/dist/chunk-KAO3Q65R.js.map +0 -1
  136. package/dist/chunk-LZKIOBG2.js +0 -2026
  137. package/dist/chunk-LZKIOBG2.js.map +0 -1
  138. package/dist/chunk-QBW3YBTR.js.map +0 -1
  139. package/dist/chunk-QUKKGHTZ.js.map +0 -1
  140. package/dist/chunk-SQQLHODJ.js.map +0 -1
  141. package/dist/chunk-V5QSWN7L.js +0 -1310
  142. package/dist/chunk-V5QSWN7L.js.map +0 -1
  143. package/dist/chunk-VQQSPGSM.js.map +0 -1
  144. package/dist/chunk-XPHOZPOM.js +0 -1947
  145. package/dist/chunk-XPHOZPOM.js.map +0 -1
  146. package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
  147. package/dist/index-ekBXweiQ.d.ts +0 -1894
  148. package/dist/sequential-DgU2mFsE.d.ts +0 -304
@@ -0,0 +1,249 @@
1
+ import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult, T as TestGradedScenario, b as TestGradedRunResult } from '../test-graded-scenario-B2kWEdh9.js';
2
+ import { T as TraceEmitter } from '../emitter-DP_cSSiw.js';
3
+ import { T as TraceStore, R as Run } from '../store-Db2Bv8Cf.js';
4
+
5
+ /**
6
+ * BuilderSession — ties a builder-of-builders workflow together.
7
+ *
8
+ * Models agent-builder's shape: Project → Chat → Edit → Ship → App →
9
+ * AppAgent. Each layer is a Run (linked via parentRunId). The
10
+ * framework-enforced invariants:
11
+ *
12
+ * - One Project → many Chats; chatId scopes runs within a project.
13
+ * - One Chat = one builder Run with `layer='builder'`.
14
+ * - One Ship = one child Run with `layer='app-build'` + SandboxHarness.
15
+ * - One AppScenario = one grandchild Run with `layer='app-runtime'`.
16
+ *
17
+ * Consumers obtain a BuilderSession, call `startChat`, drive the
18
+ * builder agent (emitting spans), and call `ship` / `runAppScenario`
19
+ * as the workflow progresses. The session reconstructs itself from
20
+ * trace data via `resume(store, projectId)`.
21
+ */
22
+
23
+ interface BuilderSessionInit {
24
+ projectId: string;
25
+ chatId?: string;
26
+ /** Free-form: user's task description, project name, etc. Stored on the builder Run. */
27
+ tags?: Record<string, string>;
28
+ }
29
+ interface ShipOptions {
30
+ harness: HarnessConfig;
31
+ driver?: SandboxDriver;
32
+ /** scenarioId of this app-build run. Defaults to `${projectId}/build`. */
33
+ scenarioId?: string;
34
+ }
35
+ interface RunAppScenarioOptions {
36
+ scenario: TestGradedScenario;
37
+ /** Harness driver override; defaults to the one the session was created with. */
38
+ driver?: SandboxDriver;
39
+ }
40
+ declare class BuilderSession {
41
+ private store;
42
+ private builderEmitter;
43
+ readonly projectId: string;
44
+ readonly chatId: string;
45
+ private builderRunId?;
46
+ private lastBuildRunId?;
47
+ private defaultDriver?;
48
+ constructor(store: TraceStore, init: BuilderSessionInit, driver?: SandboxDriver);
49
+ /** Start the builder (L0) run for this chat. Returns the runId. */
50
+ startChat(scenarioId?: string): Promise<string>;
51
+ /** The emitter for builder-level spans (edits, LLM calls, tool invocations). */
52
+ get emitter(): TraceEmitter;
53
+ /**
54
+ * Ship the project's generated app: run the sandbox harness as a child
55
+ * Run (`layer='app-build'`). Returns the build result + runId.
56
+ */
57
+ ship(options: ShipOptions): Promise<{
58
+ runId: string;
59
+ result: SandboxHarnessResult;
60
+ }>;
61
+ /**
62
+ * Run a domain scenario against the just-built app as a grandchild Run
63
+ * (`layer='app-runtime'`). The `ship` call must precede this so the
64
+ * parent is set correctly; if no build exists yet the session attaches
65
+ * directly to the builder run (useful for prototypes).
66
+ */
67
+ runAppScenario(options: RunAppScenarioOptions): Promise<TestGradedRunResult>;
68
+ /** Record an end-of-chat meta score (judge verdict on whether the builder
69
+ * served the user's intent). Accepts a numeric score + optional rationale. */
70
+ recordMetaScore(score: number, rationale?: string): Promise<void>;
71
+ /** Close the builder Run with a final outcome. */
72
+ endChat(outcome: {
73
+ pass: boolean;
74
+ score?: number;
75
+ notes?: string;
76
+ }): Promise<void>;
77
+ /**
78
+ * Inline app-runtime run — for cases where the "scenario" isn't a
79
+ * SWE-bench-style test suite but a live agent interaction (LLM chat,
80
+ * domain flow). Returns an emitter bound to a fresh Run in the
81
+ * `app-runtime` layer; caller emits spans inside and calls
82
+ * `.endRun()` with the final verdict.
83
+ */
84
+ startAppRuntime(scenarioId: string): Promise<TraceEmitter>;
85
+ /**
86
+ * Lightweight "ship marker" — record an app-build Run with a caller-
87
+ * provided verdict. Use when there isn't a sandbox harness to run but
88
+ * you still want to mark the build state at publish time.
89
+ */
90
+ recordShipMarker(args: {
91
+ pass: boolean;
92
+ score: number;
93
+ scenarioId?: string;
94
+ notes?: string;
95
+ }): Promise<string>;
96
+ get lastBuildRunIdValue(): string | undefined;
97
+ get builderRunIdValue(): string | undefined;
98
+ }
99
+ /**
100
+ * Reconstruct the most recent BuilderSession state for a given project —
101
+ * returns { builderRunId, lastBuildRunId, chatRuns }. For chat-first UIs
102
+ * this is how a resumed session finds its place in the edit history.
103
+ */
104
+ declare function resumeBuilderSession(store: TraceStore, projectId: string): Promise<{
105
+ projectId: string;
106
+ chatRuns: Run[];
107
+ lastBuilderRun?: Run;
108
+ lastBuildRun?: Run;
109
+ lastAppRuntimeRuns: Run[];
110
+ }>;
111
+
112
+ /**
113
+ * Three-layer evaluation — the canonical scoring breakdown for
114
+ * builder-of-builders workflows.
115
+ *
116
+ * meta_score: did the builder understand + satisfy user intent?
117
+ * (judge verdict attached to the builder run)
118
+ * build_score: did the generated scaffold build + pass its own tests?
119
+ * (outcome.score on the app-build child run)
120
+ * runtime_score: did the generated agent pass its domain scenarios?
121
+ * (mean outcome.score over app-runtime grandchild runs)
122
+ *
123
+ * Returns a structured report per project. The cross-layer correlation
124
+ * is the highest-leverage signal the framework computes — if
125
+ * meta_score doesn't predict runtime_score, the builder's self-scoring
126
+ * is broken.
127
+ *
128
+ * Scaffold-only mode: when a project has no `app-runtime` runs (e.g. a
129
+ * scaffold-builder eval that grades compose + build without driving a
130
+ * runtime scenario), `kind` is `'scaffold-only'` and `complete` measures
131
+ * meta + build only. Consumers can tell the two apart without having to
132
+ * interpret null-runtime as either "not yet computed" or "N/A for this
133
+ * project shape".
134
+ */
135
+
136
+ type ProjectKind = 'full' | 'scaffold-only';
137
+ interface ThreeLayerProjectReport {
138
+ projectId: string;
139
+ /**
140
+ * `'full'` when the project has at least one `app-runtime` run;
141
+ * `'scaffold-only'` when it only has meta + build layers. Lets
142
+ * downstream consumers treat a null runtime score as expected
143
+ * (scaffold-only) vs. missing (full, pipeline broke).
144
+ */
145
+ kind: ProjectKind;
146
+ builderRunId?: string;
147
+ /** Judge-verdict score on the builder run (0..1 after normalization). */
148
+ metaScore: number | null;
149
+ buildRunId?: string;
150
+ /** 0..1 from the sandbox harness (testsPassed / testsTotal). */
151
+ buildScore: number | null;
152
+ appRuntimeRunIds: string[];
153
+ /** Mean of outcome.score over app-runtime runs, 0..1. Always null in scaffold-only mode. */
154
+ runtimeScore: number | null;
155
+ runtimePassRate: number | null;
156
+ /**
157
+ * Layer-aware completeness:
158
+ * - `kind='full'`: all three layers scored
159
+ * - `kind='scaffold-only'`: meta + build scored (runtime not applicable)
160
+ */
161
+ complete: boolean;
162
+ }
163
+ declare function scoreProject(store: TraceStore, projectId: string): Promise<ThreeLayerProjectReport>;
164
+ /** Aggregate scoring across every project in a corpus. */
165
+ declare function scoreAllProjects(store: TraceStore): Promise<ThreeLayerProjectReport[]>;
166
+
167
+ /**
168
+ * Meta-eval correlation — the highest-leverage signal in the framework.
169
+ *
170
+ * Given a corpus of three-layer project reports, compute how well each
171
+ * pair of layers correlates. The question we care about most:
172
+ *
173
+ * Does `metaScore` (what the builder thinks it did) predict
174
+ * `runtimeScore` (what the user actually gets)?
175
+ *
176
+ * If r < ~0.4, the builder's self-scoring is broken — it's optimizing
177
+ * for something other than real-world success. If r > 0.7, meta_score
178
+ * is a usable proxy and can drive CI gates cheaply.
179
+ *
180
+ * Non-parametric rank correlation (Spearman) is also reported because
181
+ * meta scores are often ordinal-ish.
182
+ */
183
+
184
+ interface LayerCorrelation {
185
+ n: number;
186
+ pearson: number;
187
+ spearman: number;
188
+ }
189
+ interface CorrelationReport {
190
+ /** Pairs present in the corpus (layers with ≥ 2 matched data points). */
191
+ metaVsBuild?: LayerCorrelation;
192
+ metaVsRuntime?: LayerCorrelation;
193
+ buildVsRuntime?: LayerCorrelation;
194
+ /** Number of complete projects (all 3 scores present). */
195
+ completeProjects: number;
196
+ }
197
+ declare function correlateLayers(reports: ThreeLayerProjectReport[]): CorrelationReport;
198
+
199
+ /**
200
+ * ProjectRegistry — project-level aggregation over the trace corpus.
201
+ *
202
+ * Thin reader over TraceStore that answers the questions a chat-first,
203
+ * resumable UI needs:
204
+ * - listProjects() → project IDs with latest activity
205
+ * - projectTimeline(id) → chats + builds + runtime runs, chronological
206
+ * - projectChats(id) → chat-level summaries (turn count, outcome)
207
+ *
208
+ * All queries are pure reads; no state duplication.
209
+ */
210
+
211
+ interface ProjectSummary {
212
+ projectId: string;
213
+ chatCount: number;
214
+ buildCount: number;
215
+ appRuntimeCount: number;
216
+ lastActivityAt: number;
217
+ latestChatId?: string;
218
+ latestOutcome?: {
219
+ pass: boolean;
220
+ score?: number;
221
+ };
222
+ }
223
+ interface ChatSummary {
224
+ chatId: string;
225
+ projectId: string;
226
+ builderRunId: string;
227
+ startedAt: number;
228
+ endedAt?: number;
229
+ status: Run['status'];
230
+ outcome?: Run['outcome'];
231
+ /** Counts of spans emitted during the chat. */
232
+ llmTurns?: number;
233
+ toolCalls?: number;
234
+ buildRunId?: string;
235
+ appRuntimeRunIds: string[];
236
+ }
237
+ interface ProjectTimelineEntry {
238
+ run: Run;
239
+ layerBucket: 'chat' | 'build' | 'runtime' | 'other';
240
+ }
241
+ declare class ProjectRegistry {
242
+ private store;
243
+ constructor(store: TraceStore);
244
+ listProjects(): Promise<ProjectSummary[]>;
245
+ projectTimeline(projectId: string): Promise<ProjectTimelineEntry[]>;
246
+ projectChats(projectId: string): Promise<ChatSummary[]>;
247
+ }
248
+
249
+ export { BuilderSession, type BuilderSessionInit, type ChatSummary, type CorrelationReport, type LayerCorrelation, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type RunAppScenarioOptions, type ShipOptions, type ThreeLayerProjectReport, correlateLayers, resumeBuilderSession, scoreAllProjects, scoreProject };
@@ -0,0 +1,391 @@
1
+ import {
2
+ SandboxHarness,
3
+ runTestGradedScenario
4
+ } from "../chunk-OWLAAMME.js";
5
+ import {
6
+ judgeSpans
7
+ } from "../chunk-47X6LRCE.js";
8
+ import "../chunk-5BKGXME7.js";
9
+ import {
10
+ TraceEmitter
11
+ } from "../chunk-TVVP3ZZQ.js";
12
+ import "../chunk-NG236HPC.js";
13
+ import "../chunk-PZ5AY32C.js";
14
+
15
+ // src/builder-eval/builder-session.ts
16
+ var BuilderSession = class {
17
+ store;
18
+ builderEmitter;
19
+ projectId;
20
+ chatId;
21
+ builderRunId;
22
+ lastBuildRunId;
23
+ defaultDriver;
24
+ constructor(store, init, driver) {
25
+ this.store = store;
26
+ this.projectId = init.projectId;
27
+ this.chatId = init.chatId ?? cryptoId();
28
+ this.defaultDriver = driver;
29
+ this.builderEmitter = new TraceEmitter(store);
30
+ }
31
+ /** Start the builder (L0) run for this chat. Returns the runId. */
32
+ async startChat(scenarioId = `${this.projectId}/chat`) {
33
+ await this.builderEmitter.startRun({
34
+ scenarioId,
35
+ projectId: this.projectId,
36
+ chatId: this.chatId,
37
+ layer: "builder"
38
+ });
39
+ this.builderRunId = this.builderEmitter.runId;
40
+ return this.builderRunId;
41
+ }
42
+ /** The emitter for builder-level spans (edits, LLM calls, tool invocations). */
43
+ get emitter() {
44
+ if (!this.builderRunId) throw new Error("BuilderSession.emitter: call startChat() first");
45
+ return this.builderEmitter;
46
+ }
47
+ /**
48
+ * Ship the project's generated app: run the sandbox harness as a child
49
+ * Run (`layer='app-build'`). Returns the build result + runId.
50
+ */
51
+ async ship(options) {
52
+ if (!this.builderRunId) throw new Error("BuilderSession.ship: call startChat() first");
53
+ const buildEmitter = new TraceEmitter(this.store);
54
+ await buildEmitter.startRun({
55
+ scenarioId: options.scenarioId ?? `${this.projectId}/build`,
56
+ projectId: this.projectId,
57
+ chatId: this.chatId,
58
+ parentRunId: this.builderRunId,
59
+ layer: "app-build"
60
+ });
61
+ const harness = new SandboxHarness(options.driver ?? this.defaultDriver);
62
+ const result = await harness.run(options.harness, buildEmitter);
63
+ await buildEmitter.endRun({
64
+ pass: result.passed,
65
+ score: result.score,
66
+ failureClass: result.passed ? "success" : "sandbox_failure"
67
+ });
68
+ this.lastBuildRunId = buildEmitter.runId;
69
+ return { runId: buildEmitter.runId, result };
70
+ }
71
+ /**
72
+ * Run a domain scenario against the just-built app as a grandchild Run
73
+ * (`layer='app-runtime'`). The `ship` call must precede this so the
74
+ * parent is set correctly; if no build exists yet the session attaches
75
+ * directly to the builder run (useful for prototypes).
76
+ */
77
+ async runAppScenario(options) {
78
+ const parentRunId = this.lastBuildRunId ?? this.builderRunId;
79
+ if (!parentRunId)
80
+ throw new Error("BuilderSession.runAppScenario: call startChat() + ship() first");
81
+ const { scenario, driver } = options;
82
+ const result = await runTestGradedScenario(scenario, this.store, {
83
+ driver: driver ?? this.defaultDriver,
84
+ provenance: { codeSha: void 0, promptSha: void 0, modelFingerprint: void 0 }
85
+ });
86
+ await this.store.updateRun(result.runId, {
87
+ parentRunId,
88
+ projectId: this.projectId,
89
+ chatId: this.chatId,
90
+ layer: "app-runtime"
91
+ });
92
+ return result;
93
+ }
94
+ /** Record an end-of-chat meta score (judge verdict on whether the builder
95
+ * served the user's intent). Accepts a numeric score + optional rationale. */
96
+ async recordMetaScore(score, rationale) {
97
+ if (!this.builderRunId)
98
+ throw new Error("BuilderSession.recordMetaScore: call startChat() first");
99
+ await this.builderEmitter.recordJudge({
100
+ judgeId: "builder-meta",
101
+ targetSpanId: this.builderRunId,
102
+ // attach to the builder run itself
103
+ dimension: "user_intent_satisfaction",
104
+ score,
105
+ rationale,
106
+ name: "builder-meta"
107
+ });
108
+ }
109
+ /** Close the builder Run with a final outcome. */
110
+ async endChat(outcome) {
111
+ await this.builderEmitter.endRun({
112
+ pass: outcome.pass,
113
+ score: outcome.score,
114
+ notes: outcome.notes
115
+ });
116
+ }
117
+ /**
118
+ * Inline app-runtime run — for cases where the "scenario" isn't a
119
+ * SWE-bench-style test suite but a live agent interaction (LLM chat,
120
+ * domain flow). Returns an emitter bound to a fresh Run in the
121
+ * `app-runtime` layer; caller emits spans inside and calls
122
+ * `.endRun()` with the final verdict.
123
+ */
124
+ async startAppRuntime(scenarioId) {
125
+ const parentRunId = this.lastBuildRunId ?? this.builderRunId;
126
+ if (!parentRunId)
127
+ throw new Error(
128
+ "BuilderSession.startAppRuntime: call startChat() + (optionally) ship() first"
129
+ );
130
+ const emitter = new TraceEmitter(this.store);
131
+ await emitter.startRun({
132
+ scenarioId,
133
+ projectId: this.projectId,
134
+ chatId: this.chatId,
135
+ parentRunId,
136
+ layer: "app-runtime"
137
+ });
138
+ return emitter;
139
+ }
140
+ /**
141
+ * Lightweight "ship marker" — record an app-build Run with a caller-
142
+ * provided verdict. Use when there isn't a sandbox harness to run but
143
+ * you still want to mark the build state at publish time.
144
+ */
145
+ async recordShipMarker(args) {
146
+ if (!this.builderRunId)
147
+ throw new Error("BuilderSession.recordShipMarker: call startChat() first");
148
+ const emitter = new TraceEmitter(this.store);
149
+ await emitter.startRun({
150
+ scenarioId: args.scenarioId ?? `${this.projectId}/ship`,
151
+ projectId: this.projectId,
152
+ chatId: this.chatId,
153
+ parentRunId: this.builderRunId,
154
+ layer: "app-build"
155
+ });
156
+ await emitter.endRun({
157
+ pass: args.pass,
158
+ score: args.score,
159
+ failureClass: args.pass ? "success" : "sandbox_failure",
160
+ notes: args.notes
161
+ });
162
+ this.lastBuildRunId = emitter.runId;
163
+ return emitter.runId;
164
+ }
165
+ get lastBuildRunIdValue() {
166
+ return this.lastBuildRunId;
167
+ }
168
+ get builderRunIdValue() {
169
+ return this.builderRunId;
170
+ }
171
+ };
172
+ async function resumeBuilderSession(store, projectId) {
173
+ const runs = await store.listRuns({ projectId });
174
+ const chatRuns = runs.filter((r) => r.layer === "builder").sort((a, b) => b.startedAt - a.startedAt);
175
+ const buildRuns = runs.filter((r) => r.layer === "app-build").sort((a, b) => b.startedAt - a.startedAt);
176
+ const appRuntimeRuns = runs.filter((r) => r.layer === "app-runtime").sort((a, b) => b.startedAt - a.startedAt);
177
+ return {
178
+ projectId,
179
+ chatRuns,
180
+ lastBuilderRun: chatRuns[0],
181
+ lastBuildRun: buildRuns[0],
182
+ lastAppRuntimeRuns: appRuntimeRuns
183
+ };
184
+ }
185
+ function cryptoId() {
186
+ if (typeof globalThis.crypto?.randomUUID === "function") return globalThis.crypto.randomUUID();
187
+ return `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
188
+ }
189
+
190
+ // src/builder-eval/correlation.ts
191
+ function correlateLayers(reports) {
192
+ const completeProjects = reports.filter((r) => r.complete).length;
193
+ return {
194
+ metaVsBuild: pairwise(
195
+ reports,
196
+ (r) => r.metaScore,
197
+ (r) => r.buildScore
198
+ ),
199
+ metaVsRuntime: pairwise(
200
+ reports,
201
+ (r) => r.metaScore,
202
+ (r) => r.runtimeScore
203
+ ),
204
+ buildVsRuntime: pairwise(
205
+ reports,
206
+ (r) => r.buildScore,
207
+ (r) => r.runtimeScore
208
+ ),
209
+ completeProjects
210
+ };
211
+ }
212
+ function pairwise(reports, a, b) {
213
+ const xs = [];
214
+ const ys = [];
215
+ for (const r of reports) {
216
+ const x = a(r);
217
+ const y = b(r);
218
+ if (x !== null && y !== null && Number.isFinite(x) && Number.isFinite(y)) {
219
+ xs.push(x);
220
+ ys.push(y);
221
+ }
222
+ }
223
+ if (xs.length < 2) return void 0;
224
+ return {
225
+ n: xs.length,
226
+ pearson: pearsonR(xs, ys),
227
+ spearman: spearmanR(xs, ys)
228
+ };
229
+ }
230
+ function pearsonR(a, b) {
231
+ const mA = a.reduce((s, v) => s + v, 0) / a.length;
232
+ const mB = b.reduce((s, v) => s + v, 0) / b.length;
233
+ let num = 0, dA = 0, dB = 0;
234
+ for (let i = 0; i < a.length; i++) {
235
+ const da = a[i] - mA;
236
+ const db = b[i] - mB;
237
+ num += da * db;
238
+ dA += da * da;
239
+ dB += db * db;
240
+ }
241
+ if (dA === 0 || dB === 0) return dA === 0 && dB === 0 ? 1 : 0;
242
+ return num / Math.sqrt(dA * dB);
243
+ }
244
+ function spearmanR(a, b) {
245
+ return pearsonR(ranks(a), ranks(b));
246
+ }
247
+ function ranks(xs) {
248
+ const indexed = xs.map((v, i) => ({ v, i })).sort((x, y) => x.v - y.v);
249
+ const r = new Array(xs.length);
250
+ for (let i = 0; i < indexed.length; i++) {
251
+ let j = i;
252
+ while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
253
+ const avg = (i + j + 2) / 2;
254
+ for (let k = i; k <= j; k++) r[indexed[k].i] = avg;
255
+ i = j;
256
+ }
257
+ return r;
258
+ }
259
+
260
+ // src/builder-eval/project-registry.ts
261
+ var ProjectRegistry = class {
262
+ constructor(store) {
263
+ this.store = store;
264
+ }
265
+ store;
266
+ async listProjects() {
267
+ const runs = await this.store.listRuns();
268
+ const byProject = /* @__PURE__ */ new Map();
269
+ for (const r of runs) {
270
+ if (!r.projectId) continue;
271
+ const arr = byProject.get(r.projectId) ?? [];
272
+ arr.push(r);
273
+ byProject.set(r.projectId, arr);
274
+ }
275
+ const summaries = [];
276
+ for (const [projectId, projectRuns] of byProject) {
277
+ const sorted = projectRuns.slice().sort((a, b) => b.startedAt - a.startedAt);
278
+ const chats = projectRuns.filter((r) => r.layer === "builder");
279
+ const builds = projectRuns.filter((r) => r.layer === "app-build");
280
+ const runtimes = projectRuns.filter((r) => r.layer === "app-runtime");
281
+ const latest = sorted[0];
282
+ if (!latest) continue;
283
+ summaries.push({
284
+ projectId,
285
+ chatCount: chats.length,
286
+ buildCount: builds.length,
287
+ appRuntimeCount: runtimes.length,
288
+ lastActivityAt: latest.startedAt,
289
+ latestChatId: chats[0]?.chatId,
290
+ latestOutcome: latest.outcome ? { pass: latest.outcome.pass ?? false, score: latest.outcome.score } : void 0
291
+ });
292
+ }
293
+ return summaries.sort((a, b) => b.lastActivityAt - a.lastActivityAt);
294
+ }
295
+ async projectTimeline(projectId) {
296
+ const runs = await this.store.listRuns({ projectId });
297
+ const ordered = runs.slice().sort((a, b) => a.startedAt - b.startedAt);
298
+ return ordered.map((run) => ({
299
+ run,
300
+ layerBucket: run.layer === "builder" ? "chat" : run.layer === "app-build" ? "build" : run.layer === "app-runtime" ? "runtime" : "other"
301
+ }));
302
+ }
303
+ async projectChats(projectId) {
304
+ const builderRuns = (await this.store.listRuns({ projectId, layer: "builder" })).sort(
305
+ (a, b) => b.startedAt - a.startedAt
306
+ );
307
+ const childrenFor = async (runId) => this.store.listRuns({ parentRunId: runId });
308
+ const out = [];
309
+ for (const run of builderRuns) {
310
+ const spans = await this.store.spans({ runId: run.runId });
311
+ const children = await childrenFor(run.runId);
312
+ const build = children.find((c) => c.layer === "app-build");
313
+ const runtime = [];
314
+ if (build) {
315
+ const grands = await childrenFor(build.runId);
316
+ for (const g of grands) if (g.layer === "app-runtime") runtime.push(g.runId);
317
+ }
318
+ for (const c of children) if (c.layer === "app-runtime") runtime.push(c.runId);
319
+ out.push({
320
+ chatId: run.chatId ?? run.runId,
321
+ projectId,
322
+ builderRunId: run.runId,
323
+ startedAt: run.startedAt,
324
+ endedAt: run.endedAt,
325
+ status: run.status,
326
+ outcome: run.outcome,
327
+ llmTurns: spans.filter((s) => s.kind === "llm").length,
328
+ toolCalls: spans.filter((s) => s.kind === "tool").length,
329
+ buildRunId: build?.runId,
330
+ appRuntimeRunIds: runtime
331
+ });
332
+ }
333
+ return out;
334
+ }
335
+ };
336
+
337
+ // src/builder-eval/three-layer-eval.ts
338
+ async function scoreProject(store, projectId) {
339
+ const allRuns = await store.listRuns({ projectId });
340
+ const builder = latestByLayer(allRuns, "builder");
341
+ const build = latestByLayer(allRuns, "app-build");
342
+ const runtime = allRuns.filter((r) => r.layer === "app-runtime");
343
+ const metaScore = builder ? await extractMetaScore(store, builder.runId) : null;
344
+ const buildScore = build?.outcome?.score ?? null;
345
+ const runtimeScores = runtime.map((r) => r.outcome?.score).filter((s) => typeof s === "number");
346
+ const runtimeScore = runtimeScores.length > 0 ? runtimeScores.reduce((a, b) => a + b, 0) / runtimeScores.length : null;
347
+ const runtimePassed = runtime.filter((r) => r.outcome?.pass === true).length;
348
+ const runtimePassRate = runtime.length > 0 ? runtimePassed / runtime.length : null;
349
+ const kind = runtime.length === 0 ? "scaffold-only" : "full";
350
+ const complete = kind === "scaffold-only" ? metaScore !== null && buildScore !== null : metaScore !== null && buildScore !== null && runtimeScore !== null;
351
+ return {
352
+ projectId,
353
+ kind,
354
+ builderRunId: builder?.runId,
355
+ metaScore,
356
+ buildRunId: build?.runId,
357
+ buildScore,
358
+ appRuntimeRunIds: runtime.map((r) => r.runId),
359
+ runtimeScore,
360
+ runtimePassRate,
361
+ complete
362
+ };
363
+ }
364
+ async function scoreAllProjects(store) {
365
+ const runs = await store.listRuns();
366
+ const projectIds = [...new Set(runs.map((r) => r.projectId).filter((p) => !!p))];
367
+ return Promise.all(projectIds.map((p) => scoreProject(store, p)));
368
+ }
369
+ function latestByLayer(runs, layer) {
370
+ const filtered = runs.filter((r) => r.layer === layer).sort((a, b) => b.startedAt - a.startedAt);
371
+ return filtered[0];
372
+ }
373
+ async function extractMetaScore(store, builderRunId) {
374
+ const js = await judgeSpans(store, builderRunId);
375
+ const meta = js.find(
376
+ (s) => s.judgeId === "builder-meta" && s.dimension === "user_intent_satisfaction"
377
+ );
378
+ if (!meta) return null;
379
+ if (meta.score >= 0 && meta.score <= 1) return meta.score;
380
+ if (meta.score >= 0 && meta.score <= 10) return meta.score / 10;
381
+ return null;
382
+ }
383
+ export {
384
+ BuilderSession,
385
+ ProjectRegistry,
386
+ correlateLayers,
387
+ resumeBuilderSession,
388
+ scoreAllProjects,
389
+ scoreProject
390
+ };
391
+ //# sourceMappingURL=index.js.map