@valescoagency/runway 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/cli.js +1 -0
- package/dist/commands/run.js +47 -0
- package/dist/config.js +8 -0
- package/dist/dashboard/otlp.js +16 -2
- package/dist/dashboard/projector.js +12 -0
- package/dist/dashboard/server.js +60 -4
- package/dist/dashboard/storage.js +233 -17
- package/dist/dashboard/views.js +18 -1
- package/dist/finalize.js +34 -2
- package/dist/git.js +170 -22
- package/dist/implement.js +6 -0
- package/dist/linear.js +35 -9
- package/dist/orchestrator.js +99 -18
- package/dist/prompts.js +40 -0
- package/dist/review.js +32 -18
- package/package.json +1 -1
- package/prompts/implement.md +11 -0
- package/prompts/review.md +48 -6
package/README.md
CHANGED
|
@@ -392,7 +392,7 @@ These are tractable, just not v1.
|
|
|
392
392
|
|
|
393
393
|
## Status
|
|
394
394
|
|
|
395
|
-
0.
|
|
395
|
+
0.10.0 — production-shaped and dogfooded against live Linear queues.
|
|
396
396
|
The end-to-end pipeline (init → run → review → PR) is stable; surface
|
|
397
397
|
may still shift as the orchestrator's policy and iteration mechanics
|
|
398
398
|
mature. See [CHANGELOG.md](./CHANGELOG.md) for per-release detail.
|
package/dist/cli.js
CHANGED
package/dist/commands/run.js
CHANGED
|
@@ -68,6 +68,25 @@ export function parseRunArgs(argv) {
|
|
|
68
68
|
}
|
|
69
69
|
opts.implTurns = n;
|
|
70
70
|
}
|
|
71
|
+
else if (a === "--review-retries") {
|
|
72
|
+
const v = argv[i + 1];
|
|
73
|
+
if (!v)
|
|
74
|
+
throw new Error("--review-retries requires a number");
|
|
75
|
+
const n = Number.parseInt(v, 10);
|
|
76
|
+
if (!Number.isFinite(n) || n < 0) {
|
|
77
|
+
throw new Error(`--review-retries must be a non-negative integer, got "${v}"`);
|
|
78
|
+
}
|
|
79
|
+
opts.reviewRetries = n;
|
|
80
|
+
i += 1;
|
|
81
|
+
}
|
|
82
|
+
else if (a?.startsWith("--review-retries=")) {
|
|
83
|
+
const v = a.slice("--review-retries=".length);
|
|
84
|
+
const n = Number.parseInt(v, 10);
|
|
85
|
+
if (!Number.isFinite(n) || n < 0) {
|
|
86
|
+
throw new Error(`--review-retries must be a non-negative integer, got "${v}"`);
|
|
87
|
+
}
|
|
88
|
+
opts.reviewRetries = n;
|
|
89
|
+
}
|
|
71
90
|
else if (a === "--help" || a === "-h") {
|
|
72
91
|
printRunUsage();
|
|
73
92
|
process.exit(0);
|
|
@@ -102,6 +121,14 @@ OPTIONS
|
|
|
102
121
|
(how many turns the Claude agent gets per attempt
|
|
103
122
|
before it has to signal IMPL: DONE / BLOCKED).
|
|
104
123
|
Overrides RUNWAY_IMPL_TURNS. Default: 3.
|
|
124
|
+
--review-retries N
|
|
125
|
+
In-run review-rejection retry budget. When the
|
|
126
|
+
reviewer emits REVIEW: REJECTED-RETRY — <reason>
|
|
127
|
+
(mechanically fixable), runway re-runs the impl
|
|
128
|
+
agent with the reason in {{IN_RUN_REVIEWER_FEEDBACK}}
|
|
129
|
+
and re-runs review. N caps the extra impl+review
|
|
130
|
+
pairs per drain pickup. 0 disables retries entirely.
|
|
131
|
+
Overrides RUNWAY_REVIEW_RETRIES. Default: 1.
|
|
105
132
|
--help, -h Show this help.
|
|
106
133
|
|
|
107
134
|
ENVIRONMENT
|
|
@@ -122,6 +149,11 @@ ENVIRONMENT
|
|
|
122
149
|
RUNWAY_IMPL_TURNS default 3 — sandcastle inner turn
|
|
123
150
|
budget per impl phase. Overridden by
|
|
124
151
|
--impl-turns.
|
|
152
|
+
RUNWAY_REVIEW_RETRIES default 1 — review-rejection retry
|
|
153
|
+
loop. On REVIEW: REJECTED-RETRY, runway
|
|
154
|
+
re-runs impl with the rejection in the
|
|
155
|
+
prompt, then re-runs review. 0 disables
|
|
156
|
+
entirely. Overridden by --review-retries.
|
|
125
157
|
`);
|
|
126
158
|
}
|
|
127
159
|
export async function runCommand(argv) {
|
|
@@ -152,6 +184,9 @@ export async function runCommand(argv) {
|
|
|
152
184
|
...baseConfig,
|
|
153
185
|
...(opts.project ? { linearProject: opts.project } : {}),
|
|
154
186
|
...(opts.implTurns !== undefined ? { implTurns: opts.implTurns } : {}),
|
|
187
|
+
...(opts.reviewRetries !== undefined
|
|
188
|
+
? { reviewRetries: opts.reviewRetries }
|
|
189
|
+
: {}),
|
|
155
190
|
};
|
|
156
191
|
const scope = config.linearProject
|
|
157
192
|
? `team ${config.linearTeam} / project ${config.linearProject}`
|
|
@@ -167,4 +202,16 @@ export async function runCommand(argv) {
|
|
|
167
202
|
}).pipe(Effect.scoped, Effect.provide(MainLayer));
|
|
168
203
|
const result = await Effect.runPromise(program);
|
|
169
204
|
console.log(`[runway] done — attempts=${result.attempts} opened=${result.opened} hitl=${result.hitl} errored=${result.errored}`);
|
|
205
|
+
// Single-line, parser-friendly completion marker. Background
|
|
206
|
+
// watchers (Claude Code's `run_in_background` bash task, CI,
|
|
207
|
+
// scripts) can grep for `[runway:exit]` instead of guessing
|
|
208
|
+
// whether the drain is still in flight.
|
|
209
|
+
console.log(`[runway:exit] status=success attempts=${result.attempts} opened=${result.opened} hitl=${result.hitl} errored=${result.errored}`);
|
|
210
|
+
// Hard exit so any lingering handle (OTel BatchSpanProcessor's
|
|
211
|
+
// interval when OTEL_EXPORTER_OTLP_ENDPOINT is set, a Docker
|
|
212
|
+
// stream Sandcastle left open, etc.) can't keep the process — and
|
|
213
|
+
// the background task that launched it — alive after the drain is
|
|
214
|
+
// logically done. By this point `Effect.scoped` has already torn
|
|
215
|
+
// down its finalizers.
|
|
216
|
+
process.exit(0);
|
|
170
217
|
}
|
package/dist/config.js
CHANGED
|
@@ -24,6 +24,13 @@ const configEffect = EConfig.all({
|
|
|
24
24
|
message: "RUNWAY_IMPL_TURNS must be a positive integer",
|
|
25
25
|
validation: (n) => n > 0,
|
|
26
26
|
})),
|
|
27
|
+
// VA-418: zero is a valid value here (operator kill-switch) so the
|
|
28
|
+
// validation accepts >= 0, unlike implTurns/maxIterations which
|
|
29
|
+
// both require >= 1.
|
|
30
|
+
reviewRetries: EConfig.integer("RUNWAY_REVIEW_RETRIES").pipe(EConfig.withDefault(1), EConfig.validate({
|
|
31
|
+
message: "RUNWAY_REVIEW_RETRIES must be a non-negative integer",
|
|
32
|
+
validation: (n) => n >= 0,
|
|
33
|
+
})),
|
|
27
34
|
commentAuthorAllowlist: EConfig.option(EConfig.string("RUNWAY_COMMENT_AUTHOR_ALLOWLIST")),
|
|
28
35
|
}).pipe(Effect.map((raw) => ({
|
|
29
36
|
linearApiKey: raw.linearApiKey,
|
|
@@ -37,6 +44,7 @@ const configEffect = EConfig.all({
|
|
|
37
44
|
hitlLabel: raw.hitlLabel,
|
|
38
45
|
maxIterations: raw.maxIterations,
|
|
39
46
|
implTurns: raw.implTurns,
|
|
47
|
+
reviewRetries: raw.reviewRetries,
|
|
40
48
|
commentAuthorAllowlist: Option.getOrUndefined(raw.commentAuthorAllowlist)
|
|
41
49
|
?.split(",")
|
|
42
50
|
.map((s) => s.trim())
|
package/dist/dashboard/otlp.js
CHANGED
|
@@ -11,12 +11,18 @@
|
|
|
11
11
|
/**
|
|
12
12
|
* Coerce an OTLP attribute value to a plain JS scalar. We collapse
|
|
13
13
|
* the typed wire variants (`stringValue` / `intValue` / `boolValue` /
|
|
14
|
-
* `doubleValue`) into one return path so callers
|
|
15
|
-
* pattern-match without knowing the OTLP shape.
|
|
14
|
+
* `doubleValue` / `arrayValue`) into one return path so callers
|
|
15
|
+
* downstream can pattern-match without knowing the OTLP shape.
|
|
16
16
|
*
|
|
17
17
|
* `intValue` round-trips as a string to preserve int64 precision.
|
|
18
18
|
* Callers that want a `number` (e.g. for counters under 2^53) should
|
|
19
19
|
* `Number(...)` it themselves.
|
|
20
|
+
*
|
|
21
|
+
* VA-387: `arrayValue` collapses to a `readonly string[]` so the
|
|
22
|
+
* dashboard's label-style attributes (`runway.issue.labels`) survive
|
|
23
|
+
* the wire trip with their structure intact. Non-string array
|
|
24
|
+
* elements drop silently — projector callers only ever ask for
|
|
25
|
+
* string arrays today.
|
|
20
26
|
*/
|
|
21
27
|
export function attrValue(attr) {
|
|
22
28
|
if (!attr)
|
|
@@ -35,6 +41,14 @@ export function attrValue(attr) {
|
|
|
35
41
|
? v.intValue
|
|
36
42
|
: v.intValue;
|
|
37
43
|
}
|
|
44
|
+
if (v.arrayValue !== undefined) {
|
|
45
|
+
const items = [];
|
|
46
|
+
for (const inner of v.arrayValue.values) {
|
|
47
|
+
if (inner.stringValue !== undefined)
|
|
48
|
+
items.push(inner.stringValue);
|
|
49
|
+
}
|
|
50
|
+
return items;
|
|
51
|
+
}
|
|
38
52
|
return undefined;
|
|
39
53
|
}
|
|
40
54
|
/**
|
|
@@ -78,9 +78,13 @@ function projectIssueProcess(span) {
|
|
|
78
78
|
parentSpanId: span.parentSpanId ?? null,
|
|
79
79
|
issueIdentifier: identifier,
|
|
80
80
|
issueId: strAttr(m["runway.issue.id"]) ?? null,
|
|
81
|
+
issueTitle: strAttr(m["runway.issue.title"]) ?? null,
|
|
82
|
+
issueLabels: strArrayAttr(m["runway.issue.labels"]),
|
|
81
83
|
branch: strAttr(m["runway.branch"]) ?? null,
|
|
82
84
|
outcomeKind: strAttr(m["runway.outcome.kind"]) ?? null,
|
|
83
85
|
outcomeDetail: strAttr(m["runway.outcome.detail"]) ?? null,
|
|
86
|
+
prUrl: strAttr(m["runway.pr.url"]) ?? null,
|
|
87
|
+
hitlReason: strAttr(m["runway.hitl.reason"]) ?? null,
|
|
84
88
|
startTimeUnixNano: span.startTimeUnixNano,
|
|
85
89
|
endTimeUnixNano: span.endTimeUnixNano,
|
|
86
90
|
statusCode: span.status?.code ?? null,
|
|
@@ -125,3 +129,11 @@ function numAttr(v) {
|
|
|
125
129
|
}
|
|
126
130
|
return null;
|
|
127
131
|
}
|
|
132
|
+
/**
|
|
133
|
+
* VA-387: decode an OTLP arrayValue attribute into a string array.
|
|
134
|
+
* Older spans (or spans from a runway that never set the attribute)
|
|
135
|
+
* become an empty list so callers don't have to null-guard.
|
|
136
|
+
*/
|
|
137
|
+
function strArrayAttr(v) {
|
|
138
|
+
return Array.isArray(v) ? v : [];
|
|
139
|
+
}
|
package/dist/dashboard/server.js
CHANGED
|
@@ -6,6 +6,10 @@ import { renderDetailView, renderListView } from "./views.js";
|
|
|
6
6
|
// Anything else stays in raw_spans for debugging but isn't rendered.
|
|
7
7
|
const DETAIL_PHASE_NAMES = ["review", "pushBranch", "openPullRequest"];
|
|
8
8
|
const ISSUE_DETAIL_RE = /^\/issue\/([^/?#]+)\/([^/?#]+)\/?$/;
|
|
9
|
+
// VA-387: canonical detail route. `:id` is the issue process span_id;
|
|
10
|
+
// the lookup falls back to the (trace_id, span_id) pair only for
|
|
11
|
+
// older `/issue/...` links that still work for back-compat.
|
|
12
|
+
const ISSUE_PROCESS_DETAIL_RE = /^\/issue-processes\/([^/?#]+)\/?$/;
|
|
9
13
|
const MAX_BODY_BYTES = 10 * 1024 * 1024; // 10 MiB — generous; a runway drain is ~kilobytes per emit.
|
|
10
14
|
/**
|
|
11
15
|
* Construct a Node HTTP server wired to the given storage. The server
|
|
@@ -58,7 +62,14 @@ async function handle(req, res, storage) {
|
|
|
58
62
|
return;
|
|
59
63
|
}
|
|
60
64
|
if (method === "GET") {
|
|
61
|
-
const
|
|
65
|
+
const pathOnly = url.split("?")[0] ?? "";
|
|
66
|
+
const issueProcessMatch = ISSUE_PROCESS_DETAIL_RE.exec(pathOnly);
|
|
67
|
+
if (issueProcessMatch) {
|
|
68
|
+
const spanId = decodeURIComponent(issueProcessMatch[1] ?? "");
|
|
69
|
+
handleIssueProcessDetailView(res, storage, spanId);
|
|
70
|
+
return;
|
|
71
|
+
}
|
|
72
|
+
const detailMatch = ISSUE_DETAIL_RE.exec(pathOnly);
|
|
62
73
|
if (detailMatch) {
|
|
63
74
|
const traceId = decodeURIComponent(detailMatch[1] ?? "");
|
|
64
75
|
const spanId = decodeURIComponent(detailMatch[2] ?? "");
|
|
@@ -66,6 +77,10 @@ async function handle(req, res, storage) {
|
|
|
66
77
|
return;
|
|
67
78
|
}
|
|
68
79
|
}
|
|
80
|
+
if (method === "GET" && (url === "/api/aggregates" || url.startsWith("/api/aggregates?"))) {
|
|
81
|
+
handleAggregates(res, storage);
|
|
82
|
+
return;
|
|
83
|
+
}
|
|
69
84
|
if (method === "GET" && url === "/healthz") {
|
|
70
85
|
res.writeHead(200, { "content-type": "text/plain" });
|
|
71
86
|
res.end("ok");
|
|
@@ -136,8 +151,24 @@ function handleDetailView(res, storage, traceId, spanId) {
|
|
|
136
151
|
writeError(res, 404, "not_found", `no issue process for trace=${traceId} span=${spanId}`);
|
|
137
152
|
return;
|
|
138
153
|
}
|
|
139
|
-
|
|
140
|
-
|
|
154
|
+
renderDetailFor(res, storage, ip);
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* VA-387: detail-route handler keyed on the issue process span_id
|
|
158
|
+
* alone. Reuses the same view model as the older two-segment route
|
|
159
|
+
* once the row is resolved.
|
|
160
|
+
*/
|
|
161
|
+
function handleIssueProcessDetailView(res, storage, spanId) {
|
|
162
|
+
const ip = storage.getIssueProcessBySpanId(spanId);
|
|
163
|
+
if (!ip) {
|
|
164
|
+
writeError(res, 404, "not_found", `no issue process for span=${spanId}`);
|
|
165
|
+
return;
|
|
166
|
+
}
|
|
167
|
+
renderDetailFor(res, storage, ip);
|
|
168
|
+
}
|
|
169
|
+
function renderDetailFor(res, storage, ip) {
|
|
170
|
+
const iterations = storage.listAgentIterations(ip.traceId, ip.spanId);
|
|
171
|
+
const phaseSpans = storage.listPhaseSpans(ip.traceId, ip.spanId, [
|
|
141
172
|
...DETAIL_PHASE_NAMES,
|
|
142
173
|
]);
|
|
143
174
|
const html = renderDetailView({
|
|
@@ -148,6 +179,18 @@ function handleDetailView(res, storage, traceId, spanId) {
|
|
|
148
179
|
res.writeHead(200, { "content-type": "text/html; charset=utf-8" });
|
|
149
180
|
res.end(html);
|
|
150
181
|
}
|
|
182
|
+
/**
|
|
183
|
+
* VA-399: JSON snapshot of the evaluator-facing aggregates read-model.
|
|
184
|
+
* The shape mirrors `EvaluatorAggregate` (snake_case → camelCase) so
|
|
185
|
+
* IRA prompts and the dashboard UI can reference the same field names
|
|
186
|
+
* regardless of access path. See `read-model.md` for the field
|
|
187
|
+
* contract + versioning policy.
|
|
188
|
+
*/
|
|
189
|
+
function handleAggregates(res, storage) {
|
|
190
|
+
const rows = storage.listAggregates();
|
|
191
|
+
res.writeHead(200, { "content-type": "application/json" });
|
|
192
|
+
res.end(JSON.stringify({ view: "evaluator_aggregates_v1", rows }));
|
|
193
|
+
}
|
|
151
194
|
async function readBody(req) {
|
|
152
195
|
const chunks = [];
|
|
153
196
|
let total = 0;
|
|
@@ -184,7 +227,14 @@ export async function main() {
|
|
|
184
227
|
const sqlitePath = process.env.SQLITE_PATH ?? "/data/runway.sqlite";
|
|
185
228
|
const otlpPort = parsePort("OTLP_PORT", "4318");
|
|
186
229
|
const dashboardPort = parsePort("DASHBOARD_PORT", "3001");
|
|
187
|
-
|
|
230
|
+
// VA-399: rolling-window size for the evaluator aggregates view.
|
|
231
|
+
// Defaults to 30 drains; operators bump it for longer-baseline IRA
|
|
232
|
+
// comparisons. Missing/invalid → fall through to the storage layer's
|
|
233
|
+
// default rather than crashing the dashboard at boot.
|
|
234
|
+
const aggregateWindow = parsePositiveInt(process.env.DASHBOARD_AGGREGATE_WINDOW);
|
|
235
|
+
const storage = createStorage(sqlitePath, {
|
|
236
|
+
aggregateWindowDrains: aggregateWindow,
|
|
237
|
+
});
|
|
188
238
|
const otlp = await startServer({ storage, port: otlpPort });
|
|
189
239
|
const dashboard = dashboardPort === otlpPort
|
|
190
240
|
? otlp
|
|
@@ -209,6 +259,12 @@ function parsePort(envName, fallback) {
|
|
|
209
259
|
}
|
|
210
260
|
return n;
|
|
211
261
|
}
|
|
262
|
+
function parsePositiveInt(raw) {
|
|
263
|
+
if (!raw)
|
|
264
|
+
return undefined;
|
|
265
|
+
const n = Number.parseInt(raw, 10);
|
|
266
|
+
return Number.isFinite(n) && n > 0 ? n : undefined;
|
|
267
|
+
}
|
|
212
268
|
// Run as a script when executed directly (e.g. inside the Docker
|
|
213
269
|
// container's CMD). Skipped when imported by tests.
|
|
214
270
|
const isMain = (() => {
|
|
@@ -21,9 +21,13 @@ const SCHEMA = `
|
|
|
21
21
|
parent_span_id TEXT,
|
|
22
22
|
issue_identifier TEXT NOT NULL,
|
|
23
23
|
issue_id TEXT,
|
|
24
|
+
issue_title TEXT,
|
|
25
|
+
issue_labels TEXT,
|
|
24
26
|
branch TEXT,
|
|
25
27
|
outcome_kind TEXT,
|
|
26
28
|
outcome_detail TEXT,
|
|
29
|
+
pr_url TEXT,
|
|
30
|
+
hitl_reason TEXT,
|
|
27
31
|
start_time_unix_nano TEXT NOT NULL,
|
|
28
32
|
end_time_unix_nano TEXT NOT NULL,
|
|
29
33
|
status_code INTEGER,
|
|
@@ -38,6 +42,9 @@ const SCHEMA = `
|
|
|
38
42
|
CREATE INDEX IF NOT EXISTS idx_issue_processes_trace_id
|
|
39
43
|
ON issue_processes(trace_id);
|
|
40
44
|
|
|
45
|
+
CREATE INDEX IF NOT EXISTS idx_issue_processes_span_id
|
|
46
|
+
ON issue_processes(span_id);
|
|
47
|
+
|
|
41
48
|
CREATE TABLE IF NOT EXISTS raw_spans (
|
|
42
49
|
trace_id TEXT NOT NULL,
|
|
43
50
|
span_id TEXT NOT NULL,
|
|
@@ -63,6 +70,128 @@ const SCHEMA = `
|
|
|
63
70
|
CREATE INDEX IF NOT EXISTS idx_agent_iterations_issue_process
|
|
64
71
|
ON agent_iterations(trace_id, issue_process_id, iteration_index);
|
|
65
72
|
`;
|
|
73
|
+
const DEFAULT_AGGREGATE_WINDOW = 30;
|
|
74
|
+
/**
|
|
75
|
+
* VA-399: SQL VIEW that computes the evaluator-facing aggregates over
|
|
76
|
+
* the last N drains. N is interpolated at view-creation time because
|
|
77
|
+
* SQLite views can't take parameters — when the dashboard process
|
|
78
|
+
* starts with a different `DASHBOARD_AGGREGATE_WINDOW`, the view is
|
|
79
|
+
* dropped and recreated with the new LIMIT.
|
|
80
|
+
*
|
|
81
|
+
* Median uses the "average of the two middle values when N is even,
|
|
82
|
+
* the middle value when N is odd" convention; p95 uses the
|
|
83
|
+
* nearest-rank method (smallest observed value whose rank meets-or-
|
|
84
|
+
* exceeds 95%). See `read-model.md` for the field-by-field contract.
|
|
85
|
+
*
|
|
86
|
+
* `reviewer_rejection_rate` keys on the detail prefix emitted by
|
|
87
|
+
* `src/review.ts` ("Sub-agent review rejected: ..."). It's a subset
|
|
88
|
+
* of `hitl_escape_rate` — a review rejection routes to HITL, so both
|
|
89
|
+
* rates count the same row.
|
|
90
|
+
*/
|
|
91
|
+
function aggregatesViewDdl(windowDrains) {
|
|
92
|
+
// windowDrains is the only spot we interpolate rather than
|
|
93
|
+
// parameter-bind (CREATE VIEW can't take params). Coerce to a
|
|
94
|
+
// positive integer so a hostile env var can't smuggle SQL through.
|
|
95
|
+
const n = Math.max(1, Math.floor(windowDrains));
|
|
96
|
+
return `
|
|
97
|
+
DROP VIEW IF EXISTS evaluator_aggregates_v1;
|
|
98
|
+
CREATE VIEW evaluator_aggregates_v1 AS
|
|
99
|
+
WITH recent_drains AS (
|
|
100
|
+
SELECT trace_id
|
|
101
|
+
FROM drains
|
|
102
|
+
ORDER BY CAST(start_time_unix_nano AS INTEGER) DESC
|
|
103
|
+
LIMIT ${n}
|
|
104
|
+
),
|
|
105
|
+
process_rows AS (
|
|
106
|
+
SELECT
|
|
107
|
+
ip.trace_id,
|
|
108
|
+
ip.span_id,
|
|
109
|
+
ip.outcome_kind,
|
|
110
|
+
COALESCE(ip.outcome_detail, '') AS outcome_detail,
|
|
111
|
+
CASE
|
|
112
|
+
WHEN instr(ip.issue_identifier, '-') > 0
|
|
113
|
+
THEN substr(ip.issue_identifier, 1, instr(ip.issue_identifier, '-') - 1)
|
|
114
|
+
ELSE ip.issue_identifier
|
|
115
|
+
END AS category,
|
|
116
|
+
(CAST(ip.end_time_unix_nano AS INTEGER) - CAST(ip.start_time_unix_nano AS INTEGER)) / 1000000 AS wall_time_ms,
|
|
117
|
+
(
|
|
118
|
+
SELECT COUNT(*) FROM agent_iterations a
|
|
119
|
+
WHERE a.trace_id = ip.trace_id AND a.issue_process_id = ip.span_id
|
|
120
|
+
) AS iteration_count
|
|
121
|
+
FROM issue_processes ip
|
|
122
|
+
WHERE ip.trace_id IN (SELECT trace_id FROM recent_drains)
|
|
123
|
+
),
|
|
124
|
+
wt_ranked AS (
|
|
125
|
+
SELECT
|
|
126
|
+
category,
|
|
127
|
+
wall_time_ms,
|
|
128
|
+
ROW_NUMBER() OVER (PARTITION BY category ORDER BY wall_time_ms) AS rn,
|
|
129
|
+
COUNT(*) OVER (PARTITION BY category) AS cnt
|
|
130
|
+
FROM process_rows
|
|
131
|
+
),
|
|
132
|
+
it_ranked AS (
|
|
133
|
+
SELECT
|
|
134
|
+
category,
|
|
135
|
+
iteration_count,
|
|
136
|
+
ROW_NUMBER() OVER (PARTITION BY category ORDER BY iteration_count) AS rn,
|
|
137
|
+
COUNT(*) OVER (PARTITION BY category) AS cnt
|
|
138
|
+
FROM process_rows
|
|
139
|
+
),
|
|
140
|
+
wt_median AS (
|
|
141
|
+
SELECT category, AVG(wall_time_ms * 1.0) AS value
|
|
142
|
+
FROM wt_ranked
|
|
143
|
+
WHERE rn IN ((cnt + 1) / 2, (cnt / 2) + 1)
|
|
144
|
+
GROUP BY category
|
|
145
|
+
),
|
|
146
|
+
wt_p95 AS (
|
|
147
|
+
SELECT category, MIN(wall_time_ms) AS value
|
|
148
|
+
FROM wt_ranked
|
|
149
|
+
WHERE rn >= (cnt * 95 + 99) / 100
|
|
150
|
+
GROUP BY category
|
|
151
|
+
),
|
|
152
|
+
it_median AS (
|
|
153
|
+
SELECT category, AVG(iteration_count * 1.0) AS value
|
|
154
|
+
FROM it_ranked
|
|
155
|
+
WHERE rn IN ((cnt + 1) / 2, (cnt / 2) + 1)
|
|
156
|
+
GROUP BY category
|
|
157
|
+
),
|
|
158
|
+
it_p95 AS (
|
|
159
|
+
SELECT category, MIN(iteration_count) AS value
|
|
160
|
+
FROM it_ranked
|
|
161
|
+
WHERE rn >= (cnt * 95 + 99) / 100
|
|
162
|
+
GROUP BY category
|
|
163
|
+
),
|
|
164
|
+
rates AS (
|
|
165
|
+
SELECT
|
|
166
|
+
category,
|
|
167
|
+
COUNT(*) AS sample_size,
|
|
168
|
+
AVG(CASE WHEN outcome_kind = 'hitl' AND outcome_detail LIKE 'Sub-agent review rejected%'
|
|
169
|
+
THEN 1.0 ELSE 0.0 END) AS reviewer_rejection_rate,
|
|
170
|
+
AVG(CASE WHEN outcome_kind = 'reverted' THEN 1.0 ELSE 0.0 END) AS revert_rate,
|
|
171
|
+
AVG(CASE WHEN outcome_kind = 'hitl' THEN 1.0 ELSE 0.0 END) AS hitl_escape_rate,
|
|
172
|
+
AVG(CASE WHEN outcome_kind = 'errored' THEN 1.0 ELSE 0.0 END) AS infra_error_rate
|
|
173
|
+
FROM process_rows
|
|
174
|
+
GROUP BY category
|
|
175
|
+
)
|
|
176
|
+
SELECT
|
|
177
|
+
r.category AS category,
|
|
178
|
+
r.sample_size AS sample_size,
|
|
179
|
+
itm.value AS median_iteration_count,
|
|
180
|
+
itp.value AS p95_iteration_count,
|
|
181
|
+
wtm.value AS median_wall_time_ms,
|
|
182
|
+
wtp.value AS p95_wall_time_ms,
|
|
183
|
+
r.reviewer_rejection_rate AS reviewer_rejection_rate,
|
|
184
|
+
r.revert_rate AS revert_rate,
|
|
185
|
+
r.hitl_escape_rate AS hitl_escape_rate,
|
|
186
|
+
r.infra_error_rate AS infra_error_rate
|
|
187
|
+
FROM rates r
|
|
188
|
+
LEFT JOIN wt_median wtm ON wtm.category = r.category
|
|
189
|
+
LEFT JOIN wt_p95 wtp ON wtp.category = r.category
|
|
190
|
+
LEFT JOIN it_median itm ON itm.category = r.category
|
|
191
|
+
LEFT JOIN it_p95 itp ON itp.category = r.category
|
|
192
|
+
ORDER BY r.category;
|
|
193
|
+
`;
|
|
194
|
+
}
|
|
66
195
|
/**
|
|
67
196
|
* Open (or create) a SQLite database at `path` and return a typed
|
|
68
197
|
* `Storage` handle. Pass `:memory:` for tests — the in-memory db
|
|
@@ -72,9 +201,35 @@ const SCHEMA = `
|
|
|
72
201
|
* OTel SDK retrying a flush) don't blow up the receiver — last writer
|
|
73
202
|
* wins on (trace_id, span_id).
|
|
74
203
|
*/
|
|
75
|
-
export function createStorage(path) {
|
|
204
|
+
export function createStorage(path, opts = {}) {
|
|
76
205
|
const db = new DatabaseSync(path);
|
|
77
206
|
db.exec(SCHEMA);
|
|
207
|
+
// VA-387: idempotent column adds for DBs created against an older
|
|
208
|
+
// schema. `CREATE TABLE IF NOT EXISTS` won't migrate an existing
|
|
209
|
+
// table; SQLite has no `ADD COLUMN IF NOT EXISTS`, so we swallow
|
|
210
|
+
// the duplicate-column error individually. Runs BEFORE VA-399's
|
|
211
|
+
// view install — `evaluator_aggregates_v1` reads from
|
|
212
|
+
// `issue_processes`, so the columns it may query must exist first.
|
|
213
|
+
for (const sql of [
|
|
214
|
+
`ALTER TABLE issue_processes ADD COLUMN issue_title TEXT`,
|
|
215
|
+
`ALTER TABLE issue_processes ADD COLUMN issue_labels TEXT`,
|
|
216
|
+
`ALTER TABLE issue_processes ADD COLUMN pr_url TEXT`,
|
|
217
|
+
`ALTER TABLE issue_processes ADD COLUMN hitl_reason TEXT`,
|
|
218
|
+
]) {
|
|
219
|
+
try {
|
|
220
|
+
db.exec(sql);
|
|
221
|
+
}
|
|
222
|
+
catch {
|
|
223
|
+
// Column already present — fresh CREATE TABLE path, or a prior
|
|
224
|
+
// dashboard boot ran the same migration.
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
// VA-399: install the evaluator-facing read-model view after the
|
|
228
|
+
// base tables exist (and after VA-387's column migrations above),
|
|
229
|
+
// but before any prepared statement is created — a
|
|
230
|
+
// `SELECT FROM evaluator_aggregates_v1` would otherwise race the
|
|
231
|
+
// DDL on first use.
|
|
232
|
+
db.exec(aggregatesViewDdl(opts.aggregateWindowDrains ?? DEFAULT_AGGREGATE_WINDOW));
|
|
78
233
|
const insertDrain = db.prepare(`
|
|
79
234
|
INSERT INTO drains (
|
|
80
235
|
trace_id, span_id, start_time_unix_nano, end_time_unix_nano,
|
|
@@ -93,16 +248,21 @@ export function createStorage(path) {
|
|
|
93
248
|
const insertIssueProcess = db.prepare(`
|
|
94
249
|
INSERT INTO issue_processes (
|
|
95
250
|
trace_id, span_id, parent_span_id, issue_identifier, issue_id,
|
|
96
|
-
branch, outcome_kind, outcome_detail,
|
|
251
|
+
issue_title, issue_labels, branch, outcome_kind, outcome_detail,
|
|
252
|
+
pr_url, hitl_reason,
|
|
97
253
|
start_time_unix_nano, end_time_unix_nano, status_code, status_message
|
|
98
|
-
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
254
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
99
255
|
ON CONFLICT (trace_id, span_id) DO UPDATE SET
|
|
100
256
|
parent_span_id = excluded.parent_span_id,
|
|
101
257
|
issue_identifier = excluded.issue_identifier,
|
|
102
258
|
issue_id = excluded.issue_id,
|
|
259
|
+
issue_title = excluded.issue_title,
|
|
260
|
+
issue_labels = excluded.issue_labels,
|
|
103
261
|
branch = excluded.branch,
|
|
104
262
|
outcome_kind = excluded.outcome_kind,
|
|
105
263
|
outcome_detail = excluded.outcome_detail,
|
|
264
|
+
pr_url = excluded.pr_url,
|
|
265
|
+
hitl_reason = excluded.hitl_reason,
|
|
106
266
|
start_time_unix_nano = excluded.start_time_unix_nano,
|
|
107
267
|
end_time_unix_nano = excluded.end_time_unix_nano,
|
|
108
268
|
status_code = excluded.status_code,
|
|
@@ -130,35 +290,40 @@ export function createStorage(path) {
|
|
|
130
290
|
`);
|
|
131
291
|
// Two list variants instead of one with conditional SQL — keeps
|
|
132
292
|
// each prepared statement static.
|
|
133
|
-
const
|
|
134
|
-
SELECT
|
|
293
|
+
const ISSUE_PROCESS_COLUMNS = `
|
|
135
294
|
trace_id, span_id, parent_span_id, issue_identifier, issue_id,
|
|
136
|
-
branch, outcome_kind, outcome_detail,
|
|
295
|
+
issue_title, issue_labels, branch, outcome_kind, outcome_detail,
|
|
296
|
+
pr_url, hitl_reason,
|
|
137
297
|
start_time_unix_nano, end_time_unix_nano, status_code, status_message,
|
|
138
298
|
inserted_at
|
|
299
|
+
`;
|
|
300
|
+
const listAll = db.prepare(`
|
|
301
|
+
SELECT ${ISSUE_PROCESS_COLUMNS}
|
|
139
302
|
FROM issue_processes
|
|
140
303
|
ORDER BY inserted_at DESC, span_id DESC
|
|
141
304
|
LIMIT ?
|
|
142
305
|
`);
|
|
143
306
|
const listByTrace = db.prepare(`
|
|
144
|
-
SELECT
|
|
145
|
-
trace_id, span_id, parent_span_id, issue_identifier, issue_id,
|
|
146
|
-
branch, outcome_kind, outcome_detail,
|
|
147
|
-
start_time_unix_nano, end_time_unix_nano, status_code, status_message,
|
|
148
|
-
inserted_at
|
|
307
|
+
SELECT ${ISSUE_PROCESS_COLUMNS}
|
|
149
308
|
FROM issue_processes
|
|
150
309
|
WHERE trace_id = ?
|
|
151
310
|
ORDER BY inserted_at DESC, span_id DESC
|
|
152
311
|
LIMIT ?
|
|
153
312
|
`);
|
|
154
313
|
const getProcessStmt = db.prepare(`
|
|
155
|
-
SELECT
|
|
156
|
-
trace_id, span_id, parent_span_id, issue_identifier, issue_id,
|
|
157
|
-
branch, outcome_kind, outcome_detail,
|
|
158
|
-
start_time_unix_nano, end_time_unix_nano, status_code, status_message,
|
|
159
|
-
inserted_at
|
|
314
|
+
SELECT ${ISSUE_PROCESS_COLUMNS}
|
|
160
315
|
FROM issue_processes
|
|
161
316
|
WHERE trace_id = ? AND span_id = ?
|
|
317
|
+
`);
|
|
318
|
+
// VA-387: span_id is unique in practice (random 64-bit ids); the
|
|
319
|
+
// detail route at `/issue-processes/:id` keys on span_id alone so
|
|
320
|
+
// operators don't have to type the trace_id in URLs.
|
|
321
|
+
const getProcessBySpanStmt = db.prepare(`
|
|
322
|
+
SELECT ${ISSUE_PROCESS_COLUMNS}
|
|
323
|
+
FROM issue_processes
|
|
324
|
+
WHERE span_id = ?
|
|
325
|
+
ORDER BY inserted_at DESC
|
|
326
|
+
LIMIT 1
|
|
162
327
|
`);
|
|
163
328
|
const listIterations = db.prepare(`
|
|
164
329
|
SELECT
|
|
@@ -169,11 +334,16 @@ export function createStorage(path) {
|
|
|
169
334
|
WHERE trace_id = ? AND issue_process_id = ?
|
|
170
335
|
ORDER BY iteration_index ASC
|
|
171
336
|
`);
|
|
337
|
+
const selectAggregates = db.prepare(`SELECT * FROM evaluator_aggregates_v1`);
|
|
172
338
|
const saveDrain = (d) => {
|
|
173
339
|
insertDrain.run(d.traceId, d.spanId, d.startTimeUnixNano, d.endTimeUnixNano, asInt(d.attempts), asInt(d.opened), asInt(d.hitl), asInt(d.errored), asInt(d.statusCode), d.statusMessage);
|
|
174
340
|
};
|
|
175
341
|
const saveIssueProcess = (p) => {
|
|
176
|
-
insertIssueProcess.run(p.traceId, p.spanId, p.parentSpanId, p.issueIdentifier, p.issueId, p.
|
|
342
|
+
insertIssueProcess.run(p.traceId, p.spanId, p.parentSpanId, p.issueIdentifier, p.issueId, p.issueTitle,
|
|
343
|
+
// VA-387: labels round-trip as a JSON array string. Keeping them
|
|
344
|
+
// in one column avoids a label-many-to-many table for a feature
|
|
345
|
+
// that's read-only on the dashboard side.
|
|
346
|
+
p.issueLabels.length === 0 ? null : JSON.stringify(p.issueLabels), p.branch, p.outcomeKind, p.outcomeDetail, p.prUrl, p.hitlReason, p.startTimeUnixNano, p.endTimeUnixNano, asInt(p.statusCode), p.statusMessage);
|
|
177
347
|
};
|
|
178
348
|
const saveAgentIteration = (a) => {
|
|
179
349
|
insertAgentIteration.run(a.traceId, a.spanId, a.issueProcessSpanId, asInt(a.iterationIndex), a.startTimeUnixNano, a.endTimeUnixNano, a.sandcastleRunId, a.exitStatus);
|
|
@@ -192,6 +362,10 @@ export function createStorage(path) {
|
|
|
192
362
|
const row = getProcessStmt.get(traceId, spanId);
|
|
193
363
|
return row ? rowToIssueProcess(row) : undefined;
|
|
194
364
|
};
|
|
365
|
+
const getIssueProcessBySpanId = (spanId) => {
|
|
366
|
+
const row = getProcessBySpanStmt.get(spanId);
|
|
367
|
+
return row ? rowToIssueProcess(row) : undefined;
|
|
368
|
+
};
|
|
195
369
|
const listAgentIterations = (traceId, issueProcessSpanId) => {
|
|
196
370
|
const rows = listIterations.all(traceId, issueProcessSpanId);
|
|
197
371
|
return rows.map(rowToAgentIteration);
|
|
@@ -227,6 +401,7 @@ export function createStorage(path) {
|
|
|
227
401
|
.all(traceId, issueProcessSpanId, ...names);
|
|
228
402
|
return rows.map(rowToPhaseSpan);
|
|
229
403
|
};
|
|
404
|
+
const listAggregates = () => selectAggregates.all().map(rowToAggregate);
|
|
230
405
|
const close = () => {
|
|
231
406
|
db.close();
|
|
232
407
|
};
|
|
@@ -237,8 +412,10 @@ export function createStorage(path) {
|
|
|
237
412
|
saveRawSpan,
|
|
238
413
|
listIssueProcesses,
|
|
239
414
|
getIssueProcess,
|
|
415
|
+
getIssueProcessBySpanId,
|
|
240
416
|
listAgentIterations,
|
|
241
417
|
listPhaseSpans,
|
|
418
|
+
listAggregates,
|
|
242
419
|
close,
|
|
243
420
|
};
|
|
244
421
|
}
|
|
@@ -256,9 +433,13 @@ function rowToIssueProcess(row) {
|
|
|
256
433
|
parentSpanId: nullableStr(r.parent_span_id),
|
|
257
434
|
issueIdentifier: String(r.issue_identifier ?? ""),
|
|
258
435
|
issueId: nullableStr(r.issue_id),
|
|
436
|
+
issueTitle: nullableStr(r.issue_title),
|
|
437
|
+
issueLabels: parseLabels(r.issue_labels),
|
|
259
438
|
branch: nullableStr(r.branch),
|
|
260
439
|
outcomeKind: nullableStr(r.outcome_kind),
|
|
261
440
|
outcomeDetail: nullableStr(r.outcome_detail),
|
|
441
|
+
prUrl: nullableStr(r.pr_url),
|
|
442
|
+
hitlReason: nullableStr(r.hitl_reason),
|
|
262
443
|
startTimeUnixNano: String(r.start_time_unix_nano ?? ""),
|
|
263
444
|
endTimeUnixNano: String(r.end_time_unix_nano ?? ""),
|
|
264
445
|
statusCode: nullableNum(r.status_code),
|
|
@@ -266,6 +447,26 @@ function rowToIssueProcess(row) {
|
|
|
266
447
|
insertedAt: String(r.inserted_at ?? ""),
|
|
267
448
|
};
|
|
268
449
|
}
|
|
450
|
+
/**
|
|
451
|
+
* VA-387: decode the JSON-encoded `issue_labels` column back into a
|
|
452
|
+
* string array. A row stored before the column existed (or one with
|
|
453
|
+
* NULL / malformed JSON) collapses to an empty list.
|
|
454
|
+
*/
|
|
455
|
+
function parseLabels(v) {
|
|
456
|
+
if (v === null || v === undefined)
|
|
457
|
+
return [];
|
|
458
|
+
if (typeof v !== "string")
|
|
459
|
+
return [];
|
|
460
|
+
try {
|
|
461
|
+
const parsed = JSON.parse(v);
|
|
462
|
+
if (!Array.isArray(parsed))
|
|
463
|
+
return [];
|
|
464
|
+
return parsed.filter((x) => typeof x === "string");
|
|
465
|
+
}
|
|
466
|
+
catch {
|
|
467
|
+
return [];
|
|
468
|
+
}
|
|
469
|
+
}
|
|
269
470
|
function rowToAgentIteration(row) {
|
|
270
471
|
const r = row;
|
|
271
472
|
return {
|
|
@@ -302,3 +503,18 @@ function nullableNum(v) {
|
|
|
302
503
|
const n = Number(v);
|
|
303
504
|
return Number.isFinite(n) ? n : null;
|
|
304
505
|
}
|
|
506
|
+
function rowToAggregate(row) {
|
|
507
|
+
const r = row;
|
|
508
|
+
return {
|
|
509
|
+
category: String(r.category ?? ""),
|
|
510
|
+
sampleSize: Number(r.sample_size ?? 0),
|
|
511
|
+
medianIterationCount: nullableNum(r.median_iteration_count),
|
|
512
|
+
p95IterationCount: nullableNum(r.p95_iteration_count),
|
|
513
|
+
medianWallTimeMs: nullableNum(r.median_wall_time_ms),
|
|
514
|
+
p95WallTimeMs: nullableNum(r.p95_wall_time_ms),
|
|
515
|
+
reviewerRejectionRate: Number(r.reviewer_rejection_rate ?? 0),
|
|
516
|
+
revertRate: Number(r.revert_rate ?? 0),
|
|
517
|
+
hitlEscapeRate: Number(r.hitl_escape_rate ?? 0),
|
|
518
|
+
infraErrorRate: Number(r.infra_error_rate ?? 0),
|
|
519
|
+
};
|
|
520
|
+
}
|