nodebench-mcp 2.31.2 → 2.33.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. package/README.md +14 -6
  2. package/dist/engine/server.js +14 -4
  3. package/dist/engine/server.js.map +1 -1
  4. package/dist/index.js +1581 -670
  5. package/dist/index.js.map +1 -1
  6. package/dist/security/SecurityError.d.ts +18 -0
  7. package/dist/security/SecurityError.js +22 -0
  8. package/dist/security/SecurityError.js.map +1 -0
  9. package/dist/security/__tests__/security.test.d.ts +8 -0
  10. package/dist/security/__tests__/security.test.js +295 -0
  11. package/dist/security/__tests__/security.test.js.map +1 -0
  12. package/dist/security/auditLog.d.ts +36 -0
  13. package/dist/security/auditLog.js +178 -0
  14. package/dist/security/auditLog.js.map +1 -0
  15. package/dist/security/commandSandbox.d.ts +33 -0
  16. package/dist/security/commandSandbox.js +159 -0
  17. package/dist/security/commandSandbox.js.map +1 -0
  18. package/dist/security/config.d.ts +23 -0
  19. package/dist/security/config.js +43 -0
  20. package/dist/security/config.js.map +1 -0
  21. package/dist/security/credentialRedactor.d.ts +22 -0
  22. package/dist/security/credentialRedactor.js +118 -0
  23. package/dist/security/credentialRedactor.js.map +1 -0
  24. package/dist/security/index.d.ts +20 -0
  25. package/dist/security/index.js +21 -0
  26. package/dist/security/index.js.map +1 -0
  27. package/dist/security/pathSandbox.d.ts +23 -0
  28. package/dist/security/pathSandbox.js +160 -0
  29. package/dist/security/pathSandbox.js.map +1 -0
  30. package/dist/security/urlValidator.d.ts +23 -0
  31. package/dist/security/urlValidator.js +125 -0
  32. package/dist/security/urlValidator.js.map +1 -0
  33. package/dist/tools/agentBootstrapTools.js +22 -29
  34. package/dist/tools/agentBootstrapTools.js.map +1 -1
  35. package/dist/tools/contextSandboxTools.js +7 -9
  36. package/dist/tools/contextSandboxTools.js.map +1 -1
  37. package/dist/tools/deepSimTools.d.ts +2 -0
  38. package/dist/tools/deepSimTools.js +404 -0
  39. package/dist/tools/deepSimTools.js.map +1 -0
  40. package/dist/tools/dimensionTools.d.ts +2 -0
  41. package/dist/tools/dimensionTools.js +246 -0
  42. package/dist/tools/dimensionTools.js.map +1 -0
  43. package/dist/tools/executionTraceTools.d.ts +2 -0
  44. package/dist/tools/executionTraceTools.js +446 -0
  45. package/dist/tools/executionTraceTools.js.map +1 -0
  46. package/dist/tools/founderTools.d.ts +13 -0
  47. package/dist/tools/founderTools.js +595 -0
  48. package/dist/tools/founderTools.js.map +1 -0
  49. package/dist/tools/founderTrackingTools.d.ts +9 -0
  50. package/dist/tools/founderTrackingTools.js +644 -0
  51. package/dist/tools/founderTrackingTools.js.map +1 -0
  52. package/dist/tools/gitWorkflowTools.js +14 -10
  53. package/dist/tools/gitWorkflowTools.js.map +1 -1
  54. package/dist/tools/githubTools.js +19 -2
  55. package/dist/tools/githubTools.js.map +1 -1
  56. package/dist/tools/index.d.ts +87 -0
  57. package/dist/tools/index.js +102 -0
  58. package/dist/tools/index.js.map +1 -0
  59. package/dist/tools/localFileTools.js +24 -12
  60. package/dist/tools/localFileTools.js.map +1 -1
  61. package/dist/tools/memoryDecay.d.ts +70 -0
  62. package/dist/tools/memoryDecay.js +247 -0
  63. package/dist/tools/memoryDecay.js.map +1 -0
  64. package/dist/tools/missionHarnessTools.d.ts +32 -0
  65. package/dist/tools/missionHarnessTools.js +972 -0
  66. package/dist/tools/missionHarnessTools.js.map +1 -0
  67. package/dist/tools/observabilityTools.d.ts +15 -0
  68. package/dist/tools/observabilityTools.js +787 -0
  69. package/dist/tools/observabilityTools.js.map +1 -0
  70. package/dist/tools/openclawTools.js +151 -36
  71. package/dist/tools/openclawTools.js.map +1 -1
  72. package/dist/tools/progressiveDiscoveryTools.js +5 -4
  73. package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
  74. package/dist/tools/qualityGateTools.js +118 -2
  75. package/dist/tools/qualityGateTools.js.map +1 -1
  76. package/dist/tools/rssTools.js +3 -0
  77. package/dist/tools/rssTools.js.map +1 -1
  78. package/dist/tools/scraplingTools.js +15 -0
  79. package/dist/tools/scraplingTools.js.map +1 -1
  80. package/dist/tools/seoTools.js +66 -1
  81. package/dist/tools/seoTools.js.map +1 -1
  82. package/dist/tools/sessionMemoryTools.js +50 -11
  83. package/dist/tools/sessionMemoryTools.js.map +1 -1
  84. package/dist/tools/temporalIntelligenceTools.d.ts +12 -0
  85. package/dist/tools/temporalIntelligenceTools.js +1068 -0
  86. package/dist/tools/temporalIntelligenceTools.js.map +1 -0
  87. package/dist/tools/toolRegistry.d.ts +19 -0
  88. package/dist/tools/toolRegistry.js +956 -31
  89. package/dist/tools/toolRegistry.js.map +1 -1
  90. package/dist/tools/webTools.js +14 -1
  91. package/dist/tools/webTools.js.map +1 -1
  92. package/dist/tools/webmcpTools.js +13 -2
  93. package/dist/tools/webmcpTools.js.map +1 -1
  94. package/dist/toolsetRegistry.js +14 -0
  95. package/dist/toolsetRegistry.js.map +1 -1
  96. package/dist/types.d.ts +10 -0
  97. package/package.json +124 -124
@@ -0,0 +1,972 @@
1
+ /**
2
+ * Mission Harness Tools — Hierarchical mission execution for verifiable work
3
+ *
4
+ * NodeBench is not a single-agent assistant. It is a hierarchical mission
5
+ * execution harness for verifiable work.
6
+ *
7
+ * Architecture: Planner → Worker → Judge → Human Sniff-Check → Merge
8
+ *
9
+ * 5 first-class tools:
10
+ * plan.decompose_mission — Break mission into subtasks with verifiability routing
11
+ * judge.verify_subtask — Machine/expert verification with retry budget
12
+ * judge.request_retry — Retry, re-plan, escalate, or stop
13
+ * merge.compose_output — Judge-gated merge of subtask artifacts
14
+ * sniff.record_human_review — Human pass/concern/block with issue tags
15
+ *
16
+ * Persistence: SQLite-backed runs, taskPlans, subtaskAssignments, runSteps,
17
+ * artifacts, evidence, judgeReviews, retryAttempts, mergeBoundaries,
18
+ * sniffChecks, approvals.
19
+ *
20
+ * Verifiability tiers:
21
+ * Tier 1 — Machine-checkable (deterministic, automated judge)
22
+ * Tier 2 — Expert-checkable (requires human sniff-check)
23
+ *
24
+ * Anti-flat-coordination rules enforced:
25
+ * - One owner per subtask
26
+ * - Bounded input package
27
+ * - Explicit output contract
28
+ * - Judge-gated merge only
29
+ * - No shared free-for-all editing
30
+ */
31
+ import { getDb, genId } from "../db.js";
32
+ // ── Constants ─────────────────────────────────────────────────────────────
33
+ const MAX_SUBTASKS = 50;
34
+ const MAX_RETRY_BUDGET = 5;
35
+ const MAX_EVIDENCE_PER_REVIEW = 20;
36
+ const MAX_ARTIFACTS_PER_MERGE = 100;
37
+ // ── DB Setup ──────────────────────────────────────────────────────────────
38
+ function ensureMissionTables() {
39
+ const db = getDb();
40
+ db.exec(`
41
+ -- ═══════════════════════════════════════════
42
+ -- MISSION HARNESS — Hierarchical execution
43
+ -- Planner → Worker → Judge → Sniff → Merge
44
+ -- ═══════════════════════════════════════════
45
+
46
+ CREATE TABLE IF NOT EXISTS mission_runs (
47
+ id TEXT PRIMARY KEY,
48
+ title TEXT NOT NULL,
49
+ description TEXT,
50
+ status TEXT NOT NULL DEFAULT 'planning',
51
+ owner_agent TEXT,
52
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
53
+ updated_at TEXT NOT NULL DEFAULT (datetime('now')),
54
+ completed_at TEXT
55
+ );
56
+
57
+ CREATE TABLE IF NOT EXISTS mission_task_plans (
58
+ id TEXT PRIMARY KEY,
59
+ run_id TEXT NOT NULL REFERENCES mission_runs(id) ON DELETE CASCADE,
60
+ version INTEGER NOT NULL DEFAULT 1,
61
+ decomposition TEXT NOT NULL,
62
+ rationale TEXT,
63
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
64
+ );
65
+
66
+ CREATE INDEX IF NOT EXISTS idx_mission_task_plans_run ON mission_task_plans(run_id);
67
+
68
+ CREATE TABLE IF NOT EXISTS mission_subtasks (
69
+ id TEXT PRIMARY KEY,
70
+ run_id TEXT NOT NULL REFERENCES mission_runs(id) ON DELETE CASCADE,
71
+ plan_id TEXT NOT NULL REFERENCES mission_task_plans(id) ON DELETE CASCADE,
72
+ sequence INTEGER NOT NULL,
73
+ title TEXT NOT NULL,
74
+ description TEXT,
75
+ owner_agent TEXT,
76
+ status TEXT NOT NULL DEFAULT 'pending',
77
+ verifiability_tier TEXT NOT NULL DEFAULT 'tier_1_machine',
78
+ judge_method TEXT NOT NULL DEFAULT 'deterministic',
79
+ retry_budget INTEGER NOT NULL DEFAULT 3,
80
+ retries_used INTEGER NOT NULL DEFAULT 0,
81
+ requires_sniff_check INTEGER NOT NULL DEFAULT 0,
82
+ input_package TEXT,
83
+ output_contract TEXT,
84
+ depends_on TEXT DEFAULT '[]',
85
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
86
+ updated_at TEXT NOT NULL DEFAULT (datetime('now')),
87
+ completed_at TEXT
88
+ );
89
+
90
+ CREATE INDEX IF NOT EXISTS idx_mission_subtasks_run ON mission_subtasks(run_id);
91
+ CREATE INDEX IF NOT EXISTS idx_mission_subtasks_plan ON mission_subtasks(plan_id);
92
+ CREATE INDEX IF NOT EXISTS idx_mission_subtasks_status ON mission_subtasks(status);
93
+ CREATE INDEX IF NOT EXISTS idx_mission_subtasks_owner ON mission_subtasks(owner_agent);
94
+
95
+ CREATE TABLE IF NOT EXISTS mission_run_steps (
96
+ id TEXT PRIMARY KEY,
97
+ run_id TEXT NOT NULL REFERENCES mission_runs(id) ON DELETE CASCADE,
98
+ subtask_id TEXT NOT NULL REFERENCES mission_subtasks(id) ON DELETE CASCADE,
99
+ step_type TEXT NOT NULL,
100
+ agent_id TEXT,
101
+ input_summary TEXT,
102
+ output_summary TEXT,
103
+ status TEXT NOT NULL DEFAULT 'pending',
104
+ duration_ms INTEGER,
105
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
106
+ );
107
+
108
+ CREATE INDEX IF NOT EXISTS idx_mission_run_steps_subtask ON mission_run_steps(subtask_id);
109
+
110
+ CREATE TABLE IF NOT EXISTS mission_artifacts (
111
+ id TEXT PRIMARY KEY,
112
+ run_id TEXT NOT NULL REFERENCES mission_runs(id) ON DELETE CASCADE,
113
+ subtask_id TEXT NOT NULL REFERENCES mission_subtasks(id) ON DELETE CASCADE,
114
+ artifact_type TEXT NOT NULL,
115
+ title TEXT NOT NULL,
116
+ content TEXT NOT NULL,
117
+ content_hash TEXT,
118
+ metadata TEXT,
119
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
120
+ );
121
+
122
+ CREATE INDEX IF NOT EXISTS idx_mission_artifacts_subtask ON mission_artifacts(subtask_id);
123
+
124
+ CREATE TABLE IF NOT EXISTS mission_evidence (
125
+ id TEXT PRIMARY KEY,
126
+ run_id TEXT NOT NULL REFERENCES mission_runs(id) ON DELETE CASCADE,
127
+ subtask_id TEXT REFERENCES mission_subtasks(id) ON DELETE SET NULL,
128
+ review_id TEXT,
129
+ evidence_type TEXT NOT NULL,
130
+ content TEXT NOT NULL,
131
+ source_ref TEXT,
132
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
133
+ );
134
+
135
+ CREATE INDEX IF NOT EXISTS idx_mission_evidence_subtask ON mission_evidence(subtask_id);
136
+ CREATE INDEX IF NOT EXISTS idx_mission_evidence_review ON mission_evidence(review_id);
137
+
138
+ CREATE TABLE IF NOT EXISTS mission_judge_reviews (
139
+ id TEXT PRIMARY KEY,
140
+ run_id TEXT NOT NULL REFERENCES mission_runs(id) ON DELETE CASCADE,
141
+ subtask_id TEXT NOT NULL REFERENCES mission_subtasks(id) ON DELETE CASCADE,
142
+ judge_agent TEXT,
143
+ judge_method TEXT NOT NULL,
144
+ verdict TEXT NOT NULL,
145
+ reasoning TEXT,
146
+ evidence_ids TEXT DEFAULT '[]',
147
+ score REAL,
148
+ action TEXT NOT NULL DEFAULT 'pass',
149
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
150
+ );
151
+
152
+ CREATE INDEX IF NOT EXISTS idx_mission_judge_reviews_subtask ON mission_judge_reviews(subtask_id);
153
+
154
+ CREATE TABLE IF NOT EXISTS mission_retry_attempts (
155
+ id TEXT PRIMARY KEY,
156
+ run_id TEXT NOT NULL REFERENCES mission_runs(id) ON DELETE CASCADE,
157
+ subtask_id TEXT NOT NULL REFERENCES mission_subtasks(id) ON DELETE CASCADE,
158
+ review_id TEXT NOT NULL REFERENCES mission_judge_reviews(id) ON DELETE CASCADE,
159
+ action TEXT NOT NULL,
160
+ reason TEXT,
161
+ new_instructions TEXT,
162
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
163
+ );
164
+
165
+ CREATE INDEX IF NOT EXISTS idx_mission_retry_attempts_subtask ON mission_retry_attempts(subtask_id);
166
+
167
+ CREATE TABLE IF NOT EXISTS mission_merge_boundaries (
168
+ id TEXT PRIMARY KEY,
169
+ run_id TEXT NOT NULL REFERENCES mission_runs(id) ON DELETE CASCADE,
170
+ subtask_ids TEXT NOT NULL,
171
+ artifact_ids TEXT NOT NULL,
172
+ merged_output TEXT,
173
+ merge_agent TEXT,
174
+ status TEXT NOT NULL DEFAULT 'pending',
175
+ judge_review_id TEXT,
176
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
177
+ completed_at TEXT
178
+ );
179
+
180
+ CREATE INDEX IF NOT EXISTS idx_mission_merge_boundaries_run ON mission_merge_boundaries(run_id);
181
+
182
+ CREATE TABLE IF NOT EXISTS mission_sniff_checks (
183
+ id TEXT PRIMARY KEY,
184
+ run_id TEXT NOT NULL REFERENCES mission_runs(id) ON DELETE CASCADE,
185
+ subtask_id TEXT REFERENCES mission_subtasks(id) ON DELETE SET NULL,
186
+ merge_id TEXT REFERENCES mission_merge_boundaries(id) ON DELETE SET NULL,
187
+ reviewer TEXT,
188
+ verdict TEXT NOT NULL,
189
+ issue_tags TEXT DEFAULT '[]',
190
+ notes TEXT,
191
+ force_retry INTEGER NOT NULL DEFAULT 0,
192
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
193
+ );
194
+
195
+ CREATE INDEX IF NOT EXISTS idx_mission_sniff_checks_run ON mission_sniff_checks(run_id);
196
+ CREATE INDEX IF NOT EXISTS idx_mission_sniff_checks_subtask ON mission_sniff_checks(subtask_id);
197
+
198
+ CREATE TABLE IF NOT EXISTS mission_approvals (
199
+ id TEXT PRIMARY KEY,
200
+ run_id TEXT NOT NULL REFERENCES mission_runs(id) ON DELETE CASCADE,
201
+ subtask_id TEXT,
202
+ merge_id TEXT,
203
+ approver TEXT NOT NULL,
204
+ decision TEXT NOT NULL,
205
+ reason TEXT,
206
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
207
+ );
208
+
209
+ CREATE INDEX IF NOT EXISTS idx_mission_approvals_run ON mission_approvals(run_id);
210
+ `);
211
+ }
212
+ // ── Helpers ───────────────────────────────────────────────────────────────
213
+ function hashContent(content) {
214
+ // FNV-1a 32-bit for deterministic content hashing
215
+ let hash = 0x811c9dc5;
216
+ for (let i = 0; i < content.length; i++) {
217
+ hash ^= content.charCodeAt(i);
218
+ hash = Math.imul(hash, 0x01000193);
219
+ }
220
+ return (hash >>> 0).toString(16).padStart(8, "0");
221
+ }
222
+ function now() {
223
+ return new Date().toISOString().replace("T", " ").replace("Z", "");
224
+ }
225
+ // ── Tool Definitions ──────────────────────────────────────────────────────
226
+ export const missionHarnessTools = [
227
+ // ═══════════════════════════════════════════════════════════════════════
228
+ // 1. plan.decompose_mission
229
+ // ═══════════════════════════════════════════════════════════════════════
230
+ {
231
+ name: "plan_decompose_mission",
232
+ description: "Decompose a mission into subtasks with verifiability routing. " +
233
+ "Creates a run, task plan, and subtask assignments. Each subtask gets " +
234
+ "a verifiabilityTier (tier_1_machine | tier_2_expert), judgeMethod, " +
235
+ "retryBudget, and requiresHumanSniffCheck flag. Enforces: one owner " +
236
+ "per subtask, bounded input package, explicit output contract.",
237
+ inputSchema: {
238
+ type: "object",
239
+ properties: {
240
+ title: {
241
+ type: "string",
242
+ description: "Mission title — what is the top-level goal?",
243
+ },
244
+ description: {
245
+ type: "string",
246
+ description: "Full mission description with context and constraints",
247
+ },
248
+ subtasks: {
249
+ type: "array",
250
+ description: "Ordered list of subtask decompositions",
251
+ items: {
252
+ type: "object",
253
+ properties: {
254
+ title: { type: "string", description: "Subtask title" },
255
+ description: { type: "string", description: "What this subtask must accomplish" },
256
+ ownerAgent: { type: "string", description: "Assigned agent (one owner, no shared editing)" },
257
+ verifiabilityTier: {
258
+ type: "string",
259
+ enum: ["tier_1_machine", "tier_2_expert"],
260
+ description: "Tier 1 = machine-checkable, Tier 2 = expert-checkable",
261
+ },
262
+ judgeMethod: {
263
+ type: "string",
264
+ enum: ["deterministic", "llm_judge", "human_review", "composite"],
265
+ description: "How to verify this subtask's output",
266
+ },
267
+ retryBudget: {
268
+ type: "number",
269
+ description: "Max retry attempts before escalation (default: 3, max: 5)",
270
+ },
271
+ requiresSniffCheck: {
272
+ type: "boolean",
273
+ description: "Whether human sniff-check is required before merge",
274
+ },
275
+ inputPackage: {
276
+ type: "string",
277
+ description: "Bounded input — what data/context this subtask receives",
278
+ },
279
+ outputContract: {
280
+ type: "string",
281
+ description: "Explicit output contract — what this subtask must produce",
282
+ },
283
+ dependsOn: {
284
+ type: "array",
285
+ items: { type: "number" },
286
+ description: "Indices (0-based) of subtasks this depends on",
287
+ },
288
+ },
289
+ required: ["title", "verifiabilityTier", "judgeMethod", "outputContract"],
290
+ },
291
+ },
292
+ rationale: {
293
+ type: "string",
294
+ description: "Why this decomposition was chosen (for traceability)",
295
+ },
296
+ },
297
+ required: ["title", "subtasks"],
298
+ },
299
+ handler: async (args) => {
300
+ ensureMissionTables();
301
+ const db = getDb();
302
+ // Validate bounds
303
+ if (args.subtasks.length === 0) {
304
+ return { error: "At least one subtask is required" };
305
+ }
306
+ if (args.subtasks.length > MAX_SUBTASKS) {
307
+ return { error: `Max ${MAX_SUBTASKS} subtasks per mission` };
308
+ }
309
+ // Validate dependency indices
310
+ for (const [i, st] of args.subtasks.entries()) {
311
+ for (const dep of st.dependsOn ?? []) {
312
+ if (dep < 0 || dep >= args.subtasks.length || dep === i) {
313
+ return { error: `Subtask ${i} has invalid dependency index: ${dep}` };
314
+ }
315
+ if (dep >= i) {
316
+ return { error: `Subtask ${i} depends on later subtask ${dep} — forward deps not allowed` };
317
+ }
318
+ }
319
+ }
320
+ const runId = genId("mrun");
321
+ const planId = genId("mplan");
322
+ const timestamp = now();
323
+ // Create run
324
+ db.prepare(`INSERT INTO mission_runs (id, title, description, status, created_at, updated_at)
325
+ VALUES (?, ?, ?, 'planning', ?, ?)`).run(runId, args.title, args.description ?? null, timestamp, timestamp);
326
+ // Create task plan
327
+ db.prepare(`INSERT INTO mission_task_plans (id, run_id, version, decomposition, rationale, created_at)
328
+ VALUES (?, ?, 1, ?, ?, ?)`).run(planId, runId, JSON.stringify(args.subtasks.map((s) => s.title)), args.rationale ?? null, timestamp);
329
+ // Create subtasks
330
+ const subtaskIds = [];
331
+ const insertSubtask = db.prepare(`INSERT INTO mission_subtasks
332
+ (id, run_id, plan_id, sequence, title, description, owner_agent, status,
333
+ verifiability_tier, judge_method, retry_budget, requires_sniff_check,
334
+ input_package, output_contract, depends_on, created_at, updated_at)
335
+ VALUES (?, ?, ?, ?, ?, ?, ?, 'pending', ?, ?, ?, ?, ?, ?, ?, ?, ?)`);
336
+ const txn = db.transaction(() => {
337
+ for (const [i, st] of args.subtasks.entries()) {
338
+ const subtaskId = genId("msub");
339
+ subtaskIds.push(subtaskId);
340
+ const retryBudget = Math.min(st.retryBudget ?? 3, MAX_RETRY_BUDGET);
341
+ const depIds = (st.dependsOn ?? []).map((idx) => subtaskIds[idx]).filter(Boolean);
342
+ insertSubtask.run(subtaskId, runId, planId, i, st.title, st.description ?? null, st.ownerAgent ?? null, st.verifiabilityTier, st.judgeMethod, retryBudget, st.requiresSniffCheck ? 1 : 0, st.inputPackage ?? null, st.outputContract, JSON.stringify(depIds), timestamp, timestamp);
343
+ }
344
+ });
345
+ txn();
346
+ // Transition to executing
347
+ db.prepare(`UPDATE mission_runs SET status = 'executing', updated_at = ? WHERE id = ?`).run(now(), runId);
348
+ return {
349
+ runId,
350
+ planId,
351
+ subtaskCount: subtaskIds.length,
352
+ subtasks: subtaskIds.map((id, i) => ({
353
+ id,
354
+ sequence: i,
355
+ title: args.subtasks[i].title,
356
+ verifiabilityTier: args.subtasks[i].verifiabilityTier,
357
+ judgeMethod: args.subtasks[i].judgeMethod,
358
+ retryBudget: Math.min(args.subtasks[i].retryBudget ?? 3, MAX_RETRY_BUDGET),
359
+ requiresSniffCheck: args.subtasks[i].requiresSniffCheck ?? false,
360
+ dependsOn: (args.subtasks[i].dependsOn ?? []).map((idx) => subtaskIds[idx]),
361
+ })),
362
+ status: "executing",
363
+ traceability: {
364
+ receipt: `Mission ${runId} decomposed into ${subtaskIds.length} subtasks`,
365
+ planVersion: 1,
366
+ rationale: args.rationale ?? "not provided",
367
+ },
368
+ };
369
+ },
370
+ },
371
+ // ═══════════════════════════════════════════════════════════════════════
372
+ // 2. judge.verify_subtask
373
+ // ═══════════════════════════════════════════════════════════════════════
374
+ {
375
+ name: "judge_verify_subtask",
376
+ description: "Judge verifies a subtask's output against its output contract. " +
377
+ "Records verdict (pass/fail), reasoning, evidence references, and " +
378
+ "recommended action (pass/retry/replan/escalate/stop). " +
379
+ "Creates artifacts and evidence records for full traceability.",
380
+ inputSchema: {
381
+ type: "object",
382
+ properties: {
383
+ runId: { type: "string", description: "Mission run ID" },
384
+ subtaskId: { type: "string", description: "Subtask ID to verify" },
385
+ judgeAgent: { type: "string", description: "Judge agent identifier" },
386
+ verdict: {
387
+ type: "string",
388
+ enum: ["pass", "fail"],
389
+ description: "Did the subtask meet its output contract?",
390
+ },
391
+ reasoning: {
392
+ type: "string",
393
+ description: "Judge's reasoning for the verdict (full traceability, no hidden CoT)",
394
+ },
395
+ score: {
396
+ type: "number",
397
+ description: "Optional numeric score (0-1). No hardcoded floors — 0 means 0.",
398
+ },
399
+ evidence: {
400
+ type: "array",
401
+ description: "Evidence supporting the verdict",
402
+ items: {
403
+ type: "object",
404
+ properties: {
405
+ type: {
406
+ type: "string",
407
+ enum: ["test_result", "diff", "screenshot", "metric", "document", "citation", "log"],
408
+ description: "Evidence type",
409
+ },
410
+ content: { type: "string", description: "Evidence content or reference" },
411
+ sourceRef: { type: "string", description: "Source reference (URL, file path, etc.)" },
412
+ },
413
+ required: ["type", "content"],
414
+ },
415
+ },
416
+ artifacts: {
417
+ type: "array",
418
+ description: "Output artifacts from the subtask",
419
+ items: {
420
+ type: "object",
421
+ properties: {
422
+ type: {
423
+ type: "string",
424
+ enum: ["code", "document", "data", "config", "test", "report", "other"],
425
+ description: "Artifact type",
426
+ },
427
+ title: { type: "string", description: "Artifact title" },
428
+ content: { type: "string", description: "Artifact content" },
429
+ },
430
+ required: ["type", "title", "content"],
431
+ },
432
+ },
433
+ action: {
434
+ type: "string",
435
+ enum: ["pass", "retry", "replan", "escalate", "stop"],
436
+ description: "Recommended next action based on verdict",
437
+ },
438
+ },
439
+ required: ["runId", "subtaskId", "verdict", "reasoning", "action"],
440
+ },
441
+ handler: async (args) => {
442
+ ensureMissionTables();
443
+ const db = getDb();
444
+ // Validate subtask exists
445
+ const subtask = db.prepare("SELECT * FROM mission_subtasks WHERE id = ? AND run_id = ?").get(args.subtaskId, args.runId);
446
+ if (!subtask) {
447
+ return { error: `Subtask ${args.subtaskId} not found in run ${args.runId}` };
448
+ }
449
+ // Validate score bounds (HONEST_SCORES — no hardcoded floors)
450
+ if (args.score !== undefined && (args.score < 0 || args.score > 1)) {
451
+ return { error: "Score must be between 0 and 1. No hardcoded floors." };
452
+ }
453
+ const timestamp = now();
454
+ const reviewId = genId("mjrev");
455
+ // Store evidence
456
+ const evidenceIds = [];
457
+ if (args.evidence) {
458
+ const bounded = args.evidence.slice(0, MAX_EVIDENCE_PER_REVIEW);
459
+ const insertEvidence = db.prepare(`INSERT INTO mission_evidence (id, run_id, subtask_id, review_id, evidence_type, content, source_ref, created_at)
460
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)`);
461
+ for (const ev of bounded) {
462
+ const evId = genId("mev");
463
+ evidenceIds.push(evId);
464
+ insertEvidence.run(evId, args.runId, args.subtaskId, reviewId, ev.type, ev.content, ev.sourceRef ?? null, timestamp);
465
+ }
466
+ }
467
+ // Store artifacts
468
+ const artifactIds = [];
469
+ if (args.artifacts) {
470
+ const bounded = args.artifacts.slice(0, MAX_ARTIFACTS_PER_MERGE);
471
+ const insertArtifact = db.prepare(`INSERT INTO mission_artifacts (id, run_id, subtask_id, artifact_type, title, content, content_hash, created_at)
472
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)`);
473
+ for (const art of bounded) {
474
+ const artId = genId("mart");
475
+ artifactIds.push(artId);
476
+ insertArtifact.run(artId, args.runId, args.subtaskId, art.type, art.title, art.content, hashContent(art.content), timestamp);
477
+ }
478
+ }
479
+ // Store judge review
480
+ db.prepare(`INSERT INTO mission_judge_reviews
481
+ (id, run_id, subtask_id, judge_agent, judge_method, verdict, reasoning, evidence_ids, score, action, created_at)
482
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`).run(reviewId, args.runId, args.subtaskId, args.judgeAgent ?? "unknown", subtask.judge_method, args.verdict, args.reasoning, JSON.stringify(evidenceIds), args.score ?? null, args.action, timestamp);
483
+ // Update subtask status
484
+ const newStatus = args.action === "pass" ? "passed" :
485
+ args.action === "retry" ? "retrying" :
486
+ args.action === "escalate" ? "escalated" :
487
+ args.action === "stop" ? "failed" :
488
+ "review"; // replan
489
+ db.prepare(`UPDATE mission_subtasks SET status = ?, updated_at = ?${newStatus === "passed" ? ", completed_at = ?" : ""} WHERE id = ?`).run(...(newStatus === "passed" ? [newStatus, timestamp, timestamp, args.subtaskId] : [newStatus, timestamp, args.subtaskId]));
490
+ // Log the step
491
+ db.prepare(`INSERT INTO mission_run_steps (id, run_id, subtask_id, step_type, agent_id, input_summary, output_summary, status, created_at)
492
+ VALUES (?, ?, ?, 'judge_review', ?, ?, ?, ?, ?)`).run(genId("mstep"), args.runId, args.subtaskId, args.judgeAgent ?? "unknown", `Verifying subtask: ${subtask.title}`, `Verdict: ${args.verdict}, Action: ${args.action}`, args.verdict === "pass" ? "completed" : "pending", timestamp);
493
+ // Check if sniff-check required
494
+ const needsSniff = subtask.requires_sniff_check === 1 && args.action === "pass";
495
+ if (needsSniff) {
496
+ db.prepare(`UPDATE mission_subtasks SET status = 'review', updated_at = ? WHERE id = ?`).run(now(), args.subtaskId);
497
+ }
498
+ return {
499
+ reviewId,
500
+ verdict: args.verdict,
501
+ action: args.action,
502
+ score: args.score ?? null,
503
+ evidenceCount: evidenceIds.length,
504
+ artifactCount: artifactIds.length,
505
+ subtaskStatus: needsSniff ? "awaiting_sniff_check" : newStatus,
506
+ needsSniffCheck: needsSniff,
507
+ traceability: {
508
+ receipt: `Judge review ${reviewId} for subtask ${args.subtaskId}: ${args.verdict} → ${args.action}`,
509
+ evidenceRefs: evidenceIds,
510
+ artifactRefs: artifactIds,
511
+ reasoning: args.reasoning,
512
+ },
513
+ };
514
+ },
515
+ },
516
+ // ═══════════════════════════════════════════════════════════════════════
517
+ // 3. judge.request_retry
518
+ // ═══════════════════════════════════════════════════════════════════════
519
+ {
520
+ name: "judge_request_retry",
521
+ description: "Request a retry, re-plan, escalation, or stop for a failed subtask. " +
522
+ "Enforces retry budget — if exhausted, auto-escalates. " +
523
+ "Actions: pass | retry | replan | escalate | stop. " +
524
+ "If action is 'stop', marks subtask as unverifiable.",
525
+ inputSchema: {
526
+ type: "object",
527
+ properties: {
528
+ runId: { type: "string", description: "Mission run ID" },
529
+ subtaskId: { type: "string", description: "Subtask ID to retry" },
530
+ reviewId: { type: "string", description: "Judge review ID that triggered this" },
531
+ action: {
532
+ type: "string",
533
+ enum: ["pass", "retry", "replan", "escalate", "stop"],
534
+ description: "What to do next",
535
+ },
536
+ reason: { type: "string", description: "Why this action was chosen" },
537
+ newInstructions: {
538
+ type: "string",
539
+ description: "Updated instructions for retry/replan (what to do differently)",
540
+ },
541
+ },
542
+ required: ["runId", "subtaskId", "reviewId", "action", "reason"],
543
+ },
544
+ handler: async (args) => {
545
+ ensureMissionTables();
546
+ const db = getDb();
547
+ const subtask = db.prepare("SELECT * FROM mission_subtasks WHERE id = ? AND run_id = ?").get(args.subtaskId, args.runId);
548
+ if (!subtask) {
549
+ return { error: `Subtask ${args.subtaskId} not found in run ${args.runId}` };
550
+ }
551
+ const review = db.prepare("SELECT * FROM mission_judge_reviews WHERE id = ?").get(args.reviewId);
552
+ if (!review) {
553
+ return { error: `Review ${args.reviewId} not found` };
554
+ }
555
+ let effectiveAction = args.action;
556
+ let budgetExhausted = false;
557
+ // Enforce retry budget
558
+ if (args.action === "retry") {
559
+ if (subtask.retries_used >= subtask.retry_budget) {
560
+ effectiveAction = "escalate";
561
+ budgetExhausted = true;
562
+ }
563
+ else {
564
+ db.prepare(`UPDATE mission_subtasks SET retries_used = retries_used + 1, status = 'retrying', updated_at = ? WHERE id = ?`).run(now(), args.subtaskId);
565
+ }
566
+ }
567
+ // Record the attempt
568
+ const attemptId = genId("mretry");
569
+ db.prepare(`INSERT INTO mission_retry_attempts (id, run_id, subtask_id, review_id, action, reason, new_instructions, created_at)
570
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)`).run(attemptId, args.runId, args.subtaskId, args.reviewId, effectiveAction, args.reason, args.newInstructions ?? null, now());
571
+ // Update subtask status based on action
572
+ const statusMap = {
573
+ pass: "passed",
574
+ retry: "retrying",
575
+ replan: "pending",
576
+ escalate: "escalated",
577
+ stop: "failed",
578
+ };
579
+ const newStatus = statusMap[effectiveAction] ?? "pending";
580
+ db.prepare(`UPDATE mission_subtasks SET status = ?, updated_at = ? WHERE id = ?`).run(newStatus, now(), args.subtaskId);
581
+ // If stop, check if whole run should stop
582
+ if (effectiveAction === "stop") {
583
+ const remaining = db.prepare(`SELECT COUNT(*) as c FROM mission_subtasks WHERE run_id = ? AND status NOT IN ('passed', 'failed')`).get(args.runId);
584
+ if (remaining.c === 0) {
585
+ db.prepare(`UPDATE mission_runs SET status = 'failed', updated_at = ?, completed_at = ? WHERE id = ?`).run(now(), now(), args.runId);
586
+ }
587
+ }
588
+ return {
589
+ attemptId,
590
+ requestedAction: args.action,
591
+ effectiveAction,
592
+ budgetExhausted,
593
+ retriesUsed: subtask.retries_used + (effectiveAction === "retry" ? 1 : 0),
594
+ retryBudget: subtask.retry_budget,
595
+ subtaskStatus: newStatus,
596
+ traceability: {
597
+ receipt: `Retry attempt ${attemptId}: ${args.action}${budgetExhausted ? " → auto-escalated (budget exhausted)" : ""}`,
598
+ decision: effectiveAction,
599
+ reason: args.reason,
600
+ newInstructions: args.newInstructions ?? null,
601
+ },
602
+ };
603
+ },
604
+ },
605
+ // ═══════════════════════════════════════════════════════════════════════
606
+ // 4. merge.compose_output
607
+ // ═══════════════════════════════════════════════════════════════════════
608
+ {
609
+ name: "merge_compose_output",
610
+ description: "Judge-gated merge of subtask artifacts into a composed output. " +
611
+ "Only merges subtasks that have passed verification. " +
612
+ "Enforces: no shared free-for-all editing — merge boundary is explicit. " +
613
+ "Optionally requires judge review of the merged output.",
614
+ inputSchema: {
615
+ type: "object",
616
+ properties: {
617
+ runId: { type: "string", description: "Mission run ID" },
618
+ subtaskIds: {
619
+ type: "array",
620
+ items: { type: "string" },
621
+ description: "Subtask IDs to merge (must all be passed)",
622
+ },
623
+ mergeAgent: { type: "string", description: "Agent performing the merge" },
624
+ mergedOutput: {
625
+ type: "string",
626
+ description: "The composed output from merging subtask artifacts",
627
+ },
628
+ requiresJudgeReview: {
629
+ type: "boolean",
630
+ description: "Whether the merged output needs judge review before finalization",
631
+ },
632
+ },
633
+ required: ["runId", "subtaskIds", "mergedOutput"],
634
+ },
635
+ handler: async (args) => {
636
+ ensureMissionTables();
637
+ const db = getDb();
638
+ // Validate run exists
639
+ const run = db.prepare("SELECT * FROM mission_runs WHERE id = ?").get(args.runId);
640
+ if (!run) {
641
+ return { error: `Run ${args.runId} not found` };
642
+ }
643
+ // Validate all subtasks are passed
644
+ const notPassed = [];
645
+ for (const stId of args.subtaskIds) {
646
+ const st = db.prepare("SELECT id, status, title FROM mission_subtasks WHERE id = ? AND run_id = ?").get(stId, args.runId);
647
+ if (!st) {
648
+ return { error: `Subtask ${stId} not found in run ${args.runId}` };
649
+ }
650
+ if (st.status !== "passed") {
651
+ notPassed.push(`${stId} (${st.title}: ${st.status})`);
652
+ }
653
+ }
654
+ if (notPassed.length > 0) {
655
+ return {
656
+ error: "Judge-gated merge: all subtasks must be passed before merge",
657
+ notPassed,
658
+ hint: "Use judge_verify_subtask to pass remaining subtasks first",
659
+ };
660
+ }
661
+ // Collect artifact IDs from subtasks
662
+ const artifactIds = [];
663
+ for (const stId of args.subtaskIds) {
664
+ const arts = db.prepare("SELECT id FROM mission_artifacts WHERE subtask_id = ?").all(stId);
665
+ for (const art of arts) {
666
+ artifactIds.push(art.id);
667
+ }
668
+ }
669
+ // Create merge boundary
670
+ const mergeId = genId("mmerge");
671
+ const status = args.requiresJudgeReview ? "pending" : "completed";
672
+ const timestamp = now();
673
+ db.prepare(`INSERT INTO mission_merge_boundaries
674
+ (id, run_id, subtask_ids, artifact_ids, merged_output, merge_agent, status, created_at${status === "completed" ? ", completed_at" : ""})
675
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?${status === "completed" ? ", ?" : ""})`).run(...(status === "completed"
676
+ ? [mergeId, args.runId, JSON.stringify(args.subtaskIds), JSON.stringify(artifactIds), args.mergedOutput, args.mergeAgent ?? null, status, timestamp, timestamp]
677
+ : [mergeId, args.runId, JSON.stringify(args.subtaskIds), JSON.stringify(artifactIds), args.mergedOutput, args.mergeAgent ?? null, status, timestamp]));
678
+ // If all subtasks merged and no further review needed, complete the run
679
+ if (!args.requiresJudgeReview) {
680
+ const totalSubtasks = db.prepare("SELECT COUNT(*) as c FROM mission_subtasks WHERE run_id = ?").get(args.runId);
681
+ const passedSubtasks = db.prepare("SELECT COUNT(*) as c FROM mission_subtasks WHERE run_id = ? AND status = 'passed'").get(args.runId);
682
+ if (passedSubtasks.c === totalSubtasks.c) {
683
+ db.prepare(`UPDATE mission_runs SET status = 'completed', updated_at = ?, completed_at = ? WHERE id = ?`).run(now(), now(), args.runId);
684
+ }
685
+ }
686
+ else {
687
+ db.prepare(`UPDATE mission_runs SET status = 'merging', updated_at = ? WHERE id = ?`).run(now(), args.runId);
688
+ }
689
+ return {
690
+ mergeId,
691
+ subtasksMerged: args.subtaskIds.length,
692
+ artifactsMerged: artifactIds.length,
693
+ status,
694
+ requiresJudgeReview: args.requiresJudgeReview ?? false,
695
+ contentHash: hashContent(args.mergedOutput),
696
+ traceability: {
697
+ receipt: `Merge ${mergeId}: ${args.subtaskIds.length} subtasks → composed output`,
698
+ subtaskIds: args.subtaskIds,
699
+ artifactIds,
700
+ mergedContentHash: hashContent(args.mergedOutput),
701
+ },
702
+ };
703
+ },
704
+ },
705
+ // ═══════════════════════════════════════════════════════════════════════
706
+ // 5. sniff.record_human_review
707
+ // ═══════════════════════════════════════════════════════════════════════
708
+ {
709
+ name: "sniff_record_human_review",
710
+ description: "Record a human sniff-check for a subtask or merge output. " +
711
+ "Verdicts: pass | concern | block. " +
712
+ "Issue tags: unsupported_claim, weak_evidence, not_credible, " +
713
+ "too_risky, scope_drift, missing_source, contradictory, stale_data. " +
714
+ "If verdict is 'block', creates a force-retry path.",
715
+ inputSchema: {
716
+ type: "object",
717
+ properties: {
718
+ runId: { type: "string", description: "Mission run ID" },
719
+ subtaskId: {
720
+ type: "string",
721
+ description: "Subtask ID being reviewed (mutually exclusive with mergeId)",
722
+ },
723
+ mergeId: {
724
+ type: "string",
725
+ description: "Merge boundary ID being reviewed (mutually exclusive with subtaskId)",
726
+ },
727
+ reviewer: { type: "string", description: "Human reviewer identifier" },
728
+ verdict: {
729
+ type: "string",
730
+ enum: ["pass", "concern", "block"],
731
+ description: "pass = approved, concern = flagged but proceed, block = force retry",
732
+ },
733
+ issueTags: {
734
+ type: "array",
735
+ items: {
736
+ type: "string",
737
+ enum: [
738
+ "unsupported_claim", "weak_evidence", "not_credible",
739
+ "too_risky", "scope_drift", "missing_source",
740
+ "contradictory", "stale_data",
741
+ ],
742
+ },
743
+ description: "Issue tags categorizing the concern/block",
744
+ },
745
+ notes: {
746
+ type: "string",
747
+ description: "Free-text notes from the reviewer",
748
+ },
749
+ },
750
+ required: ["runId", "verdict"],
751
+ },
752
+ handler: async (args) => {
753
+ ensureMissionTables();
754
+ const db = getDb();
755
+ // Validate target
756
+ if (!args.subtaskId && !args.mergeId) {
757
+ return { error: "Either subtaskId or mergeId is required" };
758
+ }
759
+ // Validate run exists
760
+ const run = db.prepare("SELECT * FROM mission_runs WHERE id = ?").get(args.runId);
761
+ if (!run) {
762
+ return { error: `Run ${args.runId} not found` };
763
+ }
764
+ const forceRetry = args.verdict === "block" ? 1 : 0;
765
+ const sniffId = genId("msniff");
766
+ const timestamp = now();
767
+ db.prepare(`INSERT INTO mission_sniff_checks
768
+ (id, run_id, subtask_id, merge_id, reviewer, verdict, issue_tags, notes, force_retry, created_at)
769
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`).run(sniffId, args.runId, args.subtaskId ?? null, args.mergeId ?? null, args.reviewer ?? "human", args.verdict, JSON.stringify(args.issueTags ?? []), args.notes ?? null, forceRetry, timestamp);
770
+ // Record approval/block
771
+ db.prepare(`INSERT INTO mission_approvals (id, run_id, subtask_id, merge_id, approver, decision, reason, created_at)
772
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)`).run(genId("mappr"), args.runId, args.subtaskId ?? null, args.mergeId ?? null, args.reviewer ?? "human", args.verdict, args.notes ?? null, timestamp);
773
+ // Handle force-retry (block verdict)
774
+ if (forceRetry && args.subtaskId) {
775
+ db.prepare(`UPDATE mission_subtasks SET status = 'retrying', updated_at = ? WHERE id = ?`).run(now(), args.subtaskId);
776
+ }
777
+ if (forceRetry && args.mergeId) {
778
+ db.prepare(`UPDATE mission_merge_boundaries SET status = 'pending', completed_at = NULL WHERE id = ?`).run(args.mergeId);
779
+ }
780
+ // On pass, update run status
781
+ if (args.verdict === "pass") {
782
+ if (args.subtaskId) {
783
+ db.prepare(`UPDATE mission_subtasks SET status = 'passed', updated_at = ?, completed_at = ? WHERE id = ?`).run(now(), now(), args.subtaskId);
784
+ }
785
+ if (args.mergeId) {
786
+ db.prepare(`UPDATE mission_merge_boundaries SET status = 'completed', completed_at = ? WHERE id = ?`).run(now(), args.mergeId);
787
+ // Check if run is now complete
788
+ const allMerges = db.prepare("SELECT COUNT(*) as c FROM mission_merge_boundaries WHERE run_id = ? AND status != 'completed'").get(args.runId);
789
+ if (allMerges.c === 0) {
790
+ db.prepare(`UPDATE mission_runs SET status = 'completed', updated_at = ?, completed_at = ? WHERE id = ?`).run(now(), now(), args.runId);
791
+ }
792
+ }
793
+ }
794
+ return {
795
+ sniffCheckId: sniffId,
796
+ verdict: args.verdict,
797
+ issueTags: args.issueTags ?? [],
798
+ forceRetry: forceRetry === 1,
799
+ target: args.subtaskId ? `subtask:${args.subtaskId}` : `merge:${args.mergeId}`,
800
+ traceability: {
801
+ receipt: `Sniff-check ${sniffId}: ${args.verdict}${forceRetry ? " → force retry" : ""}`,
802
+ reviewer: args.reviewer ?? "human",
803
+ issueTags: args.issueTags ?? [],
804
+ notes: args.notes ?? null,
805
+ decision: args.verdict,
806
+ },
807
+ };
808
+ },
809
+ },
810
+ // ═══════════════════════════════════════════════════════════════════════
811
+ // 6. harness.get_mission_status (read-only query)
812
+ // ═══════════════════════════════════════════════════════════════════════
813
+ {
814
+ name: "harness_get_mission_status",
815
+ description: "Get full mission execution status: run info, subtask states, " +
816
+ "judge reviews, sniff-checks, merge boundaries, and traceability " +
817
+ "receipts. Read-only query for the Mission Graph / Live Execution Board.",
818
+ annotations: { readOnlyHint: true },
819
+ inputSchema: {
820
+ type: "object",
821
+ properties: {
822
+ runId: { type: "string", description: "Mission run ID" },
823
+ includeEvidence: {
824
+ type: "boolean",
825
+ description: "Include evidence records (default: false for performance)",
826
+ },
827
+ },
828
+ required: ["runId"],
829
+ },
830
+ handler: async (args) => {
831
+ ensureMissionTables();
832
+ const db = getDb();
833
+ const run = db.prepare("SELECT * FROM mission_runs WHERE id = ?").get(args.runId);
834
+ if (!run) {
835
+ return { error: `Run ${args.runId} not found` };
836
+ }
837
+ const subtasks = db.prepare("SELECT * FROM mission_subtasks WHERE run_id = ? ORDER BY sequence").all(args.runId);
838
+ const reviews = db.prepare("SELECT * FROM mission_judge_reviews WHERE run_id = ? ORDER BY created_at").all(args.runId);
839
+ const sniffChecks = db.prepare("SELECT * FROM mission_sniff_checks WHERE run_id = ? ORDER BY created_at").all(args.runId);
840
+ const merges = db.prepare("SELECT * FROM mission_merge_boundaries WHERE run_id = ? ORDER BY created_at").all(args.runId);
841
+ const retries = db.prepare("SELECT * FROM mission_retry_attempts WHERE run_id = ? ORDER BY created_at").all(args.runId);
842
+ const approvals = db.prepare("SELECT * FROM mission_approvals WHERE run_id = ? ORDER BY created_at").all(args.runId);
843
+ let evidence = [];
844
+ if (args.includeEvidence) {
845
+ evidence = db.prepare("SELECT * FROM mission_evidence WHERE run_id = ? ORDER BY created_at").all(args.runId);
846
+ }
847
+ // Compute summary stats
848
+ const statusCounts = {};
849
+ for (const st of subtasks) {
850
+ statusCounts[st.status] = (statusCounts[st.status] ?? 0) + 1;
851
+ }
852
+ const passRate = subtasks.length > 0
853
+ ? (statusCounts["passed"] ?? 0) / subtasks.length
854
+ : 0;
855
+ return {
856
+ run: {
857
+ id: run.id,
858
+ title: run.title,
859
+ description: run.description,
860
+ status: run.status,
861
+ createdAt: run.created_at,
862
+ completedAt: run.completed_at,
863
+ },
864
+ summary: {
865
+ totalSubtasks: subtasks.length,
866
+ statusCounts,
867
+ passRate: Math.round(passRate * 100) / 100,
868
+ totalReviews: reviews.length,
869
+ totalSniffChecks: sniffChecks.length,
870
+ totalRetries: retries.length,
871
+ totalMerges: merges.length,
872
+ totalApprovals: approvals.length,
873
+ },
874
+ subtasks: subtasks.map((st) => ({
875
+ id: st.id,
876
+ sequence: st.sequence,
877
+ title: st.title,
878
+ status: st.status,
879
+ ownerAgent: st.owner_agent,
880
+ verifiabilityTier: st.verifiability_tier,
881
+ judgeMethod: st.judge_method,
882
+ retryBudget: st.retry_budget,
883
+ retriesUsed: st.retries_used,
884
+ requiresSniffCheck: st.requires_sniff_check === 1,
885
+ outputContract: st.output_contract,
886
+ dependsOn: JSON.parse(st.depends_on || "[]"),
887
+ })),
888
+ reviews: reviews.map((r) => ({
889
+ id: r.id,
890
+ subtaskId: r.subtask_id,
891
+ verdict: r.verdict,
892
+ action: r.action,
893
+ score: r.score,
894
+ reasoning: r.reasoning,
895
+ createdAt: r.created_at,
896
+ })),
897
+ sniffChecks: sniffChecks.map((s) => ({
898
+ id: s.id,
899
+ subtaskId: s.subtask_id,
900
+ mergeId: s.merge_id,
901
+ verdict: s.verdict,
902
+ issueTags: JSON.parse(s.issue_tags || "[]"),
903
+ forceRetry: s.force_retry === 1,
904
+ notes: s.notes,
905
+ })),
906
+ merges: merges.map((m) => ({
907
+ id: m.id,
908
+ subtaskIds: JSON.parse(m.subtask_ids || "[]"),
909
+ status: m.status,
910
+ contentPreview: m.merged_output?.slice(0, 200) ?? null,
911
+ })),
912
+ retries,
913
+ approvals,
914
+ ...(args.includeEvidence ? { evidence } : {}),
915
+ };
916
+ },
917
+ },
918
+ // ═══════════════════════════════════════════════════════════════════════
919
+ // 7. harness.list_runs (discovery)
920
+ // ═══════════════════════════════════════════════════════════════════════
921
+ {
922
+ name: "harness_list_runs",
923
+ description: "List all mission runs with status summary. " +
924
+ "Supports filtering by status. For the Live Execution Board.",
925
+ annotations: { readOnlyHint: true },
926
+ inputSchema: {
927
+ type: "object",
928
+ properties: {
929
+ status: {
930
+ type: "string",
931
+ enum: ["planning", "executing", "reviewing", "merging", "sniff_check", "completed", "failed", "stopped"],
932
+ description: "Filter by run status (optional)",
933
+ },
934
+ limit: {
935
+ type: "number",
936
+ description: "Max results (default: 20, max: 100)",
937
+ },
938
+ },
939
+ },
940
+ handler: async (args) => {
941
+ ensureMissionTables();
942
+ const db = getDb();
943
+ const limit = Math.min(args.limit ?? 20, 100);
944
+ let runs;
945
+ if (args.status) {
946
+ runs = db.prepare("SELECT * FROM mission_runs WHERE status = ? ORDER BY created_at DESC LIMIT ?").all(args.status, limit);
947
+ }
948
+ else {
949
+ runs = db.prepare("SELECT * FROM mission_runs ORDER BY created_at DESC LIMIT ?").all(limit);
950
+ }
951
+ // Enrich with subtask counts
952
+ return {
953
+ runs: runs.map((r) => {
954
+ const counts = db.prepare(`SELECT status, COUNT(*) as c FROM mission_subtasks WHERE run_id = ? GROUP BY status`).all(r.id);
955
+ const statusMap = {};
956
+ for (const c of counts)
957
+ statusMap[c.status] = c.c;
958
+ return {
959
+ id: r.id,
960
+ title: r.title,
961
+ status: r.status,
962
+ createdAt: r.created_at,
963
+ completedAt: r.completed_at,
964
+ subtaskCounts: statusMap,
965
+ };
966
+ }),
967
+ total: runs.length,
968
+ };
969
+ },
970
+ },
971
+ ];
972
+ //# sourceMappingURL=missionHarnessTools.js.map