nodebench-mcp 2.31.2 → 2.33.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -6
- package/dist/engine/server.js +14 -4
- package/dist/engine/server.js.map +1 -1
- package/dist/index.js +1581 -670
- package/dist/index.js.map +1 -1
- package/dist/security/SecurityError.d.ts +18 -0
- package/dist/security/SecurityError.js +22 -0
- package/dist/security/SecurityError.js.map +1 -0
- package/dist/security/__tests__/security.test.d.ts +8 -0
- package/dist/security/__tests__/security.test.js +295 -0
- package/dist/security/__tests__/security.test.js.map +1 -0
- package/dist/security/auditLog.d.ts +36 -0
- package/dist/security/auditLog.js +178 -0
- package/dist/security/auditLog.js.map +1 -0
- package/dist/security/commandSandbox.d.ts +33 -0
- package/dist/security/commandSandbox.js +159 -0
- package/dist/security/commandSandbox.js.map +1 -0
- package/dist/security/config.d.ts +23 -0
- package/dist/security/config.js +43 -0
- package/dist/security/config.js.map +1 -0
- package/dist/security/credentialRedactor.d.ts +22 -0
- package/dist/security/credentialRedactor.js +118 -0
- package/dist/security/credentialRedactor.js.map +1 -0
- package/dist/security/index.d.ts +20 -0
- package/dist/security/index.js +21 -0
- package/dist/security/index.js.map +1 -0
- package/dist/security/pathSandbox.d.ts +23 -0
- package/dist/security/pathSandbox.js +160 -0
- package/dist/security/pathSandbox.js.map +1 -0
- package/dist/security/urlValidator.d.ts +23 -0
- package/dist/security/urlValidator.js +125 -0
- package/dist/security/urlValidator.js.map +1 -0
- package/dist/tools/agentBootstrapTools.js +22 -29
- package/dist/tools/agentBootstrapTools.js.map +1 -1
- package/dist/tools/contextSandboxTools.js +7 -9
- package/dist/tools/contextSandboxTools.js.map +1 -1
- package/dist/tools/deepSimTools.d.ts +2 -0
- package/dist/tools/deepSimTools.js +404 -0
- package/dist/tools/deepSimTools.js.map +1 -0
- package/dist/tools/dimensionTools.d.ts +2 -0
- package/dist/tools/dimensionTools.js +246 -0
- package/dist/tools/dimensionTools.js.map +1 -0
- package/dist/tools/executionTraceTools.d.ts +2 -0
- package/dist/tools/executionTraceTools.js +446 -0
- package/dist/tools/executionTraceTools.js.map +1 -0
- package/dist/tools/founderTools.d.ts +13 -0
- package/dist/tools/founderTools.js +595 -0
- package/dist/tools/founderTools.js.map +1 -0
- package/dist/tools/founderTrackingTools.d.ts +9 -0
- package/dist/tools/founderTrackingTools.js +644 -0
- package/dist/tools/founderTrackingTools.js.map +1 -0
- package/dist/tools/gitWorkflowTools.js +14 -10
- package/dist/tools/gitWorkflowTools.js.map +1 -1
- package/dist/tools/githubTools.js +19 -2
- package/dist/tools/githubTools.js.map +1 -1
- package/dist/tools/index.d.ts +87 -0
- package/dist/tools/index.js +102 -0
- package/dist/tools/index.js.map +1 -0
- package/dist/tools/localFileTools.js +24 -12
- package/dist/tools/localFileTools.js.map +1 -1
- package/dist/tools/memoryDecay.d.ts +70 -0
- package/dist/tools/memoryDecay.js +247 -0
- package/dist/tools/memoryDecay.js.map +1 -0
- package/dist/tools/missionHarnessTools.d.ts +32 -0
- package/dist/tools/missionHarnessTools.js +972 -0
- package/dist/tools/missionHarnessTools.js.map +1 -0
- package/dist/tools/observabilityTools.d.ts +15 -0
- package/dist/tools/observabilityTools.js +787 -0
- package/dist/tools/observabilityTools.js.map +1 -0
- package/dist/tools/openclawTools.js +151 -36
- package/dist/tools/openclawTools.js.map +1 -1
- package/dist/tools/progressiveDiscoveryTools.js +5 -4
- package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
- package/dist/tools/qualityGateTools.js +118 -2
- package/dist/tools/qualityGateTools.js.map +1 -1
- package/dist/tools/rssTools.js +3 -0
- package/dist/tools/rssTools.js.map +1 -1
- package/dist/tools/scraplingTools.js +15 -0
- package/dist/tools/scraplingTools.js.map +1 -1
- package/dist/tools/seoTools.js +66 -1
- package/dist/tools/seoTools.js.map +1 -1
- package/dist/tools/sessionMemoryTools.js +50 -11
- package/dist/tools/sessionMemoryTools.js.map +1 -1
- package/dist/tools/temporalIntelligenceTools.d.ts +12 -0
- package/dist/tools/temporalIntelligenceTools.js +1068 -0
- package/dist/tools/temporalIntelligenceTools.js.map +1 -0
- package/dist/tools/toolRegistry.d.ts +19 -0
- package/dist/tools/toolRegistry.js +956 -31
- package/dist/tools/toolRegistry.js.map +1 -1
- package/dist/tools/webTools.js +14 -1
- package/dist/tools/webTools.js.map +1 -1
- package/dist/tools/webmcpTools.js +13 -2
- package/dist/tools/webmcpTools.js.map +1 -1
- package/dist/toolsetRegistry.js +14 -0
- package/dist/toolsetRegistry.js.map +1 -1
- package/dist/types.d.ts +10 -0
- package/package.json +124 -124
|
@@ -0,0 +1,972 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Mission Harness Tools — Hierarchical mission execution for verifiable work
|
|
3
|
+
*
|
|
4
|
+
* NodeBench is not a single-agent assistant. It is a hierarchical mission
|
|
5
|
+
* execution harness for verifiable work.
|
|
6
|
+
*
|
|
7
|
+
* Architecture: Planner → Worker → Judge → Human Sniff-Check → Merge
|
|
8
|
+
*
|
|
9
|
+
* 5 first-class tools:
|
|
10
|
+
* plan.decompose_mission — Break mission into subtasks with verifiability routing
|
|
11
|
+
* judge.verify_subtask — Machine/expert verification with retry budget
|
|
12
|
+
* judge.request_retry — Retry, re-plan, escalate, or stop
|
|
13
|
+
* merge.compose_output — Judge-gated merge of subtask artifacts
|
|
14
|
+
* sniff.record_human_review — Human pass/concern/block with issue tags
|
|
15
|
+
*
|
|
16
|
+
* Persistence: SQLite-backed runs, taskPlans, subtaskAssignments, runSteps,
|
|
17
|
+
* artifacts, evidence, judgeReviews, retryAttempts, mergeBoundaries,
|
|
18
|
+
* sniffChecks, approvals.
|
|
19
|
+
*
|
|
20
|
+
* Verifiability tiers:
|
|
21
|
+
* Tier 1 — Machine-checkable (deterministic, automated judge)
|
|
22
|
+
* Tier 2 — Expert-checkable (requires human sniff-check)
|
|
23
|
+
*
|
|
24
|
+
* Anti-flat-coordination rules enforced:
|
|
25
|
+
* - One owner per subtask
|
|
26
|
+
* - Bounded input package
|
|
27
|
+
* - Explicit output contract
|
|
28
|
+
* - Judge-gated merge only
|
|
29
|
+
* - No shared free-for-all editing
|
|
30
|
+
*/
|
|
31
|
+
import { getDb, genId } from "../db.js";
|
|
32
|
+
// ── Constants ─────────────────────────────────────────────────────────────
|
|
33
|
+
const MAX_SUBTASKS = 50;
|
|
34
|
+
const MAX_RETRY_BUDGET = 5;
|
|
35
|
+
const MAX_EVIDENCE_PER_REVIEW = 20;
|
|
36
|
+
const MAX_ARTIFACTS_PER_MERGE = 100;
|
|
37
|
+
// ── DB Setup ──────────────────────────────────────────────────────────────
|
|
38
|
+
function ensureMissionTables() {
|
|
39
|
+
const db = getDb();
|
|
40
|
+
db.exec(`
|
|
41
|
+
-- ═══════════════════════════════════════════
|
|
42
|
+
-- MISSION HARNESS — Hierarchical execution
|
|
43
|
+
-- Planner → Worker → Judge → Sniff → Merge
|
|
44
|
+
-- ═══════════════════════════════════════════
|
|
45
|
+
|
|
46
|
+
CREATE TABLE IF NOT EXISTS mission_runs (
|
|
47
|
+
id TEXT PRIMARY KEY,
|
|
48
|
+
title TEXT NOT NULL,
|
|
49
|
+
description TEXT,
|
|
50
|
+
status TEXT NOT NULL DEFAULT 'planning',
|
|
51
|
+
owner_agent TEXT,
|
|
52
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
53
|
+
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
54
|
+
completed_at TEXT
|
|
55
|
+
);
|
|
56
|
+
|
|
57
|
+
CREATE TABLE IF NOT EXISTS mission_task_plans (
|
|
58
|
+
id TEXT PRIMARY KEY,
|
|
59
|
+
run_id TEXT NOT NULL REFERENCES mission_runs(id) ON DELETE CASCADE,
|
|
60
|
+
version INTEGER NOT NULL DEFAULT 1,
|
|
61
|
+
decomposition TEXT NOT NULL,
|
|
62
|
+
rationale TEXT,
|
|
63
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
64
|
+
);
|
|
65
|
+
|
|
66
|
+
CREATE INDEX IF NOT EXISTS idx_mission_task_plans_run ON mission_task_plans(run_id);
|
|
67
|
+
|
|
68
|
+
CREATE TABLE IF NOT EXISTS mission_subtasks (
|
|
69
|
+
id TEXT PRIMARY KEY,
|
|
70
|
+
run_id TEXT NOT NULL REFERENCES mission_runs(id) ON DELETE CASCADE,
|
|
71
|
+
plan_id TEXT NOT NULL REFERENCES mission_task_plans(id) ON DELETE CASCADE,
|
|
72
|
+
sequence INTEGER NOT NULL,
|
|
73
|
+
title TEXT NOT NULL,
|
|
74
|
+
description TEXT,
|
|
75
|
+
owner_agent TEXT,
|
|
76
|
+
status TEXT NOT NULL DEFAULT 'pending',
|
|
77
|
+
verifiability_tier TEXT NOT NULL DEFAULT 'tier_1_machine',
|
|
78
|
+
judge_method TEXT NOT NULL DEFAULT 'deterministic',
|
|
79
|
+
retry_budget INTEGER NOT NULL DEFAULT 3,
|
|
80
|
+
retries_used INTEGER NOT NULL DEFAULT 0,
|
|
81
|
+
requires_sniff_check INTEGER NOT NULL DEFAULT 0,
|
|
82
|
+
input_package TEXT,
|
|
83
|
+
output_contract TEXT,
|
|
84
|
+
depends_on TEXT DEFAULT '[]',
|
|
85
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
86
|
+
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
87
|
+
completed_at TEXT
|
|
88
|
+
);
|
|
89
|
+
|
|
90
|
+
CREATE INDEX IF NOT EXISTS idx_mission_subtasks_run ON mission_subtasks(run_id);
|
|
91
|
+
CREATE INDEX IF NOT EXISTS idx_mission_subtasks_plan ON mission_subtasks(plan_id);
|
|
92
|
+
CREATE INDEX IF NOT EXISTS idx_mission_subtasks_status ON mission_subtasks(status);
|
|
93
|
+
CREATE INDEX IF NOT EXISTS idx_mission_subtasks_owner ON mission_subtasks(owner_agent);
|
|
94
|
+
|
|
95
|
+
CREATE TABLE IF NOT EXISTS mission_run_steps (
|
|
96
|
+
id TEXT PRIMARY KEY,
|
|
97
|
+
run_id TEXT NOT NULL REFERENCES mission_runs(id) ON DELETE CASCADE,
|
|
98
|
+
subtask_id TEXT NOT NULL REFERENCES mission_subtasks(id) ON DELETE CASCADE,
|
|
99
|
+
step_type TEXT NOT NULL,
|
|
100
|
+
agent_id TEXT,
|
|
101
|
+
input_summary TEXT,
|
|
102
|
+
output_summary TEXT,
|
|
103
|
+
status TEXT NOT NULL DEFAULT 'pending',
|
|
104
|
+
duration_ms INTEGER,
|
|
105
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
106
|
+
);
|
|
107
|
+
|
|
108
|
+
CREATE INDEX IF NOT EXISTS idx_mission_run_steps_subtask ON mission_run_steps(subtask_id);
|
|
109
|
+
|
|
110
|
+
CREATE TABLE IF NOT EXISTS mission_artifacts (
|
|
111
|
+
id TEXT PRIMARY KEY,
|
|
112
|
+
run_id TEXT NOT NULL REFERENCES mission_runs(id) ON DELETE CASCADE,
|
|
113
|
+
subtask_id TEXT NOT NULL REFERENCES mission_subtasks(id) ON DELETE CASCADE,
|
|
114
|
+
artifact_type TEXT NOT NULL,
|
|
115
|
+
title TEXT NOT NULL,
|
|
116
|
+
content TEXT NOT NULL,
|
|
117
|
+
content_hash TEXT,
|
|
118
|
+
metadata TEXT,
|
|
119
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
120
|
+
);
|
|
121
|
+
|
|
122
|
+
CREATE INDEX IF NOT EXISTS idx_mission_artifacts_subtask ON mission_artifacts(subtask_id);
|
|
123
|
+
|
|
124
|
+
CREATE TABLE IF NOT EXISTS mission_evidence (
|
|
125
|
+
id TEXT PRIMARY KEY,
|
|
126
|
+
run_id TEXT NOT NULL REFERENCES mission_runs(id) ON DELETE CASCADE,
|
|
127
|
+
subtask_id TEXT REFERENCES mission_subtasks(id) ON DELETE SET NULL,
|
|
128
|
+
review_id TEXT,
|
|
129
|
+
evidence_type TEXT NOT NULL,
|
|
130
|
+
content TEXT NOT NULL,
|
|
131
|
+
source_ref TEXT,
|
|
132
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
133
|
+
);
|
|
134
|
+
|
|
135
|
+
CREATE INDEX IF NOT EXISTS idx_mission_evidence_subtask ON mission_evidence(subtask_id);
|
|
136
|
+
CREATE INDEX IF NOT EXISTS idx_mission_evidence_review ON mission_evidence(review_id);
|
|
137
|
+
|
|
138
|
+
CREATE TABLE IF NOT EXISTS mission_judge_reviews (
|
|
139
|
+
id TEXT PRIMARY KEY,
|
|
140
|
+
run_id TEXT NOT NULL REFERENCES mission_runs(id) ON DELETE CASCADE,
|
|
141
|
+
subtask_id TEXT NOT NULL REFERENCES mission_subtasks(id) ON DELETE CASCADE,
|
|
142
|
+
judge_agent TEXT,
|
|
143
|
+
judge_method TEXT NOT NULL,
|
|
144
|
+
verdict TEXT NOT NULL,
|
|
145
|
+
reasoning TEXT,
|
|
146
|
+
evidence_ids TEXT DEFAULT '[]',
|
|
147
|
+
score REAL,
|
|
148
|
+
action TEXT NOT NULL DEFAULT 'pass',
|
|
149
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
150
|
+
);
|
|
151
|
+
|
|
152
|
+
CREATE INDEX IF NOT EXISTS idx_mission_judge_reviews_subtask ON mission_judge_reviews(subtask_id);
|
|
153
|
+
|
|
154
|
+
CREATE TABLE IF NOT EXISTS mission_retry_attempts (
|
|
155
|
+
id TEXT PRIMARY KEY,
|
|
156
|
+
run_id TEXT NOT NULL REFERENCES mission_runs(id) ON DELETE CASCADE,
|
|
157
|
+
subtask_id TEXT NOT NULL REFERENCES mission_subtasks(id) ON DELETE CASCADE,
|
|
158
|
+
review_id TEXT NOT NULL REFERENCES mission_judge_reviews(id) ON DELETE CASCADE,
|
|
159
|
+
action TEXT NOT NULL,
|
|
160
|
+
reason TEXT,
|
|
161
|
+
new_instructions TEXT,
|
|
162
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
163
|
+
);
|
|
164
|
+
|
|
165
|
+
CREATE INDEX IF NOT EXISTS idx_mission_retry_attempts_subtask ON mission_retry_attempts(subtask_id);
|
|
166
|
+
|
|
167
|
+
CREATE TABLE IF NOT EXISTS mission_merge_boundaries (
|
|
168
|
+
id TEXT PRIMARY KEY,
|
|
169
|
+
run_id TEXT NOT NULL REFERENCES mission_runs(id) ON DELETE CASCADE,
|
|
170
|
+
subtask_ids TEXT NOT NULL,
|
|
171
|
+
artifact_ids TEXT NOT NULL,
|
|
172
|
+
merged_output TEXT,
|
|
173
|
+
merge_agent TEXT,
|
|
174
|
+
status TEXT NOT NULL DEFAULT 'pending',
|
|
175
|
+
judge_review_id TEXT,
|
|
176
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
177
|
+
completed_at TEXT
|
|
178
|
+
);
|
|
179
|
+
|
|
180
|
+
CREATE INDEX IF NOT EXISTS idx_mission_merge_boundaries_run ON mission_merge_boundaries(run_id);
|
|
181
|
+
|
|
182
|
+
CREATE TABLE IF NOT EXISTS mission_sniff_checks (
|
|
183
|
+
id TEXT PRIMARY KEY,
|
|
184
|
+
run_id TEXT NOT NULL REFERENCES mission_runs(id) ON DELETE CASCADE,
|
|
185
|
+
subtask_id TEXT REFERENCES mission_subtasks(id) ON DELETE SET NULL,
|
|
186
|
+
merge_id TEXT REFERENCES mission_merge_boundaries(id) ON DELETE SET NULL,
|
|
187
|
+
reviewer TEXT,
|
|
188
|
+
verdict TEXT NOT NULL,
|
|
189
|
+
issue_tags TEXT DEFAULT '[]',
|
|
190
|
+
notes TEXT,
|
|
191
|
+
force_retry INTEGER NOT NULL DEFAULT 0,
|
|
192
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
193
|
+
);
|
|
194
|
+
|
|
195
|
+
CREATE INDEX IF NOT EXISTS idx_mission_sniff_checks_run ON mission_sniff_checks(run_id);
|
|
196
|
+
CREATE INDEX IF NOT EXISTS idx_mission_sniff_checks_subtask ON mission_sniff_checks(subtask_id);
|
|
197
|
+
|
|
198
|
+
CREATE TABLE IF NOT EXISTS mission_approvals (
|
|
199
|
+
id TEXT PRIMARY KEY,
|
|
200
|
+
run_id TEXT NOT NULL REFERENCES mission_runs(id) ON DELETE CASCADE,
|
|
201
|
+
subtask_id TEXT,
|
|
202
|
+
merge_id TEXT,
|
|
203
|
+
approver TEXT NOT NULL,
|
|
204
|
+
decision TEXT NOT NULL,
|
|
205
|
+
reason TEXT,
|
|
206
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
207
|
+
);
|
|
208
|
+
|
|
209
|
+
CREATE INDEX IF NOT EXISTS idx_mission_approvals_run ON mission_approvals(run_id);
|
|
210
|
+
`);
|
|
211
|
+
}
|
|
212
|
+
// ── Helpers ───────────────────────────────────────────────────────────────
|
|
213
|
+
function hashContent(content) {
|
|
214
|
+
// FNV-1a 32-bit for deterministic content hashing
|
|
215
|
+
let hash = 0x811c9dc5;
|
|
216
|
+
for (let i = 0; i < content.length; i++) {
|
|
217
|
+
hash ^= content.charCodeAt(i);
|
|
218
|
+
hash = Math.imul(hash, 0x01000193);
|
|
219
|
+
}
|
|
220
|
+
return (hash >>> 0).toString(16).padStart(8, "0");
|
|
221
|
+
}
|
|
222
|
+
function now() {
|
|
223
|
+
return new Date().toISOString().replace("T", " ").replace("Z", "");
|
|
224
|
+
}
|
|
225
|
+
// ── Tool Definitions ──────────────────────────────────────────────────────
|
|
226
|
+
export const missionHarnessTools = [
|
|
227
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
228
|
+
// 1. plan.decompose_mission
|
|
229
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
230
|
+
{
|
|
231
|
+
name: "plan_decompose_mission",
|
|
232
|
+
description: "Decompose a mission into subtasks with verifiability routing. " +
|
|
233
|
+
"Creates a run, task plan, and subtask assignments. Each subtask gets " +
|
|
234
|
+
"a verifiabilityTier (tier_1_machine | tier_2_expert), judgeMethod, " +
|
|
235
|
+
"retryBudget, and requiresHumanSniffCheck flag. Enforces: one owner " +
|
|
236
|
+
"per subtask, bounded input package, explicit output contract.",
|
|
237
|
+
inputSchema: {
|
|
238
|
+
type: "object",
|
|
239
|
+
properties: {
|
|
240
|
+
title: {
|
|
241
|
+
type: "string",
|
|
242
|
+
description: "Mission title — what is the top-level goal?",
|
|
243
|
+
},
|
|
244
|
+
description: {
|
|
245
|
+
type: "string",
|
|
246
|
+
description: "Full mission description with context and constraints",
|
|
247
|
+
},
|
|
248
|
+
subtasks: {
|
|
249
|
+
type: "array",
|
|
250
|
+
description: "Ordered list of subtask decompositions",
|
|
251
|
+
items: {
|
|
252
|
+
type: "object",
|
|
253
|
+
properties: {
|
|
254
|
+
title: { type: "string", description: "Subtask title" },
|
|
255
|
+
description: { type: "string", description: "What this subtask must accomplish" },
|
|
256
|
+
ownerAgent: { type: "string", description: "Assigned agent (one owner, no shared editing)" },
|
|
257
|
+
verifiabilityTier: {
|
|
258
|
+
type: "string",
|
|
259
|
+
enum: ["tier_1_machine", "tier_2_expert"],
|
|
260
|
+
description: "Tier 1 = machine-checkable, Tier 2 = expert-checkable",
|
|
261
|
+
},
|
|
262
|
+
judgeMethod: {
|
|
263
|
+
type: "string",
|
|
264
|
+
enum: ["deterministic", "llm_judge", "human_review", "composite"],
|
|
265
|
+
description: "How to verify this subtask's output",
|
|
266
|
+
},
|
|
267
|
+
retryBudget: {
|
|
268
|
+
type: "number",
|
|
269
|
+
description: "Max retry attempts before escalation (default: 3, max: 5)",
|
|
270
|
+
},
|
|
271
|
+
requiresSniffCheck: {
|
|
272
|
+
type: "boolean",
|
|
273
|
+
description: "Whether human sniff-check is required before merge",
|
|
274
|
+
},
|
|
275
|
+
inputPackage: {
|
|
276
|
+
type: "string",
|
|
277
|
+
description: "Bounded input — what data/context this subtask receives",
|
|
278
|
+
},
|
|
279
|
+
outputContract: {
|
|
280
|
+
type: "string",
|
|
281
|
+
description: "Explicit output contract — what this subtask must produce",
|
|
282
|
+
},
|
|
283
|
+
dependsOn: {
|
|
284
|
+
type: "array",
|
|
285
|
+
items: { type: "number" },
|
|
286
|
+
description: "Indices (0-based) of subtasks this depends on",
|
|
287
|
+
},
|
|
288
|
+
},
|
|
289
|
+
required: ["title", "verifiabilityTier", "judgeMethod", "outputContract"],
|
|
290
|
+
},
|
|
291
|
+
},
|
|
292
|
+
rationale: {
|
|
293
|
+
type: "string",
|
|
294
|
+
description: "Why this decomposition was chosen (for traceability)",
|
|
295
|
+
},
|
|
296
|
+
},
|
|
297
|
+
required: ["title", "subtasks"],
|
|
298
|
+
},
|
|
299
|
+
handler: async (args) => {
|
|
300
|
+
ensureMissionTables();
|
|
301
|
+
const db = getDb();
|
|
302
|
+
// Validate bounds
|
|
303
|
+
if (args.subtasks.length === 0) {
|
|
304
|
+
return { error: "At least one subtask is required" };
|
|
305
|
+
}
|
|
306
|
+
if (args.subtasks.length > MAX_SUBTASKS) {
|
|
307
|
+
return { error: `Max ${MAX_SUBTASKS} subtasks per mission` };
|
|
308
|
+
}
|
|
309
|
+
// Validate dependency indices
|
|
310
|
+
for (const [i, st] of args.subtasks.entries()) {
|
|
311
|
+
for (const dep of st.dependsOn ?? []) {
|
|
312
|
+
if (dep < 0 || dep >= args.subtasks.length || dep === i) {
|
|
313
|
+
return { error: `Subtask ${i} has invalid dependency index: ${dep}` };
|
|
314
|
+
}
|
|
315
|
+
if (dep >= i) {
|
|
316
|
+
return { error: `Subtask ${i} depends on later subtask ${dep} — forward deps not allowed` };
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
const runId = genId("mrun");
|
|
321
|
+
const planId = genId("mplan");
|
|
322
|
+
const timestamp = now();
|
|
323
|
+
// Create run
|
|
324
|
+
db.prepare(`INSERT INTO mission_runs (id, title, description, status, created_at, updated_at)
|
|
325
|
+
VALUES (?, ?, ?, 'planning', ?, ?)`).run(runId, args.title, args.description ?? null, timestamp, timestamp);
|
|
326
|
+
// Create task plan
|
|
327
|
+
db.prepare(`INSERT INTO mission_task_plans (id, run_id, version, decomposition, rationale, created_at)
|
|
328
|
+
VALUES (?, ?, 1, ?, ?, ?)`).run(planId, runId, JSON.stringify(args.subtasks.map((s) => s.title)), args.rationale ?? null, timestamp);
|
|
329
|
+
// Create subtasks
|
|
330
|
+
const subtaskIds = [];
|
|
331
|
+
const insertSubtask = db.prepare(`INSERT INTO mission_subtasks
|
|
332
|
+
(id, run_id, plan_id, sequence, title, description, owner_agent, status,
|
|
333
|
+
verifiability_tier, judge_method, retry_budget, requires_sniff_check,
|
|
334
|
+
input_package, output_contract, depends_on, created_at, updated_at)
|
|
335
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, 'pending', ?, ?, ?, ?, ?, ?, ?, ?, ?)`);
|
|
336
|
+
const txn = db.transaction(() => {
|
|
337
|
+
for (const [i, st] of args.subtasks.entries()) {
|
|
338
|
+
const subtaskId = genId("msub");
|
|
339
|
+
subtaskIds.push(subtaskId);
|
|
340
|
+
const retryBudget = Math.min(st.retryBudget ?? 3, MAX_RETRY_BUDGET);
|
|
341
|
+
const depIds = (st.dependsOn ?? []).map((idx) => subtaskIds[idx]).filter(Boolean);
|
|
342
|
+
insertSubtask.run(subtaskId, runId, planId, i, st.title, st.description ?? null, st.ownerAgent ?? null, st.verifiabilityTier, st.judgeMethod, retryBudget, st.requiresSniffCheck ? 1 : 0, st.inputPackage ?? null, st.outputContract, JSON.stringify(depIds), timestamp, timestamp);
|
|
343
|
+
}
|
|
344
|
+
});
|
|
345
|
+
txn();
|
|
346
|
+
// Transition to executing
|
|
347
|
+
db.prepare(`UPDATE mission_runs SET status = 'executing', updated_at = ? WHERE id = ?`).run(now(), runId);
|
|
348
|
+
return {
|
|
349
|
+
runId,
|
|
350
|
+
planId,
|
|
351
|
+
subtaskCount: subtaskIds.length,
|
|
352
|
+
subtasks: subtaskIds.map((id, i) => ({
|
|
353
|
+
id,
|
|
354
|
+
sequence: i,
|
|
355
|
+
title: args.subtasks[i].title,
|
|
356
|
+
verifiabilityTier: args.subtasks[i].verifiabilityTier,
|
|
357
|
+
judgeMethod: args.subtasks[i].judgeMethod,
|
|
358
|
+
retryBudget: Math.min(args.subtasks[i].retryBudget ?? 3, MAX_RETRY_BUDGET),
|
|
359
|
+
requiresSniffCheck: args.subtasks[i].requiresSniffCheck ?? false,
|
|
360
|
+
dependsOn: (args.subtasks[i].dependsOn ?? []).map((idx) => subtaskIds[idx]),
|
|
361
|
+
})),
|
|
362
|
+
status: "executing",
|
|
363
|
+
traceability: {
|
|
364
|
+
receipt: `Mission ${runId} decomposed into ${subtaskIds.length} subtasks`,
|
|
365
|
+
planVersion: 1,
|
|
366
|
+
rationale: args.rationale ?? "not provided",
|
|
367
|
+
},
|
|
368
|
+
};
|
|
369
|
+
},
|
|
370
|
+
},
|
|
371
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
372
|
+
// 2. judge.verify_subtask
|
|
373
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
374
|
+
{
|
|
375
|
+
name: "judge_verify_subtask",
|
|
376
|
+
description: "Judge verifies a subtask's output against its output contract. " +
|
|
377
|
+
"Records verdict (pass/fail), reasoning, evidence references, and " +
|
|
378
|
+
"recommended action (pass/retry/replan/escalate/stop). " +
|
|
379
|
+
"Creates artifacts and evidence records for full traceability.",
|
|
380
|
+
inputSchema: {
|
|
381
|
+
type: "object",
|
|
382
|
+
properties: {
|
|
383
|
+
runId: { type: "string", description: "Mission run ID" },
|
|
384
|
+
subtaskId: { type: "string", description: "Subtask ID to verify" },
|
|
385
|
+
judgeAgent: { type: "string", description: "Judge agent identifier" },
|
|
386
|
+
verdict: {
|
|
387
|
+
type: "string",
|
|
388
|
+
enum: ["pass", "fail"],
|
|
389
|
+
description: "Did the subtask meet its output contract?",
|
|
390
|
+
},
|
|
391
|
+
reasoning: {
|
|
392
|
+
type: "string",
|
|
393
|
+
description: "Judge's reasoning for the verdict (full traceability, no hidden CoT)",
|
|
394
|
+
},
|
|
395
|
+
score: {
|
|
396
|
+
type: "number",
|
|
397
|
+
description: "Optional numeric score (0-1). No hardcoded floors — 0 means 0.",
|
|
398
|
+
},
|
|
399
|
+
evidence: {
|
|
400
|
+
type: "array",
|
|
401
|
+
description: "Evidence supporting the verdict",
|
|
402
|
+
items: {
|
|
403
|
+
type: "object",
|
|
404
|
+
properties: {
|
|
405
|
+
type: {
|
|
406
|
+
type: "string",
|
|
407
|
+
enum: ["test_result", "diff", "screenshot", "metric", "document", "citation", "log"],
|
|
408
|
+
description: "Evidence type",
|
|
409
|
+
},
|
|
410
|
+
content: { type: "string", description: "Evidence content or reference" },
|
|
411
|
+
sourceRef: { type: "string", description: "Source reference (URL, file path, etc.)" },
|
|
412
|
+
},
|
|
413
|
+
required: ["type", "content"],
|
|
414
|
+
},
|
|
415
|
+
},
|
|
416
|
+
artifacts: {
|
|
417
|
+
type: "array",
|
|
418
|
+
description: "Output artifacts from the subtask",
|
|
419
|
+
items: {
|
|
420
|
+
type: "object",
|
|
421
|
+
properties: {
|
|
422
|
+
type: {
|
|
423
|
+
type: "string",
|
|
424
|
+
enum: ["code", "document", "data", "config", "test", "report", "other"],
|
|
425
|
+
description: "Artifact type",
|
|
426
|
+
},
|
|
427
|
+
title: { type: "string", description: "Artifact title" },
|
|
428
|
+
content: { type: "string", description: "Artifact content" },
|
|
429
|
+
},
|
|
430
|
+
required: ["type", "title", "content"],
|
|
431
|
+
},
|
|
432
|
+
},
|
|
433
|
+
action: {
|
|
434
|
+
type: "string",
|
|
435
|
+
enum: ["pass", "retry", "replan", "escalate", "stop"],
|
|
436
|
+
description: "Recommended next action based on verdict",
|
|
437
|
+
},
|
|
438
|
+
},
|
|
439
|
+
required: ["runId", "subtaskId", "verdict", "reasoning", "action"],
|
|
440
|
+
},
|
|
441
|
+
handler: async (args) => {
|
|
442
|
+
ensureMissionTables();
|
|
443
|
+
const db = getDb();
|
|
444
|
+
// Validate subtask exists
|
|
445
|
+
const subtask = db.prepare("SELECT * FROM mission_subtasks WHERE id = ? AND run_id = ?").get(args.subtaskId, args.runId);
|
|
446
|
+
if (!subtask) {
|
|
447
|
+
return { error: `Subtask ${args.subtaskId} not found in run ${args.runId}` };
|
|
448
|
+
}
|
|
449
|
+
// Validate score bounds (HONEST_SCORES — no hardcoded floors)
|
|
450
|
+
if (args.score !== undefined && (args.score < 0 || args.score > 1)) {
|
|
451
|
+
return { error: "Score must be between 0 and 1. No hardcoded floors." };
|
|
452
|
+
}
|
|
453
|
+
const timestamp = now();
|
|
454
|
+
const reviewId = genId("mjrev");
|
|
455
|
+
// Store evidence
|
|
456
|
+
const evidenceIds = [];
|
|
457
|
+
if (args.evidence) {
|
|
458
|
+
const bounded = args.evidence.slice(0, MAX_EVIDENCE_PER_REVIEW);
|
|
459
|
+
const insertEvidence = db.prepare(`INSERT INTO mission_evidence (id, run_id, subtask_id, review_id, evidence_type, content, source_ref, created_at)
|
|
460
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)`);
|
|
461
|
+
for (const ev of bounded) {
|
|
462
|
+
const evId = genId("mev");
|
|
463
|
+
evidenceIds.push(evId);
|
|
464
|
+
insertEvidence.run(evId, args.runId, args.subtaskId, reviewId, ev.type, ev.content, ev.sourceRef ?? null, timestamp);
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
// Store artifacts
|
|
468
|
+
const artifactIds = [];
|
|
469
|
+
if (args.artifacts) {
|
|
470
|
+
const bounded = args.artifacts.slice(0, MAX_ARTIFACTS_PER_MERGE);
|
|
471
|
+
const insertArtifact = db.prepare(`INSERT INTO mission_artifacts (id, run_id, subtask_id, artifact_type, title, content, content_hash, created_at)
|
|
472
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)`);
|
|
473
|
+
for (const art of bounded) {
|
|
474
|
+
const artId = genId("mart");
|
|
475
|
+
artifactIds.push(artId);
|
|
476
|
+
insertArtifact.run(artId, args.runId, args.subtaskId, art.type, art.title, art.content, hashContent(art.content), timestamp);
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
// Store judge review
|
|
480
|
+
db.prepare(`INSERT INTO mission_judge_reviews
|
|
481
|
+
(id, run_id, subtask_id, judge_agent, judge_method, verdict, reasoning, evidence_ids, score, action, created_at)
|
|
482
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`).run(reviewId, args.runId, args.subtaskId, args.judgeAgent ?? "unknown", subtask.judge_method, args.verdict, args.reasoning, JSON.stringify(evidenceIds), args.score ?? null, args.action, timestamp);
|
|
483
|
+
// Update subtask status
|
|
484
|
+
const newStatus = args.action === "pass" ? "passed" :
|
|
485
|
+
args.action === "retry" ? "retrying" :
|
|
486
|
+
args.action === "escalate" ? "escalated" :
|
|
487
|
+
args.action === "stop" ? "failed" :
|
|
488
|
+
"review"; // replan
|
|
489
|
+
db.prepare(`UPDATE mission_subtasks SET status = ?, updated_at = ?${newStatus === "passed" ? ", completed_at = ?" : ""} WHERE id = ?`).run(...(newStatus === "passed" ? [newStatus, timestamp, timestamp, args.subtaskId] : [newStatus, timestamp, args.subtaskId]));
|
|
490
|
+
// Log the step
|
|
491
|
+
db.prepare(`INSERT INTO mission_run_steps (id, run_id, subtask_id, step_type, agent_id, input_summary, output_summary, status, created_at)
|
|
492
|
+
VALUES (?, ?, ?, 'judge_review', ?, ?, ?, ?, ?)`).run(genId("mstep"), args.runId, args.subtaskId, args.judgeAgent ?? "unknown", `Verifying subtask: ${subtask.title}`, `Verdict: ${args.verdict}, Action: ${args.action}`, args.verdict === "pass" ? "completed" : "pending", timestamp);
|
|
493
|
+
// Check if sniff-check required
|
|
494
|
+
const needsSniff = subtask.requires_sniff_check === 1 && args.action === "pass";
|
|
495
|
+
if (needsSniff) {
|
|
496
|
+
db.prepare(`UPDATE mission_subtasks SET status = 'review', updated_at = ? WHERE id = ?`).run(now(), args.subtaskId);
|
|
497
|
+
}
|
|
498
|
+
return {
|
|
499
|
+
reviewId,
|
|
500
|
+
verdict: args.verdict,
|
|
501
|
+
action: args.action,
|
|
502
|
+
score: args.score ?? null,
|
|
503
|
+
evidenceCount: evidenceIds.length,
|
|
504
|
+
artifactCount: artifactIds.length,
|
|
505
|
+
subtaskStatus: needsSniff ? "awaiting_sniff_check" : newStatus,
|
|
506
|
+
needsSniffCheck: needsSniff,
|
|
507
|
+
traceability: {
|
|
508
|
+
receipt: `Judge review ${reviewId} for subtask ${args.subtaskId}: ${args.verdict} → ${args.action}`,
|
|
509
|
+
evidenceRefs: evidenceIds,
|
|
510
|
+
artifactRefs: artifactIds,
|
|
511
|
+
reasoning: args.reasoning,
|
|
512
|
+
},
|
|
513
|
+
};
|
|
514
|
+
},
|
|
515
|
+
},
|
|
516
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
517
|
+
// 3. judge.request_retry
|
|
518
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
519
|
+
{
|
|
520
|
+
name: "judge_request_retry",
|
|
521
|
+
description: "Request a retry, re-plan, escalation, or stop for a failed subtask. " +
|
|
522
|
+
"Enforces retry budget — if exhausted, auto-escalates. " +
|
|
523
|
+
"Actions: pass | retry | replan | escalate | stop. " +
|
|
524
|
+
"If action is 'stop', marks subtask as unverifiable.",
|
|
525
|
+
inputSchema: {
|
|
526
|
+
type: "object",
|
|
527
|
+
properties: {
|
|
528
|
+
runId: { type: "string", description: "Mission run ID" },
|
|
529
|
+
subtaskId: { type: "string", description: "Subtask ID to retry" },
|
|
530
|
+
reviewId: { type: "string", description: "Judge review ID that triggered this" },
|
|
531
|
+
action: {
|
|
532
|
+
type: "string",
|
|
533
|
+
enum: ["pass", "retry", "replan", "escalate", "stop"],
|
|
534
|
+
description: "What to do next",
|
|
535
|
+
},
|
|
536
|
+
reason: { type: "string", description: "Why this action was chosen" },
|
|
537
|
+
newInstructions: {
|
|
538
|
+
type: "string",
|
|
539
|
+
description: "Updated instructions for retry/replan (what to do differently)",
|
|
540
|
+
},
|
|
541
|
+
},
|
|
542
|
+
required: ["runId", "subtaskId", "reviewId", "action", "reason"],
|
|
543
|
+
},
|
|
544
|
+
handler: async (args) => {
|
|
545
|
+
ensureMissionTables();
|
|
546
|
+
const db = getDb();
|
|
547
|
+
const subtask = db.prepare("SELECT * FROM mission_subtasks WHERE id = ? AND run_id = ?").get(args.subtaskId, args.runId);
|
|
548
|
+
if (!subtask) {
|
|
549
|
+
return { error: `Subtask ${args.subtaskId} not found in run ${args.runId}` };
|
|
550
|
+
}
|
|
551
|
+
const review = db.prepare("SELECT * FROM mission_judge_reviews WHERE id = ?").get(args.reviewId);
|
|
552
|
+
if (!review) {
|
|
553
|
+
return { error: `Review ${args.reviewId} not found` };
|
|
554
|
+
}
|
|
555
|
+
let effectiveAction = args.action;
|
|
556
|
+
let budgetExhausted = false;
|
|
557
|
+
// Enforce retry budget
|
|
558
|
+
if (args.action === "retry") {
|
|
559
|
+
if (subtask.retries_used >= subtask.retry_budget) {
|
|
560
|
+
effectiveAction = "escalate";
|
|
561
|
+
budgetExhausted = true;
|
|
562
|
+
}
|
|
563
|
+
else {
|
|
564
|
+
db.prepare(`UPDATE mission_subtasks SET retries_used = retries_used + 1, status = 'retrying', updated_at = ? WHERE id = ?`).run(now(), args.subtaskId);
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
// Record the attempt
|
|
568
|
+
const attemptId = genId("mretry");
|
|
569
|
+
db.prepare(`INSERT INTO mission_retry_attempts (id, run_id, subtask_id, review_id, action, reason, new_instructions, created_at)
|
|
570
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)`).run(attemptId, args.runId, args.subtaskId, args.reviewId, effectiveAction, args.reason, args.newInstructions ?? null, now());
|
|
571
|
+
// Update subtask status based on action
|
|
572
|
+
const statusMap = {
|
|
573
|
+
pass: "passed",
|
|
574
|
+
retry: "retrying",
|
|
575
|
+
replan: "pending",
|
|
576
|
+
escalate: "escalated",
|
|
577
|
+
stop: "failed",
|
|
578
|
+
};
|
|
579
|
+
const newStatus = statusMap[effectiveAction] ?? "pending";
|
|
580
|
+
db.prepare(`UPDATE mission_subtasks SET status = ?, updated_at = ? WHERE id = ?`).run(newStatus, now(), args.subtaskId);
|
|
581
|
+
// If stop, check if whole run should stop
|
|
582
|
+
if (effectiveAction === "stop") {
|
|
583
|
+
const remaining = db.prepare(`SELECT COUNT(*) as c FROM mission_subtasks WHERE run_id = ? AND status NOT IN ('passed', 'failed')`).get(args.runId);
|
|
584
|
+
if (remaining.c === 0) {
|
|
585
|
+
db.prepare(`UPDATE mission_runs SET status = 'failed', updated_at = ?, completed_at = ? WHERE id = ?`).run(now(), now(), args.runId);
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
return {
|
|
589
|
+
attemptId,
|
|
590
|
+
requestedAction: args.action,
|
|
591
|
+
effectiveAction,
|
|
592
|
+
budgetExhausted,
|
|
593
|
+
retriesUsed: subtask.retries_used + (effectiveAction === "retry" ? 1 : 0),
|
|
594
|
+
retryBudget: subtask.retry_budget,
|
|
595
|
+
subtaskStatus: newStatus,
|
|
596
|
+
traceability: {
|
|
597
|
+
receipt: `Retry attempt ${attemptId}: ${args.action}${budgetExhausted ? " → auto-escalated (budget exhausted)" : ""}`,
|
|
598
|
+
decision: effectiveAction,
|
|
599
|
+
reason: args.reason,
|
|
600
|
+
newInstructions: args.newInstructions ?? null,
|
|
601
|
+
},
|
|
602
|
+
};
|
|
603
|
+
},
|
|
604
|
+
},
|
|
605
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
606
|
+
// 4. merge.compose_output
|
|
607
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
608
|
+
{
|
|
609
|
+
name: "merge_compose_output",
|
|
610
|
+
description: "Judge-gated merge of subtask artifacts into a composed output. " +
|
|
611
|
+
"Only merges subtasks that have passed verification. " +
|
|
612
|
+
"Enforces: no shared free-for-all editing — merge boundary is explicit. " +
|
|
613
|
+
"Optionally requires judge review of the merged output.",
|
|
614
|
+
inputSchema: {
|
|
615
|
+
type: "object",
|
|
616
|
+
properties: {
|
|
617
|
+
runId: { type: "string", description: "Mission run ID" },
|
|
618
|
+
subtaskIds: {
|
|
619
|
+
type: "array",
|
|
620
|
+
items: { type: "string" },
|
|
621
|
+
description: "Subtask IDs to merge (must all be passed)",
|
|
622
|
+
},
|
|
623
|
+
mergeAgent: { type: "string", description: "Agent performing the merge" },
|
|
624
|
+
mergedOutput: {
|
|
625
|
+
type: "string",
|
|
626
|
+
description: "The composed output from merging subtask artifacts",
|
|
627
|
+
},
|
|
628
|
+
requiresJudgeReview: {
|
|
629
|
+
type: "boolean",
|
|
630
|
+
description: "Whether the merged output needs judge review before finalization",
|
|
631
|
+
},
|
|
632
|
+
},
|
|
633
|
+
required: ["runId", "subtaskIds", "mergedOutput"],
|
|
634
|
+
},
|
|
635
|
+
handler: async (args) => {
|
|
636
|
+
ensureMissionTables();
|
|
637
|
+
const db = getDb();
|
|
638
|
+
// Validate run exists
|
|
639
|
+
const run = db.prepare("SELECT * FROM mission_runs WHERE id = ?").get(args.runId);
|
|
640
|
+
if (!run) {
|
|
641
|
+
return { error: `Run ${args.runId} not found` };
|
|
642
|
+
}
|
|
643
|
+
// Validate all subtasks are passed
|
|
644
|
+
const notPassed = [];
|
|
645
|
+
for (const stId of args.subtaskIds) {
|
|
646
|
+
const st = db.prepare("SELECT id, status, title FROM mission_subtasks WHERE id = ? AND run_id = ?").get(stId, args.runId);
|
|
647
|
+
if (!st) {
|
|
648
|
+
return { error: `Subtask ${stId} not found in run ${args.runId}` };
|
|
649
|
+
}
|
|
650
|
+
if (st.status !== "passed") {
|
|
651
|
+
notPassed.push(`${stId} (${st.title}: ${st.status})`);
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
if (notPassed.length > 0) {
|
|
655
|
+
return {
|
|
656
|
+
error: "Judge-gated merge: all subtasks must be passed before merge",
|
|
657
|
+
notPassed,
|
|
658
|
+
hint: "Use judge_verify_subtask to pass remaining subtasks first",
|
|
659
|
+
};
|
|
660
|
+
}
|
|
661
|
+
// Collect artifact IDs from subtasks
|
|
662
|
+
const artifactIds = [];
|
|
663
|
+
for (const stId of args.subtaskIds) {
|
|
664
|
+
const arts = db.prepare("SELECT id FROM mission_artifacts WHERE subtask_id = ?").all(stId);
|
|
665
|
+
for (const art of arts) {
|
|
666
|
+
artifactIds.push(art.id);
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
// Create merge boundary
|
|
670
|
+
const mergeId = genId("mmerge");
|
|
671
|
+
const status = args.requiresJudgeReview ? "pending" : "completed";
|
|
672
|
+
const timestamp = now();
|
|
673
|
+
db.prepare(`INSERT INTO mission_merge_boundaries
|
|
674
|
+
(id, run_id, subtask_ids, artifact_ids, merged_output, merge_agent, status, created_at${status === "completed" ? ", completed_at" : ""})
|
|
675
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?${status === "completed" ? ", ?" : ""})`).run(...(status === "completed"
|
|
676
|
+
? [mergeId, args.runId, JSON.stringify(args.subtaskIds), JSON.stringify(artifactIds), args.mergedOutput, args.mergeAgent ?? null, status, timestamp, timestamp]
|
|
677
|
+
: [mergeId, args.runId, JSON.stringify(args.subtaskIds), JSON.stringify(artifactIds), args.mergedOutput, args.mergeAgent ?? null, status, timestamp]));
|
|
678
|
+
// If all subtasks merged and no further review needed, complete the run
|
|
679
|
+
if (!args.requiresJudgeReview) {
|
|
680
|
+
const totalSubtasks = db.prepare("SELECT COUNT(*) as c FROM mission_subtasks WHERE run_id = ?").get(args.runId);
|
|
681
|
+
const passedSubtasks = db.prepare("SELECT COUNT(*) as c FROM mission_subtasks WHERE run_id = ? AND status = 'passed'").get(args.runId);
|
|
682
|
+
if (passedSubtasks.c === totalSubtasks.c) {
|
|
683
|
+
db.prepare(`UPDATE mission_runs SET status = 'completed', updated_at = ?, completed_at = ? WHERE id = ?`).run(now(), now(), args.runId);
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
else {
|
|
687
|
+
db.prepare(`UPDATE mission_runs SET status = 'merging', updated_at = ? WHERE id = ?`).run(now(), args.runId);
|
|
688
|
+
}
|
|
689
|
+
return {
|
|
690
|
+
mergeId,
|
|
691
|
+
subtasksMerged: args.subtaskIds.length,
|
|
692
|
+
artifactsMerged: artifactIds.length,
|
|
693
|
+
status,
|
|
694
|
+
requiresJudgeReview: args.requiresJudgeReview ?? false,
|
|
695
|
+
contentHash: hashContent(args.mergedOutput),
|
|
696
|
+
traceability: {
|
|
697
|
+
receipt: `Merge ${mergeId}: ${args.subtaskIds.length} subtasks → composed output`,
|
|
698
|
+
subtaskIds: args.subtaskIds,
|
|
699
|
+
artifactIds,
|
|
700
|
+
mergedContentHash: hashContent(args.mergedOutput),
|
|
701
|
+
},
|
|
702
|
+
};
|
|
703
|
+
},
|
|
704
|
+
},
|
|
705
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
706
|
+
// 5. sniff.record_human_review
|
|
707
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
708
|
+
{
|
|
709
|
+
name: "sniff_record_human_review",
|
|
710
|
+
description: "Record a human sniff-check for a subtask or merge output. " +
|
|
711
|
+
"Verdicts: pass | concern | block. " +
|
|
712
|
+
"Issue tags: unsupported_claim, weak_evidence, not_credible, " +
|
|
713
|
+
"too_risky, scope_drift, missing_source, contradictory, stale_data. " +
|
|
714
|
+
"If verdict is 'block', creates a force-retry path.",
|
|
715
|
+
inputSchema: {
|
|
716
|
+
type: "object",
|
|
717
|
+
properties: {
|
|
718
|
+
runId: { type: "string", description: "Mission run ID" },
|
|
719
|
+
subtaskId: {
|
|
720
|
+
type: "string",
|
|
721
|
+
description: "Subtask ID being reviewed (mutually exclusive with mergeId)",
|
|
722
|
+
},
|
|
723
|
+
mergeId: {
|
|
724
|
+
type: "string",
|
|
725
|
+
description: "Merge boundary ID being reviewed (mutually exclusive with subtaskId)",
|
|
726
|
+
},
|
|
727
|
+
reviewer: { type: "string", description: "Human reviewer identifier" },
|
|
728
|
+
verdict: {
|
|
729
|
+
type: "string",
|
|
730
|
+
enum: ["pass", "concern", "block"],
|
|
731
|
+
description: "pass = approved, concern = flagged but proceed, block = force retry",
|
|
732
|
+
},
|
|
733
|
+
issueTags: {
|
|
734
|
+
type: "array",
|
|
735
|
+
items: {
|
|
736
|
+
type: "string",
|
|
737
|
+
enum: [
|
|
738
|
+
"unsupported_claim", "weak_evidence", "not_credible",
|
|
739
|
+
"too_risky", "scope_drift", "missing_source",
|
|
740
|
+
"contradictory", "stale_data",
|
|
741
|
+
],
|
|
742
|
+
},
|
|
743
|
+
description: "Issue tags categorizing the concern/block",
|
|
744
|
+
},
|
|
745
|
+
notes: {
|
|
746
|
+
type: "string",
|
|
747
|
+
description: "Free-text notes from the reviewer",
|
|
748
|
+
},
|
|
749
|
+
},
|
|
750
|
+
required: ["runId", "verdict"],
|
|
751
|
+
},
|
|
752
|
+
handler: async (args) => {
|
|
753
|
+
ensureMissionTables();
|
|
754
|
+
const db = getDb();
|
|
755
|
+
// Validate target
|
|
756
|
+
if (!args.subtaskId && !args.mergeId) {
|
|
757
|
+
return { error: "Either subtaskId or mergeId is required" };
|
|
758
|
+
}
|
|
759
|
+
// Validate run exists
|
|
760
|
+
const run = db.prepare("SELECT * FROM mission_runs WHERE id = ?").get(args.runId);
|
|
761
|
+
if (!run) {
|
|
762
|
+
return { error: `Run ${args.runId} not found` };
|
|
763
|
+
}
|
|
764
|
+
const forceRetry = args.verdict === "block" ? 1 : 0;
|
|
765
|
+
const sniffId = genId("msniff");
|
|
766
|
+
const timestamp = now();
|
|
767
|
+
db.prepare(`INSERT INTO mission_sniff_checks
|
|
768
|
+
(id, run_id, subtask_id, merge_id, reviewer, verdict, issue_tags, notes, force_retry, created_at)
|
|
769
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`).run(sniffId, args.runId, args.subtaskId ?? null, args.mergeId ?? null, args.reviewer ?? "human", args.verdict, JSON.stringify(args.issueTags ?? []), args.notes ?? null, forceRetry, timestamp);
|
|
770
|
+
// Record approval/block
|
|
771
|
+
db.prepare(`INSERT INTO mission_approvals (id, run_id, subtask_id, merge_id, approver, decision, reason, created_at)
|
|
772
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)`).run(genId("mappr"), args.runId, args.subtaskId ?? null, args.mergeId ?? null, args.reviewer ?? "human", args.verdict, args.notes ?? null, timestamp);
|
|
773
|
+
// Handle force-retry (block verdict)
|
|
774
|
+
if (forceRetry && args.subtaskId) {
|
|
775
|
+
db.prepare(`UPDATE mission_subtasks SET status = 'retrying', updated_at = ? WHERE id = ?`).run(now(), args.subtaskId);
|
|
776
|
+
}
|
|
777
|
+
if (forceRetry && args.mergeId) {
|
|
778
|
+
db.prepare(`UPDATE mission_merge_boundaries SET status = 'pending', completed_at = NULL WHERE id = ?`).run(args.mergeId);
|
|
779
|
+
}
|
|
780
|
+
// On pass, update run status
|
|
781
|
+
if (args.verdict === "pass") {
|
|
782
|
+
if (args.subtaskId) {
|
|
783
|
+
db.prepare(`UPDATE mission_subtasks SET status = 'passed', updated_at = ?, completed_at = ? WHERE id = ?`).run(now(), now(), args.subtaskId);
|
|
784
|
+
}
|
|
785
|
+
if (args.mergeId) {
|
|
786
|
+
db.prepare(`UPDATE mission_merge_boundaries SET status = 'completed', completed_at = ? WHERE id = ?`).run(now(), args.mergeId);
|
|
787
|
+
// Check if run is now complete
|
|
788
|
+
const allMerges = db.prepare("SELECT COUNT(*) as c FROM mission_merge_boundaries WHERE run_id = ? AND status != 'completed'").get(args.runId);
|
|
789
|
+
if (allMerges.c === 0) {
|
|
790
|
+
db.prepare(`UPDATE mission_runs SET status = 'completed', updated_at = ?, completed_at = ? WHERE id = ?`).run(now(), now(), args.runId);
|
|
791
|
+
}
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
return {
|
|
795
|
+
sniffCheckId: sniffId,
|
|
796
|
+
verdict: args.verdict,
|
|
797
|
+
issueTags: args.issueTags ?? [],
|
|
798
|
+
forceRetry: forceRetry === 1,
|
|
799
|
+
target: args.subtaskId ? `subtask:${args.subtaskId}` : `merge:${args.mergeId}`,
|
|
800
|
+
traceability: {
|
|
801
|
+
receipt: `Sniff-check ${sniffId}: ${args.verdict}${forceRetry ? " → force retry" : ""}`,
|
|
802
|
+
reviewer: args.reviewer ?? "human",
|
|
803
|
+
issueTags: args.issueTags ?? [],
|
|
804
|
+
notes: args.notes ?? null,
|
|
805
|
+
decision: args.verdict,
|
|
806
|
+
},
|
|
807
|
+
};
|
|
808
|
+
},
|
|
809
|
+
},
|
|
810
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
811
|
+
// 6. harness.get_mission_status (read-only query)
|
|
812
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
813
|
+
{
|
|
814
|
+
name: "harness_get_mission_status",
|
|
815
|
+
description: "Get full mission execution status: run info, subtask states, " +
|
|
816
|
+
"judge reviews, sniff-checks, merge boundaries, and traceability " +
|
|
817
|
+
"receipts. Read-only query for the Mission Graph / Live Execution Board.",
|
|
818
|
+
annotations: { readOnlyHint: true },
|
|
819
|
+
inputSchema: {
|
|
820
|
+
type: "object",
|
|
821
|
+
properties: {
|
|
822
|
+
runId: { type: "string", description: "Mission run ID" },
|
|
823
|
+
includeEvidence: {
|
|
824
|
+
type: "boolean",
|
|
825
|
+
description: "Include evidence records (default: false for performance)",
|
|
826
|
+
},
|
|
827
|
+
},
|
|
828
|
+
required: ["runId"],
|
|
829
|
+
},
|
|
830
|
+
handler: async (args) => {
|
|
831
|
+
ensureMissionTables();
|
|
832
|
+
const db = getDb();
|
|
833
|
+
const run = db.prepare("SELECT * FROM mission_runs WHERE id = ?").get(args.runId);
|
|
834
|
+
if (!run) {
|
|
835
|
+
return { error: `Run ${args.runId} not found` };
|
|
836
|
+
}
|
|
837
|
+
const subtasks = db.prepare("SELECT * FROM mission_subtasks WHERE run_id = ? ORDER BY sequence").all(args.runId);
|
|
838
|
+
const reviews = db.prepare("SELECT * FROM mission_judge_reviews WHERE run_id = ? ORDER BY created_at").all(args.runId);
|
|
839
|
+
const sniffChecks = db.prepare("SELECT * FROM mission_sniff_checks WHERE run_id = ? ORDER BY created_at").all(args.runId);
|
|
840
|
+
const merges = db.prepare("SELECT * FROM mission_merge_boundaries WHERE run_id = ? ORDER BY created_at").all(args.runId);
|
|
841
|
+
const retries = db.prepare("SELECT * FROM mission_retry_attempts WHERE run_id = ? ORDER BY created_at").all(args.runId);
|
|
842
|
+
const approvals = db.prepare("SELECT * FROM mission_approvals WHERE run_id = ? ORDER BY created_at").all(args.runId);
|
|
843
|
+
let evidence = [];
|
|
844
|
+
if (args.includeEvidence) {
|
|
845
|
+
evidence = db.prepare("SELECT * FROM mission_evidence WHERE run_id = ? ORDER BY created_at").all(args.runId);
|
|
846
|
+
}
|
|
847
|
+
// Compute summary stats
|
|
848
|
+
const statusCounts = {};
|
|
849
|
+
for (const st of subtasks) {
|
|
850
|
+
statusCounts[st.status] = (statusCounts[st.status] ?? 0) + 1;
|
|
851
|
+
}
|
|
852
|
+
const passRate = subtasks.length > 0
|
|
853
|
+
? (statusCounts["passed"] ?? 0) / subtasks.length
|
|
854
|
+
: 0;
|
|
855
|
+
return {
|
|
856
|
+
run: {
|
|
857
|
+
id: run.id,
|
|
858
|
+
title: run.title,
|
|
859
|
+
description: run.description,
|
|
860
|
+
status: run.status,
|
|
861
|
+
createdAt: run.created_at,
|
|
862
|
+
completedAt: run.completed_at,
|
|
863
|
+
},
|
|
864
|
+
summary: {
|
|
865
|
+
totalSubtasks: subtasks.length,
|
|
866
|
+
statusCounts,
|
|
867
|
+
passRate: Math.round(passRate * 100) / 100,
|
|
868
|
+
totalReviews: reviews.length,
|
|
869
|
+
totalSniffChecks: sniffChecks.length,
|
|
870
|
+
totalRetries: retries.length,
|
|
871
|
+
totalMerges: merges.length,
|
|
872
|
+
totalApprovals: approvals.length,
|
|
873
|
+
},
|
|
874
|
+
subtasks: subtasks.map((st) => ({
|
|
875
|
+
id: st.id,
|
|
876
|
+
sequence: st.sequence,
|
|
877
|
+
title: st.title,
|
|
878
|
+
status: st.status,
|
|
879
|
+
ownerAgent: st.owner_agent,
|
|
880
|
+
verifiabilityTier: st.verifiability_tier,
|
|
881
|
+
judgeMethod: st.judge_method,
|
|
882
|
+
retryBudget: st.retry_budget,
|
|
883
|
+
retriesUsed: st.retries_used,
|
|
884
|
+
requiresSniffCheck: st.requires_sniff_check === 1,
|
|
885
|
+
outputContract: st.output_contract,
|
|
886
|
+
dependsOn: JSON.parse(st.depends_on || "[]"),
|
|
887
|
+
})),
|
|
888
|
+
reviews: reviews.map((r) => ({
|
|
889
|
+
id: r.id,
|
|
890
|
+
subtaskId: r.subtask_id,
|
|
891
|
+
verdict: r.verdict,
|
|
892
|
+
action: r.action,
|
|
893
|
+
score: r.score,
|
|
894
|
+
reasoning: r.reasoning,
|
|
895
|
+
createdAt: r.created_at,
|
|
896
|
+
})),
|
|
897
|
+
sniffChecks: sniffChecks.map((s) => ({
|
|
898
|
+
id: s.id,
|
|
899
|
+
subtaskId: s.subtask_id,
|
|
900
|
+
mergeId: s.merge_id,
|
|
901
|
+
verdict: s.verdict,
|
|
902
|
+
issueTags: JSON.parse(s.issue_tags || "[]"),
|
|
903
|
+
forceRetry: s.force_retry === 1,
|
|
904
|
+
notes: s.notes,
|
|
905
|
+
})),
|
|
906
|
+
merges: merges.map((m) => ({
|
|
907
|
+
id: m.id,
|
|
908
|
+
subtaskIds: JSON.parse(m.subtask_ids || "[]"),
|
|
909
|
+
status: m.status,
|
|
910
|
+
contentPreview: m.merged_output?.slice(0, 200) ?? null,
|
|
911
|
+
})),
|
|
912
|
+
retries,
|
|
913
|
+
approvals,
|
|
914
|
+
...(args.includeEvidence ? { evidence } : {}),
|
|
915
|
+
};
|
|
916
|
+
},
|
|
917
|
+
},
|
|
918
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
919
|
+
// 7. harness.list_runs (discovery)
|
|
920
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
921
|
+
{
|
|
922
|
+
name: "harness_list_runs",
|
|
923
|
+
description: "List all mission runs with status summary. " +
|
|
924
|
+
"Supports filtering by status. For the Live Execution Board.",
|
|
925
|
+
annotations: { readOnlyHint: true },
|
|
926
|
+
inputSchema: {
|
|
927
|
+
type: "object",
|
|
928
|
+
properties: {
|
|
929
|
+
status: {
|
|
930
|
+
type: "string",
|
|
931
|
+
enum: ["planning", "executing", "reviewing", "merging", "sniff_check", "completed", "failed", "stopped"],
|
|
932
|
+
description: "Filter by run status (optional)",
|
|
933
|
+
},
|
|
934
|
+
limit: {
|
|
935
|
+
type: "number",
|
|
936
|
+
description: "Max results (default: 20, max: 100)",
|
|
937
|
+
},
|
|
938
|
+
},
|
|
939
|
+
},
|
|
940
|
+
handler: async (args) => {
|
|
941
|
+
ensureMissionTables();
|
|
942
|
+
const db = getDb();
|
|
943
|
+
const limit = Math.min(args.limit ?? 20, 100);
|
|
944
|
+
let runs;
|
|
945
|
+
if (args.status) {
|
|
946
|
+
runs = db.prepare("SELECT * FROM mission_runs WHERE status = ? ORDER BY created_at DESC LIMIT ?").all(args.status, limit);
|
|
947
|
+
}
|
|
948
|
+
else {
|
|
949
|
+
runs = db.prepare("SELECT * FROM mission_runs ORDER BY created_at DESC LIMIT ?").all(limit);
|
|
950
|
+
}
|
|
951
|
+
// Enrich with subtask counts
|
|
952
|
+
return {
|
|
953
|
+
runs: runs.map((r) => {
|
|
954
|
+
const counts = db.prepare(`SELECT status, COUNT(*) as c FROM mission_subtasks WHERE run_id = ? GROUP BY status`).all(r.id);
|
|
955
|
+
const statusMap = {};
|
|
956
|
+
for (const c of counts)
|
|
957
|
+
statusMap[c.status] = c.c;
|
|
958
|
+
return {
|
|
959
|
+
id: r.id,
|
|
960
|
+
title: r.title,
|
|
961
|
+
status: r.status,
|
|
962
|
+
createdAt: r.created_at,
|
|
963
|
+
completedAt: r.completed_at,
|
|
964
|
+
subtaskCounts: statusMap,
|
|
965
|
+
};
|
|
966
|
+
}),
|
|
967
|
+
total: runs.length,
|
|
968
|
+
};
|
|
969
|
+
},
|
|
970
|
+
},
|
|
971
|
+
];
|
|
972
|
+
//# sourceMappingURL=missionHarnessTools.js.map
|