@yemi33/minions 0.1.2121 → 0.1.2123
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dashboard/js/settings.js +4 -0
- package/dashboard.js +3 -0
- package/docs/harness-mode.md +92 -0
- package/engine/ado.js +142 -21
- package/engine/github.js +4 -1
- package/engine/harness.js +592 -0
- package/engine/lifecycle.js +91 -0
- package/engine/scheduler.js +40 -3
- package/engine/shared.js +16 -0
- package/engine/timeout.js +286 -21
- package/engine.js +66 -15
- package/package.json +1 -1
|
@@ -0,0 +1,592 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* engine/harness.js — Tri-agent harness mode for long-running missions.
|
|
3
|
+
*
|
|
4
|
+
* Inspired by Anthropic's Harness Design (April 2026), this module implements
|
|
5
|
+
* the "tri_agent" mission mode: a schedule firing produces three coordinated
|
|
6
|
+
* work items — Planner → Generator → Evaluator — that operate against a
|
|
7
|
+
* shared artifact on disk. The Evaluator scores the artifact against a rubric
|
|
8
|
+
* declared in the schedule config; if it scores below the threshold (default
|
|
9
|
+
* 0.7), the engine spawns another Generator+Evaluator iteration with the
|
|
10
|
+
* Evaluator's feedback injected. The cycle terminates on pass or after
|
|
11
|
+
* `harness_max_iterations` (default 5) iterations — whichever comes first.
|
|
12
|
+
*
|
|
13
|
+
* Why a separate module:
|
|
14
|
+
* - Keeps scheduler.js focused on cron parsing + run-dedup.
|
|
15
|
+
* - The iteration loop lives in lifecycle.js so the engine's existing
|
|
16
|
+
* post-completion hook chain owns retry orchestration.
|
|
17
|
+
* - Pure helpers here (no side effects beyond reading shared.MINIONS_DIR)
|
|
18
|
+
* are easy to test in isolation.
|
|
19
|
+
*
|
|
20
|
+
* Schedule config shape (additive on top of the cron schedule schema):
|
|
21
|
+
* {
|
|
22
|
+
* id: 'daily-research',
|
|
23
|
+
* cron: '0 9 * *',
|
|
24
|
+
* title: 'Daily research pass',
|
|
25
|
+
* description: 'Summarize new arxiv papers',
|
|
26
|
+
* project: 'minions', // optional
|
|
27
|
+
* type: 'ask', // generator work-type (default 'ask')
|
|
28
|
+
* harness_mode: 'tri_agent', // REQUIRED to enable harness mode
|
|
29
|
+
* harness_rubric: 'Must cite ≥3 papers', // REQUIRED — passed to Evaluator
|
|
30
|
+
* harness_threshold: 0.7, // default 0.7, must be (0, 1]
|
|
31
|
+
* harness_max_iterations: 5, // default 5, integer in [1, 20]
|
|
32
|
+
* }
|
|
33
|
+
*
|
|
34
|
+
* Per-item _harness meta (carried through dispatch + lifecycle):
|
|
35
|
+
* {
|
|
36
|
+
* role: 'planner' | 'generator' | 'evaluator',
|
|
37
|
+
* iteration: 1, // generators/evaluators bump this on retry
|
|
38
|
+
* missionId: 'sched-id-1700-abc',
|
|
39
|
+
* artifactPath: '<MINIONS_DIR>/engine/harness/<missionId>/artifact.md',
|
|
40
|
+
* rubric: '...', // verbatim from schedule
|
|
41
|
+
* threshold: 0.7,
|
|
42
|
+
* maxIterations: 5,
|
|
43
|
+
* generatorType: 'ask', // remembered so iteration N+1 reuses it
|
|
44
|
+
* }
|
|
45
|
+
*
|
|
46
|
+
* Zero dependencies beyond Node built-ins + engine/shared.
|
|
47
|
+
*/
|
|
48
|
+
|
|
49
|
+
'use strict';
|
|
50
|
+
|
|
51
|
+
const path = require('path');
|
|
52
|
+
const shared = require('./shared');
|
|
53
|
+
const { MINIONS_DIR, WI_STATUS, ts } = shared;
|
|
54
|
+
|
|
55
|
+
const HARNESS_MODE = Object.freeze({
|
|
56
|
+
TRI_AGENT: 'tri_agent',
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
const HARNESS_ROLE = Object.freeze({
|
|
60
|
+
PLANNER: 'planner',
|
|
61
|
+
GENERATOR: 'generator',
|
|
62
|
+
EVALUATOR: 'evaluator',
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
const HARNESS_DEFAULTS = Object.freeze({
|
|
66
|
+
threshold: 0.7,
|
|
67
|
+
maxIterations: 5,
|
|
68
|
+
// Used when sched.type is absent. Ask is read-only and produces no PR, which
|
|
69
|
+
// matches the "research / synthesis" use case the harness was designed for.
|
|
70
|
+
generatorType: 'ask',
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
const HARNESS_MAX_ITERATIONS_CAP = 20;
|
|
74
|
+
|
|
75
|
+
// Filesystem layout for harness artifacts. Each mission gets its own dir so
|
|
76
|
+
// concurrent missions don't stomp each other's artifacts.
|
|
77
|
+
function harnessRootDir() {
|
|
78
|
+
return path.join(MINIONS_DIR, 'engine', 'harness');
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
function harnessMissionDir(missionId) {
|
|
82
|
+
return path.join(harnessRootDir(), missionId);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function harnessArtifactPath(missionId) {
|
|
86
|
+
return path.join(harnessMissionDir(missionId), 'artifact.md');
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Validate a schedule's tri-agent harness configuration.
|
|
91
|
+
* Returns { valid: boolean, errors: string[], resolved?: { threshold, maxIterations, generatorType } }.
|
|
92
|
+
* The `resolved` object reflects the defaults that will be applied (only populated when valid=true).
|
|
93
|
+
*/
|
|
94
|
+
function validateHarnessConfig(sched) {
|
|
95
|
+
const errors = [];
|
|
96
|
+
if (!sched || typeof sched !== 'object') {
|
|
97
|
+
return { valid: false, errors: ['schedule must be an object'] };
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
if (sched.harness_mode !== HARNESS_MODE.TRI_AGENT) {
|
|
101
|
+
errors.push(`harness_mode must be "${HARNESS_MODE.TRI_AGENT}" (got ${JSON.stringify(sched.harness_mode)})`);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if (typeof sched.harness_rubric !== 'string' || sched.harness_rubric.trim().length === 0) {
|
|
105
|
+
errors.push('harness_rubric is required (non-empty string)');
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
let threshold = HARNESS_DEFAULTS.threshold;
|
|
109
|
+
if (sched.harness_threshold !== undefined && sched.harness_threshold !== null) {
|
|
110
|
+
if (typeof sched.harness_threshold !== 'number' || !Number.isFinite(sched.harness_threshold)
|
|
111
|
+
|| sched.harness_threshold <= 0 || sched.harness_threshold > 1) {
|
|
112
|
+
errors.push(`harness_threshold must be a number in (0, 1] (got ${JSON.stringify(sched.harness_threshold)})`);
|
|
113
|
+
} else {
|
|
114
|
+
threshold = sched.harness_threshold;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
let maxIterations = HARNESS_DEFAULTS.maxIterations;
|
|
119
|
+
if (sched.harness_max_iterations !== undefined && sched.harness_max_iterations !== null) {
|
|
120
|
+
const n = sched.harness_max_iterations;
|
|
121
|
+
if (!Number.isInteger(n) || n < 1 || n > HARNESS_MAX_ITERATIONS_CAP) {
|
|
122
|
+
errors.push(`harness_max_iterations must be a positive integer ≤ ${HARNESS_MAX_ITERATIONS_CAP} (got ${JSON.stringify(n)})`);
|
|
123
|
+
} else {
|
|
124
|
+
maxIterations = n;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
if (errors.length > 0) return { valid: false, errors };
|
|
129
|
+
return {
|
|
130
|
+
valid: true,
|
|
131
|
+
errors: [],
|
|
132
|
+
resolved: {
|
|
133
|
+
threshold,
|
|
134
|
+
maxIterations,
|
|
135
|
+
generatorType: typeof sched.type === 'string' && sched.type.trim() ? sched.type.trim() : HARNESS_DEFAULTS.generatorType,
|
|
136
|
+
},
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// ─── ID + path helpers ──────────────────────────────────────────────────────
|
|
141
|
+
|
|
142
|
+
function _shortRand() {
|
|
143
|
+
// 6 hex chars from current ms entropy — collisions are vanishingly rare for
|
|
144
|
+
// distinct scheduler ticks and harness has no cross-process write contention
|
|
145
|
+
// (each mission's artifact dir is freshly created).
|
|
146
|
+
return Math.random().toString(36).slice(2, 8);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
function _buildMissionId(sched, nowMs) {
|
|
150
|
+
const base = sched && typeof sched.id === 'string' ? sched.id : 'mission';
|
|
151
|
+
return `${base}-${nowMs}-${_shortRand()}`;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
function _buildItemId(scheduleId, role, iteration, nowMs) {
|
|
155
|
+
return `sched-${scheduleId}-${role}-i${iteration}-${nowMs}-${_shortRand()}`;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// ─── Prompt builders ────────────────────────────────────────────────────────
|
|
159
|
+
|
|
160
|
+
const RUBRIC_HEADING = '## Rubric';
|
|
161
|
+
const ARTIFACT_HEADING = '## Shared Artifact';
|
|
162
|
+
|
|
163
|
+
function _buildPlannerDescription(sched, ctx) {
|
|
164
|
+
const { artifactPath, missionId, threshold, maxIterations, rubric, iteration } = ctx;
|
|
165
|
+
return [
|
|
166
|
+
`# Tri-Agent Mission — Planner (iteration ${iteration})`,
|
|
167
|
+
'',
|
|
168
|
+
`**Mission:** ${sched.title || sched.id}`,
|
|
169
|
+
'',
|
|
170
|
+
`**Goal:** ${(sched.description || sched.title || '').trim()}`,
|
|
171
|
+
'',
|
|
172
|
+
'You are the **Planner** in a three-agent harness loop. Your job is to',
|
|
173
|
+
'decompose the mission goal above into a numbered list of concrete subtasks',
|
|
174
|
+
'that the Generator will execute next, then write the plan to the shared',
|
|
175
|
+
'artifact below. Keep subtasks small and verifiable.',
|
|
176
|
+
'',
|
|
177
|
+
ARTIFACT_HEADING,
|
|
178
|
+
'',
|
|
179
|
+
`Write your plan to: \`${artifactPath}\``,
|
|
180
|
+
'',
|
|
181
|
+
'Structure the file as:',
|
|
182
|
+
'```',
|
|
183
|
+
`# Mission ${missionId}`,
|
|
184
|
+
'',
|
|
185
|
+
'## Plan',
|
|
186
|
+
'1. <subtask 1>',
|
|
187
|
+
'2. <subtask 2>',
|
|
188
|
+
'...',
|
|
189
|
+
'```',
|
|
190
|
+
'',
|
|
191
|
+
'Do not execute the subtasks yourself — that is the Generator\'s job.',
|
|
192
|
+
'Create the artifact directory if it does not exist.',
|
|
193
|
+
'',
|
|
194
|
+
RUBRIC_HEADING,
|
|
195
|
+
'',
|
|
196
|
+
'The Evaluator will eventually score the completed artifact against this',
|
|
197
|
+
'rubric. Plan with the rubric in mind:',
|
|
198
|
+
'',
|
|
199
|
+
'> ' + rubric.split('\n').join('\n> '),
|
|
200
|
+
'',
|
|
201
|
+
`Threshold: ${threshold} · Max iterations: ${maxIterations}`,
|
|
202
|
+
'',
|
|
203
|
+
`Mission ID: \`${missionId}\``,
|
|
204
|
+
].join('\n');
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
function _buildGeneratorDescription(sched, ctx, opts) {
|
|
208
|
+
const { artifactPath, missionId, threshold, maxIterations, rubric, iteration } = ctx;
|
|
209
|
+
const previousFeedback = opts && opts.previousFeedback ? String(opts.previousFeedback).trim() : '';
|
|
210
|
+
const lines = [
|
|
211
|
+
`# Tri-Agent Mission — Generator (iteration ${iteration})`,
|
|
212
|
+
'',
|
|
213
|
+
`**Mission:** ${sched.title || sched.id}`,
|
|
214
|
+
'',
|
|
215
|
+
`**Goal:** ${(sched.description || sched.title || '').trim()}`,
|
|
216
|
+
'',
|
|
217
|
+
'You are the **Generator** in a three-agent harness loop. Read the Planner\'s',
|
|
218
|
+
'subtask list from the shared artifact, execute each subtask in order, and',
|
|
219
|
+
'append your outputs to the artifact under a clearly-labelled section.',
|
|
220
|
+
'',
|
|
221
|
+
ARTIFACT_HEADING,
|
|
222
|
+
'',
|
|
223
|
+
`Shared artifact: \`${artifactPath}\``,
|
|
224
|
+
'',
|
|
225
|
+
`Append a section titled \`## Generator Output (iteration ${iteration})\` to the`,
|
|
226
|
+
'artifact. Within it, address each numbered subtask from the plan.',
|
|
227
|
+
'',
|
|
228
|
+
'Do NOT delete or rewrite earlier sections — append only.',
|
|
229
|
+
];
|
|
230
|
+
|
|
231
|
+
if (previousFeedback) {
|
|
232
|
+
lines.push(
|
|
233
|
+
'',
|
|
234
|
+
`## Previous Evaluator Feedback (iteration ${iteration - 1})`,
|
|
235
|
+
'',
|
|
236
|
+
'The previous iteration failed the rubric. The Evaluator provided this feedback:',
|
|
237
|
+
'',
|
|
238
|
+
'> ' + previousFeedback.split('\n').join('\n> '),
|
|
239
|
+
'',
|
|
240
|
+
'Address this feedback explicitly in your new output.',
|
|
241
|
+
);
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
lines.push(
|
|
245
|
+
'',
|
|
246
|
+
RUBRIC_HEADING,
|
|
247
|
+
'',
|
|
248
|
+
'The Evaluator will score your output against this rubric:',
|
|
249
|
+
'',
|
|
250
|
+
'> ' + rubric.split('\n').join('\n> '),
|
|
251
|
+
'',
|
|
252
|
+
`Threshold: ${threshold} · Max iterations: ${maxIterations}`,
|
|
253
|
+
'',
|
|
254
|
+
`Mission ID: \`${missionId}\``,
|
|
255
|
+
);
|
|
256
|
+
return lines.join('\n');
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
function _buildEvaluatorDescription(sched, ctx) {
|
|
260
|
+
const { artifactPath, missionId, threshold, maxIterations, rubric, iteration } = ctx;
|
|
261
|
+
return [
|
|
262
|
+
`# Tri-Agent Mission — Evaluator (iteration ${iteration})`,
|
|
263
|
+
'',
|
|
264
|
+
`**Mission:** ${sched.title || sched.id}`,
|
|
265
|
+
'',
|
|
266
|
+
'You are the **Evaluator** in a three-agent harness loop. Read the shared',
|
|
267
|
+
'artifact (including the Planner\'s plan and the Generator\'s output) and',
|
|
268
|
+
'score it against the rubric below.',
|
|
269
|
+
'',
|
|
270
|
+
ARTIFACT_HEADING,
|
|
271
|
+
'',
|
|
272
|
+
`Shared artifact: \`${artifactPath}\``,
|
|
273
|
+
'',
|
|
274
|
+
'Append a section titled `## Evaluation (iteration ' + iteration + ')` containing:',
|
|
275
|
+
'- A numeric score in `[0, 1]` formatted as `Score: 0.NN`',
|
|
276
|
+
'- A `PASS` or `FAIL` verdict on its own line',
|
|
277
|
+
'- Concrete feedback under `### Feedback` explaining strengths and gaps',
|
|
278
|
+
'',
|
|
279
|
+
RUBRIC_HEADING,
|
|
280
|
+
'',
|
|
281
|
+
'> ' + rubric.split('\n').join('\n> '),
|
|
282
|
+
'',
|
|
283
|
+
`**Threshold:** ${threshold} — a score < ${threshold} is a FAIL and triggers another Generator iteration`,
|
|
284
|
+
`(up to ${maxIterations} total iterations).`,
|
|
285
|
+
'',
|
|
286
|
+
'## Completion Report',
|
|
287
|
+
'',
|
|
288
|
+
'In your JSON completion report include these fields so the engine can route',
|
|
289
|
+
'the next iteration deterministically (in addition to the standard schema):',
|
|
290
|
+
'```json',
|
|
291
|
+
'{',
|
|
292
|
+
' "status": "success",',
|
|
293
|
+
' "summary": "<one-line verdict>",',
|
|
294
|
+
' "harness_score": 0.NN,',
|
|
295
|
+
' "harness_pass": true | false,',
|
|
296
|
+
' "harness_feedback": "<machine-readable feedback the next Generator should address>"',
|
|
297
|
+
'}',
|
|
298
|
+
'```',
|
|
299
|
+
'',
|
|
300
|
+
'If you cannot evaluate (artifact missing, malformed), set `harness_pass: false`,',
|
|
301
|
+
'`harness_score: 0`, and explain in `harness_feedback`.',
|
|
302
|
+
'',
|
|
303
|
+
`Mission ID: \`${missionId}\``,
|
|
304
|
+
].join('\n');
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// ─── Mission creation ───────────────────────────────────────────────────────
|
|
308
|
+
|
|
309
|
+
function _commonItemFields(sched, role, iteration) {
|
|
310
|
+
return {
|
|
311
|
+
title: `[harness:${role}:i${iteration}] ${sched.title || sched.id}`,
|
|
312
|
+
priority: sched.priority || 'medium',
|
|
313
|
+
status: WI_STATUS.PENDING,
|
|
314
|
+
created: ts(),
|
|
315
|
+
createdBy: 'scheduler:harness',
|
|
316
|
+
project: sched.project || null,
|
|
317
|
+
agent: null,
|
|
318
|
+
_scheduleId: sched.id,
|
|
319
|
+
};
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
function _buildHarnessMeta(missionId, role, iteration, resolved, sched, artifactPath) {
|
|
323
|
+
return {
|
|
324
|
+
role,
|
|
325
|
+
iteration,
|
|
326
|
+
missionId,
|
|
327
|
+
artifactPath,
|
|
328
|
+
rubric: sched.harness_rubric,
|
|
329
|
+
threshold: resolved.threshold,
|
|
330
|
+
maxIterations: resolved.maxIterations,
|
|
331
|
+
generatorType: resolved.generatorType,
|
|
332
|
+
};
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
/**
|
|
336
|
+
* Build the initial Planner → Generator → Evaluator trio for a tri-agent
|
|
337
|
+
* schedule firing. Throws if the schedule's harness config is invalid — the
|
|
338
|
+
* caller is responsible for validating + logging upstream when desired.
|
|
339
|
+
*
|
|
340
|
+
* Returns { items, missionId, artifactPath } where items[0..2] are the three
|
|
341
|
+
* work items in dispatch order, already linked by depends_on.
|
|
342
|
+
*/
|
|
343
|
+
function createTriAgentMission(sched, opts) {
|
|
344
|
+
const { valid, errors, resolved } = validateHarnessConfig(sched);
|
|
345
|
+
if (!valid) throw new Error(`tri_agent harness config invalid for schedule ${sched && sched.id}: ${errors.join('; ')}`);
|
|
346
|
+
|
|
347
|
+
const nowMs = opts && Number.isFinite(opts.now) ? opts.now : Date.now();
|
|
348
|
+
const missionId = (opts && typeof opts.missionId === 'string' && opts.missionId) || _buildMissionId(sched, nowMs);
|
|
349
|
+
const artifactPath = harnessArtifactPath(missionId);
|
|
350
|
+
const iteration = 1;
|
|
351
|
+
const ctx = {
|
|
352
|
+
artifactPath, missionId, iteration,
|
|
353
|
+
threshold: resolved.threshold,
|
|
354
|
+
maxIterations: resolved.maxIterations,
|
|
355
|
+
rubric: sched.harness_rubric,
|
|
356
|
+
};
|
|
357
|
+
|
|
358
|
+
const plannerId = _buildItemId(sched.id, HARNESS_ROLE.PLANNER, iteration, nowMs);
|
|
359
|
+
const generatorId = _buildItemId(sched.id, HARNESS_ROLE.GENERATOR, iteration, nowMs);
|
|
360
|
+
const evaluatorId = _buildItemId(sched.id, HARNESS_ROLE.EVALUATOR, iteration, nowMs);
|
|
361
|
+
|
|
362
|
+
// Planner + Evaluator are read-only (ask) by design — they don't mutate
|
|
363
|
+
// project code, they only read/write the shared harness artifact. The
|
|
364
|
+
// generator inherits sched.type (default 'ask').
|
|
365
|
+
const planner = {
|
|
366
|
+
id: plannerId,
|
|
367
|
+
type: 'ask',
|
|
368
|
+
description: _buildPlannerDescription(sched, ctx),
|
|
369
|
+
depends_on: [],
|
|
370
|
+
..._commonItemFields(sched, HARNESS_ROLE.PLANNER, iteration),
|
|
371
|
+
_missionId: missionId,
|
|
372
|
+
_harness: _buildHarnessMeta(missionId, HARNESS_ROLE.PLANNER, iteration, resolved, sched, artifactPath),
|
|
373
|
+
};
|
|
374
|
+
|
|
375
|
+
const generator = {
|
|
376
|
+
id: generatorId,
|
|
377
|
+
type: resolved.generatorType,
|
|
378
|
+
description: _buildGeneratorDescription(sched, ctx, {}),
|
|
379
|
+
depends_on: [plannerId],
|
|
380
|
+
..._commonItemFields(sched, HARNESS_ROLE.GENERATOR, iteration),
|
|
381
|
+
_missionId: missionId,
|
|
382
|
+
_harness: _buildHarnessMeta(missionId, HARNESS_ROLE.GENERATOR, iteration, resolved, sched, artifactPath),
|
|
383
|
+
};
|
|
384
|
+
|
|
385
|
+
const evaluator = {
|
|
386
|
+
id: evaluatorId,
|
|
387
|
+
type: 'ask',
|
|
388
|
+
description: _buildEvaluatorDescription(sched, ctx),
|
|
389
|
+
depends_on: [generatorId],
|
|
390
|
+
..._commonItemFields(sched, HARNESS_ROLE.EVALUATOR, iteration),
|
|
391
|
+
_missionId: missionId,
|
|
392
|
+
_harness: _buildHarnessMeta(missionId, HARNESS_ROLE.EVALUATOR, iteration, resolved, sched, artifactPath),
|
|
393
|
+
};
|
|
394
|
+
|
|
395
|
+
return { items: [planner, generator, evaluator], missionId, artifactPath };
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
/**
|
|
399
|
+
* Build the Generator+Evaluator pair for iteration N+1 after the Evaluator
|
|
400
|
+
* fails the rubric. The Planner only runs once per mission — its plan is
|
|
401
|
+
* already in the shared artifact.
|
|
402
|
+
*
|
|
403
|
+
* `prevEvaluatorItem` is the work item that just completed (must carry
|
|
404
|
+
* `_harness` meta with role='evaluator'). The new generator depends on it so
|
|
405
|
+
* the engine's dispatch loop won't fire it until the artifact is fully written.
|
|
406
|
+
*/
|
|
407
|
+
function createIterationWorkItems(prevEvaluatorItem, verdict, opts) {
|
|
408
|
+
if (!prevEvaluatorItem || !prevEvaluatorItem._harness) {
|
|
409
|
+
throw new Error('createIterationWorkItems: prevEvaluatorItem missing _harness meta');
|
|
410
|
+
}
|
|
411
|
+
const prevMeta = prevEvaluatorItem._harness;
|
|
412
|
+
if (prevMeta.role !== HARNESS_ROLE.EVALUATOR) {
|
|
413
|
+
throw new Error(`createIterationWorkItems: prev item must be an evaluator (got role=${prevMeta.role})`);
|
|
414
|
+
}
|
|
415
|
+
const iteration = (Number(prevMeta.iteration) || 1) + 1;
|
|
416
|
+
const nowMs = opts && Number.isFinite(opts.now) ? opts.now : Date.now();
|
|
417
|
+
|
|
418
|
+
const sched = {
|
|
419
|
+
id: prevEvaluatorItem._scheduleId,
|
|
420
|
+
title: prevEvaluatorItem.title || prevMeta.missionId,
|
|
421
|
+
description: '', // Carried via artifact + feedback, not re-rendered.
|
|
422
|
+
harness_rubric: prevMeta.rubric,
|
|
423
|
+
project: prevEvaluatorItem.project || null,
|
|
424
|
+
priority: prevEvaluatorItem.priority || 'medium',
|
|
425
|
+
};
|
|
426
|
+
const ctx = {
|
|
427
|
+
artifactPath: prevMeta.artifactPath,
|
|
428
|
+
missionId: prevMeta.missionId,
|
|
429
|
+
iteration,
|
|
430
|
+
threshold: prevMeta.threshold,
|
|
431
|
+
maxIterations: prevMeta.maxIterations,
|
|
432
|
+
rubric: prevMeta.rubric,
|
|
433
|
+
};
|
|
434
|
+
const resolved = {
|
|
435
|
+
threshold: prevMeta.threshold,
|
|
436
|
+
maxIterations: prevMeta.maxIterations,
|
|
437
|
+
generatorType: prevMeta.generatorType || HARNESS_DEFAULTS.generatorType,
|
|
438
|
+
};
|
|
439
|
+
|
|
440
|
+
const generatorId = _buildItemId(sched.id || 'mission', HARNESS_ROLE.GENERATOR, iteration, nowMs);
|
|
441
|
+
const evaluatorId = _buildItemId(sched.id || 'mission', HARNESS_ROLE.EVALUATOR, iteration, nowMs);
|
|
442
|
+
const feedback = verdict && verdict.feedback ? verdict.feedback : '(no feedback supplied)';
|
|
443
|
+
|
|
444
|
+
const generator = {
|
|
445
|
+
id: generatorId,
|
|
446
|
+
type: resolved.generatorType,
|
|
447
|
+
title: `[harness:generator:i${iteration}] ${sched.title}`,
|
|
448
|
+
description: _buildGeneratorDescription(sched, ctx, { previousFeedback: feedback }),
|
|
449
|
+
depends_on: [prevEvaluatorItem.id],
|
|
450
|
+
priority: sched.priority,
|
|
451
|
+
status: WI_STATUS.PENDING,
|
|
452
|
+
created: ts(),
|
|
453
|
+
createdBy: 'harness:iterate',
|
|
454
|
+
project: sched.project,
|
|
455
|
+
agent: null,
|
|
456
|
+
_scheduleId: sched.id,
|
|
457
|
+
_missionId: prevMeta.missionId,
|
|
458
|
+
_harness: _buildHarnessMeta(prevMeta.missionId, HARNESS_ROLE.GENERATOR, iteration, resolved, sched, prevMeta.artifactPath),
|
|
459
|
+
};
|
|
460
|
+
const evaluator = {
|
|
461
|
+
id: evaluatorId,
|
|
462
|
+
type: 'ask',
|
|
463
|
+
title: `[harness:evaluator:i${iteration}] ${sched.title}`,
|
|
464
|
+
description: _buildEvaluatorDescription(sched, ctx),
|
|
465
|
+
depends_on: [generatorId],
|
|
466
|
+
priority: sched.priority,
|
|
467
|
+
status: WI_STATUS.PENDING,
|
|
468
|
+
created: ts(),
|
|
469
|
+
createdBy: 'harness:iterate',
|
|
470
|
+
project: sched.project,
|
|
471
|
+
agent: null,
|
|
472
|
+
_scheduleId: sched.id,
|
|
473
|
+
_missionId: prevMeta.missionId,
|
|
474
|
+
_harness: _buildHarnessMeta(prevMeta.missionId, HARNESS_ROLE.EVALUATOR, iteration, resolved, sched, prevMeta.artifactPath),
|
|
475
|
+
};
|
|
476
|
+
|
|
477
|
+
return [generator, evaluator];
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
// ─── Verdict parsing + iteration gate ───────────────────────────────────────
|
|
481
|
+
|
|
482
|
+
const SCORE_RE = /(?:^|\W)Score\s*[:=]\s*([0-1](?:\.\d+)?|\.\d+)/i;
|
|
483
|
+
const PASS_RE = /(?:^|[^\w])(PASS|✅\s*PASS|verdict\s*[:=]\s*pass)\b/i;
|
|
484
|
+
const FAIL_RE = /(?:^|[^\w])(FAIL|❌\s*FAIL|verdict\s*[:=]\s*fail)\b/i;
|
|
485
|
+
|
|
486
|
+
/**
|
|
487
|
+
* Extract { score, pass, feedback } from the Evaluator's completion report and
|
|
488
|
+
* stdout. Structured fields in the completion report win when present.
|
|
489
|
+
*
|
|
490
|
+
* Returns:
|
|
491
|
+
* { score: number | null, pass: boolean | null, feedback: string }
|
|
492
|
+
* `score=null` and `pass=null` together mean "no signal" — the caller should
|
|
493
|
+
* treat this as inconclusive (do NOT retry blindly).
|
|
494
|
+
*/
|
|
495
|
+
function parseEvaluatorVerdict(stdout, structuredCompletion) {
|
|
496
|
+
let score = null;
|
|
497
|
+
let pass = null;
|
|
498
|
+
let feedback = '';
|
|
499
|
+
|
|
500
|
+
// Structured fields take precedence — they're the documented contract in
|
|
501
|
+
// the evaluator prompt and not vulnerable to text-format drift.
|
|
502
|
+
if (structuredCompletion && typeof structuredCompletion === 'object') {
|
|
503
|
+
if (typeof structuredCompletion.harness_score === 'number' && Number.isFinite(structuredCompletion.harness_score)) {
|
|
504
|
+
score = Math.max(0, Math.min(1, structuredCompletion.harness_score));
|
|
505
|
+
}
|
|
506
|
+
if (typeof structuredCompletion.harness_pass === 'boolean') {
|
|
507
|
+
pass = structuredCompletion.harness_pass;
|
|
508
|
+
}
|
|
509
|
+
if (typeof structuredCompletion.harness_feedback === 'string' && structuredCompletion.harness_feedback.trim()) {
|
|
510
|
+
feedback = structuredCompletion.harness_feedback.trim();
|
|
511
|
+
} else if (typeof structuredCompletion.summary === 'string' && structuredCompletion.summary.trim()) {
|
|
512
|
+
feedback = structuredCompletion.summary.trim();
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
// Text fallback — only fill in fields the structured report did not provide.
|
|
517
|
+
if ((score === null || pass === null || !feedback) && typeof stdout === 'string' && stdout.length > 0) {
|
|
518
|
+
if (score === null) {
|
|
519
|
+
const m = SCORE_RE.exec(stdout);
|
|
520
|
+
if (m) {
|
|
521
|
+
const n = parseFloat(m[1]);
|
|
522
|
+
if (Number.isFinite(n)) score = Math.max(0, Math.min(1, n));
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
if (pass === null) {
|
|
526
|
+
const failMatch = FAIL_RE.exec(stdout);
|
|
527
|
+
const passMatch = PASS_RE.exec(stdout);
|
|
528
|
+
// FAIL takes precedence over PASS when both appear (the evaluator's
|
|
529
|
+
// explanation of failure may mention 'pass criteria' etc).
|
|
530
|
+
if (failMatch) pass = false;
|
|
531
|
+
else if (passMatch) pass = true;
|
|
532
|
+
}
|
|
533
|
+
if (!feedback) {
|
|
534
|
+
// Best-effort: take the last non-empty line as the feedback summary.
|
|
535
|
+
const lines = stdout.split(/\r?\n/).map(l => l.trim()).filter(Boolean);
|
|
536
|
+
if (lines.length > 0) feedback = lines[lines.length - 1].slice(0, 2000);
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
// If score is set but pass is not, infer pass from the threshold caller
|
|
541
|
+
// (lifecycle.shouldIterateAgain) — but leave pass=null here so the caller
|
|
542
|
+
// can apply the per-mission threshold rather than baking in a default.
|
|
543
|
+
return { score, pass, feedback };
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
/**
|
|
547
|
+
* Decide whether to spawn another Generator+Evaluator iteration.
|
|
548
|
+
*
|
|
549
|
+
* Rules (in order):
|
|
550
|
+
* 1. If verdict.pass === true, stop (mission succeeded).
|
|
551
|
+
* 2. If iteration >= maxIterations, stop (cap reached).
|
|
552
|
+
* 3. If we have a numeric score AND score >= threshold, treat as pass and stop.
|
|
553
|
+
* 4. If we have a numeric score AND score < threshold, iterate.
|
|
554
|
+
* 5. If verdict.pass === false explicitly (no score), iterate.
|
|
555
|
+
* 6. Otherwise (no score and no pass signal), STOP — silent agents would
|
|
556
|
+
* loop forever; require explicit failure to retry.
|
|
557
|
+
*/
|
|
558
|
+
function shouldIterateAgain(harnessMeta, verdict) {
|
|
559
|
+
if (!harnessMeta || !verdict) return false;
|
|
560
|
+
const iteration = Number(harnessMeta.iteration) || 1;
|
|
561
|
+
const maxIterations = Number(harnessMeta.maxIterations) || HARNESS_DEFAULTS.maxIterations;
|
|
562
|
+
const threshold = Number(harnessMeta.threshold);
|
|
563
|
+
const t = Number.isFinite(threshold) ? threshold : HARNESS_DEFAULTS.threshold;
|
|
564
|
+
|
|
565
|
+
if (verdict.pass === true) return false;
|
|
566
|
+
if (iteration >= maxIterations) return false;
|
|
567
|
+
|
|
568
|
+
if (typeof verdict.score === 'number' && Number.isFinite(verdict.score)) {
|
|
569
|
+
return verdict.score < t;
|
|
570
|
+
}
|
|
571
|
+
if (verdict.pass === false) return true;
|
|
572
|
+
return false;
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
module.exports = {
|
|
576
|
+
HARNESS_MODE,
|
|
577
|
+
HARNESS_ROLE,
|
|
578
|
+
HARNESS_DEFAULTS,
|
|
579
|
+
HARNESS_MAX_ITERATIONS_CAP,
|
|
580
|
+
harnessRootDir,
|
|
581
|
+
harnessMissionDir,
|
|
582
|
+
harnessArtifactPath,
|
|
583
|
+
validateHarnessConfig,
|
|
584
|
+
createTriAgentMission,
|
|
585
|
+
createIterationWorkItems,
|
|
586
|
+
parseEvaluatorVerdict,
|
|
587
|
+
shouldIterateAgain,
|
|
588
|
+
// Exported for direct unit tests (per docs/skills.md skill 'export-internal-helpers-for-direct-unit-tests').
|
|
589
|
+
_buildPlannerDescription,
|
|
590
|
+
_buildGeneratorDescription,
|
|
591
|
+
_buildEvaluatorDescription,
|
|
592
|
+
};
|