@yemi33/minions 0.1.2122 → 0.1.2123
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/harness-mode.md +92 -0
- package/engine/ado.js +9 -0
- package/engine/github.js +4 -1
- package/engine/harness.js +592 -0
- package/engine/lifecycle.js +91 -0
- package/engine/scheduler.js +40 -3
- package/engine.js +52 -13
- package/package.json +1 -1
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# Tri-Agent Harness Mode
|
|
2
|
+
|
|
3
|
+
> Status: opt-in feature flag on scheduled tasks (`harness_mode: "tri_agent"`).
|
|
4
|
+
> Shipped: W-mq07a9gf000jbc2b. Module: [`engine/harness.js`](../engine/harness.js).
|
|
5
|
+
|
|
6
|
+
## What it is
|
|
7
|
+
|
|
8
|
+
A way to turn one schedule firing into a coordinated **Planner → Generator → Evaluator** trio that iterates on a shared on-disk artifact until the artifact meets a rubric or hits an iteration cap. Useful for "produce a piece of work, then improve it" loops where a single agent call would either underspecify the task or produce uneven quality.
|
|
9
|
+
|
|
10
|
+
The three roles in order:
|
|
11
|
+
|
|
12
|
+
1. **Planner** (`ask` type, read-only) — reads the rubric, writes a short plan into the mission directory.
|
|
13
|
+
2. **Generator** (defaults to `ask`, inherits `sched.type`) — produces the artifact at `<MINIONS_DIR>/engine/harness/<missionId>/artifact.md` per the plan.
|
|
14
|
+
3. **Evaluator** (`ask`, read-only) — scores the artifact against the rubric and reports a verdict.
|
|
15
|
+
|
|
16
|
+
If the evaluator's verdict score is below `harness_threshold` (and the iteration cap hasn't been hit), the engine appends a fresh `Generator → Evaluator` pair carrying the evaluator's feedback in the next generator's prompt. Loop continues until pass or cap.
|
|
17
|
+
|
|
18
|
+
## Config schema (add to a schedule in `config.json`)
|
|
19
|
+
|
|
20
|
+
```json
|
|
21
|
+
{
|
|
22
|
+
"id": "weekly-design-review",
|
|
23
|
+
"title": "Tri-agent design review",
|
|
24
|
+
"cron": "0 9 * * MON",
|
|
25
|
+
"type": "ask",
|
|
26
|
+
"harness_mode": "tri_agent",
|
|
27
|
+
"harness_rubric": "Score 0-1. 1.0 = all sections complete with code examples. 0 = missing sections.",
|
|
28
|
+
"harness_threshold": 0.7,
|
|
29
|
+
"harness_max_iterations": 5
|
|
30
|
+
}
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
| Field | Required | Default | Notes |
|
|
34
|
+
|--------------------------|----------|---------|-----------------------------------------------------------------------|
|
|
35
|
+
| `harness_mode` | yes | — | Must equal `"tri_agent"` to enable. Any other value falls back to plain scheduled work. |
|
|
36
|
+
| `harness_rubric` | yes | — | Non-empty string. Injected into every role's prompt. The evaluator scores against this. |
|
|
37
|
+
| `harness_threshold` | no | `0.7` | Number in `(0, 1]`. Verdict score `>= threshold` = pass; `<` = iterate. |
|
|
38
|
+
| `harness_max_iterations` | no | `5` | Positive integer, capped at `20`. Counts generator iterations; planner is iteration 1. |
|
|
39
|
+
|
|
40
|
+
Invalid harness config logs a warning and **skips the firing without recording a schedule run**, so fixing the config and waiting for the next cron tick is enough to recover — no manual reset needed.
|
|
41
|
+
|
|
42
|
+
## Lifecycle
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
cron fires
|
|
46
|
+
└─ scheduler.discoverScheduledWork detects harness_mode === 'tri_agent'
|
|
47
|
+
└─ validateHarnessConfig (skip+warn on failure)
|
|
48
|
+
└─ createTriAgentMission → 3 work items
|
|
49
|
+
├─ Planner (iteration 1)
|
|
50
|
+
├─ Generator (iteration 1, depends on Planner)
|
|
51
|
+
└─ Evaluator (iteration 1, depends on Generator)
|
|
52
|
+
│
|
|
53
|
+
▼ (on success)
|
|
54
|
+
lifecycle.runPostCompletionHooks
|
|
55
|
+
└─ handleHarnessIterationResult
|
|
56
|
+
└─ parseEvaluatorVerdict + shouldIterateAgain
|
|
57
|
+
└─ if iterate: append Generator + Evaluator (iteration N+1)
|
|
58
|
+
└─ next tick dispatches them
|
|
59
|
+
└─ if pass / cap / inconclusive: mission terminal
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Artifact layout
|
|
63
|
+
|
|
64
|
+
```
|
|
65
|
+
<MINIONS_DIR>/engine/harness/<missionId>/
|
|
66
|
+
└─ artifact.md ← Generator writes here, Evaluator reads here
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Mission ID format: `<scheduleId>-<unixMs>-<rand6>`. The mission directory is the contract — agents in all 3 roles get the same path injected into their prompts.
|
|
70
|
+
|
|
71
|
+
## Evaluator verdict protocol
|
|
72
|
+
|
|
73
|
+
The evaluator can signal pass/fail/score either way:
|
|
74
|
+
|
|
75
|
+
- **Preferred (structured):** include the fields in the completion report sidecar:
|
|
76
|
+
```json
|
|
77
|
+
{ "harness_pass": true, "harness_score": 0.82, "harness_feedback": "all sections present" }
|
|
78
|
+
```
|
|
79
|
+
- **Fallback (text):** include `Score: 0.82` and `PASS` / `FAIL` in the summary. Structured fields win when both present. `FAIL` takes precedence when both `PASS` and `FAIL` appear in the text.
|
|
80
|
+
|
|
81
|
+
If neither signal is parseable, the harness treats the verdict as inconclusive and stops iterating (`shouldIterateAgain` returns false) to avoid an infinite loop driven by a silent agent.
|
|
82
|
+
|
|
83
|
+
## Dedup behavior (engine.js)
|
|
84
|
+
|
|
85
|
+
Within a single tick the standard scheduled-work dedup is keyed by `_scheduleId`, which would collapse the harness trio to one item. The harness trio share a `_missionId`; engine.js snapshots active mission IDs **before** the dedup loop so all 3 land together, while plain scheduled items keep the original `_scheduleId` dedup.
|
|
86
|
+
|
|
87
|
+
## Operational notes
|
|
88
|
+
|
|
89
|
+
- Tri-agent items are **schedule-driven** — there's no manual "fire a harness mission" entry point. Add a schedule with `harness_mode: "tri_agent"` to opt in.
|
|
90
|
+
- Iteration pairs always reuse the original mission's artifact path, threshold, max-iterations, and rubric. The evaluator's verdict feedback is appended to the next generator's prompt.
|
|
91
|
+
- Mission state lives entirely on disk: the work-items.json trio + the artifact file. No new DB tables.
|
|
92
|
+
- Each iteration's evaluator is a separate work item, so dispatch retries, cooldowns, and steering apply normally to every role.
|
package/engine/ado.js
CHANGED
|
@@ -975,6 +975,15 @@ async function forEachActivePr(config, token, callback) {
|
|
|
975
975
|
continue;
|
|
976
976
|
}
|
|
977
977
|
|
|
978
|
+
// Per-project throttle skip — emit one log line per skipped project, then continue.
|
|
979
|
+
// Sub-item W-mq03l6zh0006f0a1-b will replace the global isAdoThrottled() probe with
|
|
980
|
+
// a per-org `isOrgBaseThrottled(orgBase)` check so a 429 on one org no longer pauses
|
|
981
|
+
// polling for healthy orgs.
|
|
982
|
+
if (isAdoThrottled()) {
|
|
983
|
+
log('info', `[ado] PR poll skipped for ${project.name || project.repoName || 'unknown project'} — org ${orgBase} throttled`);
|
|
984
|
+
continue;
|
|
985
|
+
}
|
|
986
|
+
|
|
978
987
|
// Parallelize PR polling within each project (max 5 concurrent to avoid rate limits)
|
|
979
988
|
const CONCURRENCY = 5;
|
|
980
989
|
for (let i = 0; i < activePrs.length; i += CONCURRENCY) {
|
package/engine/github.js
CHANGED
|
@@ -295,7 +295,10 @@ function resetSlugBackoff(slug) {
|
|
|
295
295
|
// ─── GitHub Rate-Limit Throttle ────────────────────────────────────────────
|
|
296
296
|
// Tracks rate-limiting from GitHub API (gh CLI exits non-zero with rate-limit messages).
|
|
297
297
|
// GitHub rate limits reset hourly, so cap at 60 min.
|
|
298
|
-
|
|
298
|
+
// jitterRatio: 0.2 — apply ±20% random jitter to backoff to avoid thundering herd
|
|
299
|
+
// when many concurrent gh calls race the same 1-hr reset window. See sub-item
|
|
300
|
+
// W-mq03l6zh0006f0a1-a for the createThrottleTracker jitter math.
|
|
301
|
+
const _ghThrottle = createThrottleTracker({ label: 'gh', baseBackoffMs: 60000, maxBackoffMs: 60 * 60000, jitterRatio: 0.2 });
|
|
299
302
|
|
|
300
303
|
/** Returns true if GitHub is rate-limited and retryAfter hasn't elapsed. */
|
|
301
304
|
const isGhThrottled = () => _ghThrottle.isThrottled();
|
|
@@ -0,0 +1,592 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* engine/harness.js — Tri-agent harness mode for long-running missions.
|
|
3
|
+
*
|
|
4
|
+
* Inspired by Anthropic's Harness Design (April 2026), this module implements
|
|
5
|
+
* the "tri_agent" mission mode: a schedule firing produces three coordinated
|
|
6
|
+
* work items — Planner → Generator → Evaluator — that operate against a
|
|
7
|
+
* shared artifact on disk. The Evaluator scores the artifact against a rubric
|
|
8
|
+
* declared in the schedule config; if it scores below the threshold (default
|
|
9
|
+
* 0.7), the engine spawns another Generator+Evaluator iteration with the
|
|
10
|
+
* Evaluator's feedback injected. The cycle terminates on pass or after
|
|
11
|
+
* `harness_max_iterations` (default 5) iterations — whichever comes first.
|
|
12
|
+
*
|
|
13
|
+
* Why a separate module:
|
|
14
|
+
* - Keeps scheduler.js focused on cron parsing + run-dedup.
|
|
15
|
+
* - The iteration loop lives in lifecycle.js so the engine's existing
|
|
16
|
+
* post-completion hook chain owns retry orchestration.
|
|
17
|
+
* - Pure helpers here (no side effects beyond reading shared.MINIONS_DIR)
|
|
18
|
+
* are easy to test in isolation.
|
|
19
|
+
*
|
|
20
|
+
* Schedule config shape (additive on top of the cron schedule schema):
|
|
21
|
+
* {
|
|
22
|
+
* id: 'daily-research',
|
|
23
|
+
* cron: '0 9 * *',
|
|
24
|
+
* title: 'Daily research pass',
|
|
25
|
+
* description: 'Summarize new arxiv papers',
|
|
26
|
+
* project: 'minions', // optional
|
|
27
|
+
* type: 'ask', // generator work-type (default 'ask')
|
|
28
|
+
* harness_mode: 'tri_agent', // REQUIRED to enable harness mode
|
|
29
|
+
* harness_rubric: 'Must cite ≥3 papers', // REQUIRED — passed to Evaluator
|
|
30
|
+
* harness_threshold: 0.7, // default 0.7, must be (0, 1]
|
|
31
|
+
* harness_max_iterations: 5, // default 5, integer in [1, 20]
|
|
32
|
+
* }
|
|
33
|
+
*
|
|
34
|
+
* Per-item _harness meta (carried through dispatch + lifecycle):
|
|
35
|
+
* {
|
|
36
|
+
* role: 'planner' | 'generator' | 'evaluator',
|
|
37
|
+
* iteration: 1, // generators/evaluators bump this on retry
|
|
38
|
+
* missionId: 'sched-id-1700-abc',
|
|
39
|
+
* artifactPath: '<MINIONS_DIR>/engine/harness/<missionId>/artifact.md',
|
|
40
|
+
* rubric: '...', // verbatim from schedule
|
|
41
|
+
* threshold: 0.7,
|
|
42
|
+
* maxIterations: 5,
|
|
43
|
+
* generatorType: 'ask', // remembered so iteration N+1 reuses it
|
|
44
|
+
* }
|
|
45
|
+
*
|
|
46
|
+
* Zero dependencies beyond Node built-ins + engine/shared.
|
|
47
|
+
*/
|
|
48
|
+
|
|
49
|
+
'use strict';
|
|
50
|
+
|
|
51
|
+
const path = require('path');
|
|
52
|
+
const shared = require('./shared');
|
|
53
|
+
const { MINIONS_DIR, WI_STATUS, ts } = shared;
|
|
54
|
+
|
|
55
|
+
const HARNESS_MODE = Object.freeze({
|
|
56
|
+
TRI_AGENT: 'tri_agent',
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
const HARNESS_ROLE = Object.freeze({
|
|
60
|
+
PLANNER: 'planner',
|
|
61
|
+
GENERATOR: 'generator',
|
|
62
|
+
EVALUATOR: 'evaluator',
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
const HARNESS_DEFAULTS = Object.freeze({
|
|
66
|
+
threshold: 0.7,
|
|
67
|
+
maxIterations: 5,
|
|
68
|
+
// Used when sched.type is absent. Ask is read-only and produces no PR, which
|
|
69
|
+
// matches the "research / synthesis" use case the harness was designed for.
|
|
70
|
+
generatorType: 'ask',
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
const HARNESS_MAX_ITERATIONS_CAP = 20;
|
|
74
|
+
|
|
75
|
+
// Filesystem layout for harness artifacts. Each mission gets its own dir so
|
|
76
|
+
// concurrent missions don't stomp each other's artifacts.
|
|
77
|
+
function harnessRootDir() {
|
|
78
|
+
return path.join(MINIONS_DIR, 'engine', 'harness');
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
function harnessMissionDir(missionId) {
|
|
82
|
+
return path.join(harnessRootDir(), missionId);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function harnessArtifactPath(missionId) {
|
|
86
|
+
return path.join(harnessMissionDir(missionId), 'artifact.md');
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Validate a schedule's tri-agent harness configuration.
|
|
91
|
+
* Returns { valid: boolean, errors: string[], resolved?: { threshold, maxIterations, generatorType } }.
|
|
92
|
+
* The `resolved` object reflects the defaults that will be applied (only populated when valid=true).
|
|
93
|
+
*/
|
|
94
|
+
function validateHarnessConfig(sched) {
|
|
95
|
+
const errors = [];
|
|
96
|
+
if (!sched || typeof sched !== 'object') {
|
|
97
|
+
return { valid: false, errors: ['schedule must be an object'] };
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
if (sched.harness_mode !== HARNESS_MODE.TRI_AGENT) {
|
|
101
|
+
errors.push(`harness_mode must be "${HARNESS_MODE.TRI_AGENT}" (got ${JSON.stringify(sched.harness_mode)})`);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if (typeof sched.harness_rubric !== 'string' || sched.harness_rubric.trim().length === 0) {
|
|
105
|
+
errors.push('harness_rubric is required (non-empty string)');
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
let threshold = HARNESS_DEFAULTS.threshold;
|
|
109
|
+
if (sched.harness_threshold !== undefined && sched.harness_threshold !== null) {
|
|
110
|
+
if (typeof sched.harness_threshold !== 'number' || !Number.isFinite(sched.harness_threshold)
|
|
111
|
+
|| sched.harness_threshold <= 0 || sched.harness_threshold > 1) {
|
|
112
|
+
errors.push(`harness_threshold must be a number in (0, 1] (got ${JSON.stringify(sched.harness_threshold)})`);
|
|
113
|
+
} else {
|
|
114
|
+
threshold = sched.harness_threshold;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
let maxIterations = HARNESS_DEFAULTS.maxIterations;
|
|
119
|
+
if (sched.harness_max_iterations !== undefined && sched.harness_max_iterations !== null) {
|
|
120
|
+
const n = sched.harness_max_iterations;
|
|
121
|
+
if (!Number.isInteger(n) || n < 1 || n > HARNESS_MAX_ITERATIONS_CAP) {
|
|
122
|
+
errors.push(`harness_max_iterations must be a positive integer ≤ ${HARNESS_MAX_ITERATIONS_CAP} (got ${JSON.stringify(n)})`);
|
|
123
|
+
} else {
|
|
124
|
+
maxIterations = n;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
if (errors.length > 0) return { valid: false, errors };
|
|
129
|
+
return {
|
|
130
|
+
valid: true,
|
|
131
|
+
errors: [],
|
|
132
|
+
resolved: {
|
|
133
|
+
threshold,
|
|
134
|
+
maxIterations,
|
|
135
|
+
generatorType: typeof sched.type === 'string' && sched.type.trim() ? sched.type.trim() : HARNESS_DEFAULTS.generatorType,
|
|
136
|
+
},
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// ─── ID + path helpers ──────────────────────────────────────────────────────
|
|
141
|
+
|
|
142
|
+
function _shortRand() {
|
|
143
|
+
// 6 hex chars from current ms entropy — collisions are vanishingly rare for
|
|
144
|
+
// distinct scheduler ticks and harness has no cross-process write contention
|
|
145
|
+
// (each mission's artifact dir is freshly created).
|
|
146
|
+
return Math.random().toString(36).slice(2, 8);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
function _buildMissionId(sched, nowMs) {
|
|
150
|
+
const base = sched && typeof sched.id === 'string' ? sched.id : 'mission';
|
|
151
|
+
return `${base}-${nowMs}-${_shortRand()}`;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
function _buildItemId(scheduleId, role, iteration, nowMs) {
|
|
155
|
+
return `sched-${scheduleId}-${role}-i${iteration}-${nowMs}-${_shortRand()}`;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// ─── Prompt builders ────────────────────────────────────────────────────────
|
|
159
|
+
|
|
160
|
+
const RUBRIC_HEADING = '## Rubric';
|
|
161
|
+
const ARTIFACT_HEADING = '## Shared Artifact';
|
|
162
|
+
|
|
163
|
+
function _buildPlannerDescription(sched, ctx) {
|
|
164
|
+
const { artifactPath, missionId, threshold, maxIterations, rubric, iteration } = ctx;
|
|
165
|
+
return [
|
|
166
|
+
`# Tri-Agent Mission — Planner (iteration ${iteration})`,
|
|
167
|
+
'',
|
|
168
|
+
`**Mission:** ${sched.title || sched.id}`,
|
|
169
|
+
'',
|
|
170
|
+
`**Goal:** ${(sched.description || sched.title || '').trim()}`,
|
|
171
|
+
'',
|
|
172
|
+
'You are the **Planner** in a three-agent harness loop. Your job is to',
|
|
173
|
+
'decompose the mission goal above into a numbered list of concrete subtasks',
|
|
174
|
+
'that the Generator will execute next, then write the plan to the shared',
|
|
175
|
+
'artifact below. Keep subtasks small and verifiable.',
|
|
176
|
+
'',
|
|
177
|
+
ARTIFACT_HEADING,
|
|
178
|
+
'',
|
|
179
|
+
`Write your plan to: \`${artifactPath}\``,
|
|
180
|
+
'',
|
|
181
|
+
'Structure the file as:',
|
|
182
|
+
'```',
|
|
183
|
+
`# Mission ${missionId}`,
|
|
184
|
+
'',
|
|
185
|
+
'## Plan',
|
|
186
|
+
'1. <subtask 1>',
|
|
187
|
+
'2. <subtask 2>',
|
|
188
|
+
'...',
|
|
189
|
+
'```',
|
|
190
|
+
'',
|
|
191
|
+
'Do not execute the subtasks yourself — that is the Generator\'s job.',
|
|
192
|
+
'Create the artifact directory if it does not exist.',
|
|
193
|
+
'',
|
|
194
|
+
RUBRIC_HEADING,
|
|
195
|
+
'',
|
|
196
|
+
'The Evaluator will eventually score the completed artifact against this',
|
|
197
|
+
'rubric. Plan with the rubric in mind:',
|
|
198
|
+
'',
|
|
199
|
+
'> ' + rubric.split('\n').join('\n> '),
|
|
200
|
+
'',
|
|
201
|
+
`Threshold: ${threshold} · Max iterations: ${maxIterations}`,
|
|
202
|
+
'',
|
|
203
|
+
`Mission ID: \`${missionId}\``,
|
|
204
|
+
].join('\n');
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
function _buildGeneratorDescription(sched, ctx, opts) {
|
|
208
|
+
const { artifactPath, missionId, threshold, maxIterations, rubric, iteration } = ctx;
|
|
209
|
+
const previousFeedback = opts && opts.previousFeedback ? String(opts.previousFeedback).trim() : '';
|
|
210
|
+
const lines = [
|
|
211
|
+
`# Tri-Agent Mission — Generator (iteration ${iteration})`,
|
|
212
|
+
'',
|
|
213
|
+
`**Mission:** ${sched.title || sched.id}`,
|
|
214
|
+
'',
|
|
215
|
+
`**Goal:** ${(sched.description || sched.title || '').trim()}`,
|
|
216
|
+
'',
|
|
217
|
+
'You are the **Generator** in a three-agent harness loop. Read the Planner\'s',
|
|
218
|
+
'subtask list from the shared artifact, execute each subtask in order, and',
|
|
219
|
+
'append your outputs to the artifact under a clearly-labelled section.',
|
|
220
|
+
'',
|
|
221
|
+
ARTIFACT_HEADING,
|
|
222
|
+
'',
|
|
223
|
+
`Shared artifact: \`${artifactPath}\``,
|
|
224
|
+
'',
|
|
225
|
+
`Append a section titled \`## Generator Output (iteration ${iteration})\` to the`,
|
|
226
|
+
'artifact. Within it, address each numbered subtask from the plan.',
|
|
227
|
+
'',
|
|
228
|
+
'Do NOT delete or rewrite earlier sections — append only.',
|
|
229
|
+
];
|
|
230
|
+
|
|
231
|
+
if (previousFeedback) {
|
|
232
|
+
lines.push(
|
|
233
|
+
'',
|
|
234
|
+
`## Previous Evaluator Feedback (iteration ${iteration - 1})`,
|
|
235
|
+
'',
|
|
236
|
+
'The previous iteration failed the rubric. The Evaluator provided this feedback:',
|
|
237
|
+
'',
|
|
238
|
+
'> ' + previousFeedback.split('\n').join('\n> '),
|
|
239
|
+
'',
|
|
240
|
+
'Address this feedback explicitly in your new output.',
|
|
241
|
+
);
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
lines.push(
|
|
245
|
+
'',
|
|
246
|
+
RUBRIC_HEADING,
|
|
247
|
+
'',
|
|
248
|
+
'The Evaluator will score your output against this rubric:',
|
|
249
|
+
'',
|
|
250
|
+
'> ' + rubric.split('\n').join('\n> '),
|
|
251
|
+
'',
|
|
252
|
+
`Threshold: ${threshold} · Max iterations: ${maxIterations}`,
|
|
253
|
+
'',
|
|
254
|
+
`Mission ID: \`${missionId}\``,
|
|
255
|
+
);
|
|
256
|
+
return lines.join('\n');
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
function _buildEvaluatorDescription(sched, ctx) {
|
|
260
|
+
const { artifactPath, missionId, threshold, maxIterations, rubric, iteration } = ctx;
|
|
261
|
+
return [
|
|
262
|
+
`# Tri-Agent Mission — Evaluator (iteration ${iteration})`,
|
|
263
|
+
'',
|
|
264
|
+
`**Mission:** ${sched.title || sched.id}`,
|
|
265
|
+
'',
|
|
266
|
+
'You are the **Evaluator** in a three-agent harness loop. Read the shared',
|
|
267
|
+
'artifact (including the Planner\'s plan and the Generator\'s output) and',
|
|
268
|
+
'score it against the rubric below.',
|
|
269
|
+
'',
|
|
270
|
+
ARTIFACT_HEADING,
|
|
271
|
+
'',
|
|
272
|
+
`Shared artifact: \`${artifactPath}\``,
|
|
273
|
+
'',
|
|
274
|
+
'Append a section titled `## Evaluation (iteration ' + iteration + ')` containing:',
|
|
275
|
+
'- A numeric score in `[0, 1]` formatted as `Score: 0.NN`',
|
|
276
|
+
'- A `PASS` or `FAIL` verdict on its own line',
|
|
277
|
+
'- Concrete feedback under `### Feedback` explaining strengths and gaps',
|
|
278
|
+
'',
|
|
279
|
+
RUBRIC_HEADING,
|
|
280
|
+
'',
|
|
281
|
+
'> ' + rubric.split('\n').join('\n> '),
|
|
282
|
+
'',
|
|
283
|
+
`**Threshold:** ${threshold} — a score < ${threshold} is a FAIL and triggers another Generator iteration`,
|
|
284
|
+
`(up to ${maxIterations} total iterations).`,
|
|
285
|
+
'',
|
|
286
|
+
'## Completion Report',
|
|
287
|
+
'',
|
|
288
|
+
'In your JSON completion report include these fields so the engine can route',
|
|
289
|
+
'the next iteration deterministically (in addition to the standard schema):',
|
|
290
|
+
'```json',
|
|
291
|
+
'{',
|
|
292
|
+
' "status": "success",',
|
|
293
|
+
' "summary": "<one-line verdict>",',
|
|
294
|
+
' "harness_score": 0.NN,',
|
|
295
|
+
' "harness_pass": true | false,',
|
|
296
|
+
' "harness_feedback": "<machine-readable feedback the next Generator should address>"',
|
|
297
|
+
'}',
|
|
298
|
+
'```',
|
|
299
|
+
'',
|
|
300
|
+
'If you cannot evaluate (artifact missing, malformed), set `harness_pass: false`,',
|
|
301
|
+
'`harness_score: 0`, and explain in `harness_feedback`.',
|
|
302
|
+
'',
|
|
303
|
+
`Mission ID: \`${missionId}\``,
|
|
304
|
+
].join('\n');
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// ─── Mission creation ───────────────────────────────────────────────────────
|
|
308
|
+
|
|
309
|
+
function _commonItemFields(sched, role, iteration) {
|
|
310
|
+
return {
|
|
311
|
+
title: `[harness:${role}:i${iteration}] ${sched.title || sched.id}`,
|
|
312
|
+
priority: sched.priority || 'medium',
|
|
313
|
+
status: WI_STATUS.PENDING,
|
|
314
|
+
created: ts(),
|
|
315
|
+
createdBy: 'scheduler:harness',
|
|
316
|
+
project: sched.project || null,
|
|
317
|
+
agent: null,
|
|
318
|
+
_scheduleId: sched.id,
|
|
319
|
+
};
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
function _buildHarnessMeta(missionId, role, iteration, resolved, sched, artifactPath) {
|
|
323
|
+
return {
|
|
324
|
+
role,
|
|
325
|
+
iteration,
|
|
326
|
+
missionId,
|
|
327
|
+
artifactPath,
|
|
328
|
+
rubric: sched.harness_rubric,
|
|
329
|
+
threshold: resolved.threshold,
|
|
330
|
+
maxIterations: resolved.maxIterations,
|
|
331
|
+
generatorType: resolved.generatorType,
|
|
332
|
+
};
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
/**
|
|
336
|
+
* Build the initial Planner → Generator → Evaluator trio for a tri-agent
|
|
337
|
+
* schedule firing. Throws if the schedule's harness config is invalid — the
|
|
338
|
+
* caller is responsible for validating + logging upstream when desired.
|
|
339
|
+
*
|
|
340
|
+
* Returns { items, missionId, artifactPath } where items[0..2] are the three
|
|
341
|
+
* work items in dispatch order, already linked by depends_on.
|
|
342
|
+
*/
|
|
343
|
+
function createTriAgentMission(sched, opts) {
|
|
344
|
+
const { valid, errors, resolved } = validateHarnessConfig(sched);
|
|
345
|
+
if (!valid) throw new Error(`tri_agent harness config invalid for schedule ${sched && sched.id}: ${errors.join('; ')}`);
|
|
346
|
+
|
|
347
|
+
const nowMs = opts && Number.isFinite(opts.now) ? opts.now : Date.now();
|
|
348
|
+
const missionId = (opts && typeof opts.missionId === 'string' && opts.missionId) || _buildMissionId(sched, nowMs);
|
|
349
|
+
const artifactPath = harnessArtifactPath(missionId);
|
|
350
|
+
const iteration = 1;
|
|
351
|
+
const ctx = {
|
|
352
|
+
artifactPath, missionId, iteration,
|
|
353
|
+
threshold: resolved.threshold,
|
|
354
|
+
maxIterations: resolved.maxIterations,
|
|
355
|
+
rubric: sched.harness_rubric,
|
|
356
|
+
};
|
|
357
|
+
|
|
358
|
+
const plannerId = _buildItemId(sched.id, HARNESS_ROLE.PLANNER, iteration, nowMs);
|
|
359
|
+
const generatorId = _buildItemId(sched.id, HARNESS_ROLE.GENERATOR, iteration, nowMs);
|
|
360
|
+
const evaluatorId = _buildItemId(sched.id, HARNESS_ROLE.EVALUATOR, iteration, nowMs);
|
|
361
|
+
|
|
362
|
+
// Planner + Evaluator are read-only (ask) by design — they don't mutate
|
|
363
|
+
// project code, they only read/write the shared harness artifact. The
|
|
364
|
+
// generator inherits sched.type (default 'ask').
|
|
365
|
+
const planner = {
|
|
366
|
+
id: plannerId,
|
|
367
|
+
type: 'ask',
|
|
368
|
+
description: _buildPlannerDescription(sched, ctx),
|
|
369
|
+
depends_on: [],
|
|
370
|
+
..._commonItemFields(sched, HARNESS_ROLE.PLANNER, iteration),
|
|
371
|
+
_missionId: missionId,
|
|
372
|
+
_harness: _buildHarnessMeta(missionId, HARNESS_ROLE.PLANNER, iteration, resolved, sched, artifactPath),
|
|
373
|
+
};
|
|
374
|
+
|
|
375
|
+
const generator = {
|
|
376
|
+
id: generatorId,
|
|
377
|
+
type: resolved.generatorType,
|
|
378
|
+
description: _buildGeneratorDescription(sched, ctx, {}),
|
|
379
|
+
depends_on: [plannerId],
|
|
380
|
+
..._commonItemFields(sched, HARNESS_ROLE.GENERATOR, iteration),
|
|
381
|
+
_missionId: missionId,
|
|
382
|
+
_harness: _buildHarnessMeta(missionId, HARNESS_ROLE.GENERATOR, iteration, resolved, sched, artifactPath),
|
|
383
|
+
};
|
|
384
|
+
|
|
385
|
+
const evaluator = {
|
|
386
|
+
id: evaluatorId,
|
|
387
|
+
type: 'ask',
|
|
388
|
+
description: _buildEvaluatorDescription(sched, ctx),
|
|
389
|
+
depends_on: [generatorId],
|
|
390
|
+
..._commonItemFields(sched, HARNESS_ROLE.EVALUATOR, iteration),
|
|
391
|
+
_missionId: missionId,
|
|
392
|
+
_harness: _buildHarnessMeta(missionId, HARNESS_ROLE.EVALUATOR, iteration, resolved, sched, artifactPath),
|
|
393
|
+
};
|
|
394
|
+
|
|
395
|
+
return { items: [planner, generator, evaluator], missionId, artifactPath };
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
/**
|
|
399
|
+
* Build the Generator+Evaluator pair for iteration N+1 after the Evaluator
|
|
400
|
+
* fails the rubric. The Planner only runs once per mission — its plan is
|
|
401
|
+
* already in the shared artifact.
|
|
402
|
+
*
|
|
403
|
+
* `prevEvaluatorItem` is the work item that just completed (must carry
|
|
404
|
+
* `_harness` meta with role='evaluator'). The new generator depends on it so
|
|
405
|
+
* the engine's dispatch loop won't fire it until the artifact is fully written.
|
|
406
|
+
*/
|
|
407
|
+
function createIterationWorkItems(prevEvaluatorItem, verdict, opts) {
|
|
408
|
+
if (!prevEvaluatorItem || !prevEvaluatorItem._harness) {
|
|
409
|
+
throw new Error('createIterationWorkItems: prevEvaluatorItem missing _harness meta');
|
|
410
|
+
}
|
|
411
|
+
const prevMeta = prevEvaluatorItem._harness;
|
|
412
|
+
if (prevMeta.role !== HARNESS_ROLE.EVALUATOR) {
|
|
413
|
+
throw new Error(`createIterationWorkItems: prev item must be an evaluator (got role=${prevMeta.role})`);
|
|
414
|
+
}
|
|
415
|
+
const iteration = (Number(prevMeta.iteration) || 1) + 1;
|
|
416
|
+
const nowMs = opts && Number.isFinite(opts.now) ? opts.now : Date.now();
|
|
417
|
+
|
|
418
|
+
const sched = {
|
|
419
|
+
id: prevEvaluatorItem._scheduleId,
|
|
420
|
+
title: prevEvaluatorItem.title || prevMeta.missionId,
|
|
421
|
+
description: '', // Carried via artifact + feedback, not re-rendered.
|
|
422
|
+
harness_rubric: prevMeta.rubric,
|
|
423
|
+
project: prevEvaluatorItem.project || null,
|
|
424
|
+
priority: prevEvaluatorItem.priority || 'medium',
|
|
425
|
+
};
|
|
426
|
+
const ctx = {
|
|
427
|
+
artifactPath: prevMeta.artifactPath,
|
|
428
|
+
missionId: prevMeta.missionId,
|
|
429
|
+
iteration,
|
|
430
|
+
threshold: prevMeta.threshold,
|
|
431
|
+
maxIterations: prevMeta.maxIterations,
|
|
432
|
+
rubric: prevMeta.rubric,
|
|
433
|
+
};
|
|
434
|
+
const resolved = {
|
|
435
|
+
threshold: prevMeta.threshold,
|
|
436
|
+
maxIterations: prevMeta.maxIterations,
|
|
437
|
+
generatorType: prevMeta.generatorType || HARNESS_DEFAULTS.generatorType,
|
|
438
|
+
};
|
|
439
|
+
|
|
440
|
+
const generatorId = _buildItemId(sched.id || 'mission', HARNESS_ROLE.GENERATOR, iteration, nowMs);
|
|
441
|
+
const evaluatorId = _buildItemId(sched.id || 'mission', HARNESS_ROLE.EVALUATOR, iteration, nowMs);
|
|
442
|
+
const feedback = verdict && verdict.feedback ? verdict.feedback : '(no feedback supplied)';
|
|
443
|
+
|
|
444
|
+
const generator = {
|
|
445
|
+
id: generatorId,
|
|
446
|
+
type: resolved.generatorType,
|
|
447
|
+
title: `[harness:generator:i${iteration}] ${sched.title}`,
|
|
448
|
+
description: _buildGeneratorDescription(sched, ctx, { previousFeedback: feedback }),
|
|
449
|
+
depends_on: [prevEvaluatorItem.id],
|
|
450
|
+
priority: sched.priority,
|
|
451
|
+
status: WI_STATUS.PENDING,
|
|
452
|
+
created: ts(),
|
|
453
|
+
createdBy: 'harness:iterate',
|
|
454
|
+
project: sched.project,
|
|
455
|
+
agent: null,
|
|
456
|
+
_scheduleId: sched.id,
|
|
457
|
+
_missionId: prevMeta.missionId,
|
|
458
|
+
_harness: _buildHarnessMeta(prevMeta.missionId, HARNESS_ROLE.GENERATOR, iteration, resolved, sched, prevMeta.artifactPath),
|
|
459
|
+
};
|
|
460
|
+
const evaluator = {
|
|
461
|
+
id: evaluatorId,
|
|
462
|
+
type: 'ask',
|
|
463
|
+
title: `[harness:evaluator:i${iteration}] ${sched.title}`,
|
|
464
|
+
description: _buildEvaluatorDescription(sched, ctx),
|
|
465
|
+
depends_on: [generatorId],
|
|
466
|
+
priority: sched.priority,
|
|
467
|
+
status: WI_STATUS.PENDING,
|
|
468
|
+
created: ts(),
|
|
469
|
+
createdBy: 'harness:iterate',
|
|
470
|
+
project: sched.project,
|
|
471
|
+
agent: null,
|
|
472
|
+
_scheduleId: sched.id,
|
|
473
|
+
_missionId: prevMeta.missionId,
|
|
474
|
+
_harness: _buildHarnessMeta(prevMeta.missionId, HARNESS_ROLE.EVALUATOR, iteration, resolved, sched, prevMeta.artifactPath),
|
|
475
|
+
};
|
|
476
|
+
|
|
477
|
+
return [generator, evaluator];
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
// ─── Verdict parsing + iteration gate ───────────────────────────────────────
|
|
481
|
+
|
|
482
|
+
const SCORE_RE = /(?:^|\W)Score\s*[:=]\s*([0-1](?:\.\d+)?|\.\d+)/i;
|
|
483
|
+
const PASS_RE = /(?:^|[^\w])(PASS|✅\s*PASS|verdict\s*[:=]\s*pass)\b/i;
|
|
484
|
+
const FAIL_RE = /(?:^|[^\w])(FAIL|❌\s*FAIL|verdict\s*[:=]\s*fail)\b/i;
|
|
485
|
+
|
|
486
|
+
/**
|
|
487
|
+
* Extract { score, pass, feedback } from the Evaluator's completion report and
|
|
488
|
+
* stdout. Structured fields in the completion report win when present.
|
|
489
|
+
*
|
|
490
|
+
* Returns:
|
|
491
|
+
* { score: number | null, pass: boolean | null, feedback: string }
|
|
492
|
+
* `score=null` and `pass=null` together mean "no signal" — the caller should
|
|
493
|
+
* treat this as inconclusive (do NOT retry blindly).
|
|
494
|
+
*/
|
|
495
|
+
function parseEvaluatorVerdict(stdout, structuredCompletion) {
|
|
496
|
+
let score = null;
|
|
497
|
+
let pass = null;
|
|
498
|
+
let feedback = '';
|
|
499
|
+
|
|
500
|
+
// Structured fields take precedence — they're the documented contract in
|
|
501
|
+
// the evaluator prompt and not vulnerable to text-format drift.
|
|
502
|
+
if (structuredCompletion && typeof structuredCompletion === 'object') {
|
|
503
|
+
if (typeof structuredCompletion.harness_score === 'number' && Number.isFinite(structuredCompletion.harness_score)) {
|
|
504
|
+
score = Math.max(0, Math.min(1, structuredCompletion.harness_score));
|
|
505
|
+
}
|
|
506
|
+
if (typeof structuredCompletion.harness_pass === 'boolean') {
|
|
507
|
+
pass = structuredCompletion.harness_pass;
|
|
508
|
+
}
|
|
509
|
+
if (typeof structuredCompletion.harness_feedback === 'string' && structuredCompletion.harness_feedback.trim()) {
|
|
510
|
+
feedback = structuredCompletion.harness_feedback.trim();
|
|
511
|
+
} else if (typeof structuredCompletion.summary === 'string' && structuredCompletion.summary.trim()) {
|
|
512
|
+
feedback = structuredCompletion.summary.trim();
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
// Text fallback — only fill in fields the structured report did not provide.
|
|
517
|
+
if ((score === null || pass === null || !feedback) && typeof stdout === 'string' && stdout.length > 0) {
|
|
518
|
+
if (score === null) {
|
|
519
|
+
const m = SCORE_RE.exec(stdout);
|
|
520
|
+
if (m) {
|
|
521
|
+
const n = parseFloat(m[1]);
|
|
522
|
+
if (Number.isFinite(n)) score = Math.max(0, Math.min(1, n));
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
if (pass === null) {
|
|
526
|
+
const failMatch = FAIL_RE.exec(stdout);
|
|
527
|
+
const passMatch = PASS_RE.exec(stdout);
|
|
528
|
+
// FAIL takes precedence over PASS when both appear (the evaluator's
|
|
529
|
+
// explanation of failure may mention 'pass criteria' etc).
|
|
530
|
+
if (failMatch) pass = false;
|
|
531
|
+
else if (passMatch) pass = true;
|
|
532
|
+
}
|
|
533
|
+
if (!feedback) {
|
|
534
|
+
// Best-effort: take the last non-empty line as the feedback summary.
|
|
535
|
+
const lines = stdout.split(/\r?\n/).map(l => l.trim()).filter(Boolean);
|
|
536
|
+
if (lines.length > 0) feedback = lines[lines.length - 1].slice(0, 2000);
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
// If score is set but pass is not, infer pass from the threshold caller
|
|
541
|
+
// (lifecycle.shouldIterateAgain) — but leave pass=null here so the caller
|
|
542
|
+
// can apply the per-mission threshold rather than baking in a default.
|
|
543
|
+
return { score, pass, feedback };
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
/**
|
|
547
|
+
* Decide whether to spawn another Generator+Evaluator iteration.
|
|
548
|
+
*
|
|
549
|
+
* Rules (in order):
|
|
550
|
+
* 1. If verdict.pass === true, stop (mission succeeded).
|
|
551
|
+
* 2. If iteration >= maxIterations, stop (cap reached).
|
|
552
|
+
* 3. If we have a numeric score AND score >= threshold, treat as pass and stop.
|
|
553
|
+
* 4. If we have a numeric score AND score < threshold, iterate.
|
|
554
|
+
* 5. If verdict.pass === false explicitly (no score), iterate.
|
|
555
|
+
* 6. Otherwise (no score and no pass signal), STOP — silent agents would
|
|
556
|
+
* loop forever; require explicit failure to retry.
|
|
557
|
+
*/
|
|
558
|
+
function shouldIterateAgain(harnessMeta, verdict) {
|
|
559
|
+
if (!harnessMeta || !verdict) return false;
|
|
560
|
+
const iteration = Number(harnessMeta.iteration) || 1;
|
|
561
|
+
const maxIterations = Number(harnessMeta.maxIterations) || HARNESS_DEFAULTS.maxIterations;
|
|
562
|
+
const threshold = Number(harnessMeta.threshold);
|
|
563
|
+
const t = Number.isFinite(threshold) ? threshold : HARNESS_DEFAULTS.threshold;
|
|
564
|
+
|
|
565
|
+
if (verdict.pass === true) return false;
|
|
566
|
+
if (iteration >= maxIterations) return false;
|
|
567
|
+
|
|
568
|
+
if (typeof verdict.score === 'number' && Number.isFinite(verdict.score)) {
|
|
569
|
+
return verdict.score < t;
|
|
570
|
+
}
|
|
571
|
+
if (verdict.pass === false) return true;
|
|
572
|
+
return false;
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
module.exports = {
|
|
576
|
+
HARNESS_MODE,
|
|
577
|
+
HARNESS_ROLE,
|
|
578
|
+
HARNESS_DEFAULTS,
|
|
579
|
+
HARNESS_MAX_ITERATIONS_CAP,
|
|
580
|
+
harnessRootDir,
|
|
581
|
+
harnessMissionDir,
|
|
582
|
+
harnessArtifactPath,
|
|
583
|
+
validateHarnessConfig,
|
|
584
|
+
createTriAgentMission,
|
|
585
|
+
createIterationWorkItems,
|
|
586
|
+
parseEvaluatorVerdict,
|
|
587
|
+
shouldIterateAgain,
|
|
588
|
+
// Exported for direct unit tests (per docs/skills.md skill 'export-internal-helpers-for-direct-unit-tests').
|
|
589
|
+
_buildPlannerDescription,
|
|
590
|
+
_buildGeneratorDescription,
|
|
591
|
+
_buildEvaluatorDescription,
|
|
592
|
+
};
|
package/engine/lifecycle.js
CHANGED
|
@@ -14,6 +14,7 @@ const { trackEngineUsage } = require('./llm');
|
|
|
14
14
|
const { resolveRuntime } = require('./runtimes');
|
|
15
15
|
const adoGitAuth = require('./ado-git-auth');
|
|
16
16
|
const queries = require('./queries');
|
|
17
|
+
const harness = require('./harness');
|
|
17
18
|
const { isBranchActive } = require('./cooldown');
|
|
18
19
|
const { worktreeMatchesBranch, getWorktreeBranch, cleanupMergedPrLocalBranch } = require('./cleanup');
|
|
19
20
|
const { getConfig, getInboxFiles, getNotes, getPrs, getDispatch,
|
|
@@ -4040,6 +4041,82 @@ function handleDecompositionResult(stdout, meta, config, runtimeName) {
|
|
|
4040
4041
|
return 0;
|
|
4041
4042
|
}
|
|
4042
4043
|
|
|
4044
|
+
/**
|
|
4045
|
+
* Tri-agent harness mode (W-mq07a9gf000jbc2b): when an evaluator completes,
|
|
4046
|
+
* parse its verdict against the configured rubric/threshold and — if the
|
|
4047
|
+
* artifact didn't pass and the iteration cap hasn't been hit — append a
|
|
4048
|
+
* fresh Generator+Evaluator pair so the harness can iterate on its own
|
|
4049
|
+
* artifact. Returns the number of work items appended (0 = terminal stop,
|
|
4050
|
+
* either pass or cap reached).
|
|
4051
|
+
*
|
|
4052
|
+
* Called from runPostCompletionHooks after a successful run when the
|
|
4053
|
+
* dispatched item carries _harness.role === 'evaluator'.
|
|
4054
|
+
*/
|
|
4055
|
+
function handleHarnessIterationResult(stdout, structuredCompletion, meta, config) {
|
|
4056
|
+
const evaluatorItem = meta?.item;
|
|
4057
|
+
if (!evaluatorItem?._harness || evaluatorItem._harness.role !== harness.HARNESS_ROLE.EVALUATOR) return 0;
|
|
4058
|
+
|
|
4059
|
+
let verdict;
|
|
4060
|
+
try {
|
|
4061
|
+
verdict = harness.parseEvaluatorVerdict(stdout || '', structuredCompletion || null);
|
|
4062
|
+
} catch (err) {
|
|
4063
|
+
log('warn', `Harness ${evaluatorItem._harness.missionId}: verdict parse failed — ${err.message}; treating as terminal stop`);
|
|
4064
|
+
return 0;
|
|
4065
|
+
}
|
|
4066
|
+
|
|
4067
|
+
if (!harness.shouldIterateAgain(evaluatorItem._harness, verdict)) {
|
|
4068
|
+
const reason = verdict.pass === true ? 'passed' :
|
|
4069
|
+
(evaluatorItem._harness.iteration >= evaluatorItem._harness.maxIterations ? 'max iterations reached' :
|
|
4070
|
+
'inconclusive verdict');
|
|
4071
|
+
log('info', `Harness mission ${evaluatorItem._harness.missionId} terminal stop (iteration ${evaluatorItem._harness.iteration}, ${reason}, score=${verdict.score ?? 'n/a'})`);
|
|
4072
|
+
return 0;
|
|
4073
|
+
}
|
|
4074
|
+
|
|
4075
|
+
let nextItems;
|
|
4076
|
+
try {
|
|
4077
|
+
nextItems = harness.createIterationWorkItems(evaluatorItem, verdict, {});
|
|
4078
|
+
} catch (err) {
|
|
4079
|
+
log('warn', `Harness ${evaluatorItem._harness.missionId}: iteration build failed — ${err.message}`);
|
|
4080
|
+
return 0;
|
|
4081
|
+
}
|
|
4082
|
+
if (!Array.isArray(nextItems) || nextItems.length === 0) return 0;
|
|
4083
|
+
|
|
4084
|
+
// Mirror handleDecompositionResult: scan central + per-project work-items.json
|
|
4085
|
+
// and append into the file that owns the evaluator (the trio always lands in
|
|
4086
|
+
// the central file in practice — scheduler.discoverScheduledWork writes
|
|
4087
|
+
// directly to engine/work-items.json via engine.js — but iterate defensively).
|
|
4088
|
+
const projects = shared.getProjects(config);
|
|
4089
|
+
const allPaths = [path.join(MINIONS_DIR, 'work-items.json')];
|
|
4090
|
+
for (const p of projects) allPaths.push(shared.projectWorkItemsPath(p));
|
|
4091
|
+
|
|
4092
|
+
let appendedTo = null;
|
|
4093
|
+
for (const wiPath of allPaths) {
|
|
4094
|
+
let found = false;
|
|
4095
|
+
mutateJsonFileLocked(wiPath, data => {
|
|
4096
|
+
if (!Array.isArray(data)) return data;
|
|
4097
|
+
const evaluator = data.find(i => i.id === evaluatorItem.id);
|
|
4098
|
+
if (!evaluator) return data;
|
|
4099
|
+
found = true;
|
|
4100
|
+
// De-dupe by id in case a previous tick already appended the next pair.
|
|
4101
|
+
const existingIds = new Set(data.map(i => i.id));
|
|
4102
|
+
for (const it of nextItems) {
|
|
4103
|
+
if (existingIds.has(it.id)) continue;
|
|
4104
|
+
data.push(it);
|
|
4105
|
+
}
|
|
4106
|
+
return data;
|
|
4107
|
+
}, { defaultValue: [] });
|
|
4108
|
+
if (found) { appendedTo = wiPath; break; }
|
|
4109
|
+
}
|
|
4110
|
+
|
|
4111
|
+
if (!appendedTo) {
|
|
4112
|
+
log('warn', `Harness ${evaluatorItem._harness.missionId}: evaluator ${evaluatorItem.id} not found in any work-items.json — iteration skipped`);
|
|
4113
|
+
return 0;
|
|
4114
|
+
}
|
|
4115
|
+
|
|
4116
|
+
log('info', `Harness mission ${evaluatorItem._harness.missionId} iterating: appended ${nextItems.length} work items (next iteration: ${nextItems[0]._harness.iteration}, score=${verdict.score ?? 'n/a'})`);
|
|
4117
|
+
return nextItems.length;
|
|
4118
|
+
}
|
|
4119
|
+
|
|
4043
4120
|
/**
|
|
4044
4121
|
* W-mpg58wv3 — auto-dispatch a re-review WI when a fix-WI born from a minion
|
|
4045
4122
|
* REQUEST_CHANGES marks done. Closure-loop for the shared Yemi reviewer slot:
|
|
@@ -4386,6 +4463,19 @@ async function runPostCompletionHooks(dispatchItem, agentId, code, stdout, confi
|
|
|
4386
4463
|
}
|
|
4387
4464
|
}
|
|
4388
4465
|
|
|
4466
|
+
// Tri-agent harness iteration (W-mq07a9gf000jbc2b): if the evaluator just
|
|
4467
|
+
// completed successfully and verdict says retry, append the next Gen+Eval
|
|
4468
|
+
// pair into the same work-items.json. Engine will dispatch them on the
|
|
4469
|
+
// next tick. No interaction with skipDoneStatus — the evaluator itself
|
|
4470
|
+
// still marks DONE; iteration is a sibling write, not a parent decomp.
|
|
4471
|
+
if (effectiveSuccess && meta?.item?._harness?.role === harness.HARNESS_ROLE.EVALUATOR) {
|
|
4472
|
+
try {
|
|
4473
|
+
handleHarnessIterationResult(stdout, structuredCompletion, meta, config);
|
|
4474
|
+
} catch (err) {
|
|
4475
|
+
log('warn', `Harness iteration hook failed for ${meta.item.id}: ${err.message}`);
|
|
4476
|
+
}
|
|
4477
|
+
}
|
|
4478
|
+
|
|
4389
4479
|
// Verify review work items include a verdict — must run BEFORE updateWorkItemStatus(DONE),
|
|
4390
4480
|
// same pattern as plan-to-prd (#893): updateWorkItemStatus deletes _retryCount, so the check
|
|
4391
4481
|
// must read/increment it before that happens. Also sets skipDoneStatus so completedAt isn't
|
|
@@ -5204,6 +5294,7 @@ module.exports = {
|
|
|
5204
5294
|
isPrAttachmentRequired,
|
|
5205
5295
|
extractDecompositionJson,
|
|
5206
5296
|
handleDecompositionResult,
|
|
5297
|
+
handleHarnessIterationResult,
|
|
5207
5298
|
processCompletionFollowups,
|
|
5208
5299
|
// W-mpg58wv3 — closure-loop dispatch helpers (exported for testing).
|
|
5209
5300
|
dispatchReReviewForFix,
|
package/engine/scheduler.js
CHANGED
|
@@ -25,7 +25,8 @@ const fs = require('fs');
|
|
|
25
25
|
const path = require('path');
|
|
26
26
|
const shared = require('./shared');
|
|
27
27
|
const routing = require('./routing');
|
|
28
|
-
const
|
|
28
|
+
const harness = require('./harness');
|
|
29
|
+
const { safeJson, safeWrite, mutateJsonFileLocked, mutateScheduleRuns, ts, dateStamp, log, WI_STATUS, WORK_TYPE } = shared;
|
|
29
30
|
|
|
30
31
|
const SCHEDULE_RUNS_PATH = path.join(shared.MINIONS_DIR, 'engine', 'schedule-runs.json');
|
|
31
32
|
|
|
@@ -186,9 +187,9 @@ function createScheduledWorkItem(sched) {
|
|
|
186
187
|
};
|
|
187
188
|
}
|
|
188
189
|
|
|
189
|
-
function writeScheduleRunEntry(runs, scheduleId, workItemId) {
|
|
190
|
+
function writeScheduleRunEntry(runs, scheduleId, workItemId, extra) {
|
|
190
191
|
const existing = typeof runs[scheduleId] === 'object' && runs[scheduleId] ? runs[scheduleId] : {};
|
|
191
|
-
runs[scheduleId] = { ...existing, lastRun: ts(), lastWorkItemId: workItemId };
|
|
192
|
+
runs[scheduleId] = { ...existing, lastRun: ts(), lastWorkItemId: workItemId, ...(extra || {}) };
|
|
192
193
|
return runs[scheduleId];
|
|
193
194
|
}
|
|
194
195
|
|
|
@@ -222,6 +223,42 @@ function discoverScheduledWork(config) {
|
|
|
222
223
|
const lastRun = typeof runEntry === 'string' ? runEntry : (runEntry?.lastRun || null);
|
|
223
224
|
if (!shouldRunNow(sched, lastRun)) continue;
|
|
224
225
|
|
|
226
|
+
// Tri-agent harness mode (W-mq07a9gf000jbc2b): a single schedule firing
|
|
227
|
+
// produces a coordinated Planner → Generator → Evaluator trio rather than
|
|
228
|
+
// a single work item. Validate config first — on bad config, skip this
|
|
229
|
+
// tick WITHOUT recording a schedule run so the operator can fix the
|
|
230
|
+
// config and the next tick will pick it up.
|
|
231
|
+
if (sched.harness_mode === harness.HARNESS_MODE.TRI_AGENT) {
|
|
232
|
+
const validation = harness.validateHarnessConfig(sched);
|
|
233
|
+
if (!validation.valid) {
|
|
234
|
+
log('warn', `Scheduler: harness config invalid for ${sched.id} — skipping (errors: ${validation.errors.join('; ')})`);
|
|
235
|
+
continue;
|
|
236
|
+
}
|
|
237
|
+
try {
|
|
238
|
+
// Resolve schedule-time template variables on the title/description
|
|
239
|
+
// BEFORE handing the schedule to the harness builder so subtask
|
|
240
|
+
// prompts inherit the same substitutions as regular schedules.
|
|
241
|
+
const resolvedSched = {
|
|
242
|
+
...sched,
|
|
243
|
+
title: resolveScheduleTemplateVars(sched.title),
|
|
244
|
+
description: resolveScheduleTemplateVars(sched.description || sched.title),
|
|
245
|
+
harness_rubric: resolveScheduleTemplateVars(sched.harness_rubric),
|
|
246
|
+
};
|
|
247
|
+
const mission = harness.createTriAgentMission(resolvedSched);
|
|
248
|
+
for (const it of mission.items) work.push(it);
|
|
249
|
+
// Record the mission's planner id as lastWorkItemId for compatibility
|
|
250
|
+
// with the existing schedule-runs shape, plus lastMissionId so the
|
|
251
|
+
// dashboard and consolidation tooling can join across the trio.
|
|
252
|
+
writeScheduleRunEntry(runs, sched.id, mission.items[0].id, {
|
|
253
|
+
lastMissionId: mission.missionId,
|
|
254
|
+
harnessMode: harness.HARNESS_MODE.TRI_AGENT,
|
|
255
|
+
});
|
|
256
|
+
} catch (err) {
|
|
257
|
+
log('warn', `Scheduler: tri-agent mission build failed for ${sched.id}: ${err.message}`);
|
|
258
|
+
}
|
|
259
|
+
continue;
|
|
260
|
+
}
|
|
261
|
+
|
|
225
262
|
// Substitute schedule-time template vars (e.g. {{date}}) before the work
|
|
226
263
|
// item is written — single-pass playbook rendering can't reach placeholders
|
|
227
264
|
// embedded inside task_description, so they must be resolved up front.
|
package/engine.js
CHANGED
|
@@ -3944,7 +3944,7 @@ function reconcileItemsWithPrs(items, allPrs, { onlyIds } = {}) {
|
|
|
3944
3944
|
// ─── Inbox Consolidation (extracted to engine/consolidation.js) ──────────────
|
|
3945
3945
|
|
|
3946
3946
|
const { consolidateInbox } = require('./engine/consolidation');
|
|
3947
|
-
const { pollPrStatus, pollPrHumanComments, reconcilePrs, checkLiveReviewStatus: adoCheckLiveReview, checkLiveBuildAndConflict: adoCheckLiveBuildAndConflict, needsAdoPollRetry, getAdoToken, isAdoThrottled } = require('./engine/ado');
|
|
3947
|
+
const { pollPrStatus, pollPrHumanComments, reconcilePrs, checkLiveReviewStatus: adoCheckLiveReview, checkLiveBuildAndConflict: adoCheckLiveBuildAndConflict, needsAdoPollRetry, getAdoToken, isAdoThrottled, getAdoThrottleStateAll } = require('./engine/ado');
|
|
3948
3948
|
const { pollPrStatus: ghPollPrStatus, pollPrHumanComments: ghPollPrHumanComments, reconcilePrs: ghReconcilePrs, checkLiveReviewStatus: ghCheckLiveReview, checkLiveBuildAndConflict: ghCheckLiveBuildAndConflict, isGhThrottled } = require('./engine/github');
|
|
3949
3949
|
|
|
3950
3950
|
// ─── State Snapshot ─────────────────────────────────────────────────────────
|
|
@@ -6878,12 +6878,35 @@ async function discoverWork(config) {
|
|
|
6878
6878
|
mutateJsonFileLocked(centralPath, (items) => {
|
|
6879
6879
|
if (!Array.isArray(items)) items = [];
|
|
6880
6880
|
let added = 0;
|
|
6881
|
+
// Snapshot active dedup keys BEFORE the loop so multiple items in the
|
|
6882
|
+
// same harness mission (same _missionId) all land in one tick. Without
|
|
6883
|
+
// this snapshot, the first item's push would block subsequent items
|
|
6884
|
+
// in the same mission from joining (W-mq07a9gf000jbc2b — tri-agent
|
|
6885
|
+
// harness mode requires Planner+Generator+Evaluator to land together).
|
|
6886
|
+
const activeMissionIds = new Set();
|
|
6887
|
+
const activeScheduleIds = new Set();
|
|
6888
|
+
for (const existing of items) {
|
|
6889
|
+
if (existing.status === WI_STATUS.DONE || existing.status === WI_STATUS.FAILED) continue;
|
|
6890
|
+
if (existing._missionId) activeMissionIds.add(existing._missionId);
|
|
6891
|
+
if (existing._scheduleId) activeScheduleIds.add(existing._scheduleId);
|
|
6892
|
+
}
|
|
6893
|
+
const addedScheduleIdsThisTick = new Set();
|
|
6881
6894
|
for (const item of taskItems) {
|
|
6882
|
-
|
|
6883
|
-
|
|
6884
|
-
|
|
6885
|
-
|
|
6895
|
+
// Mission items dedup by _missionId against pre-existing rows only
|
|
6896
|
+
// (the trio's other items added later in this loop must not block
|
|
6897
|
+
// each other). Plain scheduled items keep the original scheduleId
|
|
6898
|
+
// dedup AND skip if a sibling item from the same tick already
|
|
6899
|
+
// claimed the schedule slot.
|
|
6900
|
+
if (item._missionId) {
|
|
6901
|
+
if (activeMissionIds.has(item._missionId)) continue;
|
|
6902
|
+
} else {
|
|
6903
|
+
if (activeScheduleIds.has(item._scheduleId)) continue;
|
|
6904
|
+
if (addedScheduleIdsThisTick.has(item._scheduleId)) continue;
|
|
6886
6905
|
}
|
|
6906
|
+
items.push(item);
|
|
6907
|
+
if (!item._missionId && item._scheduleId) addedScheduleIdsThisTick.add(item._scheduleId);
|
|
6908
|
+
added++;
|
|
6909
|
+
log('info', `Scheduled task fired: ${item._scheduleId} → ${item.title}`);
|
|
6887
6910
|
}
|
|
6888
6911
|
return items;
|
|
6889
6912
|
}, { defaultValue: [] });
|
|
@@ -7349,10 +7372,18 @@ async function tickInner() {
|
|
|
7349
7372
|
lastPrStatusPollAt = now;
|
|
7350
7373
|
// Build promise array — enabled+unthrottled polls run concurrently via Promise.allSettled
|
|
7351
7374
|
const statusPolls = [];
|
|
7352
|
-
if (adoPollEnabled
|
|
7353
|
-
|
|
7354
|
-
|
|
7355
|
-
log
|
|
7375
|
+
if (adoPollEnabled) {
|
|
7376
|
+
// Per-org throttle skip happens inside forEachActivePr (one log line per skipped project).
|
|
7377
|
+
// Top-level short-circuit: when every known ADO org is throttled, skip the whole phase
|
|
7378
|
+
// with one log line to avoid the per-project iteration cost.
|
|
7379
|
+
const adoThrottleStates = getAdoThrottleStateAll() || {};
|
|
7380
|
+
const adoOrgCount = Object.keys(adoThrottleStates).length;
|
|
7381
|
+
const allAdoThrottled = adoOrgCount > 0 && Object.values(adoThrottleStates).every(s => s && s.throttled);
|
|
7382
|
+
if (allAdoThrottled) {
|
|
7383
|
+
log('info', `[ado] PR status poll skipped — all ${adoOrgCount} known orgs throttled`);
|
|
7384
|
+
} else {
|
|
7385
|
+
statusPolls.push(pollPrStatus(config).catch(err => { log('warn', `ADO PR status poll error: ${err?.message || err}${err?.stack ? ' | ' + err.stack.split('\n')[1]?.trim() : ''}`); }));
|
|
7386
|
+
}
|
|
7356
7387
|
}
|
|
7357
7388
|
if (ghPollEnabled && !isGhThrottled()) {
|
|
7358
7389
|
statusPolls.push(ghPollPrStatus(config).catch(err => { log('warn', `GitHub PR status poll error: ${err?.message || err}${err?.stack ? ' | ' + err.stack.split('\n')[1]?.trim() : ''}`); }));
|
|
@@ -7395,10 +7426,18 @@ async function tickInner() {
|
|
|
7395
7426
|
lastPrCommentsPollAt = now;
|
|
7396
7427
|
// Build promise array — enabled+unthrottled comment polls run concurrently via Promise.allSettled
|
|
7397
7428
|
const commentPolls = [];
|
|
7398
|
-
if (adoPollEnabled
|
|
7399
|
-
|
|
7400
|
-
|
|
7401
|
-
log
|
|
7429
|
+
if (adoPollEnabled) {
|
|
7430
|
+
// Per-org throttle skip happens inside forEachActivePr (one log line per skipped project).
|
|
7431
|
+
// Top-level short-circuit: when every known ADO org is throttled, skip the whole phase
|
|
7432
|
+
// with one log line to avoid the per-project iteration cost.
|
|
7433
|
+
const adoThrottleStates = getAdoThrottleStateAll() || {};
|
|
7434
|
+
const adoOrgCount = Object.keys(adoThrottleStates).length;
|
|
7435
|
+
const allAdoThrottled = adoOrgCount > 0 && Object.values(adoThrottleStates).every(s => s && s.throttled);
|
|
7436
|
+
if (allAdoThrottled) {
|
|
7437
|
+
log('info', `[ado] PR comment poll skipped — all ${adoOrgCount} known orgs throttled`);
|
|
7438
|
+
} else {
|
|
7439
|
+
commentPolls.push(pollPrHumanComments(config).catch(err => { log('warn', `ADO PR comment poll error: ${err?.message || err}${err?.stack ? ' | ' + err.stack.split('\n')[1]?.trim() : ''}`); }));
|
|
7440
|
+
}
|
|
7402
7441
|
}
|
|
7403
7442
|
if (ghPollEnabled && !isGhThrottled()) {
|
|
7404
7443
|
commentPolls.push(ghPollPrHumanComments(config).catch(err => { log('warn', `GitHub PR comment poll error: ${err?.message || err}${err?.stack ? ' | ' + err.stack.split('\n')[1]?.trim() : ''}`); }));
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@yemi33/minions",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.2123",
|
|
4
4
|
"description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
|
|
5
5
|
"bin": {
|
|
6
6
|
"minions": "bin/minions.js"
|