@gethmy/agent 1.0.9 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +67 -16
- package/dist/__tests__/budget.test.d.ts +1 -0
- package/dist/__tests__/budget.test.js +94 -0
- package/dist/__tests__/config-validation.test.d.ts +1 -0
- package/dist/__tests__/config-validation.test.js +65 -0
- package/dist/__tests__/dev-server-readiness.test.d.ts +1 -0
- package/dist/__tests__/dev-server-readiness.test.js +26 -0
- package/dist/__tests__/http-server.test.d.ts +1 -0
- package/dist/__tests__/http-server.test.js +115 -0
- package/dist/__tests__/log.test.d.ts +1 -0
- package/dist/__tests__/log.test.js +115 -0
- package/dist/__tests__/process-group.test.d.ts +1 -0
- package/dist/__tests__/process-group.test.js +68 -0
- package/dist/__tests__/reconcile-heartbeat.test.d.ts +1 -0
- package/dist/__tests__/reconcile-heartbeat.test.js +116 -0
- package/dist/__tests__/recovery.test.d.ts +1 -0
- package/dist/__tests__/recovery.test.js +126 -0
- package/dist/__tests__/review-parser.test.d.ts +1 -0
- package/dist/__tests__/review-parser.test.js +65 -0
- package/dist/__tests__/state-store.test.d.ts +1 -0
- package/dist/__tests__/state-store.test.js +132 -0
- package/dist/__tests__/transitions.test.d.ts +1 -0
- package/dist/__tests__/transitions.test.js +130 -0
- package/dist/__tests__/worktree-gc.test.d.ts +1 -0
- package/dist/__tests__/worktree-gc.test.js +137 -0
- package/dist/budget.d.ts +45 -0
- package/dist/budget.js +94 -0
- package/dist/cli.d.ts +15 -1
- package/dist/cli.js +239 -1
- package/dist/completion.d.ts +9 -0
- package/dist/completion.js +28 -2
- package/dist/config-validation.d.ts +18 -0
- package/dist/config-validation.js +66 -0
- package/dist/config.js +12 -0
- package/dist/http-server.d.ts +79 -0
- package/dist/http-server.js +115 -0
- package/dist/index.d.ts +4 -1
- package/dist/index.js +125 -10
- package/dist/log.d.ts +29 -5
- package/dist/log.js +80 -15
- package/dist/pool.d.ts +27 -2
- package/dist/pool.js +69 -4
- package/dist/process-group.d.ts +26 -0
- package/dist/process-group.js +72 -0
- package/dist/progress-tracker.js +2 -0
- package/dist/queue.d.ts +2 -0
- package/dist/queue.js +4 -0
- package/dist/reconcile.d.ts +15 -1
- package/dist/reconcile.js +63 -2
- package/dist/recovery.d.ts +30 -0
- package/dist/recovery.js +136 -0
- package/dist/review-completion.d.ts +12 -4
- package/dist/review-completion.js +158 -49
- package/dist/review-worker.d.ts +9 -2
- package/dist/review-worker.js +182 -78
- package/dist/run-log.d.ts +6 -0
- package/dist/run-log.js +19 -0
- package/dist/state-store.d.ts +72 -0
- package/dist/state-store.js +216 -0
- package/dist/transitions.d.ts +57 -0
- package/dist/transitions.js +131 -0
- package/dist/types.d.ts +23 -0
- package/dist/types.js +19 -1
- package/dist/verification.d.ts +17 -0
- package/dist/verification.js +71 -10
- package/dist/watcher.d.ts +2 -0
- package/dist/watcher.js +11 -0
- package/dist/worker.d.ts +9 -2
- package/dist/worker.js +168 -47
- package/dist/worktree-gc.d.ts +39 -0
- package/dist/worktree-gc.js +139 -0
- package/package.json +2 -2
package/dist/pool.d.ts
CHANGED
|
@@ -1,16 +1,24 @@
|
|
|
1
1
|
import type { HarmonyApiClient } from "@gethmy/mcp/src/api-client.js";
|
|
2
2
|
import type { Card, Column, Label, Subtask } from "@harmony/shared";
|
|
3
|
+
import { PriorityQueue } from "./queue.js";
|
|
4
|
+
import type { StateStore } from "./state-store.js";
|
|
3
5
|
import type { AgentConfig, WorkMode } from "./types.js";
|
|
4
6
|
export declare class Pool {
|
|
7
|
+
private client;
|
|
5
8
|
private implWorkers;
|
|
6
9
|
private reviewWorkers;
|
|
7
10
|
private implQueue;
|
|
8
11
|
private reviewQueue;
|
|
9
|
-
|
|
12
|
+
private budget;
|
|
13
|
+
constructor(config: AgentConfig, client: HarmonyApiClient, userEmail: string, workspaceId: string, projectId: string, stateStore: StateStore);
|
|
10
14
|
/**
|
|
11
15
|
* Enqueue a card for processing with the given mode.
|
|
16
|
+
*
|
|
17
|
+
* Returns async so callers can await the DLQ side-effects on skip.
|
|
18
|
+
* Budget/DLQ checks happen here so the reconciler, realtime watcher,
|
|
19
|
+
* and manual API calls all go through the same gate.
|
|
12
20
|
*/
|
|
13
|
-
enqueue(card: Card, column: Column, labels: Label[], subtasks: Subtask[], mode?: WorkMode): void
|
|
21
|
+
enqueue(card: Card, column: Column, labels: Label[], subtasks: Subtask[], mode?: WorkMode): Promise<void>;
|
|
14
22
|
/**
|
|
15
23
|
* Remove a card from any queue or cancel an active worker.
|
|
16
24
|
*/
|
|
@@ -31,6 +39,23 @@ export declare class Pool {
|
|
|
31
39
|
* Handle an agent command (pause/resume/stop) for a specific card.
|
|
32
40
|
*/
|
|
33
41
|
handleAgentCommand(cardId: string, command: "pause" | "resume" | "stop"): Promise<void>;
|
|
42
|
+
/**
|
|
43
|
+
* Point-in-time snapshot for the HTTP /status endpoint. Safe to call
|
|
44
|
+
* from anywhere — reads in-memory state only.
|
|
45
|
+
*/
|
|
46
|
+
snapshotWorkers(): Array<{
|
|
47
|
+
id: number;
|
|
48
|
+
pipeline: "implement" | "review";
|
|
49
|
+
state: string;
|
|
50
|
+
cardId: string | null;
|
|
51
|
+
cardShortId: number | null;
|
|
52
|
+
startedAt: number | null;
|
|
53
|
+
branchName: string | null;
|
|
54
|
+
}>;
|
|
55
|
+
snapshotQueues(): {
|
|
56
|
+
impl: ReturnType<PriorityQueue["snapshot"]>;
|
|
57
|
+
review: ReturnType<PriorityQueue["snapshot"]>;
|
|
58
|
+
};
|
|
34
59
|
/**
|
|
35
60
|
* Gracefully shutdown all workers.
|
|
36
61
|
*/
|
package/dist/pool.js
CHANGED
|
@@ -1,36 +1,45 @@
|
|
|
1
|
+
import { BudgetGuard } from "./budget.js";
|
|
1
2
|
import { log } from "./log.js";
|
|
2
3
|
import { PriorityQueue } from "./queue.js";
|
|
3
4
|
import { ReviewWorker } from "./review-worker.js";
|
|
4
5
|
import { Worker } from "./worker.js";
|
|
5
6
|
const TAG = "pool";
|
|
6
7
|
export class Pool {
|
|
8
|
+
client;
|
|
7
9
|
implWorkers = [];
|
|
8
10
|
reviewWorkers = [];
|
|
9
11
|
implQueue;
|
|
10
12
|
reviewQueue;
|
|
11
|
-
|
|
13
|
+
budget;
|
|
14
|
+
constructor(config, client, userEmail, workspaceId, projectId, stateStore) {
|
|
15
|
+
this.client = client;
|
|
12
16
|
this.implQueue = new PriorityQueue(config);
|
|
13
17
|
this.reviewQueue = new PriorityQueue(config);
|
|
18
|
+
this.budget = new BudgetGuard(config.budget, stateStore);
|
|
14
19
|
// Create implementation workers
|
|
15
20
|
for (let i = 0; i < config.poolSize; i++) {
|
|
16
21
|
this.implWorkers.push(new Worker(i, config, client, userEmail, () => {
|
|
17
22
|
this.tryDispatchFor(this.implWorkers, this.implQueue, "impl");
|
|
18
|
-
}, workspaceId, projectId));
|
|
23
|
+
}, workspaceId, projectId, stateStore));
|
|
19
24
|
}
|
|
20
25
|
// Create review worker(s) — 1 review worker per pool
|
|
21
26
|
if (config.review.enabled) {
|
|
22
27
|
const reviewWorkerId = config.poolSize; // offset to avoid ID collision
|
|
23
28
|
this.reviewWorkers.push(new ReviewWorker(reviewWorkerId, config, client, userEmail, () => {
|
|
24
29
|
this.tryDispatchFor(this.reviewWorkers, this.reviewQueue, "review");
|
|
25
|
-
}));
|
|
30
|
+
}, stateStore));
|
|
26
31
|
}
|
|
27
32
|
const reviewCount = this.reviewWorkers.length;
|
|
28
33
|
log.info(TAG, `Pool initialized: ${config.poolSize} impl worker(s), ${reviewCount} review worker(s)`);
|
|
29
34
|
}
|
|
30
35
|
/**
|
|
31
36
|
* Enqueue a card for processing with the given mode.
|
|
37
|
+
*
|
|
38
|
+
* Returns async so callers can await the DLQ side-effects on skip.
|
|
39
|
+
* Budget/DLQ checks happen here so the reconciler, realtime watcher,
|
|
40
|
+
* and manual API calls all go through the same gate.
|
|
32
41
|
*/
|
|
33
|
-
enqueue(card, column, labels, subtasks, mode = "implement") {
|
|
42
|
+
async enqueue(card, column, labels, subtasks, mode = "implement") {
|
|
34
43
|
// Don't enqueue if already in any queue or actively being worked on
|
|
35
44
|
if (this.implQueue.has(card.id) ||
|
|
36
45
|
this.reviewQueue.has(card.id) ||
|
|
@@ -38,6 +47,26 @@ export class Pool {
|
|
|
38
47
|
log.debug(TAG, `Card ${card.id} already queued or active, skipping`);
|
|
39
48
|
return;
|
|
40
49
|
}
|
|
50
|
+
// Review pickups bypass per-card attempt limits (reviews are cheap
|
|
51
|
+
// and orthogonal to implement attempts). Daily budget still applies.
|
|
52
|
+
if (mode === "implement") {
|
|
53
|
+
const decision = this.budget.check(card.id);
|
|
54
|
+
if (!decision.allow) {
|
|
55
|
+
// Already-DLQ cards are expected noise on every reconcile tick;
|
|
56
|
+
// only the terminal decision itself deserves a warn.
|
|
57
|
+
const wasAlreadyDlq = decision.reason === "dlq";
|
|
58
|
+
if (!wasAlreadyDlq) {
|
|
59
|
+
log.warn(TAG, `#${card.short_id} skipped (${decision.reason}): ${decision.detail}`);
|
|
60
|
+
if (this.budget.isTerminal(decision.reason)) {
|
|
61
|
+
await this.budget.markDlq(this.client, card, decision.reason, decision.detail);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
else {
|
|
65
|
+
log.debug(TAG, `#${card.short_id} in DLQ: ${decision.detail}`);
|
|
66
|
+
}
|
|
67
|
+
return;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
41
70
|
const queue = mode === "review" ? this.reviewQueue : this.implQueue;
|
|
42
71
|
queue.enqueue(card, column, labels, mode);
|
|
43
72
|
// Store card data for when it gets dispatched
|
|
@@ -122,6 +151,42 @@ export class Pool {
|
|
|
122
151
|
break;
|
|
123
152
|
}
|
|
124
153
|
}
|
|
154
|
+
/**
|
|
155
|
+
* Point-in-time snapshot for the HTTP /status endpoint. Safe to call
|
|
156
|
+
* from anywhere — reads in-memory state only.
|
|
157
|
+
*/
|
|
158
|
+
snapshotWorkers() {
|
|
159
|
+
const out = [];
|
|
160
|
+
for (const w of this.implWorkers) {
|
|
161
|
+
out.push({
|
|
162
|
+
id: w.id,
|
|
163
|
+
pipeline: "implement",
|
|
164
|
+
state: w.state,
|
|
165
|
+
cardId: w.cardId,
|
|
166
|
+
cardShortId: null,
|
|
167
|
+
startedAt: w.startedAt,
|
|
168
|
+
branchName: w.branchName,
|
|
169
|
+
});
|
|
170
|
+
}
|
|
171
|
+
for (const w of this.reviewWorkers) {
|
|
172
|
+
out.push({
|
|
173
|
+
id: w.id,
|
|
174
|
+
pipeline: "review",
|
|
175
|
+
state: w.state,
|
|
176
|
+
cardId: w.cardId,
|
|
177
|
+
cardShortId: null,
|
|
178
|
+
startedAt: w.startedAt,
|
|
179
|
+
branchName: w.branchName,
|
|
180
|
+
});
|
|
181
|
+
}
|
|
182
|
+
return out;
|
|
183
|
+
}
|
|
184
|
+
snapshotQueues() {
|
|
185
|
+
return {
|
|
186
|
+
impl: this.implQueue.snapshot(),
|
|
187
|
+
review: this.reviewQueue.snapshot(),
|
|
188
|
+
};
|
|
189
|
+
}
|
|
125
190
|
/**
|
|
126
191
|
* Gracefully shutdown all workers.
|
|
127
192
|
*/
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { type ChildProcess, type SpawnOptions } from "node:child_process";
|
|
2
|
+
/**
|
|
3
|
+
* Spawn a child in its own process group so we can reliably kill the
|
|
4
|
+
* whole subtree later. The Claude CLI shells out to git, build tools,
|
|
5
|
+
* dev servers, etc. — signalling only the direct child leaves orphans.
|
|
6
|
+
*
|
|
7
|
+
* - POSIX: `detached: true` puts the child in a new process group whose
|
|
8
|
+
* pgid equals its pid. Killing the negative pid signals every member.
|
|
9
|
+
* - Windows: `detached: true` creates a new process group that can be
|
|
10
|
+
* signalled via the child's pid (no negation).
|
|
11
|
+
*/
|
|
12
|
+
export declare function spawnInGroup(command: string, args: readonly string[], options?: SpawnOptions): ChildProcess;
|
|
13
|
+
/**
|
|
14
|
+
* Send a signal to every process in the group whose leader is `proc`.
|
|
15
|
+
* On POSIX, this is `process.kill(-pid, signal)`. If the group has
|
|
16
|
+
* already exited, returns silently.
|
|
17
|
+
*/
|
|
18
|
+
export declare function signalGroup(proc: ChildProcess, signal: NodeJS.Signals): void;
|
|
19
|
+
/**
|
|
20
|
+
* Escalating termination: SIGINT → wait → SIGTERM → wait → SIGKILL.
|
|
21
|
+
* Returns when the process has exited or all signals have been sent.
|
|
22
|
+
*/
|
|
23
|
+
export declare function terminateGroup(proc: ChildProcess, opts: {
|
|
24
|
+
sigintTimeoutMs: number;
|
|
25
|
+
sigtermTimeoutMs: number;
|
|
26
|
+
}): Promise<void>;
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import { spawn, } from "node:child_process";
|
|
2
|
+
import { log } from "./log.js";
|
|
3
|
+
const TAG = "pgroup";
|
|
4
|
+
/**
|
|
5
|
+
* Spawn a child in its own process group so we can reliably kill the
|
|
6
|
+
* whole subtree later. The Claude CLI shells out to git, build tools,
|
|
7
|
+
* dev servers, etc. — signalling only the direct child leaves orphans.
|
|
8
|
+
*
|
|
9
|
+
* - POSIX: `detached: true` puts the child in a new process group whose
|
|
10
|
+
* pgid equals its pid. Killing the negative pid signals every member.
|
|
11
|
+
* - Windows: `detached: true` creates a new process group that can be
|
|
12
|
+
* signalled via the child's pid (no negation).
|
|
13
|
+
*/
|
|
14
|
+
export function spawnInGroup(command, args, options = {}) {
|
|
15
|
+
return spawn(command, args, {
|
|
16
|
+
...options,
|
|
17
|
+
detached: true,
|
|
18
|
+
// Keep stdio wired up so streaming still works.
|
|
19
|
+
stdio: options.stdio ?? ["ignore", "pipe", "pipe"],
|
|
20
|
+
});
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Send a signal to every process in the group whose leader is `proc`.
|
|
24
|
+
* On POSIX, this is `process.kill(-pid, signal)`. If the group has
|
|
25
|
+
* already exited, returns silently.
|
|
26
|
+
*/
|
|
27
|
+
export function signalGroup(proc, signal) {
|
|
28
|
+
if (!proc.pid || proc.killed)
|
|
29
|
+
return;
|
|
30
|
+
try {
|
|
31
|
+
if (process.platform === "win32") {
|
|
32
|
+
// No process groups on Windows; best effort tree kill via the child.
|
|
33
|
+
proc.kill(signal);
|
|
34
|
+
return;
|
|
35
|
+
}
|
|
36
|
+
process.kill(-proc.pid, signal);
|
|
37
|
+
}
|
|
38
|
+
catch (err) {
|
|
39
|
+
// ESRCH means the group is already gone — that is the goal, not an error.
|
|
40
|
+
const code = err.code;
|
|
41
|
+
if (code !== "ESRCH") {
|
|
42
|
+
log.warn(TAG, `signal ${signal} to pgid ${proc.pid} failed: ${err instanceof Error ? err.message : err}`);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Escalating termination: SIGINT → wait → SIGTERM → wait → SIGKILL.
|
|
48
|
+
* Returns when the process has exited or all signals have been sent.
|
|
49
|
+
*/
|
|
50
|
+
export async function terminateGroup(proc, opts) {
|
|
51
|
+
if (!proc.pid || proc.killed)
|
|
52
|
+
return;
|
|
53
|
+
// Unpause first in case the process was suspended — otherwise it
|
|
54
|
+
// can't react to signals.
|
|
55
|
+
signalGroup(proc, "SIGCONT");
|
|
56
|
+
const waitForExit = (timeout) => new Promise((resolve) => {
|
|
57
|
+
if (proc.killed || proc.exitCode !== null)
|
|
58
|
+
return resolve(true);
|
|
59
|
+
const timer = setTimeout(() => resolve(false), timeout);
|
|
60
|
+
proc.once("exit", () => {
|
|
61
|
+
clearTimeout(timer);
|
|
62
|
+
resolve(true);
|
|
63
|
+
});
|
|
64
|
+
});
|
|
65
|
+
signalGroup(proc, "SIGINT");
|
|
66
|
+
if (await waitForExit(opts.sigintTimeoutMs))
|
|
67
|
+
return;
|
|
68
|
+
signalGroup(proc, "SIGTERM");
|
|
69
|
+
if (await waitForExit(opts.sigtermTimeoutMs))
|
|
70
|
+
return;
|
|
71
|
+
signalGroup(proc, "SIGKILL");
|
|
72
|
+
}
|
package/dist/progress-tracker.js
CHANGED
|
@@ -347,6 +347,8 @@ export class ProgressTracker {
|
|
|
347
347
|
phase: this.phase,
|
|
348
348
|
filesChanged: this.filesEdited.size,
|
|
349
349
|
costCents: Math.round((this.lastCost?.totalCostUsd ?? 0) * 100),
|
|
350
|
+
inputTokens: this.lastCost?.totalInputTokens ?? 0,
|
|
351
|
+
outputTokens: this.lastCost?.totalOutputTokens ?? 0,
|
|
350
352
|
})
|
|
351
353
|
.catch((err) => {
|
|
352
354
|
log.warn(TAG, `Failed to send progress update: ${err}`);
|
package/dist/queue.d.ts
CHANGED
package/dist/queue.js
CHANGED
package/dist/reconcile.d.ts
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import type { HarmonyApiClient } from "@gethmy/mcp/src/api-client.js";
|
|
2
2
|
import type { Pool } from "./pool.js";
|
|
3
|
+
import type { StateStore } from "./state-store.js";
|
|
4
|
+
import { type AgentConfig } from "./types.js";
|
|
3
5
|
/**
|
|
4
6
|
* Reconciliation heartbeat: polls the board every `intervalMs` to catch
|
|
5
7
|
* missed realtime events and sync state.
|
|
@@ -13,9 +15,21 @@ export declare class Reconciler {
|
|
|
13
15
|
private reviewColumns;
|
|
14
16
|
private approvedLabel;
|
|
15
17
|
private intervalMs;
|
|
18
|
+
private stateStore?;
|
|
19
|
+
private agentConfig?;
|
|
16
20
|
private timer;
|
|
17
|
-
|
|
21
|
+
private lastTickAt;
|
|
22
|
+
get lastTick(): number | null;
|
|
23
|
+
get isRunning(): boolean;
|
|
24
|
+
constructor(client: HarmonyApiClient, pool: Pool, projectId: string, agentUserId: string, pickupColumns: string[], reviewColumns: string[], approvedLabel: string, intervalMs?: number, stateStore?: StateStore | undefined, agentConfig?: AgentConfig | undefined);
|
|
18
25
|
start(): void;
|
|
19
26
|
stop(): void;
|
|
27
|
+
/**
|
|
28
|
+
* Walk the state store for runs marked active whose owning daemon is
|
|
29
|
+
* dead OR whose heartbeat is stale. Each such run gets the same
|
|
30
|
+
* recovery treatment as startup orphans: session ended, card returned
|
|
31
|
+
* to pickup column with agent-recovered label, worktree cleaned up.
|
|
32
|
+
*/
|
|
33
|
+
private recoverStaleRuns;
|
|
20
34
|
private tick;
|
|
21
35
|
}
|
package/dist/reconcile.js
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import { buildLabelMap, hasLabel, resolveCardLabels } from "./board-helpers.js";
|
|
2
2
|
import { log } from "./log.js";
|
|
3
|
+
import { isProcessAlive, recoverRun } from "./recovery.js";
|
|
4
|
+
import { extractBranchFromDescription } from "./review-worktree.js";
|
|
3
5
|
import { NEED_REVIEW_LABEL } from "./types.js";
|
|
4
6
|
const TAG = "reconcile";
|
|
5
7
|
/**
|
|
@@ -15,8 +17,17 @@ export class Reconciler {
|
|
|
15
17
|
reviewColumns;
|
|
16
18
|
approvedLabel;
|
|
17
19
|
intervalMs;
|
|
20
|
+
stateStore;
|
|
21
|
+
agentConfig;
|
|
18
22
|
timer = null;
|
|
19
|
-
|
|
23
|
+
lastTickAt = null;
|
|
24
|
+
get lastTick() {
|
|
25
|
+
return this.lastTickAt;
|
|
26
|
+
}
|
|
27
|
+
get isRunning() {
|
|
28
|
+
return this.timer !== null;
|
|
29
|
+
}
|
|
30
|
+
constructor(client, pool, projectId, agentUserId, pickupColumns, reviewColumns, approvedLabel, intervalMs = 60_000, stateStore, agentConfig) {
|
|
20
31
|
this.client = client;
|
|
21
32
|
this.pool = pool;
|
|
22
33
|
this.projectId = projectId;
|
|
@@ -25,6 +36,8 @@ export class Reconciler {
|
|
|
25
36
|
this.reviewColumns = reviewColumns;
|
|
26
37
|
this.approvedLabel = approvedLabel;
|
|
27
38
|
this.intervalMs = intervalMs;
|
|
39
|
+
this.stateStore = stateStore;
|
|
40
|
+
this.agentConfig = agentConfig;
|
|
28
41
|
}
|
|
29
42
|
start() {
|
|
30
43
|
log.info(TAG, `Heartbeat every ${this.intervalMs / 1000}s`);
|
|
@@ -39,7 +52,42 @@ export class Reconciler {
|
|
|
39
52
|
}
|
|
40
53
|
log.info(TAG, "Heartbeat stopped");
|
|
41
54
|
}
|
|
55
|
+
/**
|
|
56
|
+
* Walk the state store for runs marked active whose owning daemon is
|
|
57
|
+
* dead OR whose heartbeat is stale. Each such run gets the same
|
|
58
|
+
* recovery treatment as startup orphans: session ended, card returned
|
|
59
|
+
* to pickup column with agent-recovered label, worktree cleaned up.
|
|
60
|
+
*/
|
|
61
|
+
async recoverStaleRuns() {
|
|
62
|
+
if (!this.stateStore || !this.agentConfig)
|
|
63
|
+
return;
|
|
64
|
+
const now = Date.now();
|
|
65
|
+
const stale = this.agentConfig.timing.staleHeartbeatMs;
|
|
66
|
+
const active = this.stateStore.getActiveRuns();
|
|
67
|
+
const pool = this.pool;
|
|
68
|
+
for (const run of active) {
|
|
69
|
+
const foreignDaemon = run.daemonPid !== process.pid;
|
|
70
|
+
const daemonDead = foreignDaemon && !isProcessAlive(run.daemonPid, process.pid);
|
|
71
|
+
const heartbeatStale = now - run.lastHeartbeatAt > stale;
|
|
72
|
+
const ourZombie = !foreignDaemon && !pool.isCardActive(run.cardId);
|
|
73
|
+
if (!daemonDead && !(heartbeatStale && ourZombie))
|
|
74
|
+
continue;
|
|
75
|
+
const reason = daemonDead
|
|
76
|
+
? `foreign daemon ${run.daemonPid} is dead`
|
|
77
|
+
: `our worker lost card ${run.cardId} with ${Math.round((now - run.lastHeartbeatAt) / 1000)}s stale heartbeat`;
|
|
78
|
+
log.warn(TAG, `zombie run ${run.runId} (#${run.cardShortId}): ${reason} — recovering`);
|
|
79
|
+
await recoverRun(run, this.stateStore, this.client, this.agentConfig, {
|
|
80
|
+
runId: run.runId,
|
|
81
|
+
cardId: run.cardId,
|
|
82
|
+
cardShortId: run.cardShortId,
|
|
83
|
+
pipeline: run.pipeline,
|
|
84
|
+
actions: [],
|
|
85
|
+
errors: [],
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
}
|
|
42
89
|
async tick() {
|
|
90
|
+
this.lastTickAt = Date.now();
|
|
43
91
|
try {
|
|
44
92
|
const board = await this.client.getBoard(this.projectId);
|
|
45
93
|
const cards = (board.cards ?? []);
|
|
@@ -94,10 +142,23 @@ export class Reconciler {
|
|
|
94
142
|
log.debug(TAG, `Skipping #${card.short_id} — has "${NEED_REVIEW_LABEL}" label (needs human)`);
|
|
95
143
|
continue;
|
|
96
144
|
}
|
|
145
|
+
// Skip review for cards without a branch reference — not qualified for auto-review
|
|
146
|
+
if (mode === "review" &&
|
|
147
|
+
!extractBranchFromDescription(card.description)) {
|
|
148
|
+
log.debug(TAG, `Skipping #${card.short_id} — no branch reference (not qualified for auto-review)`);
|
|
149
|
+
continue;
|
|
150
|
+
}
|
|
97
151
|
log.info(TAG, `Missed assignment: #${card.short_id} "${card.title}" (${mode}) — enqueueing`);
|
|
98
|
-
this.pool.enqueue(card, column, cardLabels, subtasks, mode);
|
|
152
|
+
await this.pool.enqueue(card, column, cardLabels, subtasks, mode);
|
|
99
153
|
}
|
|
100
154
|
}
|
|
155
|
+
// Detect zombie runs: state-store says active, but either:
|
|
156
|
+
// (a) another daemon's PID is dead, or
|
|
157
|
+
// (b) our daemon holds the run but no worker is on the card, or
|
|
158
|
+
// (c) heartbeat is older than staleHeartbeatMs.
|
|
159
|
+
if (this.stateStore && this.agentConfig) {
|
|
160
|
+
await this.recoverStaleRuns();
|
|
161
|
+
}
|
|
101
162
|
// Cards in queue/active but no longer assigned to agent → cancel/remove
|
|
102
163
|
for (const knownId of knownCardIds) {
|
|
103
164
|
if (!allAgentCardIds.has(knownId)) {
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import type { HarmonyApiClient } from "@gethmy/mcp/src/api-client.js";
|
|
2
|
+
import type { RunRecord, StateStore } from "./state-store.js";
|
|
3
|
+
import type { AgentConfig } from "./types.js";
|
|
4
|
+
export interface RecoveryOutcome {
|
|
5
|
+
runId: string;
|
|
6
|
+
cardId: string;
|
|
7
|
+
cardShortId: number;
|
|
8
|
+
pipeline: "implement" | "review";
|
|
9
|
+
actions: string[];
|
|
10
|
+
errors: string[];
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Check if a process is still alive. A crashed daemon's PID is unlikely
|
|
14
|
+
* to be reused within a reboot window; if it is, we still treat it as
|
|
15
|
+
* orphaned because our current process is the new daemon.
|
|
16
|
+
*/
|
|
17
|
+
export declare function isProcessAlive(pid: number, currentPid: number): boolean;
|
|
18
|
+
/**
|
|
19
|
+
* Reconcile orphaned runs from a previous daemon life.
|
|
20
|
+
*
|
|
21
|
+
* For each active run in the state store:
|
|
22
|
+
* - If the daemon PID is alive (should not happen for a fresh process),
|
|
23
|
+
* skip it — it's another instance.
|
|
24
|
+
* - Otherwise: end the Harmony session, return the card to its pickup
|
|
25
|
+
* column with the `agent-recovered` label, and cleanup the worktree.
|
|
26
|
+
*
|
|
27
|
+
* This runs once at daemon startup, before the pool accepts work.
|
|
28
|
+
*/
|
|
29
|
+
export declare function recoverOrphans(store: StateStore, client: HarmonyApiClient, config: AgentConfig): Promise<RecoveryOutcome[]>;
|
|
30
|
+
export declare function recoverRun(run: RunRecord, store: StateStore, client: HarmonyApiClient, config: AgentConfig, outcome: RecoveryOutcome): Promise<void>;
|
package/dist/recovery.js
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import { addLabelByName, moveCardToColumn } from "./board-helpers.js";
|
|
2
|
+
import { log } from "./log.js";
|
|
3
|
+
import { cleanupWorktree } from "./worktree.js";
|
|
4
|
+
const TAG = "recovery";
|
|
5
|
+
const RECOVERED_LABEL = "agent-recovered";
|
|
6
|
+
const RECOVERED_LABEL_COLOR = "#f59e0b";
|
|
7
|
+
/**
|
|
8
|
+
* Check if a process is still alive. A crashed daemon's PID is unlikely
|
|
9
|
+
* to be reused within a reboot window; if it is, we still treat it as
|
|
10
|
+
* orphaned because our current process is the new daemon.
|
|
11
|
+
*/
|
|
12
|
+
export function isProcessAlive(pid, currentPid) {
|
|
13
|
+
if (pid === currentPid)
|
|
14
|
+
return true;
|
|
15
|
+
try {
|
|
16
|
+
process.kill(pid, 0);
|
|
17
|
+
return true;
|
|
18
|
+
}
|
|
19
|
+
catch {
|
|
20
|
+
return false;
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
async function fetchCardSafely(client, cardId) {
|
|
24
|
+
try {
|
|
25
|
+
const { card } = (await client.getCard(cardId));
|
|
26
|
+
return card;
|
|
27
|
+
}
|
|
28
|
+
catch (err) {
|
|
29
|
+
log.warn(TAG, `cannot fetch card ${cardId}: ${err instanceof Error ? err.message : err}`);
|
|
30
|
+
return null;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Reconcile orphaned runs from a previous daemon life.
|
|
35
|
+
*
|
|
36
|
+
* For each active run in the state store:
|
|
37
|
+
* - If the daemon PID is alive (should not happen for a fresh process),
|
|
38
|
+
* skip it — it's another instance.
|
|
39
|
+
* - Otherwise: end the Harmony session, return the card to its pickup
|
|
40
|
+
* column with the `agent-recovered` label, and cleanup the worktree.
|
|
41
|
+
*
|
|
42
|
+
* This runs once at daemon startup, before the pool accepts work.
|
|
43
|
+
*/
|
|
44
|
+
export async function recoverOrphans(store, client, config) {
|
|
45
|
+
const active = store.getActiveRuns();
|
|
46
|
+
if (active.length === 0) {
|
|
47
|
+
log.info(TAG, "no orphan runs to recover");
|
|
48
|
+
return [];
|
|
49
|
+
}
|
|
50
|
+
const outcomes = [];
|
|
51
|
+
log.info(TAG, `recovering ${active.length} orphan run(s) from prior daemon`);
|
|
52
|
+
for (const run of active) {
|
|
53
|
+
const outcome = {
|
|
54
|
+
runId: run.runId,
|
|
55
|
+
cardId: run.cardId,
|
|
56
|
+
cardShortId: run.cardShortId,
|
|
57
|
+
pipeline: run.pipeline,
|
|
58
|
+
actions: [],
|
|
59
|
+
errors: [],
|
|
60
|
+
};
|
|
61
|
+
outcomes.push(outcome);
|
|
62
|
+
if (isProcessAlive(run.daemonPid, process.pid)) {
|
|
63
|
+
log.warn(TAG, `run ${run.runId} claims live daemon pid ${run.daemonPid} — skipping`);
|
|
64
|
+
outcome.actions.push("skipped: daemon pid still alive");
|
|
65
|
+
continue;
|
|
66
|
+
}
|
|
67
|
+
log.info(TAG, `recovering ${run.pipeline} run ${run.runId} for card #${run.cardShortId}`);
|
|
68
|
+
await recoverRun(run, store, client, config, outcome);
|
|
69
|
+
}
|
|
70
|
+
return outcomes;
|
|
71
|
+
}
|
|
72
|
+
export async function recoverRun(run, store, client, config, outcome) {
|
|
73
|
+
// 1. End the agent session so the card stops showing the progress ring.
|
|
74
|
+
try {
|
|
75
|
+
await client.endAgentSession(run.cardId, {
|
|
76
|
+
status: "paused",
|
|
77
|
+
progressPercent: run.phase === "completing" ? 95 : undefined,
|
|
78
|
+
});
|
|
79
|
+
outcome.actions.push("ended agent session (paused)");
|
|
80
|
+
}
|
|
81
|
+
catch (err) {
|
|
82
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
83
|
+
outcome.errors.push(`endAgentSession: ${msg}`);
|
|
84
|
+
log.warn(TAG, `endAgentSession failed for ${run.cardId}: ${msg}`);
|
|
85
|
+
}
|
|
86
|
+
// 2. Move card back to a safe column and add the recovered label.
|
|
87
|
+
// - implement pipeline → pickup column (usually "To Do")
|
|
88
|
+
// - review pipeline → leave in place (reviewer will re-pick)
|
|
89
|
+
const card = await fetchCardSafely(client, run.cardId);
|
|
90
|
+
if (card) {
|
|
91
|
+
if (run.pipeline === "implement") {
|
|
92
|
+
const target = config.pickupColumns[0];
|
|
93
|
+
if (target) {
|
|
94
|
+
try {
|
|
95
|
+
await moveCardToColumn(client, card, target);
|
|
96
|
+
outcome.actions.push(`moved to "${target}"`);
|
|
97
|
+
}
|
|
98
|
+
catch (err) {
|
|
99
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
100
|
+
outcome.errors.push(`moveCardToColumn: ${msg}`);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
try {
|
|
105
|
+
await addLabelByName(client, card, RECOVERED_LABEL, RECOVERED_LABEL_COLOR);
|
|
106
|
+
outcome.actions.push(`labeled "${RECOVERED_LABEL}"`);
|
|
107
|
+
}
|
|
108
|
+
catch (err) {
|
|
109
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
110
|
+
outcome.errors.push(`addLabel: ${msg}`);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
else {
|
|
114
|
+
outcome.actions.push("card not reachable — local cleanup only");
|
|
115
|
+
}
|
|
116
|
+
// 3. Cleanup local worktree so it doesn't collide with future runs.
|
|
117
|
+
if (run.worktreePath) {
|
|
118
|
+
try {
|
|
119
|
+
cleanupWorktree(run.worktreePath, run.branchName ?? undefined);
|
|
120
|
+
outcome.actions.push("cleaned up worktree");
|
|
121
|
+
}
|
|
122
|
+
catch (err) {
|
|
123
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
124
|
+
outcome.errors.push(`cleanupWorktree: ${msg}`);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
// 4. Mark the run as orphaned in the store.
|
|
128
|
+
try {
|
|
129
|
+
await store.endRun(run.runId, "orphaned", "recovered after daemon restart");
|
|
130
|
+
}
|
|
131
|
+
catch (err) {
|
|
132
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
133
|
+
outcome.errors.push(`endRun: ${msg}`);
|
|
134
|
+
}
|
|
135
|
+
log.info(TAG, `recovered run ${run.runId} (card #${run.cardShortId}): ${outcome.actions.join(", ")}${outcome.errors.length ? ` | errors: ${outcome.errors.join("; ")}` : ""}`);
|
|
136
|
+
}
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import type { HarmonyApiClient } from "@gethmy/mcp/src/api-client.js";
|
|
2
2
|
import type { Card } from "@harmony/shared";
|
|
3
|
-
import type
|
|
3
|
+
import { type SessionStats } from "./completion.js";
|
|
4
|
+
import { type AgentConfig } from "./types.js";
|
|
4
5
|
export interface ReviewFinding {
|
|
5
6
|
severity: "critical" | "major" | "minor";
|
|
6
7
|
title: string;
|
|
@@ -13,14 +14,21 @@ export interface ScopeCheck {
|
|
|
13
14
|
notes?: string;
|
|
14
15
|
}
|
|
15
16
|
export interface ReviewResult {
|
|
16
|
-
verdict: "approved" | "rejected";
|
|
17
|
+
verdict: "approved" | "rejected" | "error";
|
|
17
18
|
summary: string;
|
|
18
19
|
scopeCheck?: ScopeCheck;
|
|
19
20
|
findings: ReviewFinding[];
|
|
20
21
|
}
|
|
21
22
|
/**
|
|
22
23
|
* Parse Claude's review output into a structured ReviewResult.
|
|
23
|
-
*
|
|
24
|
+
*
|
|
25
|
+
* Tries multiple extraction strategies in order:
|
|
26
|
+
* 1. ```json ... ``` fenced block (what the prompt asks for)
|
|
27
|
+
* 2. Any top-level JSON object containing a "verdict" key (last-wins)
|
|
28
|
+
* 3. Regex for a bare `"verdict": "approved|rejected"` anywhere — lossy
|
|
29
|
+
* but keeps the pipeline moving
|
|
30
|
+
* 4. Falls back to verdict: "error" — keeps card in Review instead of
|
|
31
|
+
* bouncing it to To Do for a parse failure that isn't a code quality signal.
|
|
24
32
|
*/
|
|
25
33
|
export declare function parseReviewOutput(stdout: string): ReviewResult;
|
|
26
34
|
/**
|
|
@@ -28,4 +36,4 @@ export declare function parseReviewOutput(stdout: string): ReviewResult;
|
|
|
28
36
|
* Handles approved/rejected verdicts, creates subtasks for findings,
|
|
29
37
|
* and moves the card to the appropriate column.
|
|
30
38
|
*/
|
|
31
|
-
export declare function runReviewCompletion(client: HarmonyApiClient, card: Card, result: ReviewResult, config: AgentConfig, worktreePath: string, branchName: string | null): Promise<void>;
|
|
39
|
+
export declare function runReviewCompletion(client: HarmonyApiClient, card: Card, result: ReviewResult, config: AgentConfig, worktreePath: string, branchName: string | null, sessionStats?: SessionStats | null): Promise<void>;
|