freddie 0.0.117 → 0.0.119
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/plugins/gui-machines/plugin.js +49 -0
- package/src/acp/main.js +2 -1
- package/src/acp/server.js +30 -2
- package/src/agent/acptoapi-bridge.js +1 -1
- package/src/agent/machine.js +56 -16
- package/src/batch.js +84 -20
- package/src/cli/gateway_cli.js +2 -0
- package/src/cron/scheduler.js +46 -2
- package/src/gateway/run.js +39 -0
- package/src/host/host_helpers.js +7 -1
- package/src/machines/persistent-actor.js +57 -0
- package/src/machines/resume.js +42 -0
- package/src/machines/snapshot-store.js +111 -0
- package/src/web/server.js +3 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "freddie",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.119",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Open JS agent harness built on pi-mono, floosie, xstate, and anentrypoint-design",
|
|
6
6
|
"bin": {
|
|
@@ -27,7 +27,7 @@
|
|
|
27
27
|
"@mariozechner/pi-ai": "^0.70.6",
|
|
28
28
|
"@mariozechner/pi-coding-agent": "^0.70.6",
|
|
29
29
|
"@mariozechner/pi-tui": "^0.70.6",
|
|
30
|
-
"acptoapi": "^1.0.
|
|
30
|
+
"acptoapi": "^1.0.114",
|
|
31
31
|
"anentrypoint-design": "^0.0.140",
|
|
32
32
|
"commander": "^14.0.0",
|
|
33
33
|
"express": "^5.0.0",
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
// Exposes live persisted machine snapshots over HTTP for the dashboard.
|
|
2
|
+
// GET /api/machines -> { count, kinds:{<kind>:n}, machines:[{kind,key,status,state,updated}] }
|
|
3
|
+
// GET /api/machines/:kind -> machines filtered to one kind
|
|
4
|
+
// POST /api/machines/resume -> { ok, summary } : drive resumeAll() on demand
|
|
5
|
+
import { list } from '../../src/machines/snapshot-store.js'
|
|
6
|
+
import { registerDebug } from '../../src/observability/debug.js'
|
|
7
|
+
|
|
8
|
+
// window.__debug.machines() / GET /debug/machines — live persisted machine census.
|
|
9
|
+
registerDebug('machines', () => ({ note: 'GET /api/machines for live snapshots', kinds: ['agent', 'cron', 'batch', 'gateway', 'gateway-msg', 'acp', 'acp-prompt'] }))
|
|
10
|
+
|
|
11
|
+
async function snapshotRows(kind = null) {
|
|
12
|
+
const rows = await list({ kind, status: null })
|
|
13
|
+
return rows.map(r => {
|
|
14
|
+
let state = null
|
|
15
|
+
try { const ps = JSON.parse(r.snapshot_json || 'null'); state = ps?.value ?? null } catch {}
|
|
16
|
+
return { kind: r.kind, key: r.key, status: r.status, state, machine_id: r.machine_id, updated: r.updated }
|
|
17
|
+
})
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export default {
|
|
21
|
+
name: 'gui-machines', surfaces: 'gui',
|
|
22
|
+
register({ gui }) {
|
|
23
|
+
gui.route('GET', '/api/machines', async (_req, res) => {
|
|
24
|
+
try {
|
|
25
|
+
// list() does not return snapshot_json (truncated for size); re-read full per row.
|
|
26
|
+
const { db } = await import('../../src/db.js')
|
|
27
|
+
const d = await db()
|
|
28
|
+
await d.exec(`CREATE TABLE IF NOT EXISTS machine_snapshots (kind TEXT, key TEXT, schema_version INTEGER, machine_id TEXT, snapshot_json TEXT, status TEXT, updated INTEGER, PRIMARY KEY(kind,key))`)
|
|
29
|
+
const all = await d.prepare(`SELECT kind, key, machine_id, snapshot_json, status, updated FROM machine_snapshots ORDER BY updated DESC`).all()
|
|
30
|
+
const kinds = {}
|
|
31
|
+
const machines = all.map(r => {
|
|
32
|
+
kinds[r.kind] = (kinds[r.kind] || 0) + 1
|
|
33
|
+
let state = null
|
|
34
|
+
try { state = JSON.parse(r.snapshot_json || 'null')?.value ?? null } catch {}
|
|
35
|
+
return { kind: r.kind, key: r.key, status: r.status, state, machine_id: r.machine_id, updated: r.updated }
|
|
36
|
+
})
|
|
37
|
+
res.json({ count: machines.length, kinds, machines })
|
|
38
|
+
} catch (e) { res.status(500).json({ error: String(e.message || e) }) }
|
|
39
|
+
})
|
|
40
|
+
gui.route('GET', '/api/machines/:kind', async (req, res) => {
|
|
41
|
+
try { res.json({ machines: await snapshotRows(req.params.kind) }) }
|
|
42
|
+
catch (e) { res.status(500).json({ error: String(e.message || e) }) }
|
|
43
|
+
})
|
|
44
|
+
gui.route('POST', '/api/machines/resume', async (_req, res) => {
|
|
45
|
+
try { const { resumeAll } = await import('../../src/machines/resume.js'); res.json({ ok: true, summary: await resumeAll() }) }
|
|
46
|
+
catch (e) { res.status(500).json({ error: String(e.message || e) }) }
|
|
47
|
+
})
|
|
48
|
+
},
|
|
49
|
+
}
|
package/src/acp/main.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { AcpServer } from './server.js'
|
|
2
|
-
export function startAcpStdio() {
|
|
2
|
+
export async function startAcpStdio() {
|
|
3
|
+
try { const { resumeAll } = await import('../machines/resume.js'); await resumeAll() } catch (_) {}
|
|
3
4
|
const srv = new AcpServer()
|
|
4
5
|
srv.start()
|
|
5
6
|
process.on('SIGINT', () => { srv.stop(); process.exit(0) })
|
package/src/acp/server.js
CHANGED
|
@@ -6,9 +6,26 @@ import { logger } from '../observability/log.js'
|
|
|
6
6
|
import { Events } from './events.js'
|
|
7
7
|
import { checkPermission, rememberAllow, rememberDeny } from './permissions.js'
|
|
8
8
|
import { AcpSessionManager } from './session.js'
|
|
9
|
+
import { createMachine, createActor } from 'xstate'
|
|
10
|
+
import { persist, load, clear } from '../machines/snapshot-store.js'
|
|
9
11
|
|
|
10
12
|
const log = logger('acp')
|
|
11
13
|
|
|
14
|
+
// ACP server lifecycle machine: stopped -> running -> stopped. Persisted so an
|
|
15
|
+
// active snapshot on boot signals the server was serving; per-prompt processing
|
|
16
|
+
// is persisted separately under kind=acp-prompt so an interrupted prompt.submit
|
|
17
|
+
// is observable + resumable after a restart.
|
|
18
|
+
export function createAcpMachine() {
|
|
19
|
+
return createMachine({
|
|
20
|
+
id: 'freddie-acp',
|
|
21
|
+
initial: 'stopped',
|
|
22
|
+
states: {
|
|
23
|
+
stopped: { on: { START: 'running' } },
|
|
24
|
+
running: { on: { STOP: 'stopped' } },
|
|
25
|
+
},
|
|
26
|
+
})
|
|
27
|
+
}
|
|
28
|
+
|
|
12
29
|
const CAPABILITIES = {
|
|
13
30
|
name: 'freddie', version: '0.4.0',
|
|
14
31
|
methods: ['initialize', 'session.new', 'session.resume', 'session.list', 'session.end', 'prompt.submit', 'tool.list', 'permission.respond', 'shutdown'],
|
|
@@ -21,13 +38,19 @@ export class AcpServer extends EventEmitter {
|
|
|
21
38
|
this.in = stdin; this.out = stdout; this.callLLM = callLLM
|
|
22
39
|
this.sessions = new AcpSessionManager()
|
|
23
40
|
this._pendingPerm = new Map()
|
|
41
|
+
this.machine = createAcpMachine()
|
|
42
|
+
this.actor = createActor(this.machine)
|
|
43
|
+
this.actor.subscribe(() => { persist('acp', 'lifecycle', this.actor.getPersistedSnapshot()).catch(() => {}) })
|
|
44
|
+
this.actor.start()
|
|
24
45
|
}
|
|
46
|
+
get state() { return this.actor.getSnapshot().value }
|
|
25
47
|
start() {
|
|
26
48
|
const rl = readline.createInterface({ input: this.in, crlfDelay: Infinity })
|
|
27
49
|
rl.on('line', (l) => this.handle(l).catch(e => this.send({ jsonrpc: '2.0', error: { message: String(e) } })))
|
|
28
50
|
this.rl = rl
|
|
51
|
+
this.actor.send({ type: 'START' })
|
|
29
52
|
}
|
|
30
|
-
stop() { this.rl?.close() }
|
|
53
|
+
stop() { this.rl?.close(); try { this.actor.send({ type: 'STOP' }) } catch {} }
|
|
31
54
|
send(o) { this.out.write(JSON.stringify(o) + '\n') }
|
|
32
55
|
async handle(line) {
|
|
33
56
|
if (!line.trim()) return
|
|
@@ -75,7 +98,12 @@ const METHODS = {
|
|
|
75
98
|
if (!srv.sessions.isActive(sessionId)) throw new Error('session not active')
|
|
76
99
|
srv.sessions.appendUser(sessionId, prompt)
|
|
77
100
|
Events.messageDelta((o) => srv.send(o), { sessionId, role: 'user', content: prompt })
|
|
78
|
-
|
|
101
|
+
// Persist in-flight prompt under kind=acp-prompt keyed by sessionId so a
|
|
102
|
+
// refresh mid-turn is observable + resumable (the agent snapshot for the
|
|
103
|
+
// turn itself lives under kind=agent via runTurn sessionKey).
|
|
104
|
+
await persist('acp-prompt', sessionId, { status: 'active', value: 'running', context: { sessionId, prompt } })
|
|
105
|
+
const out = await runTurn({ prompt, callLLM: srv.callLLM, sessionKey: 'acp:' + sessionId })
|
|
106
|
+
await clear('acp-prompt', sessionId)
|
|
79
107
|
srv.sessions.appendAssistant(sessionId, out.result || '')
|
|
80
108
|
Events.messageComplete((o) => srv.send(o), { sessionId, role: 'assistant', content: out.result || '' })
|
|
81
109
|
return { result: out.result, error: out.error, iterations: out.iterations }
|
|
@@ -121,7 +121,7 @@ function adaptResponse(r) {
|
|
|
121
121
|
|
|
122
122
|
function tryParseJson(s) { try { return typeof s === 'string' ? JSON.parse(s) : (s || {}) } catch { return {} } }
|
|
123
123
|
|
|
124
|
-
export async function isReachable(timeoutMs =
|
|
124
|
+
export async function isReachable(timeoutMs = 10000) {
|
|
125
125
|
try {
|
|
126
126
|
const controller = new AbortController()
|
|
127
127
|
const timeoutId = setTimeout(() => controller.abort(), timeoutMs)
|
package/src/agent/machine.js
CHANGED
|
@@ -3,6 +3,8 @@ import { bootHost } from '../host/index.js'
|
|
|
3
3
|
import { getEnabledToolSchemas } from '../toolsets.js'
|
|
4
4
|
import { logger } from '../observability/log.js'
|
|
5
5
|
import { resolveCallLLM } from './llm_resolver.js'
|
|
6
|
+
import { createPersistentActor } from '../machines/persistent-actor.js'
|
|
7
|
+
import { randomUUID } from 'node:crypto'
|
|
6
8
|
|
|
7
9
|
const log = logger('agent')
|
|
8
10
|
|
|
@@ -55,7 +57,18 @@ export function createAgentMachine({ provider, model, maxIterations = 90, callLL
|
|
|
55
57
|
input: ({ context }) => ({ messages: context.messages, model: context.model, provider: context.provider, enabledToolsets: context.enabledToolsets, disabledToolsets: context.disabledToolsets }),
|
|
56
58
|
onDone: [
|
|
57
59
|
{ guard: ({ event }) => Array.isArray(event.output?.tool_calls) && event.output.tool_calls.length > 0, target: 'tool_calls', actions: assign({ messages: ({ context, event }) => [...context.messages, { role: 'assistant', content: event.output.content || '', tool_calls: event.output.tool_calls }] }) },
|
|
58
|
-
{ target: 'done', actions: assign({ messages: ({ context, event }) => [...context.messages, { role: 'assistant', content: event.output.content || '' }], lastResult: ({ event }) =>
|
|
60
|
+
{ target: 'done', actions: assign({ messages: ({ context, event }) => [...context.messages, { role: 'assistant', content: event.output.content || '' }], lastResult: ({ context, event }) => {
|
|
61
|
+
// Prefer this turn's content, but if the model ended with empty
|
|
62
|
+
// text (it may have put its answer in an earlier turn alongside a
|
|
63
|
+
// tool_call), fall back to the last non-empty assistant message so
|
|
64
|
+
// the caller never gets an empty result after a successful run.
|
|
65
|
+
if (event.output.content && event.output.content.trim()) return event.output.content;
|
|
66
|
+
for (let i = context.messages.length - 1; i >= 0; i--) {
|
|
67
|
+
const m = context.messages[i];
|
|
68
|
+
if (m.role === 'assistant' && typeof m.content === 'string' && m.content.trim()) return m.content;
|
|
69
|
+
}
|
|
70
|
+
return event.output.content || '';
|
|
71
|
+
} }) },
|
|
59
72
|
],
|
|
60
73
|
onError: { target: 'done', actions: assign({ error: ({ event }) => String(event.error?.message || event.error) }) },
|
|
61
74
|
},
|
|
@@ -159,21 +172,14 @@ function mergeHookExtras(messages, r, tag) {
|
|
|
159
172
|
return e.length ? [...messages, ...e] : messages
|
|
160
173
|
}
|
|
161
174
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
if (skill) { const sd = h.pi.skills.get(skill); if (sd?.content) sysParts.push('Skill context:\n' + sd.content) }
|
|
168
|
-
if (sysParts.length) initMessages.unshift({ role: 'user', content: sysParts.join('\n\n') })
|
|
169
|
-
const inbound = await h.hooks.invoke('onMessageInbound', { content: prompt })
|
|
170
|
-
if (inbound?.behavior === 'block') { await h.hooks.invoke('onSessionEnd', { reason: 'prompt_blocked' }); return { messages: initMessages, result: null, error: 'prompt blocked by plugsdk hook: ' + (inbound.reason || 'denied'), iterations: 0 } }
|
|
171
|
-
initMessages = mergeHookExtras(initMessages, inbound, 'onMessageInbound')
|
|
172
|
-
const machine = createAgentMachine({ model, provider, callLLM, enabledToolsets, disabledToolsets, maxIterations, events })
|
|
173
|
-
const actor = createActor(machine, { input: { messages: initMessages } }); actor.start(); actor.send({ type: 'SUBMIT', prompt })
|
|
175
|
+
// Drive a started persistent agent actor to its final state, wiring timeout +
|
|
176
|
+
// session-end hooks + trajectory. Shared by runTurn (fresh) and resumeTurn
|
|
177
|
+
// (rehydrated from a persisted snapshot after a refresh/restart).
|
|
178
|
+
async function driveAgentActor({ pa, h, events, prompt, provider, model, skill, cwd, witnessPath, timeoutMs }) {
|
|
179
|
+
const { actor } = pa
|
|
174
180
|
return await new Promise((resolve, reject) => {
|
|
175
181
|
let sub
|
|
176
|
-
const cleanup = () => { try { sub?.unsubscribe() } catch {} try { actor.stop() } catch {} }
|
|
182
|
+
const cleanup = () => { try { sub?.unsubscribe() } catch {} ; pa.flush().catch(() => {}).finally(() => { try { actor.stop() } catch {} }) }
|
|
177
183
|
const t = setTimeout(() => { cleanup(); reject(new Error('agent turn timeout')) }, timeoutMs)
|
|
178
184
|
sub = actor.subscribe(snap => { if (snap.status !== 'done') return; clearTimeout(t)
|
|
179
185
|
;(async () => {
|
|
@@ -183,8 +189,9 @@ export async function runTurn({ prompt, messages = [], model, provider, callLLM,
|
|
|
183
189
|
await h.hooks.invoke('onSessionEnd', { reason: out?.error ? 'error' : 'ok', iterations: out?.iterations })
|
|
184
190
|
const errorStack = out?.error ? (events.find(e => e.type === 'llm_call' && !e.ok)?.stack || null) : null
|
|
185
191
|
await writeTrajectory(out, { prompt, provider, model, skill, cwd, events, errorStack, witnessPath })
|
|
186
|
-
// Unsubscribe
|
|
187
|
-
// actor
|
|
192
|
+
// Unsubscribe, flush the final snapshot (persistent-actor clears it on
|
|
193
|
+
// the done state) + stop the actor — a finished actor should not be
|
|
194
|
+
// left running with live subscriptions/handles.
|
|
188
195
|
cleanup()
|
|
189
196
|
resolve(out)
|
|
190
197
|
})().catch(e => { cleanup(); reject(e) })
|
|
@@ -192,6 +199,39 @@ export async function runTurn({ prompt, messages = [], model, provider, callLLM,
|
|
|
192
199
|
})
|
|
193
200
|
}
|
|
194
201
|
|
|
202
|
+
export async function runTurn({ prompt, messages = [], model, provider, callLLM, enabledToolsets, disabledToolsets, maxIterations = 90, timeoutMs = 30000, cwd, skill, witnessPath, sessionKey } = {}) {
|
|
203
|
+
const events = []; const h = await bootHost()
|
|
204
|
+
await h.hooks.invoke('onSessionStart', { prompt, model, provider, skill, cwd })
|
|
205
|
+
let initMessages = [...messages]; const sysParts = []
|
|
206
|
+
if (cwd) sysParts.push(`Working directory: ${cwd}. Always pass cwd="${cwd}" to bash tool calls. When reading or writing files use paths relative to this directory or absolute paths under it.`)
|
|
207
|
+
if (skill) { const sd = h.pi.skills.get(skill); if (sd?.content) sysParts.push('Skill context:\n' + sd.content) }
|
|
208
|
+
if (sysParts.length) initMessages.unshift({ role: 'user', content: sysParts.join('\n\n') })
|
|
209
|
+
const inbound = await h.hooks.invoke('onMessageInbound', { content: prompt })
|
|
210
|
+
if (inbound?.behavior === 'block') { await h.hooks.invoke('onSessionEnd', { reason: 'prompt_blocked' }); return { messages: initMessages, result: null, error: 'prompt blocked by plugsdk hook: ' + (inbound.reason || 'denied'), iterations: 0 } }
|
|
211
|
+
initMessages = mergeHookExtras(initMessages, inbound, 'onMessageInbound')
|
|
212
|
+
const machine = createAgentMachine({ model, provider, callLLM, enabledToolsets, disabledToolsets, maxIterations, events })
|
|
213
|
+
// Persist the turn snapshot under kind=agent so an interrupted turn (process
|
|
214
|
+
// refresh mid-tool-call) resumes exactly where it stopped via resumeTurn.
|
|
215
|
+
const key = sessionKey || randomUUID()
|
|
216
|
+
const pa = await createPersistentActor(machine, { kind: 'agent', key, input: { messages: initMessages } })
|
|
217
|
+
pa.actor.send({ type: 'SUBMIT', prompt })
|
|
218
|
+
return await driveAgentActor({ pa, h, events, prompt, provider, model, skill, cwd, witnessPath, timeoutMs })
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// Rehydrate an interrupted turn from its persisted snapshot and drive it to
|
|
222
|
+
// completion. Returns null if no live snapshot exists for the key (already
|
|
223
|
+
// completed or never persisted) — caller falls back to a fresh runTurn.
|
|
224
|
+
export async function resumeTurn({ sessionKey, model, provider, callLLM, enabledToolsets, disabledToolsets, maxIterations = 90, timeoutMs = 30000, cwd, skill, witnessPath } = {}) {
|
|
225
|
+
if (!sessionKey) throw new Error('resumeTurn requires sessionKey')
|
|
226
|
+
const { load } = await import('../machines/snapshot-store.js')
|
|
227
|
+
if (!(await load('agent', sessionKey))) return null
|
|
228
|
+
const events = []; const h = await bootHost()
|
|
229
|
+
const machine = createAgentMachine({ model, provider, callLLM, enabledToolsets, disabledToolsets, maxIterations, events })
|
|
230
|
+
const pa = await createPersistentActor(machine, { kind: 'agent', key: sessionKey, input: { messages: [] } })
|
|
231
|
+
if (!pa.resumed) { await pa.forget(); return null }
|
|
232
|
+
return await driveAgentActor({ pa, h, events, prompt: '', provider, model, skill, cwd, witnessPath, timeoutMs })
|
|
233
|
+
}
|
|
234
|
+
|
|
195
235
|
export async function invokeCompactHooks({ trigger = 'auto', messages = [] } = {}) {
|
|
196
236
|
const h = await bootHost()
|
|
197
237
|
const pre = await h.hooks.invoke('onPreCompact', { trigger, messages })
|
package/src/batch.js
CHANGED
|
@@ -3,30 +3,94 @@ import path from 'node:path'
|
|
|
3
3
|
import { runTurn } from './agent/machine.js'
|
|
4
4
|
import { getFreddieHome } from './home.js'
|
|
5
5
|
import { randomUUID } from 'node:crypto'
|
|
6
|
+
import { createMachine, assign, fromPromise } from 'xstate'
|
|
7
|
+
import { createPersistentActor } from './machines/persistent-actor.js'
|
|
8
|
+
import { load } from './machines/snapshot-store.js'
|
|
6
9
|
|
|
7
|
-
|
|
10
|
+
// Run one prompt and append its result to the batch jsonl file.
|
|
11
|
+
async function runOne({ job, model, callLLM, file }) {
|
|
12
|
+
let rec
|
|
13
|
+
try {
|
|
14
|
+
const out = await runTurn({ prompt: job.p, model, callLLM, timeoutMs: 60000 })
|
|
15
|
+
rec = { i: job.i, prompt: job.p, result: out.result, error: out.error }
|
|
16
|
+
} catch (e) {
|
|
17
|
+
rec = { i: job.i, prompt: job.p, error: String(e?.message || e) }
|
|
18
|
+
}
|
|
19
|
+
fs.appendFileSync(file, JSON.stringify(rec) + '\n')
|
|
20
|
+
return rec
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
// xstate batch machine. Context tracks done[] (indices completed) + results so a
|
|
24
|
+
// refreshed batch resumes only the unfinished prompts. running -> running until
|
|
25
|
+
// every index is done, then -> complete (final). The persisted snapshot is keyed
|
|
26
|
+
// by kind=batch key=<batchId>.
|
|
27
|
+
export function createBatchMachine({ prompts, concurrency, model, callLLM, file } = {}) {
|
|
28
|
+
return createMachine({
|
|
29
|
+
id: 'freddie-batch',
|
|
30
|
+
initial: 'running',
|
|
31
|
+
output: ({ context }) => ({ id: context.id, file: context.file, results: context.results }),
|
|
32
|
+
context: ({ input }) => ({
|
|
33
|
+
id: input.id, file: input.file, model: input.model, concurrency: input.concurrency,
|
|
34
|
+
prompts: input.prompts, done: input.done || [], results: input.results || new Array(input.prompts.length).fill(null),
|
|
35
|
+
}),
|
|
36
|
+
states: {
|
|
37
|
+
running: {
|
|
38
|
+
always: { guard: ({ context }) => context.done.length >= context.prompts.length, target: 'complete' },
|
|
39
|
+
invoke: {
|
|
40
|
+
src: fromPromise(async ({ input }) => {
|
|
41
|
+
const { context } = input
|
|
42
|
+
const pending = context.prompts
|
|
43
|
+
.map((p, i) => ({ i, p }))
|
|
44
|
+
.filter(({ i }) => !context.done.includes(i))
|
|
45
|
+
.slice(0, context.concurrency)
|
|
46
|
+
return await Promise.all(pending.map(job => runOne({ job, model: context.model, callLLM, file: context.file })))
|
|
47
|
+
}),
|
|
48
|
+
input: ({ context }) => ({ context }),
|
|
49
|
+
onDone: {
|
|
50
|
+
target: 'running',
|
|
51
|
+
reenter: true,
|
|
52
|
+
actions: assign({
|
|
53
|
+
done: ({ context, event }) => [...context.done, ...event.output.map(r => r.i)],
|
|
54
|
+
results: ({ context, event }) => { const r = [...context.results]; for (const rec of event.output) r[rec.i] = rec; return r },
|
|
55
|
+
}),
|
|
56
|
+
},
|
|
57
|
+
},
|
|
58
|
+
},
|
|
59
|
+
complete: { type: 'final', output: ({ context }) => ({ id: context.id, file: context.file, results: context.results }) },
|
|
60
|
+
},
|
|
61
|
+
})
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export async function runBatch({ prompts = [], concurrency = 4, model, callLLM, batchId } = {}) {
|
|
8
65
|
if (!Array.isArray(prompts) || prompts.length === 0) throw new Error('prompts required')
|
|
9
|
-
const id = randomUUID()
|
|
66
|
+
const id = batchId || randomUUID()
|
|
10
67
|
const dir = path.join(getFreddieHome(), 'batches')
|
|
11
68
|
fs.mkdirSync(dir, { recursive: true })
|
|
12
69
|
const file = path.join(dir, id + '.jsonl')
|
|
13
|
-
const
|
|
14
|
-
const
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
70
|
+
const machine = createBatchMachine({ model, callLLM })
|
|
71
|
+
const pa = await createPersistentActor(machine, { kind: 'batch', key: id, input: { id, file, model, concurrency, prompts } })
|
|
72
|
+
return await driveBatch(pa)
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Resume an interrupted batch from its persisted snapshot — only the prompts not
|
|
76
|
+
// yet in context.done get re-run. Returns null if no live snapshot for the id.
|
|
77
|
+
export async function resumeBatch({ batchId, model, callLLM } = {}) {
|
|
78
|
+
if (!batchId) throw new Error('resumeBatch requires batchId')
|
|
79
|
+
if (!(await load('batch', batchId))) return null
|
|
80
|
+
const machine = createBatchMachine({ model, callLLM })
|
|
81
|
+
const pa = await createPersistentActor(machine, { kind: 'batch', key: batchId, input: { id: batchId, file: '', model, concurrency: 4, prompts: [] } })
|
|
82
|
+
if (!pa.resumed) { await pa.forget(); return null }
|
|
83
|
+
return await driveBatch(pa)
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
function driveBatch(pa) {
|
|
87
|
+
const { actor } = pa
|
|
88
|
+
return new Promise((resolve, reject) => {
|
|
89
|
+
const sub = actor.subscribe(snap => {
|
|
90
|
+
if (snap.status !== 'done') return
|
|
91
|
+
const out = snap.output
|
|
92
|
+
pa.flush().catch(() => {}).finally(() => { try { sub.unsubscribe() } catch {}; try { actor.stop() } catch {}; resolve(out) })
|
|
93
|
+
})
|
|
94
|
+
actor.subscribe({ error: (e) => { try { sub.unsubscribe() } catch {}; reject(e) } })
|
|
28
95
|
})
|
|
29
|
-
await Promise.all(workers)
|
|
30
|
-
await new Promise(r => stream.end(r))
|
|
31
|
-
return { id, file, results }
|
|
32
96
|
}
|
package/src/cli/gateway_cli.js
CHANGED
|
@@ -4,6 +4,8 @@ import { registerBuiltinHooks } from '../gateway/builtin_hooks/index.js'
|
|
|
4
4
|
let _gateway = null
|
|
5
5
|
export async function startGateway({ port = 0, hooks = true } = {}) {
|
|
6
6
|
if (_gateway) return _gateway
|
|
7
|
+
// Rehydrate interrupted agent turns / batches before the gateway starts taking traffic.
|
|
8
|
+
try { const { resumeAll } = await import('../machines/resume.js'); await resumeAll() } catch (_) {}
|
|
7
9
|
const wh = await makePlatform('webhook', { port })
|
|
8
10
|
const api = await makePlatform('api_server', { port: 0 })
|
|
9
11
|
const gw = new Gateway({ platforms: { webhook: wh, api_server: api } })
|
package/src/cron/scheduler.js
CHANGED
|
@@ -2,11 +2,11 @@ import { db } from '../db.js'
|
|
|
2
2
|
import { parseCron, matches } from './cron-parse.js'
|
|
3
3
|
import { runTurn } from '../agent/machine.js'
|
|
4
4
|
import { logger } from '../observability/log.js'
|
|
5
|
+
import { createMachine, assign, fromPromise } from 'xstate'
|
|
6
|
+
import { createPersistentActor } from '../machines/persistent-actor.js'
|
|
5
7
|
|
|
6
8
|
const log = logger('cron')
|
|
7
9
|
|
|
8
|
-
let _interval = null
|
|
9
|
-
|
|
10
10
|
async function init() {
|
|
11
11
|
const d = await db()
|
|
12
12
|
await d.exec(`CREATE TABLE IF NOT EXISTS cron_jobs (id INTEGER PRIMARY KEY AUTOINCREMENT, cron TEXT NOT NULL, prompt TEXT NOT NULL, model TEXT, last_run INTEGER, created INTEGER NOT NULL, enabled INTEGER NOT NULL DEFAULT 1)`)
|
|
@@ -52,6 +52,31 @@ export async function tick(now = new Date(), { callLLM = null } = {}) {
|
|
|
52
52
|
return fired
|
|
53
53
|
}
|
|
54
54
|
|
|
55
|
+
// xstate scheduler machine: idle -> ticking -> idle. The persisted snapshot
|
|
56
|
+
// carries tick_count + last_tick_ms so a refreshed scheduler resumes its cadence;
|
|
57
|
+
// per-job fire-state (last_run minute) lives durably in cron_jobs, so a restart
|
|
58
|
+
// never double-fires a job that already ran this minute.
|
|
59
|
+
export function createCronMachine({ callLLM = null, intervalMs = 30000 } = {}) {
|
|
60
|
+
return createMachine({
|
|
61
|
+
id: 'freddie-cron',
|
|
62
|
+
initial: 'idle',
|
|
63
|
+
context: ({ input }) => ({ tickCount: input?.tickCount || 0, lastTickMs: input?.lastTickMs || 0, intervalMs, lastFired: [] }),
|
|
64
|
+
states: {
|
|
65
|
+
idle: { after: { [intervalMs]: 'ticking' }, on: { TICK_NOW: 'ticking', STOP: 'stopped' } },
|
|
66
|
+
ticking: {
|
|
67
|
+
invoke: {
|
|
68
|
+
src: fromPromise(async () => tick(new Date(), { callLLM })),
|
|
69
|
+
onDone: { target: 'idle', actions: assign({ tickCount: ({ context }) => context.tickCount + 1, lastTickMs: () => Date.now(), lastFired: ({ event }) => (event.output || []).map(j => j.id) }) },
|
|
70
|
+
onError: { target: 'idle', actions: ({ event }) => log.error('cron tick errored', { err: String(event.error) }) },
|
|
71
|
+
},
|
|
72
|
+
},
|
|
73
|
+
stopped: { type: 'final' },
|
|
74
|
+
},
|
|
75
|
+
})
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Legacy in-memory interval scheduler (kept for tests + non-resumable callers).
|
|
79
|
+
let _interval = null
|
|
55
80
|
export function startScheduler({ callLLM = null, intervalMs = 30000 } = {}) {
|
|
56
81
|
stopScheduler()
|
|
57
82
|
_interval = setInterval(() => { tick(new Date(), { callLLM }) }, intervalMs)
|
|
@@ -61,3 +86,22 @@ export function startScheduler({ callLLM = null, intervalMs = 30000 } = {}) {
|
|
|
61
86
|
export function stopScheduler() {
|
|
62
87
|
if (_interval) { clearInterval(_interval); _interval = null }
|
|
63
88
|
}
|
|
89
|
+
|
|
90
|
+
// Resumable scheduler: persists its machine snapshot every transition under
|
|
91
|
+
// kind=cron key=scheduler. On boot resume.js rehydrates it and it continues
|
|
92
|
+
// ticking. Returns the persistent-actor handle.
|
|
93
|
+
let _persistentCron = null
|
|
94
|
+
export async function startPersistentScheduler({ callLLM = null, intervalMs = 30000 } = {}) {
|
|
95
|
+
if (_persistentCron) return _persistentCron
|
|
96
|
+
const machine = createCronMachine({ callLLM, intervalMs })
|
|
97
|
+
_persistentCron = await createPersistentActor(machine, { kind: 'cron', key: 'scheduler', input: {} })
|
|
98
|
+
return _persistentCron
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
export async function stopPersistentScheduler() {
|
|
102
|
+
if (!_persistentCron) return
|
|
103
|
+
try { _persistentCron.actor.send({ type: 'STOP' }) } catch {}
|
|
104
|
+
await _persistentCron.flush()
|
|
105
|
+
try { _persistentCron.actor.stop() } catch {}
|
|
106
|
+
_persistentCron = null
|
|
107
|
+
}
|
package/src/gateway/run.js
CHANGED
|
@@ -1,30 +1,68 @@
|
|
|
1
1
|
import { logger } from '../observability/log.js'
|
|
2
2
|
import { runTurn } from '../agent/machine.js'
|
|
3
|
+
import { createMachine, assign, fromPromise, createActor } from 'xstate'
|
|
4
|
+
import { persist, load, clear } from '../machines/snapshot-store.js'
|
|
5
|
+
import { randomUUID } from 'node:crypto'
|
|
3
6
|
|
|
4
7
|
const log = logger('gateway')
|
|
5
8
|
|
|
9
|
+
// Gateway lifecycle machine: stopped -> starting -> running -> stopping -> stopped.
|
|
10
|
+
// The running state's context tracks platform names; lifecycle is the resumable
|
|
11
|
+
// shape. In-flight inbound message processing is persisted separately (per-message
|
|
12
|
+
// snapshot under kind=gateway-msg) so a refresh re-drives messages whose reply was
|
|
13
|
+
// never sent.
|
|
14
|
+
export function createGatewayMachine({ platformNames = [] } = {}) {
|
|
15
|
+
return createMachine({
|
|
16
|
+
id: 'freddie-gateway',
|
|
17
|
+
initial: 'stopped',
|
|
18
|
+
context: ({ input }) => ({ platformNames: input?.platformNames || platformNames }),
|
|
19
|
+
states: {
|
|
20
|
+
stopped: { on: { START: 'starting' } },
|
|
21
|
+
starting: { on: { STARTED: 'running', FAIL: 'stopped' } },
|
|
22
|
+
running: { on: { STOP: 'stopping' } },
|
|
23
|
+
stopping: { on: { STOPPED: 'stopped' } },
|
|
24
|
+
},
|
|
25
|
+
})
|
|
26
|
+
}
|
|
27
|
+
|
|
6
28
|
export class Gateway {
|
|
7
29
|
constructor({ platforms = {}, callLLM = null } = {}) {
|
|
8
30
|
this.platforms = new Map()
|
|
9
31
|
this.callLLM = callLLM
|
|
10
32
|
this.hooks = { inbound: [], outbound: [] }
|
|
11
33
|
for (const [name, adapter] of Object.entries(platforms)) this.register(name, adapter)
|
|
34
|
+
this.machine = createGatewayMachine({ platformNames: [...this.platforms.keys()] })
|
|
35
|
+
this.actor = createActor(this.machine, { input: { platformNames: [...this.platforms.keys()] } })
|
|
36
|
+
// Persist lifecycle transitions so the gateway's state is observable +
|
|
37
|
+
// resumable; an active snapshot on boot means the gateway was running.
|
|
38
|
+
this.actor.subscribe((snap) => { persist('gateway', 'lifecycle', this.actor.getPersistedSnapshot()).catch(() => {}) })
|
|
39
|
+
this.actor.start()
|
|
12
40
|
}
|
|
41
|
+
get state() { return this.actor.getSnapshot().value }
|
|
13
42
|
register(name, adapter) {
|
|
14
43
|
this.platforms.set(name, adapter)
|
|
15
44
|
adapter.on?.('message', (m) => this.handleInbound(name, m))
|
|
16
45
|
}
|
|
17
46
|
addHook(stage, fn) { this.hooks[stage].push(fn) }
|
|
18
47
|
async start() {
|
|
48
|
+
this.actor.send({ type: 'START' })
|
|
19
49
|
for (const a of this.platforms.values()) await a.start?.()
|
|
50
|
+
this.actor.send({ type: 'STARTED' })
|
|
20
51
|
log.info('gateway started', { platforms: [...this.platforms.keys()] })
|
|
21
52
|
}
|
|
22
53
|
async stop() {
|
|
54
|
+
this.actor.send({ type: 'STOP' })
|
|
23
55
|
for (const a of this.platforms.values()) await a.stop?.()
|
|
56
|
+
this.actor.send({ type: 'STOPPED' })
|
|
24
57
|
log.info('gateway stopped')
|
|
25
58
|
}
|
|
26
59
|
async handleInbound(platform, msg) {
|
|
27
60
|
log.info('inbound', { platform, from: msg.from, len: (msg.text || '').length })
|
|
61
|
+
// Persist the in-flight message under a stable key derived from platform +
|
|
62
|
+
// sender + content so a refresh mid-processing re-drives it instead of
|
|
63
|
+
// dropping it. The snapshot is cleared once the reply is sent.
|
|
64
|
+
const msgKey = msg.id || `${platform}:${msg.from}:${randomUUID()}`
|
|
65
|
+
await persist('gateway-msg', msgKey, { status: 'active', value: 'processing', context: { platform, from: msg.from, text: msg.text } })
|
|
28
66
|
let cur = { ...msg, platform }
|
|
29
67
|
for (const h of this.hooks.inbound) cur = (await h(cur)) || cur
|
|
30
68
|
const result = await runTurn({ prompt: cur.text || '', callLLM: this.callLLM })
|
|
@@ -32,6 +70,7 @@ export class Gateway {
|
|
|
32
70
|
for (const h of this.hooks.outbound) reply = (await h(reply)) || reply
|
|
33
71
|
const adapter = this.platforms.get(platform)
|
|
34
72
|
await adapter.send?.(reply)
|
|
73
|
+
await clear('gateway-msg', msgKey)
|
|
35
74
|
return reply
|
|
36
75
|
}
|
|
37
76
|
}
|
package/src/host/host_helpers.js
CHANGED
|
@@ -134,7 +134,13 @@ export function makeCcLoaders(ccHost, env) {
|
|
|
134
134
|
}
|
|
135
135
|
return ccHost.plugins().length
|
|
136
136
|
}
|
|
137
|
+
// gm-cc must never be auto-discovered as a cc-plugin: it ships the 24
|
|
138
|
+
// deprecated gm-* platform-variant skills under a manifest named 'gm', and the
|
|
139
|
+
// single canonical gm-skill is registered by plugins/gm-skill instead. The
|
|
140
|
+
// package extracts into node_modules under both 'gm-cc' and pnpm/bun temp dirs
|
|
141
|
+
// like '.gm-cc-<hash>', so exclude by basename prefix, not exact match.
|
|
137
142
|
const CC_EXCLUDE = new Set(['gm-cc'])
|
|
143
|
+
const isExcludedCc = (base) => CC_EXCLUDE.has(base) || /^\.?gm-cc(-|$)/.test(base)
|
|
138
144
|
async function loadCcFromNodeModules(startDir) {
|
|
139
145
|
const seen = new Set(ccHost.plugins().map(p => p.root))
|
|
140
146
|
let cur = path.resolve(startDir)
|
|
@@ -146,7 +152,7 @@ export function makeCcLoaders(ccHost, env) {
|
|
|
146
152
|
? fs.readdirSync(path.join(nm, entry.name), { withFileTypes: true }).filter(e => e.isDirectory()).map(e => path.join(nm, entry.name, e.name))
|
|
147
153
|
: [path.join(nm, entry.name)]
|
|
148
154
|
for (const d of dirs) {
|
|
149
|
-
if (seen.has(d) || !isCcPluginDir(d) ||
|
|
155
|
+
if (seen.has(d) || !isCcPluginDir(d) || isExcludedCc(path.basename(d))) continue
|
|
150
156
|
seen.add(d); await useCcDir(d)
|
|
151
157
|
}
|
|
152
158
|
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
// Persistent xstate actor wrapper.
|
|
2
|
+
//
|
|
3
|
+
// createPersistentActor rehydrates an actor from its last persisted snapshot
|
|
4
|
+
// (if any), auto-persists on every transition, and clears its snapshot the
|
|
5
|
+
// moment it reaches a final/stopped state. This is the single primitive every
|
|
6
|
+
// long-lived freddie subsystem uses to become resumable across a process refresh.
|
|
7
|
+
import { createActor } from 'xstate'
|
|
8
|
+
import { persist, load, clear } from './snapshot-store.js'
|
|
9
|
+
import { logger } from '../observability/log.js'
|
|
10
|
+
|
|
11
|
+
const log = logger('persistent-actor')
|
|
12
|
+
|
|
13
|
+
// machine: an xstate machine. kind+key: snapshot identity. input: actor input
|
|
14
|
+
// (used only on a fresh start — a rehydrated actor restores its own context).
|
|
15
|
+
// onTransition: optional callback per snapshot.
|
|
16
|
+
export async function createPersistentActor(machine, { kind, key, input, onTransition } = {}) {
|
|
17
|
+
if (!kind || !key) throw new Error('createPersistentActor requires kind and key')
|
|
18
|
+
const machineId = machine?.id || machine?.config?.id || null
|
|
19
|
+
const snapshot = await load(kind, key, { machineId })
|
|
20
|
+
const resumed = !!snapshot
|
|
21
|
+
const actor = snapshot
|
|
22
|
+
? createActor(machine, { snapshot })
|
|
23
|
+
: createActor(machine, { input })
|
|
24
|
+
|
|
25
|
+
let persisting = Promise.resolve()
|
|
26
|
+
const sub = actor.subscribe((snap) => {
|
|
27
|
+
// Serialize persists so rapid consecutive transitions land last-write-wins
|
|
28
|
+
// without interleaving; the store upsert is keyed by (kind,key).
|
|
29
|
+
persisting = persisting.then(async () => {
|
|
30
|
+
try {
|
|
31
|
+
const ps = actor.getPersistedSnapshot()
|
|
32
|
+
if (snap.status === 'active') {
|
|
33
|
+
await persist(kind, key, ps, { machineId })
|
|
34
|
+
} else {
|
|
35
|
+
// Final/stopped: clear so a completed actor never resurrects on boot.
|
|
36
|
+
await clear(kind, key)
|
|
37
|
+
}
|
|
38
|
+
onTransition?.(snap)
|
|
39
|
+
} catch (e) {
|
|
40
|
+
log.error('persist failed', { kind, key, err: String(e) })
|
|
41
|
+
}
|
|
42
|
+
})
|
|
43
|
+
})
|
|
44
|
+
|
|
45
|
+
if (resumed) log.info('actor resumed from snapshot', { kind, key, machineId })
|
|
46
|
+
|
|
47
|
+
actor.start()
|
|
48
|
+
return {
|
|
49
|
+
actor,
|
|
50
|
+
resumed,
|
|
51
|
+
// Await all in-flight persists then unsubscribe — call before stopping so
|
|
52
|
+
// the final snapshot state is durable.
|
|
53
|
+
async flush() { await persisting; try { sub.unsubscribe() } catch {} },
|
|
54
|
+
// Clear this actor's snapshot explicitly (e.g. on external cancel).
|
|
55
|
+
async forget() { await persisting; try { sub.unsubscribe() } catch {}; await clear(kind, key) },
|
|
56
|
+
}
|
|
57
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
// Boot-time resumability driver.
|
|
2
|
+
//
|
|
3
|
+
// resumeAll() is invoked on process boot (CLI + dashboard server start). It walks
|
|
4
|
+
// every non-final persisted machine snapshot and rehydrates the ones that can be
|
|
5
|
+
// driven to completion headlessly — interrupted agent turns and unfinished
|
|
6
|
+
// batches. Lifecycle snapshots (gateway/acp) and in-flight message markers are
|
|
7
|
+
// reported but not auto-driven, since they require their live host process (open
|
|
8
|
+
// sockets / stdio) to resume meaningfully; resumeAll() surfaces them so the host
|
|
9
|
+
// can decide.
|
|
10
|
+
import { list, sweepDone } from './snapshot-store.js'
|
|
11
|
+
import { logger } from '../observability/log.js'
|
|
12
|
+
|
|
13
|
+
const log = logger('resume')
|
|
14
|
+
|
|
15
|
+
export async function resumeAll({ driveAgents = true, driveBatches = true } = {}) {
|
|
16
|
+
// Drop any final snapshots first so they never resurrect.
|
|
17
|
+
await sweepDone()
|
|
18
|
+
const active = await list({ status: 'active' })
|
|
19
|
+
const summary = { agent: 0, batch: 0, cron: 0, gateway: 0, acp: 0, 'gateway-msg': 0, 'acp-prompt': 0, resumed: [], surfaced: [] }
|
|
20
|
+
|
|
21
|
+
for (const row of active) {
|
|
22
|
+
summary[row.kind] = (summary[row.kind] || 0) + 1
|
|
23
|
+
try {
|
|
24
|
+
if (row.kind === 'agent' && driveAgents) {
|
|
25
|
+
const { resumeTurn } = await import('../agent/machine.js')
|
|
26
|
+
resumeTurn({ sessionKey: row.key }).then(() => log.info('agent turn resumed to completion', { key: row.key })).catch(e => log.error('agent resume failed', { key: row.key, err: String(e) }))
|
|
27
|
+
summary.resumed.push({ kind: 'agent', key: row.key })
|
|
28
|
+
} else if (row.kind === 'batch' && driveBatches) {
|
|
29
|
+
const { resumeBatch } = await import('../batch.js')
|
|
30
|
+
resumeBatch({ batchId: row.key }).then(() => log.info('batch resumed to completion', { key: row.key })).catch(e => log.error('batch resume failed', { key: row.key, err: String(e) }))
|
|
31
|
+
summary.resumed.push({ kind: 'batch', key: row.key })
|
|
32
|
+
} else {
|
|
33
|
+
// Lifecycle + in-flight markers: surfaced for the host to act on.
|
|
34
|
+
summary.surfaced.push({ kind: row.kind, key: row.key, status: row.status })
|
|
35
|
+
}
|
|
36
|
+
} catch (e) {
|
|
37
|
+
log.error('resume dispatch failed', { kind: row.kind, key: row.key, err: String(e) })
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
log.info('resumeAll complete', { active: active.length, resumed: summary.resumed.length, surfaced: summary.surfaced.length })
|
|
41
|
+
return summary
|
|
42
|
+
}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
// Durable xstate snapshot store — the resumability backbone.
|
|
2
|
+
//
|
|
3
|
+
// Every long-lived freddie machine (agent turn, cron scheduler, batch runner,
|
|
4
|
+
// gateway, acp) persists its xstate snapshot here on every transition. After a
|
|
5
|
+
// refresh/restart, resume.js rehydrates each non-final snapshot into a fresh
|
|
6
|
+
// actor via createActor(machine, { snapshot }), so the process picks up exactly
|
|
7
|
+
// where it left off.
|
|
8
|
+
//
|
|
9
|
+
// Storage is libsql (shared sessions.db) keyed by (kind, key). Last-write-wins
|
|
10
|
+
// upsert: rapid consecutive transitions only keep the latest snapshot.
|
|
11
|
+
import { db } from '../db.js'
|
|
12
|
+
import { logger } from '../observability/log.js'
|
|
13
|
+
|
|
14
|
+
const log = logger('snapshot-store')
|
|
15
|
+
|
|
16
|
+
// Bump when the persisted-snapshot encoding or any machine definition changes
|
|
17
|
+
// shape incompatibly. load() discards rows whose schema_version mismatches so a
|
|
18
|
+
// stale snapshot from older code never crashes resume.
|
|
19
|
+
export const SNAPSHOT_SCHEMA_VERSION = 1
|
|
20
|
+
|
|
21
|
+
let _inited = false
|
|
22
|
+
async function init() {
|
|
23
|
+
const d = await db()
|
|
24
|
+
if (!_inited) {
|
|
25
|
+
await d.exec(`CREATE TABLE IF NOT EXISTS machine_snapshots (
|
|
26
|
+
kind TEXT NOT NULL,
|
|
27
|
+
key TEXT NOT NULL,
|
|
28
|
+
schema_version INTEGER NOT NULL,
|
|
29
|
+
machine_id TEXT,
|
|
30
|
+
snapshot_json TEXT NOT NULL,
|
|
31
|
+
status TEXT NOT NULL,
|
|
32
|
+
updated INTEGER NOT NULL,
|
|
33
|
+
PRIMARY KEY (kind, key)
|
|
34
|
+
)`)
|
|
35
|
+
_inited = true
|
|
36
|
+
}
|
|
37
|
+
return d
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// Persist (upsert) a snapshot. status is the actor snapshot status
|
|
41
|
+
// ('active' | 'done' | 'error' | 'stopped'). machineId guards against rehydrating
|
|
42
|
+
// a snapshot into a structurally different machine after a code change.
|
|
43
|
+
export async function persist(kind, key, snapshot, { machineId = null } = {}) {
|
|
44
|
+
if (!kind || !key) throw new Error('persist requires kind and key')
|
|
45
|
+
const d = await init()
|
|
46
|
+
const status = snapshot?.status || 'active'
|
|
47
|
+
const json = JSON.stringify(snapshot)
|
|
48
|
+
await d.prepare(`INSERT INTO machine_snapshots (kind, key, schema_version, machine_id, snapshot_json, status, updated)
|
|
49
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
50
|
+
ON CONFLICT(kind, key) DO UPDATE SET
|
|
51
|
+
schema_version = excluded.schema_version,
|
|
52
|
+
machine_id = excluded.machine_id,
|
|
53
|
+
snapshot_json = excluded.snapshot_json,
|
|
54
|
+
status = excluded.status,
|
|
55
|
+
updated = excluded.updated`)
|
|
56
|
+
.run(kind, key, SNAPSHOT_SCHEMA_VERSION, machineId, json, status, Date.now())
|
|
57
|
+
return { kind, key, status }
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Load a persisted snapshot. Returns null on missing row, schema-version
|
|
61
|
+
// mismatch, machine-id mismatch, or unparseable JSON — every consumer falls back
|
|
62
|
+
// to a fresh actor and never throws.
|
|
63
|
+
export async function load(kind, key, { machineId = null } = {}) {
|
|
64
|
+
const d = await init()
|
|
65
|
+
const row = await d.prepare(`SELECT * FROM machine_snapshots WHERE kind = ? AND key = ?`).get(kind, key)
|
|
66
|
+
if (!row) return null
|
|
67
|
+
if (Number(row.schema_version) !== SNAPSHOT_SCHEMA_VERSION) {
|
|
68
|
+
log.info('discarding stale snapshot (schema mismatch)', { kind, key, had: row.schema_version, want: SNAPSHOT_SCHEMA_VERSION })
|
|
69
|
+
await clear(kind, key)
|
|
70
|
+
return null
|
|
71
|
+
}
|
|
72
|
+
if (machineId && row.machine_id && row.machine_id !== machineId) {
|
|
73
|
+
log.info('discarding stale snapshot (machine id mismatch)', { kind, key, had: row.machine_id, want: machineId })
|
|
74
|
+
await clear(kind, key)
|
|
75
|
+
return null
|
|
76
|
+
}
|
|
77
|
+
try {
|
|
78
|
+
return JSON.parse(row.snapshot_json)
|
|
79
|
+
} catch (e) {
|
|
80
|
+
log.error('unparseable snapshot, discarding', { kind, key, err: String(e) })
|
|
81
|
+
await clear(kind, key)
|
|
82
|
+
return null
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
export async function clear(kind, key) {
|
|
87
|
+
const d = await init()
|
|
88
|
+
await d.prepare(`DELETE FROM machine_snapshots WHERE kind = ? AND key = ?`).run(kind, key)
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// List snapshots, optionally filtered by kind. status filter defaults to
|
|
92
|
+
// non-final ('active') for resume-on-boot; pass status:null for all.
|
|
93
|
+
export async function list({ kind = null, status = 'active' } = {}) {
|
|
94
|
+
const d = await init()
|
|
95
|
+
let sql = `SELECT kind, key, schema_version, machine_id, status, updated FROM machine_snapshots`
|
|
96
|
+
const where = []; const args = []
|
|
97
|
+
if (kind) { where.push('kind = ?'); args.push(kind) }
|
|
98
|
+
if (status) { where.push('status = ?'); args.push(status) }
|
|
99
|
+
if (where.length) sql += ' WHERE ' + where.join(' AND ')
|
|
100
|
+
sql += ' ORDER BY updated DESC'
|
|
101
|
+
return await d.prepare(sql).all(...args)
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Remove all final (done/error/stopped) snapshots — a completed actor must not
|
|
105
|
+
// resurrect on the next boot. Called opportunistically; final actors also clear
|
|
106
|
+
// their own row on completion.
|
|
107
|
+
export async function sweepDone() {
|
|
108
|
+
const d = await init()
|
|
109
|
+
const res = await d.prepare(`DELETE FROM machine_snapshots WHERE status != 'active'`).run()
|
|
110
|
+
return { removed: res.changes }
|
|
111
|
+
}
|
package/src/web/server.js
CHANGED
|
@@ -7,6 +7,9 @@ const __dirname = path.dirname(fileURLToPath(import.meta.url))
|
|
|
7
7
|
|
|
8
8
|
export async function createDashboard({ port = 0 } = {}) {
|
|
9
9
|
const host = await bootHost()
|
|
10
|
+
// Rehydrate any interrupted machines (agent turns, batches) from their
|
|
11
|
+
// persisted snapshots; surface lifecycle markers. Non-blocking on failure.
|
|
12
|
+
try { const { resumeAll } = await import('../machines/resume.js'); await resumeAll() } catch (_) {}
|
|
10
13
|
const app = express()
|
|
11
14
|
app.use(express.json())
|
|
12
15
|
app.use(express.static(__dirname))
|