@geekbeer/minion 4.4.0 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,14 +8,37 @@
8
8
  *
9
9
  * Each call spins up a fresh chromium instance. Pooling can come later
10
10
  * once the API stabilizes — for the experimental MVP, simple is better.
11
+ *
12
+ * Wait strategy (SPA-aware, v4.7.0):
13
+ * `page.goto` resolves on `domcontentloaded`, which for a client-rendered
14
+ * SPA fires *before* the framework has mounted and fetched its data. So
15
+ * after navigation we additionally wait for the content to actually appear:
16
+ * 1. If a `readySelector` is known (from the recipe), wait for it.
17
+ * 2. Otherwise wait for the DOM to *settle* — i.e. no MutationObserver
18
+ * events for `settleMs` — which is self-calibrating and works whether
19
+ * the page renders 50 chars or 50,000.
20
+ * `scroll` (optional) then drives infinite-scroll / lazy-load pages up to
21
+ * caller-declared limits, with hard server-side caps.
11
22
  */
12
23
 
13
24
  const DEFAULT_NAV_TIMEOUT_MS = 20_000
14
25
  const DEFAULT_EVAL_TIMEOUT_MS = 5_000
26
+ const DEFAULT_READY_TIMEOUT_MS = 8_000
27
+ const DEFAULT_SETTLE_MS = 500
28
+ const DEFAULT_SETTLE_MAX_MS = 8_000
15
29
  const DEFAULT_USER_AGENT =
16
30
  'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' +
17
31
  'Chrome/124.0.0.0 Safari/537.36 MinionWebExtract/0.1'
18
32
 
33
+ // Scroll defaults (caller may override) and hard caps (server enforces).
34
+ const SCROLL_DEFAULT_MAX_SCROLLS = 10
35
+ const SCROLL_HARD_MAX_SCROLLS = 50
36
+ const SCROLL_DEFAULT_MAX_MS = 15_000
37
+ const SCROLL_HARD_MAX_MS = 45_000
38
+ const SCROLL_DEFAULT_SETTLE_MS = 600
39
+ const SCROLL_STABLE_ROUNDS = 2
40
+ const SCROLL_STRATEGIES = ['count', 'untilStable', 'fixed']
41
+
19
42
  function loadChromium() {
20
43
  let playwright
21
44
  try {
@@ -49,12 +72,172 @@ async function withPage(fn, opts = {}) {
49
72
  }
50
73
  }
51
74
 
75
+ /**
76
+ * Wait for the DOM to stop mutating. Resolves once no childList/characterData
77
+ * mutation has fired for `settleMs`, or after `settleMaxMs` regardless. Scoped
78
+ * to the main content landmark when present so ever-churning headers/ads/beacons
79
+ * don't keep it awake. Best-effort: any failure (e.g. mid-navigation) resolves.
80
+ */
81
+ async function waitForSettle(page, opts = {}) {
82
+ const settleMs = opts.settleMs ?? DEFAULT_SETTLE_MS
83
+ const settleMaxMs = opts.settleMaxMs ?? DEFAULT_SETTLE_MAX_MS
84
+ try {
85
+ await page.evaluate(({ quiet, max }) => new Promise(resolve => {
86
+ const target = document.querySelector('main, article, [role="main"]') || document.body
87
+ if (!target) { resolve(); return }
88
+ let quietTimer = setTimeout(finish, quiet)
89
+ const hardCap = setTimeout(finish, max)
90
+ const obs = new MutationObserver(() => {
91
+ clearTimeout(quietTimer)
92
+ quietTimer = setTimeout(finish, quiet)
93
+ })
94
+ obs.observe(target, { childList: true, subtree: true, characterData: true })
95
+ function finish() {
96
+ clearTimeout(quietTimer)
97
+ clearTimeout(hardCap)
98
+ obs.disconnect()
99
+ resolve()
100
+ }
101
+ }), { quiet: settleMs, max: settleMaxMs })
102
+ } catch {
103
+ // navigation/teardown raced us — caller proceeds with whatever rendered.
104
+ }
105
+ }
106
+
107
+ /**
108
+ * After navigation, wait until the meaningful content is present:
109
+ * prefer the recipe's `readySelector`; fall back to DOM-settle detection.
110
+ */
111
+ async function waitForReady(page, opts = {}) {
112
+ if (opts.readySelector) {
113
+ try {
114
+ await page.waitForSelector(opts.readySelector, {
115
+ state: 'visible',
116
+ timeout: opts.readyTimeoutMs ?? DEFAULT_READY_TIMEOUT_MS,
117
+ })
118
+ return
119
+ } catch {
120
+ // readySelector never showed — fall through to settle so we still
121
+ // capture whatever did render rather than returning the bare shell.
122
+ }
123
+ }
124
+ await waitForSettle(page, opts)
125
+ }
126
+
127
+ /**
128
+ * Normalize a caller-supplied scroll request into a clamped config, or null
129
+ * when no (valid) scroll was requested. `itemSelector` resolution order:
130
+ * explicit > the recipe's first `multiple: true` selector. With no item
131
+ * selector, 'count' is meaningless so we measure scrollHeight stability.
132
+ */
133
+ function normalizeScroll(scroll, selectors) {
134
+ if (!scroll || typeof scroll !== 'object') return null
135
+ if (!SCROLL_STRATEGIES.includes(scroll.strategy)) return null
136
+
137
+ let itemSelector =
138
+ typeof scroll.itemSelector === 'string' && scroll.itemSelector.trim()
139
+ ? scroll.itemSelector.trim()
140
+ : null
141
+ if (!itemSelector && selectors && typeof selectors === 'object') {
142
+ for (const spec of Object.values(selectors)) {
143
+ if (spec && spec.multiple && typeof spec.selector === 'string' && spec.selector.trim()) {
144
+ itemSelector = spec.selector.trim()
145
+ break
146
+ }
147
+ }
148
+ }
149
+
150
+ const clampInt = (val, def, min, max) => {
151
+ const n = Number.isFinite(val) ? Math.floor(val) : def
152
+ return Math.max(min, Math.min(max, n))
153
+ }
154
+
155
+ return {
156
+ strategy: scroll.strategy,
157
+ itemSelector,
158
+ targetItems: clampInt(scroll.targetItems, 0, 0, 100_000),
159
+ maxScrolls: clampInt(scroll.maxScrolls, SCROLL_DEFAULT_MAX_SCROLLS, 1, SCROLL_HARD_MAX_SCROLLS),
160
+ maxMs: clampInt(scroll.maxMs, SCROLL_DEFAULT_MAX_MS, 500, SCROLL_HARD_MAX_MS),
161
+ settleMs: clampInt(scroll.settleMs, SCROLL_DEFAULT_SETTLE_MS, 100, 5_000),
162
+ times: clampInt(scroll.times, SCROLL_DEFAULT_MAX_SCROLLS, 1, SCROLL_HARD_MAX_SCROLLS),
163
+ }
164
+ }
165
+
166
+ /**
167
+ * Drive an infinite-scroll / lazy-load page within the caller's limits.
168
+ * Returns { scrolls, items, reachedTarget, stoppedReason } so the caller can
169
+ * tell whether it hit the target or was capped (never silently truncated).
170
+ */
171
+ async function scrollToLoad(page, cfg) {
172
+ const start = Date.now()
173
+ const elapsed = () => Date.now() - start
174
+ const measure = async () => {
175
+ try {
176
+ if (cfg.itemSelector) {
177
+ return await page.evaluate(sel => document.querySelectorAll(sel).length, cfg.itemSelector)
178
+ }
179
+ return await page.evaluate(() => document.body.scrollHeight)
180
+ } catch {
181
+ return 0
182
+ }
183
+ }
184
+
185
+ let last = await measure()
186
+ let scrolls = 0
187
+ let stableRounds = 0
188
+ let stoppedReason = null
189
+
190
+ while (true) {
191
+ if (cfg.strategy === 'count' && cfg.itemSelector && cfg.targetItems > 0 && last >= cfg.targetItems) {
192
+ stoppedReason = 'reachedTarget'
193
+ break
194
+ }
195
+ if (cfg.strategy === 'fixed' && scrolls >= cfg.times) {
196
+ stoppedReason = 'fixedDone'
197
+ break
198
+ }
199
+ if (scrolls >= cfg.maxScrolls) { stoppedReason = 'maxScrolls'; break }
200
+ if (elapsed() >= cfg.maxMs) { stoppedReason = 'maxMs'; break }
201
+
202
+ await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)).catch(() => {})
203
+ scrolls++
204
+ await waitForSettle(page, {
205
+ settleMs: cfg.settleMs,
206
+ settleMaxMs: Math.min(cfg.settleMs * 4, Math.max(cfg.maxMs - elapsed(), cfg.settleMs)),
207
+ })
208
+
209
+ const cur = await measure()
210
+ if (cfg.strategy !== 'fixed') {
211
+ // No growth this round counts toward "settled"; two flat rounds = done.
212
+ stableRounds = cur <= last ? stableRounds + 1 : 0
213
+ if (stableRounds >= SCROLL_STABLE_ROUNDS) {
214
+ last = cur
215
+ stoppedReason = 'stable'
216
+ break
217
+ }
218
+ }
219
+ last = cur
220
+ }
221
+
222
+ const items = cfg.itemSelector ? await measure() : null
223
+ return {
224
+ scrolls,
225
+ items,
226
+ reachedTarget:
227
+ cfg.strategy === 'count' && cfg.itemSelector && cfg.targetItems > 0
228
+ ? items >= cfg.targetItems
229
+ : null,
230
+ stoppedReason: stoppedReason || 'stable',
231
+ }
232
+ }
233
+
52
234
  async function renderPage(url, opts = {}) {
53
235
  return withPage(async page => {
54
236
  const response = await page.goto(url, {
55
237
  waitUntil: 'domcontentloaded',
56
238
  timeout: opts.timeoutMs ?? DEFAULT_NAV_TIMEOUT_MS,
57
239
  })
240
+ await waitForReady(page, opts)
58
241
  const html = await page.content()
59
242
  return {
60
243
  html,
@@ -77,6 +260,11 @@ async function renderPage(url, opts = {}) {
77
260
  *
78
261
  * `attr` defaults to 'text' (innerText). Special value 'html' returns
79
262
  * innerHTML. Any other string is read as an HTML attribute.
263
+ *
264
+ * `opts.readySelector` waits for that element before extracting; `opts.scroll`
265
+ * (a config from `normalizeScroll`) drives lazy-load pages first.
266
+ *
267
+ * Returns `{ data, scrollInfo }` — scrollInfo is null when no scroll ran.
80
268
  */
81
269
  async function extractWithSelectors(url, selectors, opts = {}) {
82
270
  return withPage(async page => {
@@ -84,7 +272,14 @@ async function extractWithSelectors(url, selectors, opts = {}) {
84
272
  waitUntil: 'domcontentloaded',
85
273
  timeout: opts.timeoutMs ?? DEFAULT_NAV_TIMEOUT_MS,
86
274
  })
87
- return await page.evaluate(
275
+ await waitForReady(page, opts)
276
+
277
+ let scrollInfo = null
278
+ if (opts.scroll) {
279
+ scrollInfo = await scrollToLoad(page, opts.scroll)
280
+ }
281
+
282
+ const data = await page.evaluate(
88
283
  ({ selectorMap, evalTimeoutMs }) => {
89
284
  const start = Date.now()
90
285
  const result = {}
@@ -120,10 +315,13 @@ async function extractWithSelectors(url, selectors, opts = {}) {
120
315
  },
121
316
  { selectorMap: selectors, evalTimeoutMs: opts.evalTimeoutMs ?? DEFAULT_EVAL_TIMEOUT_MS },
122
317
  )
318
+
319
+ return { data, scrollInfo }
123
320
  }, opts)
124
321
  }
125
322
 
126
323
  module.exports = {
127
324
  renderPage,
128
325
  extractWithSelectors,
326
+ normalizeScroll,
129
327
  }
@@ -17,6 +17,7 @@
17
17
  * {
18
18
  * pageType: 'article' | 'listing' | 'product' | 'profile' | 'form' | 'other',
19
19
  * selectors: { fieldName: { selector, attr?, multiple? }, ... },
20
+ * readySelector: <css selector whose presence signals content is rendered>,
20
21
  * extracted: { fieldName: <value already pulled from this page> }
21
22
  * }
22
23
  */
@@ -59,6 +60,15 @@ const ANTHROPIC_TOOLS = [{
59
60
  },
60
61
  },
61
62
  },
63
+ ready_selector: {
64
+ type: 'string',
65
+ description:
66
+ 'A single CSS selector for an element that exists ONLY once the primary ' +
67
+ 'content has rendered (e.g. the article body, the first list item, a price). ' +
68
+ 'On later visits the extractor waits for this element before reading the page, ' +
69
+ 'so client-rendered (SPA) pages are captured after hydration rather than as an ' +
70
+ 'empty shell. Pick a stable, semantic element; avoid spinners/skeletons.',
71
+ },
62
72
  extracted: {
63
73
  type: 'object',
64
74
  description:
@@ -77,7 +87,10 @@ Given a cleaned Markdown rendering of one page, you must:
77
87
  - Prefer semantic selectors (article, h1, time[datetime], a[rel="author"]) over class names where possible.
78
88
  - Use class-based selectors only when semantic ones are unavailable.
79
89
  - Avoid fragile attribute selectors like data-react-* or auto-generated hashes.
80
- 3. Fill the "extracted" object with the values pulled from this exact page so the caller can verify your recipe works.
90
+ 3. Pick a "ready_selector": one CSS selector for an element that only exists once the
91
+ primary content has rendered (the article body, the first list item, the price, etc.).
92
+ Prefer a stable semantic element; never pick a loading spinner or skeleton placeholder.
93
+ 4. Fill the "extracted" object with the values pulled from this exact page so the caller can verify your recipe works.
81
94
 
82
95
  The same recipe will be reused for structurally similar pages, so think about what generalizes.`
83
96
 
@@ -132,6 +145,7 @@ async function generateViaPlugin(plugin, { url, cleanedMarkdown, hint }) {
132
145
  return {
133
146
  pageType: json.page_type || 'other',
134
147
  selectors: json.selectors || {},
148
+ readySelector: typeof json.ready_selector === 'string' ? json.ready_selector : null,
135
149
  extracted: json.extracted || {},
136
150
  source: `primary:${plugin.name}`,
137
151
  }
@@ -175,10 +189,11 @@ async function generateViaAnthropicDirect({ url, cleanedMarkdown, hint }) {
175
189
  throw new Error('Anthropic API returned no tool_use block for page_extraction')
176
190
  }
177
191
 
178
- const { page_type, selectors, extracted } = toolUse.input
192
+ const { page_type, selectors, ready_selector, extracted } = toolUse.input
179
193
  return {
180
194
  pageType: page_type || 'other',
181
195
  selectors: selectors || {},
196
+ readySelector: typeof ready_selector === 'string' ? ready_selector : null,
182
197
  extracted: extracted || {},
183
198
  source: 'anthropic-direct',
184
199
  }
@@ -199,12 +214,14 @@ function buildTextPrompt({ url, cleanedMarkdown, hint }) {
199
214
  ' "selectors": {',
200
215
  ' "<fieldName>": { "selector": "<css>", "attr"?: "<text|html|attribute-name>", "multiple"?: <boolean> }',
201
216
  ' },',
217
+ ' "ready_selector": "<css selector that only exists once the main content has rendered>",',
202
218
  ' "extracted": { "<fieldName>": "<string or array of strings>" }',
203
219
  '}',
204
220
  '',
205
221
  'Notes:',
206
222
  '- attr defaults to "text" (innerText). Use "html" or an HTML attribute name to override.',
207
223
  '- Set multiple=true for list fields (returns array).',
224
+ '- "ready_selector" should target a stable content element (article body, first list item, price); never a spinner/skeleton.',
208
225
  '- "extracted" must contain the values you actually read from THIS page using those selectors.',
209
226
  '',
210
227
  '--- Cleaned Markdown ---',
@@ -108,6 +108,29 @@ function variableRoutes(fastify, _opts, done) {
108
108
  return { success: true, scopes: variableStore.listVariableScopes() }
109
109
  })
110
110
 
111
+ // Move a variable from one workspace scope to another. Body:
112
+ // { from_workspace_id, to_workspace_id } — omit/empty string targets the
113
+ // minion-wide bucket. The destination is never overwritten on conflict.
114
+ fastify.post('/api/variables/:key/move', async (request, reply) => {
115
+ if (!verifyToken(request)) {
116
+ return reply.code(401).send({ error: 'Unauthorized' })
117
+ }
118
+ const { key } = request.params
119
+ const from = typeof request.body?.from_workspace_id === 'string' ? request.body.from_workspace_id : ''
120
+ const to = typeof request.body?.to_workspace_id === 'string' ? request.body.to_workspace_id : ''
121
+ const result = variableStore.moveVariable(from, to, key)
122
+ if (result.status === 'same_scope') {
123
+ return reply.code(400).send({ error: 'Source and destination scopes are the same.' })
124
+ }
125
+ if (result.status === 'not_found') {
126
+ return reply.code(404).send({ error: `Variable not found: ${key}` })
127
+ }
128
+ if (result.status === 'conflict') {
129
+ return reply.code(409).send({ error: `A variable with the same key already exists in the destination scope: ${key}` })
130
+ }
131
+ return { success: true, key, from_workspace_id: result.from, to_workspace_id: result.to }
132
+ })
133
+
111
134
  // ─── Secrets (sensitive, workspace-scoped) ────────────────────────────
112
135
  //
113
136
  // Secrets are scoped per workspace. Pass ?workspace_id=<uuid> to target a
@@ -118,11 +141,6 @@ function variableRoutes(fastify, _opts, done) {
118
141
  // Values are never returned via the API by design — only key names. Secrets
119
142
  // never leave the minion: the HQ proxy is a pure pass-through.
120
143
 
121
- function readWorkspaceId(request) {
122
- const rawWs = request.query?.workspace_id
123
- return (typeof rawWs === 'string') ? rawWs : ''
124
- }
125
-
126
144
  fastify.get('/api/secrets', async (request, reply) => {
127
145
  if (!verifyToken(request)) {
128
146
  return reply.code(401).send({ error: 'Unauthorized' })
@@ -175,6 +193,30 @@ function variableRoutes(fastify, _opts, done) {
175
193
  return { success: true, scopes: variableStore.listSecretScopes() }
176
194
  })
177
195
 
196
+ // Move a secret from one workspace scope to another. Body:
197
+ // { from_workspace_id, to_workspace_id } — omit/empty string targets the
198
+ // minion-wide bucket. The value is moved within the minion (it never leaves);
199
+ // the destination is never overwritten on conflict.
200
+ fastify.post('/api/secrets/:key/move', async (request, reply) => {
201
+ if (!verifyToken(request)) {
202
+ return reply.code(401).send({ error: 'Unauthorized' })
203
+ }
204
+ const { key } = request.params
205
+ const from = typeof request.body?.from_workspace_id === 'string' ? request.body.from_workspace_id : ''
206
+ const to = typeof request.body?.to_workspace_id === 'string' ? request.body.to_workspace_id : ''
207
+ const result = variableStore.moveSecret(from, to, key)
208
+ if (result.status === 'same_scope') {
209
+ return reply.code(400).send({ error: 'Source and destination scopes are the same.' })
210
+ }
211
+ if (result.status === 'not_found') {
212
+ return reply.code(404).send({ error: `Secret not found: ${key}` })
213
+ }
214
+ if (result.status === 'conflict') {
215
+ return reply.code(409).send({ error: `A secret with the same key already exists in the destination scope: ${key}` })
216
+ }
217
+ return { success: true, key, from_workspace_id: result.from, to_workspace_id: result.to }
218
+ })
219
+
178
220
  done()
179
221
  }
180
222
 
@@ -19,6 +19,9 @@ const { extract } = require('../lib/web-extract')
19
19
  const pageRecipeStore = require('../stores/page-recipe-store')
20
20
 
21
21
  const REQUEST_TIMEOUT_MS = 60_000
22
+ // Scrolling adds an extra navigation plus an in-page scroll loop, so give
23
+ // scroll-enabled requests a wider ceiling than the plain extract path.
24
+ const SCROLL_REQUEST_TIMEOUT_MS = 120_000
22
25
 
23
26
  async function webRoutes(fastify) {
24
27
  fastify.post('/api/web/extract', async (request, reply) => {
@@ -28,7 +31,7 @@ async function webRoutes(fastify) {
28
31
  }
29
32
 
30
33
  const body = request.body || {}
31
- const { url, hint } = body
34
+ const { url, hint, scroll } = body
32
35
 
33
36
  if (!url || typeof url !== 'string') {
34
37
  reply.code(400)
@@ -40,11 +43,17 @@ async function webRoutes(fastify) {
40
43
  reply.code(400)
41
44
  return { success: false, error: 'url is not a valid URL' }
42
45
  }
46
+ if (scroll != null && typeof scroll !== 'object') {
47
+ reply.code(400)
48
+ return { success: false, error: 'scroll must be an object when provided' }
49
+ }
50
+
51
+ const requestTimeoutMs = scroll ? SCROLL_REQUEST_TIMEOUT_MS : REQUEST_TIMEOUT_MS
43
52
 
44
53
  try {
45
54
  const result = await Promise.race([
46
- extract({ url, hint: typeof hint === 'string' ? hint : null }),
47
- new Promise((_, rej) => setTimeout(() => rej(new Error('extract timeout')), REQUEST_TIMEOUT_MS)),
55
+ extract({ url, hint: typeof hint === 'string' ? hint : null, scroll: scroll || null }),
56
+ new Promise((_, rej) => setTimeout(() => rej(new Error('extract timeout')), requestTimeoutMs)),
48
57
  ])
49
58
  return { success: true, ...result }
50
59
  } catch (err) {
@@ -200,8 +200,18 @@ function rekeySession(oldSessionId, newSessionId) {
200
200
  .run(Date.now(), oldSession.turn_count || 0, newSessionId)
201
201
  db.prepare('DELETE FROM chat_sessions WHERE session_id = ?').run(oldSessionId)
202
202
  } else {
203
- db.prepare('UPDATE chat_sessions SET session_id = ? WHERE session_id = ?').run(newSessionId, oldSessionId)
203
+ // Insert the new parent row first, repoint the children, then drop the
204
+ // old parent. Updating chat_sessions.session_id in place would orphan the
205
+ // existing chat_messages mid-statement — the FK is ON UPDATE NO ACTION, so
206
+ // SQLite raises "FOREIGN KEY constraint failed" before the follow-up
207
+ // UPDATE can repoint them. (The final DELETE cascades to nothing because
208
+ // the messages were already moved.)
209
+ db.prepare(
210
+ 'INSERT INTO chat_sessions (session_id, workspace_id, turn_count, created_at, updated_at) ' +
211
+ 'SELECT ?, workspace_id, turn_count, created_at, updated_at FROM chat_sessions WHERE session_id = ?'
212
+ ).run(newSessionId, oldSessionId)
204
213
  db.prepare('UPDATE chat_messages SET session_id = ? WHERE session_id = ?').run(newSessionId, oldSessionId)
214
+ db.prepare('DELETE FROM chat_sessions WHERE session_id = ?').run(oldSessionId)
205
215
  }
206
216
  return true
207
217
  })
@@ -240,4 +250,111 @@ function deleteSession(sessionId) {
240
250
  return result.changes > 0
241
251
  }
242
252
 
243
- module.exports = { load, loadById, listSessions, save, addMessage, rekeySession, clear, deleteSession }
253
+ // ---------------------------------------------------------------------------
254
+ // Chat runs — durable index for detached chat execution (see chat-run-manager).
255
+ // A run is owned by the run manager, not the HTTP request. These rows let a
256
+ // reconnecting client find the in-flight run for a workspace and resume tailing.
257
+ // ---------------------------------------------------------------------------
258
+
259
+ /**
260
+ * Record the start of a detached run.
261
+ * @param {{ runId: string, sessionId?: string|null, pendingSessionId?: string|null, workspaceId?: string|null }} run
262
+ */
263
+ function createRun({ runId, sessionId, pendingSessionId, workspaceId }) {
264
+ const db = getDb()
265
+ db.prepare(
266
+ `INSERT INTO chat_runs (run_id, session_id, pending_session_id, workspace_id, status, started_at, last_seq)
267
+ VALUES (?, ?, ?, ?, 'running', ?, 0)`
268
+ ).run(runId, sessionId || null, pendingSessionId || null, workspaceId || null, Date.now())
269
+ }
270
+
271
+ /**
272
+ * Update a run's progress/terminal state. Only provided fields are written.
273
+ * @param {string} runId
274
+ * @param {{ status?: string, sessionId?: string|null, lastSeq?: number }} patch
275
+ */
276
+ function updateRun(runId, { status, sessionId, lastSeq } = {}) {
277
+ const db = getDb()
278
+ const sets = []
279
+ const vals = []
280
+ if (status !== undefined) {
281
+ sets.push('status = ?')
282
+ vals.push(status)
283
+ if (status !== 'running') {
284
+ sets.push('ended_at = ?')
285
+ vals.push(Date.now())
286
+ }
287
+ }
288
+ if (sessionId !== undefined) {
289
+ sets.push('session_id = ?')
290
+ vals.push(sessionId)
291
+ }
292
+ if (typeof lastSeq === 'number') {
293
+ sets.push('last_seq = ?')
294
+ vals.push(lastSeq)
295
+ }
296
+ if (!sets.length) return
297
+ vals.push(runId)
298
+ db.prepare(`UPDATE chat_runs SET ${sets.join(', ')} WHERE run_id = ?`).run(...vals)
299
+ }
300
+
301
+ /**
302
+ * Most recent still-running run for a workspace (reconnect target).
303
+ * @param {string|null} workspaceId
304
+ * @returns {object|null}
305
+ */
306
+ function getActiveRun(workspaceId) {
307
+ const db = getDb()
308
+ if (workspaceId) {
309
+ return db.prepare(
310
+ `SELECT * FROM chat_runs WHERE workspace_id = ? AND status = 'running' ORDER BY started_at DESC LIMIT 1`
311
+ ).get(workspaceId) || null
312
+ }
313
+ return db.prepare(
314
+ `SELECT * FROM chat_runs WHERE workspace_id IS NULL AND status = 'running' ORDER BY started_at DESC LIMIT 1`
315
+ ).get() || null
316
+ }
317
+
318
+ /**
319
+ * Load a run row by id.
320
+ * @param {string} runId
321
+ * @returns {object|null}
322
+ */
323
+ function getRun(runId) {
324
+ const db = getDb()
325
+ return db.prepare('SELECT * FROM chat_runs WHERE run_id = ?').get(runId) || null
326
+ }
327
+
328
+ /**
329
+ * Boot-time sweep: any run still flagged `running` belongs to a previous server
330
+ * process whose in-memory owner is gone. Mark them interrupted so clients stop
331
+ * waiting on a stream that will never resume.
332
+ * @returns {number} rows swept
333
+ */
334
+ function markRunningInterrupted() {
335
+ const db = getDb()
336
+ const res = db.prepare(
337
+ `UPDATE chat_runs SET status = 'interrupted', ended_at = ? WHERE status = 'running'`
338
+ ).run(Date.now())
339
+ return res.changes || 0
340
+ }
341
+
342
+ /**
343
+ * Prune run rows older than maxAgeMs (terminal states only). Keeps the table
344
+ * from growing unbounded; the NDJSON event logs are pruned separately.
345
+ * @param {number} maxAgeMs
346
+ * @returns {number} rows removed
347
+ */
348
+ function pruneRuns(maxAgeMs) {
349
+ const db = getDb()
350
+ const cutoff = Date.now() - maxAgeMs
351
+ const res = db.prepare(
352
+ `DELETE FROM chat_runs WHERE status != 'running' AND COALESCE(ended_at, started_at) < ?`
353
+ ).run(cutoff)
354
+ return res.changes || 0
355
+ }
356
+
357
+ module.exports = {
358
+ load, loadById, listSessions, save, addMessage, rekeySession, clear, deleteSession,
359
+ createRun, updateRun, getActiveRun, getRun, markRunningInterrupted, pruneRuns,
360
+ }
@@ -12,7 +12,7 @@ const MAX_FAIL_COUNT = 3
12
12
  function find({ urlTemplate, domFingerprint }) {
13
13
  const db = getDb()
14
14
  const row = db.prepare(`
15
- SELECT url_template, dom_fingerprint, selectors_json, page_type,
15
+ SELECT url_template, dom_fingerprint, selectors_json, page_type, ready_selector,
16
16
  hit_count, fail_count, last_verified_at, created_at
17
17
  FROM page_recipes
18
18
  WHERE url_template = ? AND dom_fingerprint = ?
@@ -24,7 +24,7 @@ function find({ urlTemplate, domFingerprint }) {
24
24
  function findByTemplate(urlTemplate) {
25
25
  const db = getDb()
26
26
  const rows = db.prepare(`
27
- SELECT url_template, dom_fingerprint, selectors_json, page_type,
27
+ SELECT url_template, dom_fingerprint, selectors_json, page_type, ready_selector,
28
28
  hit_count, fail_count, last_verified_at, created_at
29
29
  FROM page_recipes
30
30
  WHERE url_template = ?
@@ -33,19 +33,20 @@ function findByTemplate(urlTemplate) {
33
33
  return rows.map(parseRow)
34
34
  }
35
35
 
36
- function upsert({ urlTemplate, domFingerprint, selectors, pageType }) {
36
+ function upsert({ urlTemplate, domFingerprint, selectors, pageType, readySelector }) {
37
37
  const db = getDb()
38
38
  const json = JSON.stringify(selectors || {})
39
39
  const now = new Date().toISOString()
40
40
  db.prepare(`
41
- INSERT INTO page_recipes (url_template, dom_fingerprint, selectors_json, page_type, hit_count, fail_count, last_verified_at, created_at)
42
- VALUES (?, ?, ?, ?, 0, 0, ?, ?)
41
+ INSERT INTO page_recipes (url_template, dom_fingerprint, selectors_json, page_type, ready_selector, hit_count, fail_count, last_verified_at, created_at)
42
+ VALUES (?, ?, ?, ?, ?, 0, 0, ?, ?)
43
43
  ON CONFLICT(url_template, dom_fingerprint) DO UPDATE SET
44
44
  selectors_json = excluded.selectors_json,
45
45
  page_type = excluded.page_type,
46
+ ready_selector = excluded.ready_selector,
46
47
  fail_count = 0,
47
48
  last_verified_at = excluded.last_verified_at
48
- `).run(urlTemplate, domFingerprint, json, pageType || null, now, now)
49
+ `).run(urlTemplate, domFingerprint, json, pageType || null, readySelector || null, now, now)
49
50
  return find({ urlTemplate, domFingerprint })
50
51
  }
51
52
 
@@ -102,7 +103,7 @@ function remove({ urlTemplate, domFingerprint }) {
102
103
  function listAll({ limit = 100 } = {}) {
103
104
  const db = getDb()
104
105
  const rows = db.prepare(`
105
- SELECT url_template, dom_fingerprint, selectors_json, page_type,
106
+ SELECT url_template, dom_fingerprint, selectors_json, page_type, ready_selector,
106
107
  hit_count, fail_count, last_verified_at, created_at
107
108
  FROM page_recipes
108
109
  ORDER BY last_verified_at DESC NULLS LAST, created_at DESC
@@ -123,6 +124,7 @@ function parseRow(row) {
123
124
  dom_fingerprint: row.dom_fingerprint,
124
125
  selectors,
125
126
  page_type: row.page_type,
127
+ ready_selector: row.ready_selector || null,
126
128
  hit_count: row.hit_count,
127
129
  fail_count: row.fail_count,
128
130
  last_verified_at: row.last_verified_at,