@geekbeer/minion 4.5.1 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,14 +8,37 @@
8
8
  *
9
9
  * Each call spins up a fresh chromium instance. Pooling can come later
10
10
  * once the API stabilizes — for the experimental MVP, simple is better.
11
+ *
12
+ * Wait strategy (SPA-aware, v4.7.0):
13
+ * `page.goto` resolves on `domcontentloaded`, which for a client-rendered
14
+ * SPA fires *before* the framework has mounted and fetched its data. So
15
+ * after navigation we additionally wait for the content to actually appear:
16
+ * 1. If a `readySelector` is known (from the recipe), wait for it.
17
+ * 2. Otherwise wait for the DOM to *settle* — i.e. no MutationObserver
18
+ * events for `settleMs` — which is self-calibrating and works whether
19
+ * the page renders 50 chars or 50,000.
20
+ * `scroll` (optional) then drives infinite-scroll / lazy-load pages up to
21
+ * caller-declared limits, with hard server-side caps.
11
22
  */
12
23
 
13
24
  const DEFAULT_NAV_TIMEOUT_MS = 20_000
14
25
  const DEFAULT_EVAL_TIMEOUT_MS = 5_000
26
+ const DEFAULT_READY_TIMEOUT_MS = 8_000
27
+ const DEFAULT_SETTLE_MS = 500
28
+ const DEFAULT_SETTLE_MAX_MS = 8_000
15
29
  const DEFAULT_USER_AGENT =
16
30
  'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' +
17
31
  'Chrome/124.0.0.0 Safari/537.36 MinionWebExtract/0.1'
18
32
 
33
+ // Scroll defaults (caller may override) and hard caps (server enforces).
34
+ const SCROLL_DEFAULT_MAX_SCROLLS = 10
35
+ const SCROLL_HARD_MAX_SCROLLS = 50
36
+ const SCROLL_DEFAULT_MAX_MS = 15_000
37
+ const SCROLL_HARD_MAX_MS = 45_000
38
+ const SCROLL_DEFAULT_SETTLE_MS = 600
39
+ const SCROLL_STABLE_ROUNDS = 2
40
+ const SCROLL_STRATEGIES = ['count', 'untilStable', 'fixed']
41
+
19
42
  function loadChromium() {
20
43
  let playwright
21
44
  try {
@@ -49,12 +72,172 @@ async function withPage(fn, opts = {}) {
49
72
  }
50
73
  }
51
74
 
75
+ /**
76
+ * Wait for the DOM to stop mutating. Resolves once no childList/characterData
77
+ * mutation has fired for `settleMs`, or after `settleMaxMs` regardless. Scoped
78
+ * to the main content landmark when present so ever-churning headers/ads/beacons
79
+ * don't keep it awake. Best-effort: any failure (e.g. mid-navigation) resolves.
80
+ */
81
+ async function waitForSettle(page, opts = {}) {
82
+ const settleMs = opts.settleMs ?? DEFAULT_SETTLE_MS
83
+ const settleMaxMs = opts.settleMaxMs ?? DEFAULT_SETTLE_MAX_MS
84
+ try {
85
+ await page.evaluate(({ quiet, max }) => new Promise(resolve => {
86
+ const target = document.querySelector('main, article, [role="main"]') || document.body
87
+ if (!target) { resolve(); return }
88
+ let quietTimer = setTimeout(finish, quiet)
89
+ const hardCap = setTimeout(finish, max)
90
+ const obs = new MutationObserver(() => {
91
+ clearTimeout(quietTimer)
92
+ quietTimer = setTimeout(finish, quiet)
93
+ })
94
+ obs.observe(target, { childList: true, subtree: true, characterData: true })
95
+ function finish() {
96
+ clearTimeout(quietTimer)
97
+ clearTimeout(hardCap)
98
+ obs.disconnect()
99
+ resolve()
100
+ }
101
+ }), { quiet: settleMs, max: settleMaxMs })
102
+ } catch {
103
+ // navigation/teardown raced us — caller proceeds with whatever rendered.
104
+ }
105
+ }
106
+
107
+ /**
108
+ * After navigation, wait until the meaningful content is present:
109
+ * prefer the recipe's `readySelector`; fall back to DOM-settle detection.
110
+ */
111
+ async function waitForReady(page, opts = {}) {
112
+ if (opts.readySelector) {
113
+ try {
114
+ await page.waitForSelector(opts.readySelector, {
115
+ state: 'visible',
116
+ timeout: opts.readyTimeoutMs ?? DEFAULT_READY_TIMEOUT_MS,
117
+ })
118
+ return
119
+ } catch {
120
+ // readySelector never showed — fall through to settle so we still
121
+ // capture whatever did render rather than returning the bare shell.
122
+ }
123
+ }
124
+ await waitForSettle(page, opts)
125
+ }
126
+
127
+ /**
128
+ * Normalize a caller-supplied scroll request into a clamped config, or null
129
+ * when no (valid) scroll was requested. `itemSelector` resolution order:
130
+ * explicit > the recipe's first `multiple: true` selector. With no item
131
+ * selector, 'count' is meaningless so we measure scrollHeight stability.
132
+ */
133
+ function normalizeScroll(scroll, selectors) {
134
+ if (!scroll || typeof scroll !== 'object') return null
135
+ if (!SCROLL_STRATEGIES.includes(scroll.strategy)) return null
136
+
137
+ let itemSelector =
138
+ typeof scroll.itemSelector === 'string' && scroll.itemSelector.trim()
139
+ ? scroll.itemSelector.trim()
140
+ : null
141
+ if (!itemSelector && selectors && typeof selectors === 'object') {
142
+ for (const spec of Object.values(selectors)) {
143
+ if (spec && spec.multiple && typeof spec.selector === 'string' && spec.selector.trim()) {
144
+ itemSelector = spec.selector.trim()
145
+ break
146
+ }
147
+ }
148
+ }
149
+
150
+ const clampInt = (val, def, min, max) => {
151
+ const n = Number.isFinite(val) ? Math.floor(val) : def
152
+ return Math.max(min, Math.min(max, n))
153
+ }
154
+
155
+ return {
156
+ strategy: scroll.strategy,
157
+ itemSelector,
158
+ targetItems: clampInt(scroll.targetItems, 0, 0, 100_000),
159
+ maxScrolls: clampInt(scroll.maxScrolls, SCROLL_DEFAULT_MAX_SCROLLS, 1, SCROLL_HARD_MAX_SCROLLS),
160
+ maxMs: clampInt(scroll.maxMs, SCROLL_DEFAULT_MAX_MS, 500, SCROLL_HARD_MAX_MS),
161
+ settleMs: clampInt(scroll.settleMs, SCROLL_DEFAULT_SETTLE_MS, 100, 5_000),
162
+ times: clampInt(scroll.times, SCROLL_DEFAULT_MAX_SCROLLS, 1, SCROLL_HARD_MAX_SCROLLS),
163
+ }
164
+ }
165
+
166
+ /**
167
+ * Drive an infinite-scroll / lazy-load page within the caller's limits.
168
+ * Returns { scrolls, items, reachedTarget, stoppedReason } so the caller can
169
+ * tell whether it hit the target or was capped (never silently truncated).
170
+ */
171
+ async function scrollToLoad(page, cfg) {
172
+ const start = Date.now()
173
+ const elapsed = () => Date.now() - start
174
+ const measure = async () => {
175
+ try {
176
+ if (cfg.itemSelector) {
177
+ return await page.evaluate(sel => document.querySelectorAll(sel).length, cfg.itemSelector)
178
+ }
179
+ return await page.evaluate(() => document.body.scrollHeight)
180
+ } catch {
181
+ return 0
182
+ }
183
+ }
184
+
185
+ let last = await measure()
186
+ let scrolls = 0
187
+ let stableRounds = 0
188
+ let stoppedReason = null
189
+
190
+ while (true) {
191
+ if (cfg.strategy === 'count' && cfg.itemSelector && cfg.targetItems > 0 && last >= cfg.targetItems) {
192
+ stoppedReason = 'reachedTarget'
193
+ break
194
+ }
195
+ if (cfg.strategy === 'fixed' && scrolls >= cfg.times) {
196
+ stoppedReason = 'fixedDone'
197
+ break
198
+ }
199
+ if (scrolls >= cfg.maxScrolls) { stoppedReason = 'maxScrolls'; break }
200
+ if (elapsed() >= cfg.maxMs) { stoppedReason = 'maxMs'; break }
201
+
202
+ await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)).catch(() => {})
203
+ scrolls++
204
+ await waitForSettle(page, {
205
+ settleMs: cfg.settleMs,
206
+ settleMaxMs: Math.min(cfg.settleMs * 4, Math.max(cfg.maxMs - elapsed(), cfg.settleMs)),
207
+ })
208
+
209
+ const cur = await measure()
210
+ if (cfg.strategy !== 'fixed') {
211
+ // No growth this round counts toward "settled"; two flat rounds = done.
212
+ stableRounds = cur <= last ? stableRounds + 1 : 0
213
+ if (stableRounds >= SCROLL_STABLE_ROUNDS) {
214
+ last = cur
215
+ stoppedReason = 'stable'
216
+ break
217
+ }
218
+ }
219
+ last = cur
220
+ }
221
+
222
+ const items = cfg.itemSelector ? await measure() : null
223
+ return {
224
+ scrolls,
225
+ items,
226
+ reachedTarget:
227
+ cfg.strategy === 'count' && cfg.itemSelector && cfg.targetItems > 0
228
+ ? items >= cfg.targetItems
229
+ : null,
230
+ stoppedReason: stoppedReason || 'stable',
231
+ }
232
+ }
233
+
52
234
  async function renderPage(url, opts = {}) {
53
235
  return withPage(async page => {
54
236
  const response = await page.goto(url, {
55
237
  waitUntil: 'domcontentloaded',
56
238
  timeout: opts.timeoutMs ?? DEFAULT_NAV_TIMEOUT_MS,
57
239
  })
240
+ await waitForReady(page, opts)
58
241
  const html = await page.content()
59
242
  return {
60
243
  html,
@@ -77,6 +260,11 @@ async function renderPage(url, opts = {}) {
77
260
  *
78
261
  * `attr` defaults to 'text' (innerText). Special value 'html' returns
79
262
  * innerHTML. Any other string is read as an HTML attribute.
263
+ *
264
+ * `opts.readySelector` waits for that element before extracting; `opts.scroll`
265
+ * (a config from `normalizeScroll`) drives lazy-load pages first.
266
+ *
267
+ * Returns `{ data, scrollInfo }` — scrollInfo is null when no scroll ran.
80
268
  */
81
269
  async function extractWithSelectors(url, selectors, opts = {}) {
82
270
  return withPage(async page => {
@@ -84,7 +272,14 @@ async function extractWithSelectors(url, selectors, opts = {}) {
84
272
  waitUntil: 'domcontentloaded',
85
273
  timeout: opts.timeoutMs ?? DEFAULT_NAV_TIMEOUT_MS,
86
274
  })
87
- return await page.evaluate(
275
+ await waitForReady(page, opts)
276
+
277
+ let scrollInfo = null
278
+ if (opts.scroll) {
279
+ scrollInfo = await scrollToLoad(page, opts.scroll)
280
+ }
281
+
282
+ const data = await page.evaluate(
88
283
  ({ selectorMap, evalTimeoutMs }) => {
89
284
  const start = Date.now()
90
285
  const result = {}
@@ -120,10 +315,13 @@ async function extractWithSelectors(url, selectors, opts = {}) {
120
315
  },
121
316
  { selectorMap: selectors, evalTimeoutMs: opts.evalTimeoutMs ?? DEFAULT_EVAL_TIMEOUT_MS },
122
317
  )
318
+
319
+ return { data, scrollInfo }
123
320
  }, opts)
124
321
  }
125
322
 
126
323
  module.exports = {
127
324
  renderPage,
128
325
  extractWithSelectors,
326
+ normalizeScroll,
129
327
  }
@@ -17,6 +17,7 @@
17
17
  * {
18
18
  * pageType: 'article' | 'listing' | 'product' | 'profile' | 'form' | 'other',
19
19
  * selectors: { fieldName: { selector, attr?, multiple? }, ... },
20
+ * readySelector: <css selector whose presence signals content is rendered>,
20
21
  * extracted: { fieldName: <value already pulled from this page> }
21
22
  * }
22
23
  */
@@ -59,6 +60,15 @@ const ANTHROPIC_TOOLS = [{
59
60
  },
60
61
  },
61
62
  },
63
+ ready_selector: {
64
+ type: 'string',
65
+ description:
66
+ 'A single CSS selector for an element that exists ONLY once the primary ' +
67
+ 'content has rendered (e.g. the article body, the first list item, a price). ' +
68
+ 'On later visits the extractor waits for this element before reading the page, ' +
69
+ 'so client-rendered (SPA) pages are captured after hydration rather than as an ' +
70
+ 'empty shell. Pick a stable, semantic element; avoid spinners/skeletons.',
71
+ },
62
72
  extracted: {
63
73
  type: 'object',
64
74
  description:
@@ -77,7 +87,10 @@ Given a cleaned Markdown rendering of one page, you must:
77
87
  - Prefer semantic selectors (article, h1, time[datetime], a[rel="author"]) over class names where possible.
78
88
  - Use class-based selectors only when semantic ones are unavailable.
79
89
  - Avoid fragile attribute selectors like data-react-* or auto-generated hashes.
80
- 3. Fill the "extracted" object with the values pulled from this exact page so the caller can verify your recipe works.
90
+ 3. Pick a "ready_selector": one CSS selector for an element that only exists once the
91
+ primary content has rendered (the article body, the first list item, the price, etc.).
92
+ Prefer a stable semantic element; never pick a loading spinner or skeleton placeholder.
93
+ 4. Fill the "extracted" object with the values pulled from this exact page so the caller can verify your recipe works.
81
94
 
82
95
  The same recipe will be reused for structurally similar pages, so think about what generalizes.`
83
96
 
@@ -132,6 +145,7 @@ async function generateViaPlugin(plugin, { url, cleanedMarkdown, hint }) {
132
145
  return {
133
146
  pageType: json.page_type || 'other',
134
147
  selectors: json.selectors || {},
148
+ readySelector: typeof json.ready_selector === 'string' ? json.ready_selector : null,
135
149
  extracted: json.extracted || {},
136
150
  source: `primary:${plugin.name}`,
137
151
  }
@@ -175,10 +189,11 @@ async function generateViaAnthropicDirect({ url, cleanedMarkdown, hint }) {
175
189
  throw new Error('Anthropic API returned no tool_use block for page_extraction')
176
190
  }
177
191
 
178
- const { page_type, selectors, extracted } = toolUse.input
192
+ const { page_type, selectors, ready_selector, extracted } = toolUse.input
179
193
  return {
180
194
  pageType: page_type || 'other',
181
195
  selectors: selectors || {},
196
+ readySelector: typeof ready_selector === 'string' ? ready_selector : null,
182
197
  extracted: extracted || {},
183
198
  source: 'anthropic-direct',
184
199
  }
@@ -199,12 +214,14 @@ function buildTextPrompt({ url, cleanedMarkdown, hint }) {
199
214
  ' "selectors": {',
200
215
  ' "<fieldName>": { "selector": "<css>", "attr"?: "<text|html|attribute-name>", "multiple"?: <boolean> }',
201
216
  ' },',
217
+ ' "ready_selector": "<css selector that only exists once the main content has rendered>",',
202
218
  ' "extracted": { "<fieldName>": "<string or array of strings>" }',
203
219
  '}',
204
220
  '',
205
221
  'Notes:',
206
222
  '- attr defaults to "text" (innerText). Use "html" or an HTML attribute name to override.',
207
223
  '- Set multiple=true for list fields (returns array).',
224
+ '- "ready_selector" should target a stable content element (article body, first list item, price); never a spinner/skeleton.',
208
225
  '- "extracted" must contain the values you actually read from THIS page using those selectors.',
209
226
  '',
210
227
  '--- Cleaned Markdown ---',
@@ -19,6 +19,9 @@ const { extract } = require('../lib/web-extract')
19
19
  const pageRecipeStore = require('../stores/page-recipe-store')
20
20
 
21
21
  const REQUEST_TIMEOUT_MS = 60_000
22
+ // Scrolling adds an extra navigation plus an in-page scroll loop, so give
23
+ // scroll-enabled requests a wider ceiling than the plain extract path.
24
+ const SCROLL_REQUEST_TIMEOUT_MS = 120_000
22
25
 
23
26
  async function webRoutes(fastify) {
24
27
  fastify.post('/api/web/extract', async (request, reply) => {
@@ -28,7 +31,7 @@ async function webRoutes(fastify) {
28
31
  }
29
32
 
30
33
  const body = request.body || {}
31
- const { url, hint } = body
34
+ const { url, hint, scroll } = body
32
35
 
33
36
  if (!url || typeof url !== 'string') {
34
37
  reply.code(400)
@@ -40,11 +43,17 @@ async function webRoutes(fastify) {
40
43
  reply.code(400)
41
44
  return { success: false, error: 'url is not a valid URL' }
42
45
  }
46
+ if (scroll != null && typeof scroll !== 'object') {
47
+ reply.code(400)
48
+ return { success: false, error: 'scroll must be an object when provided' }
49
+ }
50
+
51
+ const requestTimeoutMs = scroll ? SCROLL_REQUEST_TIMEOUT_MS : REQUEST_TIMEOUT_MS
43
52
 
44
53
  try {
45
54
  const result = await Promise.race([
46
- extract({ url, hint: typeof hint === 'string' ? hint : null }),
47
- new Promise((_, rej) => setTimeout(() => rej(new Error('extract timeout')), REQUEST_TIMEOUT_MS)),
55
+ extract({ url, hint: typeof hint === 'string' ? hint : null, scroll: scroll || null }),
56
+ new Promise((_, rej) => setTimeout(() => rej(new Error('extract timeout')), requestTimeoutMs)),
48
57
  ])
49
58
  return { success: true, ...result }
50
59
  } catch (err) {
@@ -200,8 +200,18 @@ function rekeySession(oldSessionId, newSessionId) {
200
200
  .run(Date.now(), oldSession.turn_count || 0, newSessionId)
201
201
  db.prepare('DELETE FROM chat_sessions WHERE session_id = ?').run(oldSessionId)
202
202
  } else {
203
- db.prepare('UPDATE chat_sessions SET session_id = ? WHERE session_id = ?').run(newSessionId, oldSessionId)
203
+ // Insert the new parent row first, repoint the children, then drop the
204
+ // old parent. Updating chat_sessions.session_id in place would orphan the
205
+ // existing chat_messages mid-statement — the FK is ON UPDATE NO ACTION, so
206
+ // SQLite raises "FOREIGN KEY constraint failed" before the follow-up
207
+ // UPDATE can repoint them. (The final DELETE cascades to nothing because
208
+ // the messages were already moved.)
209
+ db.prepare(
210
+ 'INSERT INTO chat_sessions (session_id, workspace_id, turn_count, created_at, updated_at) ' +
211
+ 'SELECT ?, workspace_id, turn_count, created_at, updated_at FROM chat_sessions WHERE session_id = ?'
212
+ ).run(newSessionId, oldSessionId)
204
213
  db.prepare('UPDATE chat_messages SET session_id = ? WHERE session_id = ?').run(newSessionId, oldSessionId)
214
+ db.prepare('DELETE FROM chat_sessions WHERE session_id = ?').run(oldSessionId)
205
215
  }
206
216
  return true
207
217
  })
@@ -240,4 +250,111 @@ function deleteSession(sessionId) {
240
250
  return result.changes > 0
241
251
  }
242
252
 
243
- module.exports = { load, loadById, listSessions, save, addMessage, rekeySession, clear, deleteSession }
253
+ // ---------------------------------------------------------------------------
254
+ // Chat runs — durable index for detached chat execution (see chat-run-manager).
255
+ // A run is owned by the run manager, not the HTTP request. These rows let a
256
+ // reconnecting client find the in-flight run for a workspace and resume tailing.
257
+ // ---------------------------------------------------------------------------
258
+
259
+ /**
260
+ * Record the start of a detached run.
261
+ * @param {{ runId: string, sessionId?: string|null, pendingSessionId?: string|null, workspaceId?: string|null }} run
262
+ */
263
+ function createRun({ runId, sessionId, pendingSessionId, workspaceId }) {
264
+ const db = getDb()
265
+ db.prepare(
266
+ `INSERT INTO chat_runs (run_id, session_id, pending_session_id, workspace_id, status, started_at, last_seq)
267
+ VALUES (?, ?, ?, ?, 'running', ?, 0)`
268
+ ).run(runId, sessionId || null, pendingSessionId || null, workspaceId || null, Date.now())
269
+ }
270
+
271
+ /**
272
+ * Update a run's progress/terminal state. Only provided fields are written.
273
+ * @param {string} runId
274
+ * @param {{ status?: string, sessionId?: string|null, lastSeq?: number }} patch
275
+ */
276
+ function updateRun(runId, { status, sessionId, lastSeq } = {}) {
277
+ const db = getDb()
278
+ const sets = []
279
+ const vals = []
280
+ if (status !== undefined) {
281
+ sets.push('status = ?')
282
+ vals.push(status)
283
+ if (status !== 'running') {
284
+ sets.push('ended_at = ?')
285
+ vals.push(Date.now())
286
+ }
287
+ }
288
+ if (sessionId !== undefined) {
289
+ sets.push('session_id = ?')
290
+ vals.push(sessionId)
291
+ }
292
+ if (typeof lastSeq === 'number') {
293
+ sets.push('last_seq = ?')
294
+ vals.push(lastSeq)
295
+ }
296
+ if (!sets.length) return
297
+ vals.push(runId)
298
+ db.prepare(`UPDATE chat_runs SET ${sets.join(', ')} WHERE run_id = ?`).run(...vals)
299
+ }
300
+
301
+ /**
302
+ * Most recent still-running run for a workspace (reconnect target).
303
+ * @param {string|null} workspaceId
304
+ * @returns {object|null}
305
+ */
306
+ function getActiveRun(workspaceId) {
307
+ const db = getDb()
308
+ if (workspaceId) {
309
+ return db.prepare(
310
+ `SELECT * FROM chat_runs WHERE workspace_id = ? AND status = 'running' ORDER BY started_at DESC LIMIT 1`
311
+ ).get(workspaceId) || null
312
+ }
313
+ return db.prepare(
314
+ `SELECT * FROM chat_runs WHERE workspace_id IS NULL AND status = 'running' ORDER BY started_at DESC LIMIT 1`
315
+ ).get() || null
316
+ }
317
+
318
+ /**
319
+ * Load a run row by id.
320
+ * @param {string} runId
321
+ * @returns {object|null}
322
+ */
323
+ function getRun(runId) {
324
+ const db = getDb()
325
+ return db.prepare('SELECT * FROM chat_runs WHERE run_id = ?').get(runId) || null
326
+ }
327
+
328
+ /**
329
+ * Boot-time sweep: any run still flagged `running` belongs to a previous server
330
+ * process whose in-memory owner is gone. Mark them interrupted so clients stop
331
+ * waiting on a stream that will never resume.
332
+ * @returns {number} rows swept
333
+ */
334
+ function markRunningInterrupted() {
335
+ const db = getDb()
336
+ const res = db.prepare(
337
+ `UPDATE chat_runs SET status = 'interrupted', ended_at = ? WHERE status = 'running'`
338
+ ).run(Date.now())
339
+ return res.changes || 0
340
+ }
341
+
342
+ /**
343
+ * Prune run rows older than maxAgeMs (terminal states only). Keeps the table
344
+ * from growing unbounded; the NDJSON event logs are pruned separately.
345
+ * @param {number} maxAgeMs
346
+ * @returns {number} rows removed
347
+ */
348
+ function pruneRuns(maxAgeMs) {
349
+ const db = getDb()
350
+ const cutoff = Date.now() - maxAgeMs
351
+ const res = db.prepare(
352
+ `DELETE FROM chat_runs WHERE status != 'running' AND COALESCE(ended_at, started_at) < ?`
353
+ ).run(cutoff)
354
+ return res.changes || 0
355
+ }
356
+
357
+ module.exports = {
358
+ load, loadById, listSessions, save, addMessage, rekeySession, clear, deleteSession,
359
+ createRun, updateRun, getActiveRun, getRun, markRunningInterrupted, pruneRuns,
360
+ }
@@ -12,7 +12,7 @@ const MAX_FAIL_COUNT = 3
12
12
  function find({ urlTemplate, domFingerprint }) {
13
13
  const db = getDb()
14
14
  const row = db.prepare(`
15
- SELECT url_template, dom_fingerprint, selectors_json, page_type,
15
+ SELECT url_template, dom_fingerprint, selectors_json, page_type, ready_selector,
16
16
  hit_count, fail_count, last_verified_at, created_at
17
17
  FROM page_recipes
18
18
  WHERE url_template = ? AND dom_fingerprint = ?
@@ -24,7 +24,7 @@ function find({ urlTemplate, domFingerprint }) {
24
24
  function findByTemplate(urlTemplate) {
25
25
  const db = getDb()
26
26
  const rows = db.prepare(`
27
- SELECT url_template, dom_fingerprint, selectors_json, page_type,
27
+ SELECT url_template, dom_fingerprint, selectors_json, page_type, ready_selector,
28
28
  hit_count, fail_count, last_verified_at, created_at
29
29
  FROM page_recipes
30
30
  WHERE url_template = ?
@@ -33,19 +33,20 @@ function findByTemplate(urlTemplate) {
33
33
  return rows.map(parseRow)
34
34
  }
35
35
 
36
- function upsert({ urlTemplate, domFingerprint, selectors, pageType }) {
36
+ function upsert({ urlTemplate, domFingerprint, selectors, pageType, readySelector }) {
37
37
  const db = getDb()
38
38
  const json = JSON.stringify(selectors || {})
39
39
  const now = new Date().toISOString()
40
40
  db.prepare(`
41
- INSERT INTO page_recipes (url_template, dom_fingerprint, selectors_json, page_type, hit_count, fail_count, last_verified_at, created_at)
42
- VALUES (?, ?, ?, ?, 0, 0, ?, ?)
41
+ INSERT INTO page_recipes (url_template, dom_fingerprint, selectors_json, page_type, ready_selector, hit_count, fail_count, last_verified_at, created_at)
42
+ VALUES (?, ?, ?, ?, ?, 0, 0, ?, ?)
43
43
  ON CONFLICT(url_template, dom_fingerprint) DO UPDATE SET
44
44
  selectors_json = excluded.selectors_json,
45
45
  page_type = excluded.page_type,
46
+ ready_selector = excluded.ready_selector,
46
47
  fail_count = 0,
47
48
  last_verified_at = excluded.last_verified_at
48
- `).run(urlTemplate, domFingerprint, json, pageType || null, now, now)
49
+ `).run(urlTemplate, domFingerprint, json, pageType || null, readySelector || null, now, now)
49
50
  return find({ urlTemplate, domFingerprint })
50
51
  }
51
52
 
@@ -102,7 +103,7 @@ function remove({ urlTemplate, domFingerprint }) {
102
103
  function listAll({ limit = 100 } = {}) {
103
104
  const db = getDb()
104
105
  const rows = db.prepare(`
105
- SELECT url_template, dom_fingerprint, selectors_json, page_type,
106
+ SELECT url_template, dom_fingerprint, selectors_json, page_type, ready_selector,
106
107
  hit_count, fail_count, last_verified_at, created_at
107
108
  FROM page_recipes
108
109
  ORDER BY last_verified_at DESC NULLS LAST, created_at DESC
@@ -123,6 +124,7 @@ function parseRow(row) {
123
124
  dom_fingerprint: row.dom_fingerprint,
124
125
  selectors,
125
126
  page_type: row.page_type,
127
+ ready_selector: row.ready_selector || null,
126
128
  hit_count: row.hit_count,
127
129
  fail_count: row.fail_count,
128
130
  last_verified_at: row.last_verified_at,
@@ -251,6 +251,37 @@ Response:
251
251
  }
252
252
  ```
253
253
 
254
+ ### Chat (Detached Execution)
255
+
256
+ チャットメッセージは **デタッチ実行** される (v4.6.0〜)。`POST /api/chat` で起動した LLM プロセスは
257
+ HTTP リクエストではなく内部の **run manager** が所有するため、SSE 接続が切れても (タブを閉じる・
258
+ リロード・プロキシのアイドルタイムアウト・回線断) 処理は中断されない。接続は単なる「覗き窓」であり、
259
+ 切断時はサブスクライブを解除するだけでプロセスは kill されない。LLM を実際に停止するのは
260
+ `POST /api/chat/abort` のみ。
261
+
262
+ | Method | Endpoint | Description |
263
+ |--------|----------|-------------|
264
+ | POST | `/api/chat` | メッセージ送信 → 内部で run を起動し、その run のイベントを SSE で tail。SSE の最初のイベントは `{ "type": "run", "run_id": "..." }`(再接続用) |
265
+ | GET | `/api/chat/stream` | 進行中(または直近完了)の run に **再接続**し、`cursor` 以降のイベントから tail を再開。`run_id` または `workspace_id`(アクティブ run 自動解決) と任意の `cursor`(=最後に受け取った `seq`) を指定 |
266
+ | POST | `/api/chat/abort` | アクティブ run を明示的に kill (SIGTERM → 2秒後 SIGKILL)。Body に `workspace_id` / `run_id`(任意) |
267
+ | GET | `/api/chat/session` | アクティブセッションに加え、進行中 run があれば `active_run: { run_id, status, last_seq }` を返す(クライアントはこれを見て `/api/chat/stream` にアタッチする) |
268
+
269
+ 各 SSE イベントには単調増加の `seq` が付与され、これが再接続時の `cursor` になる。終端は必ず
270
+ `{ "type": "done", "session_id, turn_count }`(失敗時は直前に `{ "type": "error" }`)。run の
271
+ イベントログは `$DATA_DIR/chat-runs/{run_id}.ndjson` に永続化され、完了から一定時間(既定10分)後に
272
+ メモリから evict、24時間でログ/索引を prune する。
273
+
274
+ > **再起動の挙動 (Phase 1):** run manager はミニオンプロセス内で動作するため、ミニオン自体の再起動は
275
+ > 生存しない。起動時に `running` のまま残った run は `interrupted` に掃き出される (クライアントは待ち続けない)。
276
+ > 接続断への耐性が主目的であり、再起動生存は将来の拡張 (tmux バックエンド) で対応予定。
277
+
278
+ `GET /api/chat/stream` 例:
279
+ ```bash
280
+ # 直近のアクティブ run に seq 12 以降から再接続
281
+ curl -N -H "Authorization: Bearer $API_TOKEN" \
282
+ "http://localhost:8080/api/chat/stream?workspace_id=ws_abc123&cursor=12"
283
+ ```
284
+
254
285
  ### Self-Reflection Schedule (自己反省時間)
255
286
 
256
287
  The minion has a built-in daily scheduler that automatically runs end-of-day processing
@@ -714,10 +745,32 @@ Web ページの読み取り・要約・情報抽出をミニオン内のサブ
714
745
  ```json
715
746
  {
716
747
  "url": "https://example.com/article/123",
717
- "hint": "本文と著者を抽出してほしい (任意, 抽出フィールドのヒント)"
748
+ "hint": "本文と著者を抽出してほしい (任意, 抽出フィールドのヒント)",
749
+ "scroll": {
750
+ "strategy": "count",
751
+ "targetItems": 50,
752
+ "itemSelector": ".feed-item",
753
+ "maxScrolls": 20,
754
+ "maxMs": 15000,
755
+ "settleMs": 600
756
+ }
718
757
  }
719
758
  ```
720
759
 
760
+ **`scroll` (任意, v4.7.0〜):** 無限スクロール / 遅延ロードのページで「どこまでコンテンツを読み込むか」を**呼び出し側が宣言**するためのオプション。省略時はスクロールしない (従来動作)。
761
+
762
+ | フィールド | 説明 |
763
+ |-----------|------|
764
+ | `strategy` | `"count"` (件数到達まで) / `"untilStable"` (件数=増加が止まるまで) / `"fixed"` (回数固定)。未指定/不正ならスクロールしない |
765
+ | `targetItems` | `count` の目標件数。`itemSelector` が解決できる場合のみ有効 |
766
+ | `itemSelector` | 件数を数える CSS セレクタ。省略時はレシピ内の最初の `multiple: true` セレクタを流用 |
767
+ | `maxScrolls` | スクロール回数の上限 (default 10、サーバー上限 50) |
768
+ | `maxMs` | スクロールに使う最大時間 (default 15000、サーバー上限 45000) |
769
+ | `settleMs` | 1スクロールごとの描画待ち静止時間 (default 600) |
770
+ | `times` | `fixed` のスクロール回数 (default 10) |
771
+
772
+ > 値はサーバー側で上限にクランプされる。スクロール有効時はリクエスト全体のタイムアウトが 60s→120s に拡張される。
773
+
721
774
  **レスポンス (success):**
722
775
  ```json
723
776
  {
@@ -732,15 +785,24 @@ Web ページの読み取り・要約・情報抽出をミニオン内のサブ
732
785
  "title": "...",
733
786
  "content": "Markdown 本文...",
734
787
  "structured": { "title": "...", "author": "...", "publishedAt": "..." },
735
- "selectors": { "title": { "selector": "h1" }, "author": { "selector": "a[rel=author]" } }
788
+ "selectors": { "title": { "selector": "h1" }, "author": { "selector": "a[rel=author]" } },
789
+ "scrollInfo": { "scrolls": 12, "items": 50, "reachedTarget": true, "stoppedReason": "reachedTarget" }
736
790
  }
737
791
  ```
738
792
 
793
+ - `scrollInfo` は `scroll` 指定時のみ含まれる。目標未達で上限打ち切りの場合は `reachedTarget: false` と `warning` が返るので、`maxScrolls` / `maxMs` を上げて再試行できる (サイレントに打ち切らない)。
794
+
739
795
  **動作:**
740
- - 初回アクセス (cold): Playwright でレンダリング → Readability で本文抽出 → Anthropic Haiku でセレクタ生成 → SQLite (`page_recipes`) に保存 → セレクタで再抽出して返却
741
- - 2回目以降 (hot): URL 正規化・テンプレート化 → DOM フィンガープリントで保存済みレシピを照合 → セレクタで抽出のみ (LLM 呼び出しなし)
796
+ - 初回アクセス (cold): Playwright でレンダリング (**DOM が静止するまで待機**) → Readability で本文抽出 → Anthropic Haiku でセレクタ + `ready_selector` (描画完了の合図となる要素) を生成 → SQLite (`page_recipes`) に保存 → セレクタで再抽出して返却
797
+ - 2回目以降 (hot): URL 正規化・テンプレート化 → DOM フィンガープリントで保存済みレシピを照合 → **`ready_selector` の出現を待機**してからセレクタで抽出 (LLM 呼び出しなし)
742
798
  - セルフヒール: hot 実行で空結果が返ったら `fail_count++`、3回失敗で破棄して次回 cold 再生成
743
799
 
800
+ **SPA (クライアントレンダリング) への対応 (v4.7.0〜):**
801
+ - `page.goto` は `domcontentloaded` で解決するが、SPA はその時点では中身が空のシェルなので、ナビゲーション後に追加で描画完了を待つ:
802
+ 1. レシピに `ready_selector` があればその要素の出現を待つ
803
+ 2. 無ければ **DOM が `settleMs` の間ミューテーションしなくなるまで待つ** (MutationObserver、コンテンツ量に依存せず自己校正)
804
+ - これにより「SPA の描画が始まる前に空 DOM を掴んでタイムアウト/空結果になる」問題を回避する
805
+
744
806
  **URL 正規化ルール:**
745
807
  - `utm_*` `fbclid` `gclid` `ref` 等のトラッキングクエリは除去
746
808
  - `page` `p` `offset` 等のページネーション値は `:n` プレースホルダ化