npm - @geekbeer/minion - Versions diffs - 4.5.1 → 4.7.0 - Mend

@geekbeer/minion 4.5.1 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/core/db/migrations/20260607000000_chat_runs.js +48 -0
package/core/db/migrations/20260607120000_page_recipes_ready_selector.js +22 -0
package/core/lib/chat-run-manager.js +406 -0
package/core/lib/web-extract/extractor.js +27 -7
package/core/lib/web-extract/playwright-runner.js +199 -1
package/core/lib/web-extract/recipe-generator.js +19 -2
package/core/routes/web.js +12 -3
package/core/stores/chat-store.js +119 -2
package/core/stores/page-recipe-store.js +9 -7
package/docs/api-reference.md +66 -4
package/docs/task-guides.md +20 -2
package/linux/routes/chat.js +158 -193
package/package.json +1 -1
package/rules/core.md +9 -1
package/win/routes/chat.js +154 -157

package/core/lib/web-extract/playwright-runner.js CHANGED Viewed

@@ -8,14 +8,37 @@
  *
  * Each call spins up a fresh chromium instance. Pooling can come later
  * once the API stabilizes — for the experimental MVP, simple is better.
+ *
+ * Wait strategy (SPA-aware, v4.7.0):
+ *   `page.goto` resolves on `domcontentloaded`, which for a client-rendered
+ *   SPA fires *before* the framework has mounted and fetched its data. So
+ *   after navigation we additionally wait for the content to actually appear:
+ *     1. If a `readySelector` is known (from the recipe), wait for it.
+ *     2. Otherwise wait for the DOM to *settle* — i.e. no MutationObserver
+ *        events for `settleMs` — which is self-calibrating and works whether
+ *        the page renders 50 chars or 50,000.
+ *   `scroll` (optional) then drives infinite-scroll / lazy-load pages up to
+ *   caller-declared limits, with hard server-side caps.
  */
 const DEFAULT_NAV_TIMEOUT_MS = 20_000
 const DEFAULT_EVAL_TIMEOUT_MS = 5_000
+const DEFAULT_READY_TIMEOUT_MS = 8_000
+const DEFAULT_SETTLE_MS = 500
+const DEFAULT_SETTLE_MAX_MS = 8_000
 const DEFAULT_USER_AGENT =
   'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' +
   'Chrome/124.0.0.0 Safari/537.36 MinionWebExtract/0.1'
+// Scroll defaults (caller may override) and hard caps (server enforces).
+const SCROLL_DEFAULT_MAX_SCROLLS = 10
+const SCROLL_HARD_MAX_SCROLLS = 50
+const SCROLL_DEFAULT_MAX_MS = 15_000
+const SCROLL_HARD_MAX_MS = 45_000
+const SCROLL_DEFAULT_SETTLE_MS = 600
+const SCROLL_STABLE_ROUNDS = 2
+const SCROLL_STRATEGIES = ['count', 'untilStable', 'fixed']
 function loadChromium() {
   let playwright
   try {
@@ -49,12 +72,172 @@ async function withPage(fn, opts = {}) {
   }
 }
+/**
+ * Wait for the DOM to stop mutating. Resolves once no childList/characterData
+ * mutation has fired for `settleMs`, or after `settleMaxMs` regardless. Scoped
+ * to the main content landmark when present so ever-churning headers/ads/beacons
+ * don't keep it awake. Best-effort: any failure (e.g. mid-navigation) resolves.
+ */
+async function waitForSettle(page, opts = {}) {
+  const settleMs = opts.settleMs ?? DEFAULT_SETTLE_MS
+  const settleMaxMs = opts.settleMaxMs ?? DEFAULT_SETTLE_MAX_MS
+  try {
+    await page.evaluate(({ quiet, max }) => new Promise(resolve => {
+      const target = document.querySelector('main, article, [role="main"]') || document.body
+      if (!target) { resolve(); return }
+      let quietTimer = setTimeout(finish, quiet)
+      const hardCap = setTimeout(finish, max)
+      const obs = new MutationObserver(() => {
+        clearTimeout(quietTimer)
+        quietTimer = setTimeout(finish, quiet)
+      })
+      obs.observe(target, { childList: true, subtree: true, characterData: true })
+      function finish() {
+        clearTimeout(quietTimer)
+        clearTimeout(hardCap)
+        obs.disconnect()
+        resolve()
+      }
+    }), { quiet: settleMs, max: settleMaxMs })
+  } catch {
+    // navigation/teardown raced us — caller proceeds with whatever rendered.
+  }
+}
+/**
+ * After navigation, wait until the meaningful content is present:
+ * prefer the recipe's `readySelector`; fall back to DOM-settle detection.
+ */
+async function waitForReady(page, opts = {}) {
+  if (opts.readySelector) {
+    try {
+      await page.waitForSelector(opts.readySelector, {
+        state: 'visible',
+        timeout: opts.readyTimeoutMs ?? DEFAULT_READY_TIMEOUT_MS,
+      })
+      return
+    } catch {
+      // readySelector never showed — fall through to settle so we still
+      // capture whatever did render rather than returning the bare shell.
+    }
+  }
+  await waitForSettle(page, opts)
+}
+/**
+ * Normalize a caller-supplied scroll request into a clamped config, or null
+ * when no (valid) scroll was requested. `itemSelector` resolution order:
+ * explicit > the recipe's first `multiple: true` selector. With no item
+ * selector, 'count' is meaningless so we measure scrollHeight stability.
+ */
+function normalizeScroll(scroll, selectors) {
+  if (!scroll || typeof scroll !== 'object') return null
+  if (!SCROLL_STRATEGIES.includes(scroll.strategy)) return null
+  let itemSelector =
+    typeof scroll.itemSelector === 'string' && scroll.itemSelector.trim()
+      ? scroll.itemSelector.trim()
+      : null
+  if (!itemSelector && selectors && typeof selectors === 'object') {
+    for (const spec of Object.values(selectors)) {
+      if (spec && spec.multiple && typeof spec.selector === 'string' && spec.selector.trim()) {
+        itemSelector = spec.selector.trim()
+        break
+      }
+    }
+  }
+  const clampInt = (val, def, min, max) => {
+    const n = Number.isFinite(val) ? Math.floor(val) : def
+    return Math.max(min, Math.min(max, n))
+  }
+  return {
+    strategy: scroll.strategy,
+    itemSelector,
+    targetItems: clampInt(scroll.targetItems, 0, 0, 100_000),
+    maxScrolls: clampInt(scroll.maxScrolls, SCROLL_DEFAULT_MAX_SCROLLS, 1, SCROLL_HARD_MAX_SCROLLS),
+    maxMs: clampInt(scroll.maxMs, SCROLL_DEFAULT_MAX_MS, 500, SCROLL_HARD_MAX_MS),
+    settleMs: clampInt(scroll.settleMs, SCROLL_DEFAULT_SETTLE_MS, 100, 5_000),
+    times: clampInt(scroll.times, SCROLL_DEFAULT_MAX_SCROLLS, 1, SCROLL_HARD_MAX_SCROLLS),
+  }
+}
+/**
+ * Drive an infinite-scroll / lazy-load page within the caller's limits.
+ * Returns { scrolls, items, reachedTarget, stoppedReason } so the caller can
+ * tell whether it hit the target or was capped (never silently truncated).
+ */
+async function scrollToLoad(page, cfg) {
+  const start = Date.now()
+  const elapsed = () => Date.now() - start
+  const measure = async () => {
+    try {
+      if (cfg.itemSelector) {
+        return await page.evaluate(sel => document.querySelectorAll(sel).length, cfg.itemSelector)
+      }
+      return await page.evaluate(() => document.body.scrollHeight)
+    } catch {
+      return 0
+    }
+  }
+  let last = await measure()
+  let scrolls = 0
+  let stableRounds = 0
+  let stoppedReason = null
+  while (true) {
+    if (cfg.strategy === 'count' && cfg.itemSelector && cfg.targetItems > 0 && last >= cfg.targetItems) {
+      stoppedReason = 'reachedTarget'
+      break
+    }
+    if (cfg.strategy === 'fixed' && scrolls >= cfg.times) {
+      stoppedReason = 'fixedDone'
+      break
+    }
+    if (scrolls >= cfg.maxScrolls) { stoppedReason = 'maxScrolls'; break }
+    if (elapsed() >= cfg.maxMs) { stoppedReason = 'maxMs'; break }
+    await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)).catch(() => {})
+    scrolls++
+    await waitForSettle(page, {
+      settleMs: cfg.settleMs,
+      settleMaxMs: Math.min(cfg.settleMs * 4, Math.max(cfg.maxMs - elapsed(), cfg.settleMs)),
+    })
+    const cur = await measure()
+    if (cfg.strategy !== 'fixed') {
+      // No growth this round counts toward "settled"; two flat rounds = done.
+      stableRounds = cur <= last ? stableRounds + 1 : 0
+      if (stableRounds >= SCROLL_STABLE_ROUNDS) {
+        last = cur
+        stoppedReason = 'stable'
+        break
+      }
+    }
+    last = cur
+  }
+  const items = cfg.itemSelector ? await measure() : null
+  return {
+    scrolls,
+    items,
+    reachedTarget:
+      cfg.strategy === 'count' && cfg.itemSelector && cfg.targetItems > 0
+        ? items >= cfg.targetItems
+        : null,
+    stoppedReason: stoppedReason || 'stable',
+  }
+}
 async function renderPage(url, opts = {}) {
   return withPage(async page => {
     const response = await page.goto(url, {
       waitUntil: 'domcontentloaded',
       timeout: opts.timeoutMs ?? DEFAULT_NAV_TIMEOUT_MS,
     })
+    await waitForReady(page, opts)
     const html = await page.content()
     return {
       html,
@@ -77,6 +260,11 @@ async function renderPage(url, opts = {}) {
  *
  * `attr` defaults to 'text' (innerText). Special value 'html' returns
  * innerHTML. Any other string is read as an HTML attribute.
+ *
+ * `opts.readySelector` waits for that element before extracting; `opts.scroll`
+ * (a config from `normalizeScroll`) drives lazy-load pages first.
+ *
+ * Returns `{ data, scrollInfo }` — scrollInfo is null when no scroll ran.
  */
 async function extractWithSelectors(url, selectors, opts = {}) {
   return withPage(async page => {
@@ -84,7 +272,14 @@ async function extractWithSelectors(url, selectors, opts = {}) {
       waitUntil: 'domcontentloaded',
       timeout: opts.timeoutMs ?? DEFAULT_NAV_TIMEOUT_MS,
     })
-    return await page.evaluate(
+    await waitForReady(page, opts)
+    let scrollInfo = null
+    if (opts.scroll) {
+      scrollInfo = await scrollToLoad(page, opts.scroll)
+    }
+    const data = await page.evaluate(
       ({ selectorMap, evalTimeoutMs }) => {
         const start = Date.now()
         const result = {}
@@ -120,10 +315,13 @@ async function extractWithSelectors(url, selectors, opts = {}) {
       },
       { selectorMap: selectors, evalTimeoutMs: opts.evalTimeoutMs ?? DEFAULT_EVAL_TIMEOUT_MS },
     )
+    return { data, scrollInfo }
   }, opts)
 }
 module.exports = {
   renderPage,
   extractWithSelectors,
+  normalizeScroll,
 }

package/core/lib/web-extract/recipe-generator.js CHANGED Viewed

@@ -17,6 +17,7 @@
  *   {
  *     pageType: 'article' | 'listing' | 'product' | 'profile' | 'form' | 'other',
  *     selectors: { fieldName: { selector, attr?, multiple? }, ... },
+ *     readySelector: <css selector whose presence signals content is rendered>,
  *     extracted: { fieldName: <value already pulled from this page> }
  *   }
  */
@@ -59,6 +60,15 @@ const ANTHROPIC_TOOLS = [{
           },
         },
       },
+      ready_selector: {
+        type: 'string',
+        description:
+          'A single CSS selector for an element that exists ONLY once the primary ' +
+          'content has rendered (e.g. the article body, the first list item, a price). ' +
+          'On later visits the extractor waits for this element before reading the page, ' +
+          'so client-rendered (SPA) pages are captured after hydration rather than as an ' +
+          'empty shell. Pick a stable, semantic element; avoid spinners/skeletons.',
+      },
       extracted: {
         type: 'object',
         description:
@@ -77,7 +87,10 @@ Given a cleaned Markdown rendering of one page, you must:
    - Prefer semantic selectors (article, h1, time[datetime], a[rel="author"]) over class names where possible.
    - Use class-based selectors only when semantic ones are unavailable.
    - Avoid fragile attribute selectors like data-react-* or auto-generated hashes.
-3. Fill the "extracted" object with the values pulled from this exact page so the caller can verify your recipe works.
+3. Pick a "ready_selector": one CSS selector for an element that only exists once the
+   primary content has rendered (the article body, the first list item, the price, etc.).
+   Prefer a stable semantic element; never pick a loading spinner or skeleton placeholder.
+4. Fill the "extracted" object with the values pulled from this exact page so the caller can verify your recipe works.
 The same recipe will be reused for structurally similar pages, so think about what generalizes.`
@@ -132,6 +145,7 @@ async function generateViaPlugin(plugin, { url, cleanedMarkdown, hint }) {
   return {
     pageType: json.page_type || 'other',
     selectors: json.selectors || {},
+    readySelector: typeof json.ready_selector === 'string' ? json.ready_selector : null,
     extracted: json.extracted || {},
     source: `primary:${plugin.name}`,
   }
@@ -175,10 +189,11 @@ async function generateViaAnthropicDirect({ url, cleanedMarkdown, hint }) {
     throw new Error('Anthropic API returned no tool_use block for page_extraction')
   }
-  const { page_type, selectors, extracted } = toolUse.input
+  const { page_type, selectors, ready_selector, extracted } = toolUse.input
   return {
     pageType: page_type || 'other',
     selectors: selectors || {},
+    readySelector: typeof ready_selector === 'string' ? ready_selector : null,
     extracted: extracted || {},
     source: 'anthropic-direct',
   }
@@ -199,12 +214,14 @@ function buildTextPrompt({ url, cleanedMarkdown, hint }) {
     '  "selectors": {',
     '    "<fieldName>": { "selector": "<css>", "attr"?: "<text|html|attribute-name>", "multiple"?: <boolean> }',
     '  },',
+    '  "ready_selector": "<css selector that only exists once the main content has rendered>",',
     '  "extracted": { "<fieldName>": "<string or array of strings>" }',
     '}',
     '',
     'Notes:',
     '- attr defaults to "text" (innerText). Use "html" or an HTML attribute name to override.',
     '- Set multiple=true for list fields (returns array).',
+    '- "ready_selector" should target a stable content element (article body, first list item, price); never a spinner/skeleton.',
     '- "extracted" must contain the values you actually read from THIS page using those selectors.',
     '',
     '--- Cleaned Markdown ---',

package/core/routes/web.js CHANGED Viewed

@@ -19,6 +19,9 @@ const { extract } = require('../lib/web-extract')
 const pageRecipeStore = require('../stores/page-recipe-store')
 const REQUEST_TIMEOUT_MS = 60_000
+// Scrolling adds an extra navigation plus an in-page scroll loop, so give
+// scroll-enabled requests a wider ceiling than the plain extract path.
+const SCROLL_REQUEST_TIMEOUT_MS = 120_000
 async function webRoutes(fastify) {
   fastify.post('/api/web/extract', async (request, reply) => {
@@ -28,7 +31,7 @@ async function webRoutes(fastify) {
     }
     const body = request.body || {}
-    const { url, hint } = body
+    const { url, hint, scroll } = body
     if (!url || typeof url !== 'string') {
       reply.code(400)
@@ -40,11 +43,17 @@ async function webRoutes(fastify) {
       reply.code(400)
       return { success: false, error: 'url is not a valid URL' }
     }
+    if (scroll != null && typeof scroll !== 'object') {
+      reply.code(400)
+      return { success: false, error: 'scroll must be an object when provided' }
+    }
+    const requestTimeoutMs = scroll ? SCROLL_REQUEST_TIMEOUT_MS : REQUEST_TIMEOUT_MS
     try {
       const result = await Promise.race([
-        extract({ url, hint: typeof hint === 'string' ? hint : null }),
-        new Promise((_, rej) => setTimeout(() => rej(new Error('extract timeout')), REQUEST_TIMEOUT_MS)),
+        extract({ url, hint: typeof hint === 'string' ? hint : null, scroll: scroll || null }),
+        new Promise((_, rej) => setTimeout(() => rej(new Error('extract timeout')), requestTimeoutMs)),
       ])
       return { success: true, ...result }
     } catch (err) {

package/core/stores/chat-store.js CHANGED Viewed

@@ -200,8 +200,18 @@ function rekeySession(oldSessionId, newSessionId) {
         .run(Date.now(), oldSession.turn_count || 0, newSessionId)
       db.prepare('DELETE FROM chat_sessions WHERE session_id = ?').run(oldSessionId)
     } else {
-      db.prepare('UPDATE chat_sessions SET session_id = ? WHERE session_id = ?').run(newSessionId, oldSessionId)
+      // Insert the new parent row first, repoint the children, then drop the
+      // old parent. Updating chat_sessions.session_id in place would orphan the
+      // existing chat_messages mid-statement — the FK is ON UPDATE NO ACTION, so
+      // SQLite raises "FOREIGN KEY constraint failed" before the follow-up
+      // UPDATE can repoint them. (The final DELETE cascades to nothing because
+      // the messages were already moved.)
+      db.prepare(
+        'INSERT INTO chat_sessions (session_id, workspace_id, turn_count, created_at, updated_at) ' +
+        'SELECT ?, workspace_id, turn_count, created_at, updated_at FROM chat_sessions WHERE session_id = ?'
+      ).run(newSessionId, oldSessionId)
       db.prepare('UPDATE chat_messages SET session_id = ? WHERE session_id = ?').run(newSessionId, oldSessionId)
+      db.prepare('DELETE FROM chat_sessions WHERE session_id = ?').run(oldSessionId)
     }
     return true
   })
@@ -240,4 +250,111 @@ function deleteSession(sessionId) {
   return result.changes > 0
 }
-module.exports = { load, loadById, listSessions, save, addMessage, rekeySession, clear, deleteSession }
+// ---------------------------------------------------------------------------
+// Chat runs — durable index for detached chat execution (see chat-run-manager).
+// A run is owned by the run manager, not the HTTP request. These rows let a
+// reconnecting client find the in-flight run for a workspace and resume tailing.
+// ---------------------------------------------------------------------------
+/**
+ * Record the start of a detached run.
+ * @param {{ runId: string, sessionId?: string|null, pendingSessionId?: string|null, workspaceId?: string|null }} run
+ */
+function createRun({ runId, sessionId, pendingSessionId, workspaceId }) {
+  const db = getDb()
+  db.prepare(
+    `INSERT INTO chat_runs (run_id, session_id, pending_session_id, workspace_id, status, started_at, last_seq)
+     VALUES (?, ?, ?, ?, 'running', ?, 0)`
+  ).run(runId, sessionId || null, pendingSessionId || null, workspaceId || null, Date.now())
+}
+/**
+ * Update a run's progress/terminal state. Only provided fields are written.
+ * @param {string} runId
+ * @param {{ status?: string, sessionId?: string|null, lastSeq?: number }} patch
+ */
+function updateRun(runId, { status, sessionId, lastSeq } = {}) {
+  const db = getDb()
+  const sets = []
+  const vals = []
+  if (status !== undefined) {
+    sets.push('status = ?')
+    vals.push(status)
+    if (status !== 'running') {
+      sets.push('ended_at = ?')
+      vals.push(Date.now())
+    }
+  }
+  if (sessionId !== undefined) {
+    sets.push('session_id = ?')
+    vals.push(sessionId)
+  }
+  if (typeof lastSeq === 'number') {
+    sets.push('last_seq = ?')
+    vals.push(lastSeq)
+  }
+  if (!sets.length) return
+  vals.push(runId)
+  db.prepare(`UPDATE chat_runs SET ${sets.join(', ')} WHERE run_id = ?`).run(...vals)
+}
+/**
+ * Most recent still-running run for a workspace (reconnect target).
+ * @param {string|null} workspaceId
+ * @returns {object|null}
+ */
+function getActiveRun(workspaceId) {
+  const db = getDb()
+  if (workspaceId) {
+    return db.prepare(
+      `SELECT * FROM chat_runs WHERE workspace_id = ? AND status = 'running' ORDER BY started_at DESC LIMIT 1`
+    ).get(workspaceId) || null
+  }
+  return db.prepare(
+    `SELECT * FROM chat_runs WHERE workspace_id IS NULL AND status = 'running' ORDER BY started_at DESC LIMIT 1`
+  ).get() || null
+}
+/**
+ * Load a run row by id.
+ * @param {string} runId
+ * @returns {object|null}
+ */
+function getRun(runId) {
+  const db = getDb()
+  return db.prepare('SELECT * FROM chat_runs WHERE run_id = ?').get(runId) || null
+}
+/**
+ * Boot-time sweep: any run still flagged `running` belongs to a previous server
+ * process whose in-memory owner is gone. Mark them interrupted so clients stop
+ * waiting on a stream that will never resume.
+ * @returns {number} rows swept
+ */
+function markRunningInterrupted() {
+  const db = getDb()
+  const res = db.prepare(
+    `UPDATE chat_runs SET status = 'interrupted', ended_at = ? WHERE status = 'running'`
+  ).run(Date.now())
+  return res.changes || 0
+}
+/**
+ * Prune run rows older than maxAgeMs (terminal states only). Keeps the table
+ * from growing unbounded; the NDJSON event logs are pruned separately.
+ * @param {number} maxAgeMs
+ * @returns {number} rows removed
+ */
+function pruneRuns(maxAgeMs) {
+  const db = getDb()
+  const cutoff = Date.now() - maxAgeMs
+  const res = db.prepare(
+    `DELETE FROM chat_runs WHERE status != 'running' AND COALESCE(ended_at, started_at) < ?`
+  ).run(cutoff)
+  return res.changes || 0
+}
+module.exports = {
+  load, loadById, listSessions, save, addMessage, rekeySession, clear, deleteSession,
+  createRun, updateRun, getActiveRun, getRun, markRunningInterrupted, pruneRuns,
+}

package/core/stores/page-recipe-store.js CHANGED Viewed

@@ -12,7 +12,7 @@ const MAX_FAIL_COUNT = 3
 function find({ urlTemplate, domFingerprint }) {
   const db = getDb()
   const row = db.prepare(`
-    SELECT url_template, dom_fingerprint, selectors_json, page_type,
+    SELECT url_template, dom_fingerprint, selectors_json, page_type, ready_selector,
            hit_count, fail_count, last_verified_at, created_at
     FROM page_recipes
     WHERE url_template = ? AND dom_fingerprint = ?
@@ -24,7 +24,7 @@ function find({ urlTemplate, domFingerprint }) {
 function findByTemplate(urlTemplate) {
   const db = getDb()
   const rows = db.prepare(`
-    SELECT url_template, dom_fingerprint, selectors_json, page_type,
+    SELECT url_template, dom_fingerprint, selectors_json, page_type, ready_selector,
            hit_count, fail_count, last_verified_at, created_at
     FROM page_recipes
     WHERE url_template = ?
@@ -33,19 +33,20 @@ function findByTemplate(urlTemplate) {
   return rows.map(parseRow)
 }
-function upsert({ urlTemplate, domFingerprint, selectors, pageType }) {
+function upsert({ urlTemplate, domFingerprint, selectors, pageType, readySelector }) {
   const db = getDb()
   const json = JSON.stringify(selectors || {})
   const now = new Date().toISOString()
   db.prepare(`
-    INSERT INTO page_recipes (url_template, dom_fingerprint, selectors_json, page_type, hit_count, fail_count, last_verified_at, created_at)
-    VALUES (?, ?, ?, ?, 0, 0, ?, ?)
+    INSERT INTO page_recipes (url_template, dom_fingerprint, selectors_json, page_type, ready_selector, hit_count, fail_count, last_verified_at, created_at)
+    VALUES (?, ?, ?, ?, ?, 0, 0, ?, ?)
     ON CONFLICT(url_template, dom_fingerprint) DO UPDATE SET
       selectors_json = excluded.selectors_json,
       page_type = excluded.page_type,
+      ready_selector = excluded.ready_selector,
       fail_count = 0,
       last_verified_at = excluded.last_verified_at
-  `).run(urlTemplate, domFingerprint, json, pageType || null, now, now)
+  `).run(urlTemplate, domFingerprint, json, pageType || null, readySelector || null, now, now)
   return find({ urlTemplate, domFingerprint })
 }
@@ -102,7 +103,7 @@ function remove({ urlTemplate, domFingerprint }) {
 function listAll({ limit = 100 } = {}) {
   const db = getDb()
   const rows = db.prepare(`
-    SELECT url_template, dom_fingerprint, selectors_json, page_type,
+    SELECT url_template, dom_fingerprint, selectors_json, page_type, ready_selector,
            hit_count, fail_count, last_verified_at, created_at
     FROM page_recipes
     ORDER BY last_verified_at DESC NULLS LAST, created_at DESC
@@ -123,6 +124,7 @@ function parseRow(row) {
     dom_fingerprint: row.dom_fingerprint,
     selectors,
     page_type: row.page_type,
+    ready_selector: row.ready_selector || null,
     hit_count: row.hit_count,
     fail_count: row.fail_count,
     last_verified_at: row.last_verified_at,

package/docs/api-reference.md CHANGED Viewed

@@ -251,6 +251,37 @@ Response:
 }
 ```
+### Chat (Detached Execution)
+チャットメッセージは **デタッチ実行** される (v4.6.0〜)。`POST /api/chat` で起動した LLM プロセスは
+HTTP リクエストではなく内部の **run manager** が所有するため、SSE 接続が切れても (タブを閉じる・
+リロード・プロキシのアイドルタイムアウト・回線断) 処理は中断されない。接続は単なる「覗き窓」であり、
+切断時はサブスクライブを解除するだけでプロセスは kill されない。LLM を実際に停止するのは
+`POST /api/chat/abort` のみ。
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| POST | `/api/chat` | メッセージ送信 → 内部で run を起動し、その run のイベントを SSE で tail。SSE の最初のイベントは `{ "type": "run", "run_id": "..." }`（再接続用） |
+| GET  | `/api/chat/stream` | 進行中(または直近完了)の run に **再接続**し、`cursor` 以降のイベントから tail を再開。`run_id` または `workspace_id`(アクティブ run 自動解決) と任意の `cursor`(=最後に受け取った `seq`) を指定 |
+| POST | `/api/chat/abort` | アクティブ run を明示的に kill (SIGTERM → 2秒後 SIGKILL)。Body に `workspace_id` / `run_id`(任意) |
+| GET  | `/api/chat/session` | アクティブセッションに加え、進行中 run があれば `active_run: { run_id, status, last_seq }` を返す(クライアントはこれを見て `/api/chat/stream` にアタッチする) |
+各 SSE イベントには単調増加の `seq` が付与され、これが再接続時の `cursor` になる。終端は必ず
+`{ "type": "done", "session_id, turn_count }`(失敗時は直前に `{ "type": "error" }`)。run の
+イベントログは `$DATA_DIR/chat-runs/{run_id}.ndjson` に永続化され、完了から一定時間(既定10分)後に
+メモリから evict、24時間でログ/索引を prune する。
+> **再起動の挙動 (Phase 1):** run manager はミニオンプロセス内で動作するため、ミニオン自体の再起動は
+> 生存しない。起動時に `running` のまま残った run は `interrupted` に掃き出される (クライアントは待ち続けない)。
+> 接続断への耐性が主目的であり、再起動生存は将来の拡張 (tmux バックエンド) で対応予定。
+`GET /api/chat/stream` 例:
+```bash
+# 直近のアクティブ run に seq 12 以降から再接続
+curl -N -H "Authorization: Bearer $API_TOKEN" \
+  "http://localhost:8080/api/chat/stream?workspace_id=ws_abc123&cursor=12"
+```
 ### Self-Reflection Schedule (自己反省時間)
 The minion has a built-in daily scheduler that automatically runs end-of-day processing
@@ -714,10 +745,32 @@ Web ページの読み取り・要約・情報抽出をミニオン内のサブ
 ```json
 {
   "url": "https://example.com/article/123",
-  "hint": "本文と著者を抽出してほしい (任意, 抽出フィールドのヒント)"
+  "hint": "本文と著者を抽出してほしい (任意, 抽出フィールドのヒント)",
+  "scroll": {
+    "strategy": "count",
+    "targetItems": 50,
+    "itemSelector": ".feed-item",
+    "maxScrolls": 20,
+    "maxMs": 15000,
+    "settleMs": 600
+  }
 }
 ```
+**`scroll` (任意, v4.7.0〜):** 無限スクロール / 遅延ロードのページで「どこまでコンテンツを読み込むか」を**呼び出し側が宣言**するためのオプション。省略時はスクロールしない (従来動作)。
+| フィールド | 説明 |
+|-----------|------|
+| `strategy` | `"count"` (件数到達まで) / `"untilStable"` (件数=増加が止まるまで) / `"fixed"` (回数固定)。未指定/不正ならスクロールしない |
+| `targetItems` | `count` の目標件数。`itemSelector` が解決できる場合のみ有効 |
+| `itemSelector` | 件数を数える CSS セレクタ。省略時はレシピ内の最初の `multiple: true` セレクタを流用 |
+| `maxScrolls` | スクロール回数の上限 (default 10、サーバー上限 50) |
+| `maxMs` | スクロールに使う最大時間 (default 15000、サーバー上限 45000) |
+| `settleMs` | 1スクロールごとの描画待ち静止時間 (default 600) |
+| `times` | `fixed` のスクロール回数 (default 10) |
+> 値はサーバー側で上限にクランプされる。スクロール有効時はリクエスト全体のタイムアウトが 60s→120s に拡張される。
 **レスポンス (success):**
 ```json
 {
@@ -732,15 +785,24 @@ Web ページの読み取り・要約・情報抽出をミニオン内のサブ
   "title": "...",
   "content": "Markdown 本文...",
   "structured": { "title": "...", "author": "...", "publishedAt": "..." },
-  "selectors": { "title": { "selector": "h1" }, "author": { "selector": "a[rel=author]" } }
+  "selectors": { "title": { "selector": "h1" }, "author": { "selector": "a[rel=author]" } },
+  "scrollInfo": { "scrolls": 12, "items": 50, "reachedTarget": true, "stoppedReason": "reachedTarget" }
 }
 ```
+- `scrollInfo` は `scroll` 指定時のみ含まれる。目標未達で上限打ち切りの場合は `reachedTarget: false` と `warning` が返るので、`maxScrolls` / `maxMs` を上げて再試行できる (サイレントに打ち切らない)。
 **動作:**
-- 初回アクセス (cold): Playwright でレンダリング → Readability で本文抽出 → Anthropic Haiku でセレクタ生成 → SQLite (`page_recipes`) に保存 → セレクタで再抽出して返却
-- 2回目以降 (hot): URL 正規化・テンプレート化 → DOM フィンガープリントで保存済みレシピを照合 → セレクタで抽出のみ (LLM 呼び出しなし)
+- 初回アクセス (cold): Playwright でレンダリング (**DOM が静止するまで待機**) → Readability で本文抽出 → Anthropic Haiku でセレクタ + `ready_selector` (描画完了の合図となる要素) を生成 → SQLite (`page_recipes`) に保存 → セレクタで再抽出して返却
+- 2回目以降 (hot): URL 正規化・テンプレート化 → DOM フィンガープリントで保存済みレシピを照合 → **`ready_selector` の出現を待機**してからセレクタで抽出 (LLM 呼び出しなし)
 - セルフヒール: hot 実行で空結果が返ったら `fail_count++`、3回失敗で破棄して次回 cold 再生成
+**SPA (クライアントレンダリング) への対応 (v4.7.0〜):**
+- `page.goto` は `domcontentloaded` で解決するが、SPA はその時点では中身が空のシェルなので、ナビゲーション後に追加で描画完了を待つ:
+  1. レシピに `ready_selector` があればその要素の出現を待つ
+  2. 無ければ **DOM が `settleMs` の間ミューテーションしなくなるまで待つ** (MutationObserver、コンテンツ量に依存せず自己校正)
+- これにより「SPA の描画が始まる前に空 DOM を掴んでタイムアウト/空結果になる」問題を回避する
 **URL 正規化ルール:**
 - `utm_*` `fbclid` `gclid` `ref` 等のトラッキングクエリは除去
 - `page` `p` `offset` 等のページネーション値は `:n` プレースホルダ化