@geekbeer/minion 4.5.1 → 4.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/db/migrations/20260607000000_chat_runs.js +48 -0
- package/core/db/migrations/20260607120000_page_recipes_ready_selector.js +22 -0
- package/core/lib/chat-run-manager.js +406 -0
- package/core/lib/web-extract/extractor.js +27 -7
- package/core/lib/web-extract/playwright-runner.js +199 -1
- package/core/lib/web-extract/recipe-generator.js +19 -2
- package/core/routes/web.js +12 -3
- package/core/stores/chat-store.js +119 -2
- package/core/stores/page-recipe-store.js +9 -7
- package/docs/api-reference.md +66 -4
- package/docs/task-guides.md +20 -2
- package/linux/routes/chat.js +158 -193
- package/package.json +1 -1
- package/rules/core.md +9 -1
- package/win/routes/chat.js +154 -157
|
@@ -8,14 +8,37 @@
|
|
|
8
8
|
*
|
|
9
9
|
* Each call spins up a fresh chromium instance. Pooling can come later
|
|
10
10
|
* once the API stabilizes — for the experimental MVP, simple is better.
|
|
11
|
+
*
|
|
12
|
+
* Wait strategy (SPA-aware, v4.7.0):
|
|
13
|
+
* `page.goto` resolves on `domcontentloaded`, which for a client-rendered
|
|
14
|
+
* SPA fires *before* the framework has mounted and fetched its data. So
|
|
15
|
+
* after navigation we additionally wait for the content to actually appear:
|
|
16
|
+
* 1. If a `readySelector` is known (from the recipe), wait for it.
|
|
17
|
+
* 2. Otherwise wait for the DOM to *settle* — i.e. no MutationObserver
|
|
18
|
+
* events for `settleMs` — which is self-calibrating and works whether
|
|
19
|
+
* the page renders 50 chars or 50,000.
|
|
20
|
+
* `scroll` (optional) then drives infinite-scroll / lazy-load pages up to
|
|
21
|
+
* caller-declared limits, with hard server-side caps.
|
|
11
22
|
*/
|
|
12
23
|
|
|
13
24
|
const DEFAULT_NAV_TIMEOUT_MS = 20_000
|
|
14
25
|
const DEFAULT_EVAL_TIMEOUT_MS = 5_000
|
|
26
|
+
const DEFAULT_READY_TIMEOUT_MS = 8_000
|
|
27
|
+
const DEFAULT_SETTLE_MS = 500
|
|
28
|
+
const DEFAULT_SETTLE_MAX_MS = 8_000
|
|
15
29
|
const DEFAULT_USER_AGENT =
|
|
16
30
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' +
|
|
17
31
|
'Chrome/124.0.0.0 Safari/537.36 MinionWebExtract/0.1'
|
|
18
32
|
|
|
33
|
+
// Scroll defaults (caller may override) and hard caps (server enforces).
|
|
34
|
+
const SCROLL_DEFAULT_MAX_SCROLLS = 10
|
|
35
|
+
const SCROLL_HARD_MAX_SCROLLS = 50
|
|
36
|
+
const SCROLL_DEFAULT_MAX_MS = 15_000
|
|
37
|
+
const SCROLL_HARD_MAX_MS = 45_000
|
|
38
|
+
const SCROLL_DEFAULT_SETTLE_MS = 600
|
|
39
|
+
const SCROLL_STABLE_ROUNDS = 2
|
|
40
|
+
const SCROLL_STRATEGIES = ['count', 'untilStable', 'fixed']
|
|
41
|
+
|
|
19
42
|
function loadChromium() {
|
|
20
43
|
let playwright
|
|
21
44
|
try {
|
|
@@ -49,12 +72,172 @@ async function withPage(fn, opts = {}) {
|
|
|
49
72
|
}
|
|
50
73
|
}
|
|
51
74
|
|
|
75
|
+
/**
|
|
76
|
+
* Wait for the DOM to stop mutating. Resolves once no childList/characterData
|
|
77
|
+
* mutation has fired for `settleMs`, or after `settleMaxMs` regardless. Scoped
|
|
78
|
+
* to the main content landmark when present so ever-churning headers/ads/beacons
|
|
79
|
+
* don't keep it awake. Best-effort: any failure (e.g. mid-navigation) resolves.
|
|
80
|
+
*/
|
|
81
|
+
async function waitForSettle(page, opts = {}) {
|
|
82
|
+
const settleMs = opts.settleMs ?? DEFAULT_SETTLE_MS
|
|
83
|
+
const settleMaxMs = opts.settleMaxMs ?? DEFAULT_SETTLE_MAX_MS
|
|
84
|
+
try {
|
|
85
|
+
await page.evaluate(({ quiet, max }) => new Promise(resolve => {
|
|
86
|
+
const target = document.querySelector('main, article, [role="main"]') || document.body
|
|
87
|
+
if (!target) { resolve(); return }
|
|
88
|
+
let quietTimer = setTimeout(finish, quiet)
|
|
89
|
+
const hardCap = setTimeout(finish, max)
|
|
90
|
+
const obs = new MutationObserver(() => {
|
|
91
|
+
clearTimeout(quietTimer)
|
|
92
|
+
quietTimer = setTimeout(finish, quiet)
|
|
93
|
+
})
|
|
94
|
+
obs.observe(target, { childList: true, subtree: true, characterData: true })
|
|
95
|
+
function finish() {
|
|
96
|
+
clearTimeout(quietTimer)
|
|
97
|
+
clearTimeout(hardCap)
|
|
98
|
+
obs.disconnect()
|
|
99
|
+
resolve()
|
|
100
|
+
}
|
|
101
|
+
}), { quiet: settleMs, max: settleMaxMs })
|
|
102
|
+
} catch {
|
|
103
|
+
// navigation/teardown raced us — caller proceeds with whatever rendered.
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* After navigation, wait until the meaningful content is present:
|
|
109
|
+
* prefer the recipe's `readySelector`; fall back to DOM-settle detection.
|
|
110
|
+
*/
|
|
111
|
+
async function waitForReady(page, opts = {}) {
|
|
112
|
+
if (opts.readySelector) {
|
|
113
|
+
try {
|
|
114
|
+
await page.waitForSelector(opts.readySelector, {
|
|
115
|
+
state: 'visible',
|
|
116
|
+
timeout: opts.readyTimeoutMs ?? DEFAULT_READY_TIMEOUT_MS,
|
|
117
|
+
})
|
|
118
|
+
return
|
|
119
|
+
} catch {
|
|
120
|
+
// readySelector never showed — fall through to settle so we still
|
|
121
|
+
// capture whatever did render rather than returning the bare shell.
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
await waitForSettle(page, opts)
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Normalize a caller-supplied scroll request into a clamped config, or null
|
|
129
|
+
* when no (valid) scroll was requested. `itemSelector` resolution order:
|
|
130
|
+
* explicit > the recipe's first `multiple: true` selector. With no item
|
|
131
|
+
* selector, 'count' is meaningless so we measure scrollHeight stability.
|
|
132
|
+
*/
|
|
133
|
+
function normalizeScroll(scroll, selectors) {
|
|
134
|
+
if (!scroll || typeof scroll !== 'object') return null
|
|
135
|
+
if (!SCROLL_STRATEGIES.includes(scroll.strategy)) return null
|
|
136
|
+
|
|
137
|
+
let itemSelector =
|
|
138
|
+
typeof scroll.itemSelector === 'string' && scroll.itemSelector.trim()
|
|
139
|
+
? scroll.itemSelector.trim()
|
|
140
|
+
: null
|
|
141
|
+
if (!itemSelector && selectors && typeof selectors === 'object') {
|
|
142
|
+
for (const spec of Object.values(selectors)) {
|
|
143
|
+
if (spec && spec.multiple && typeof spec.selector === 'string' && spec.selector.trim()) {
|
|
144
|
+
itemSelector = spec.selector.trim()
|
|
145
|
+
break
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
const clampInt = (val, def, min, max) => {
|
|
151
|
+
const n = Number.isFinite(val) ? Math.floor(val) : def
|
|
152
|
+
return Math.max(min, Math.min(max, n))
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
return {
|
|
156
|
+
strategy: scroll.strategy,
|
|
157
|
+
itemSelector,
|
|
158
|
+
targetItems: clampInt(scroll.targetItems, 0, 0, 100_000),
|
|
159
|
+
maxScrolls: clampInt(scroll.maxScrolls, SCROLL_DEFAULT_MAX_SCROLLS, 1, SCROLL_HARD_MAX_SCROLLS),
|
|
160
|
+
maxMs: clampInt(scroll.maxMs, SCROLL_DEFAULT_MAX_MS, 500, SCROLL_HARD_MAX_MS),
|
|
161
|
+
settleMs: clampInt(scroll.settleMs, SCROLL_DEFAULT_SETTLE_MS, 100, 5_000),
|
|
162
|
+
times: clampInt(scroll.times, SCROLL_DEFAULT_MAX_SCROLLS, 1, SCROLL_HARD_MAX_SCROLLS),
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
/**
|
|
167
|
+
* Drive an infinite-scroll / lazy-load page within the caller's limits.
|
|
168
|
+
* Returns { scrolls, items, reachedTarget, stoppedReason } so the caller can
|
|
169
|
+
* tell whether it hit the target or was capped (never silently truncated).
|
|
170
|
+
*/
|
|
171
|
+
async function scrollToLoad(page, cfg) {
|
|
172
|
+
const start = Date.now()
|
|
173
|
+
const elapsed = () => Date.now() - start
|
|
174
|
+
const measure = async () => {
|
|
175
|
+
try {
|
|
176
|
+
if (cfg.itemSelector) {
|
|
177
|
+
return await page.evaluate(sel => document.querySelectorAll(sel).length, cfg.itemSelector)
|
|
178
|
+
}
|
|
179
|
+
return await page.evaluate(() => document.body.scrollHeight)
|
|
180
|
+
} catch {
|
|
181
|
+
return 0
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
let last = await measure()
|
|
186
|
+
let scrolls = 0
|
|
187
|
+
let stableRounds = 0
|
|
188
|
+
let stoppedReason = null
|
|
189
|
+
|
|
190
|
+
while (true) {
|
|
191
|
+
if (cfg.strategy === 'count' && cfg.itemSelector && cfg.targetItems > 0 && last >= cfg.targetItems) {
|
|
192
|
+
stoppedReason = 'reachedTarget'
|
|
193
|
+
break
|
|
194
|
+
}
|
|
195
|
+
if (cfg.strategy === 'fixed' && scrolls >= cfg.times) {
|
|
196
|
+
stoppedReason = 'fixedDone'
|
|
197
|
+
break
|
|
198
|
+
}
|
|
199
|
+
if (scrolls >= cfg.maxScrolls) { stoppedReason = 'maxScrolls'; break }
|
|
200
|
+
if (elapsed() >= cfg.maxMs) { stoppedReason = 'maxMs'; break }
|
|
201
|
+
|
|
202
|
+
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)).catch(() => {})
|
|
203
|
+
scrolls++
|
|
204
|
+
await waitForSettle(page, {
|
|
205
|
+
settleMs: cfg.settleMs,
|
|
206
|
+
settleMaxMs: Math.min(cfg.settleMs * 4, Math.max(cfg.maxMs - elapsed(), cfg.settleMs)),
|
|
207
|
+
})
|
|
208
|
+
|
|
209
|
+
const cur = await measure()
|
|
210
|
+
if (cfg.strategy !== 'fixed') {
|
|
211
|
+
// No growth this round counts toward "settled"; two flat rounds = done.
|
|
212
|
+
stableRounds = cur <= last ? stableRounds + 1 : 0
|
|
213
|
+
if (stableRounds >= SCROLL_STABLE_ROUNDS) {
|
|
214
|
+
last = cur
|
|
215
|
+
stoppedReason = 'stable'
|
|
216
|
+
break
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
last = cur
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
const items = cfg.itemSelector ? await measure() : null
|
|
223
|
+
return {
|
|
224
|
+
scrolls,
|
|
225
|
+
items,
|
|
226
|
+
reachedTarget:
|
|
227
|
+
cfg.strategy === 'count' && cfg.itemSelector && cfg.targetItems > 0
|
|
228
|
+
? items >= cfg.targetItems
|
|
229
|
+
: null,
|
|
230
|
+
stoppedReason: stoppedReason || 'stable',
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
52
234
|
async function renderPage(url, opts = {}) {
|
|
53
235
|
return withPage(async page => {
|
|
54
236
|
const response = await page.goto(url, {
|
|
55
237
|
waitUntil: 'domcontentloaded',
|
|
56
238
|
timeout: opts.timeoutMs ?? DEFAULT_NAV_TIMEOUT_MS,
|
|
57
239
|
})
|
|
240
|
+
await waitForReady(page, opts)
|
|
58
241
|
const html = await page.content()
|
|
59
242
|
return {
|
|
60
243
|
html,
|
|
@@ -77,6 +260,11 @@ async function renderPage(url, opts = {}) {
|
|
|
77
260
|
*
|
|
78
261
|
* `attr` defaults to 'text' (innerText). Special value 'html' returns
|
|
79
262
|
* innerHTML. Any other string is read as an HTML attribute.
|
|
263
|
+
*
|
|
264
|
+
* `opts.readySelector` waits for that element before extracting; `opts.scroll`
|
|
265
|
+
* (a config from `normalizeScroll`) drives lazy-load pages first.
|
|
266
|
+
*
|
|
267
|
+
* Returns `{ data, scrollInfo }` — scrollInfo is null when no scroll ran.
|
|
80
268
|
*/
|
|
81
269
|
async function extractWithSelectors(url, selectors, opts = {}) {
|
|
82
270
|
return withPage(async page => {
|
|
@@ -84,7 +272,14 @@ async function extractWithSelectors(url, selectors, opts = {}) {
|
|
|
84
272
|
waitUntil: 'domcontentloaded',
|
|
85
273
|
timeout: opts.timeoutMs ?? DEFAULT_NAV_TIMEOUT_MS,
|
|
86
274
|
})
|
|
87
|
-
|
|
275
|
+
await waitForReady(page, opts)
|
|
276
|
+
|
|
277
|
+
let scrollInfo = null
|
|
278
|
+
if (opts.scroll) {
|
|
279
|
+
scrollInfo = await scrollToLoad(page, opts.scroll)
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
const data = await page.evaluate(
|
|
88
283
|
({ selectorMap, evalTimeoutMs }) => {
|
|
89
284
|
const start = Date.now()
|
|
90
285
|
const result = {}
|
|
@@ -120,10 +315,13 @@ async function extractWithSelectors(url, selectors, opts = {}) {
|
|
|
120
315
|
},
|
|
121
316
|
{ selectorMap: selectors, evalTimeoutMs: opts.evalTimeoutMs ?? DEFAULT_EVAL_TIMEOUT_MS },
|
|
122
317
|
)
|
|
318
|
+
|
|
319
|
+
return { data, scrollInfo }
|
|
123
320
|
}, opts)
|
|
124
321
|
}
|
|
125
322
|
|
|
126
323
|
module.exports = {
|
|
127
324
|
renderPage,
|
|
128
325
|
extractWithSelectors,
|
|
326
|
+
normalizeScroll,
|
|
129
327
|
}
|
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
* {
|
|
18
18
|
* pageType: 'article' | 'listing' | 'product' | 'profile' | 'form' | 'other',
|
|
19
19
|
* selectors: { fieldName: { selector, attr?, multiple? }, ... },
|
|
20
|
+
* readySelector: <css selector whose presence signals content is rendered>,
|
|
20
21
|
* extracted: { fieldName: <value already pulled from this page> }
|
|
21
22
|
* }
|
|
22
23
|
*/
|
|
@@ -59,6 +60,15 @@ const ANTHROPIC_TOOLS = [{
|
|
|
59
60
|
},
|
|
60
61
|
},
|
|
61
62
|
},
|
|
63
|
+
ready_selector: {
|
|
64
|
+
type: 'string',
|
|
65
|
+
description:
|
|
66
|
+
'A single CSS selector for an element that exists ONLY once the primary ' +
|
|
67
|
+
'content has rendered (e.g. the article body, the first list item, a price). ' +
|
|
68
|
+
'On later visits the extractor waits for this element before reading the page, ' +
|
|
69
|
+
'so client-rendered (SPA) pages are captured after hydration rather than as an ' +
|
|
70
|
+
'empty shell. Pick a stable, semantic element; avoid spinners/skeletons.',
|
|
71
|
+
},
|
|
62
72
|
extracted: {
|
|
63
73
|
type: 'object',
|
|
64
74
|
description:
|
|
@@ -77,7 +87,10 @@ Given a cleaned Markdown rendering of one page, you must:
|
|
|
77
87
|
- Prefer semantic selectors (article, h1, time[datetime], a[rel="author"]) over class names where possible.
|
|
78
88
|
- Use class-based selectors only when semantic ones are unavailable.
|
|
79
89
|
- Avoid fragile attribute selectors like data-react-* or auto-generated hashes.
|
|
80
|
-
3.
|
|
90
|
+
3. Pick a "ready_selector": one CSS selector for an element that only exists once the
|
|
91
|
+
primary content has rendered (the article body, the first list item, the price, etc.).
|
|
92
|
+
Prefer a stable semantic element; never pick a loading spinner or skeleton placeholder.
|
|
93
|
+
4. Fill the "extracted" object with the values pulled from this exact page so the caller can verify your recipe works.
|
|
81
94
|
|
|
82
95
|
The same recipe will be reused for structurally similar pages, so think about what generalizes.`
|
|
83
96
|
|
|
@@ -132,6 +145,7 @@ async function generateViaPlugin(plugin, { url, cleanedMarkdown, hint }) {
|
|
|
132
145
|
return {
|
|
133
146
|
pageType: json.page_type || 'other',
|
|
134
147
|
selectors: json.selectors || {},
|
|
148
|
+
readySelector: typeof json.ready_selector === 'string' ? json.ready_selector : null,
|
|
135
149
|
extracted: json.extracted || {},
|
|
136
150
|
source: `primary:${plugin.name}`,
|
|
137
151
|
}
|
|
@@ -175,10 +189,11 @@ async function generateViaAnthropicDirect({ url, cleanedMarkdown, hint }) {
|
|
|
175
189
|
throw new Error('Anthropic API returned no tool_use block for page_extraction')
|
|
176
190
|
}
|
|
177
191
|
|
|
178
|
-
const { page_type, selectors, extracted } = toolUse.input
|
|
192
|
+
const { page_type, selectors, ready_selector, extracted } = toolUse.input
|
|
179
193
|
return {
|
|
180
194
|
pageType: page_type || 'other',
|
|
181
195
|
selectors: selectors || {},
|
|
196
|
+
readySelector: typeof ready_selector === 'string' ? ready_selector : null,
|
|
182
197
|
extracted: extracted || {},
|
|
183
198
|
source: 'anthropic-direct',
|
|
184
199
|
}
|
|
@@ -199,12 +214,14 @@ function buildTextPrompt({ url, cleanedMarkdown, hint }) {
|
|
|
199
214
|
' "selectors": {',
|
|
200
215
|
' "<fieldName>": { "selector": "<css>", "attr"?: "<text|html|attribute-name>", "multiple"?: <boolean> }',
|
|
201
216
|
' },',
|
|
217
|
+
' "ready_selector": "<css selector that only exists once the main content has rendered>",',
|
|
202
218
|
' "extracted": { "<fieldName>": "<string or array of strings>" }',
|
|
203
219
|
'}',
|
|
204
220
|
'',
|
|
205
221
|
'Notes:',
|
|
206
222
|
'- attr defaults to "text" (innerText). Use "html" or an HTML attribute name to override.',
|
|
207
223
|
'- Set multiple=true for list fields (returns array).',
|
|
224
|
+
'- "ready_selector" should target a stable content element (article body, first list item, price); never a spinner/skeleton.',
|
|
208
225
|
'- "extracted" must contain the values you actually read from THIS page using those selectors.',
|
|
209
226
|
'',
|
|
210
227
|
'--- Cleaned Markdown ---',
|
package/core/routes/web.js
CHANGED
|
@@ -19,6 +19,9 @@ const { extract } = require('../lib/web-extract')
|
|
|
19
19
|
const pageRecipeStore = require('../stores/page-recipe-store')
|
|
20
20
|
|
|
21
21
|
const REQUEST_TIMEOUT_MS = 60_000
|
|
22
|
+
// Scrolling adds an extra navigation plus an in-page scroll loop, so give
|
|
23
|
+
// scroll-enabled requests a wider ceiling than the plain extract path.
|
|
24
|
+
const SCROLL_REQUEST_TIMEOUT_MS = 120_000
|
|
22
25
|
|
|
23
26
|
async function webRoutes(fastify) {
|
|
24
27
|
fastify.post('/api/web/extract', async (request, reply) => {
|
|
@@ -28,7 +31,7 @@ async function webRoutes(fastify) {
|
|
|
28
31
|
}
|
|
29
32
|
|
|
30
33
|
const body = request.body || {}
|
|
31
|
-
const { url, hint } = body
|
|
34
|
+
const { url, hint, scroll } = body
|
|
32
35
|
|
|
33
36
|
if (!url || typeof url !== 'string') {
|
|
34
37
|
reply.code(400)
|
|
@@ -40,11 +43,17 @@ async function webRoutes(fastify) {
|
|
|
40
43
|
reply.code(400)
|
|
41
44
|
return { success: false, error: 'url is not a valid URL' }
|
|
42
45
|
}
|
|
46
|
+
if (scroll != null && typeof scroll !== 'object') {
|
|
47
|
+
reply.code(400)
|
|
48
|
+
return { success: false, error: 'scroll must be an object when provided' }
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const requestTimeoutMs = scroll ? SCROLL_REQUEST_TIMEOUT_MS : REQUEST_TIMEOUT_MS
|
|
43
52
|
|
|
44
53
|
try {
|
|
45
54
|
const result = await Promise.race([
|
|
46
|
-
extract({ url, hint: typeof hint === 'string' ? hint : null }),
|
|
47
|
-
new Promise((_, rej) => setTimeout(() => rej(new Error('extract timeout')),
|
|
55
|
+
extract({ url, hint: typeof hint === 'string' ? hint : null, scroll: scroll || null }),
|
|
56
|
+
new Promise((_, rej) => setTimeout(() => rej(new Error('extract timeout')), requestTimeoutMs)),
|
|
48
57
|
])
|
|
49
58
|
return { success: true, ...result }
|
|
50
59
|
} catch (err) {
|
|
@@ -200,8 +200,18 @@ function rekeySession(oldSessionId, newSessionId) {
|
|
|
200
200
|
.run(Date.now(), oldSession.turn_count || 0, newSessionId)
|
|
201
201
|
db.prepare('DELETE FROM chat_sessions WHERE session_id = ?').run(oldSessionId)
|
|
202
202
|
} else {
|
|
203
|
-
|
|
203
|
+
// Insert the new parent row first, repoint the children, then drop the
|
|
204
|
+
// old parent. Updating chat_sessions.session_id in place would orphan the
|
|
205
|
+
// existing chat_messages mid-statement — the FK is ON UPDATE NO ACTION, so
|
|
206
|
+
// SQLite raises "FOREIGN KEY constraint failed" before the follow-up
|
|
207
|
+
// UPDATE can repoint them. (The final DELETE cascades to nothing because
|
|
208
|
+
// the messages were already moved.)
|
|
209
|
+
db.prepare(
|
|
210
|
+
'INSERT INTO chat_sessions (session_id, workspace_id, turn_count, created_at, updated_at) ' +
|
|
211
|
+
'SELECT ?, workspace_id, turn_count, created_at, updated_at FROM chat_sessions WHERE session_id = ?'
|
|
212
|
+
).run(newSessionId, oldSessionId)
|
|
204
213
|
db.prepare('UPDATE chat_messages SET session_id = ? WHERE session_id = ?').run(newSessionId, oldSessionId)
|
|
214
|
+
db.prepare('DELETE FROM chat_sessions WHERE session_id = ?').run(oldSessionId)
|
|
205
215
|
}
|
|
206
216
|
return true
|
|
207
217
|
})
|
|
@@ -240,4 +250,111 @@ function deleteSession(sessionId) {
|
|
|
240
250
|
return result.changes > 0
|
|
241
251
|
}
|
|
242
252
|
|
|
243
|
-
|
|
253
|
+
// ---------------------------------------------------------------------------
|
|
254
|
+
// Chat runs — durable index for detached chat execution (see chat-run-manager).
|
|
255
|
+
// A run is owned by the run manager, not the HTTP request. These rows let a
|
|
256
|
+
// reconnecting client find the in-flight run for a workspace and resume tailing.
|
|
257
|
+
// ---------------------------------------------------------------------------
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* Record the start of a detached run.
|
|
261
|
+
* @param {{ runId: string, sessionId?: string|null, pendingSessionId?: string|null, workspaceId?: string|null }} run
|
|
262
|
+
*/
|
|
263
|
+
function createRun({ runId, sessionId, pendingSessionId, workspaceId }) {
|
|
264
|
+
const db = getDb()
|
|
265
|
+
db.prepare(
|
|
266
|
+
`INSERT INTO chat_runs (run_id, session_id, pending_session_id, workspace_id, status, started_at, last_seq)
|
|
267
|
+
VALUES (?, ?, ?, ?, 'running', ?, 0)`
|
|
268
|
+
).run(runId, sessionId || null, pendingSessionId || null, workspaceId || null, Date.now())
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
/**
|
|
272
|
+
* Update a run's progress/terminal state. Only provided fields are written.
|
|
273
|
+
* @param {string} runId
|
|
274
|
+
* @param {{ status?: string, sessionId?: string|null, lastSeq?: number }} patch
|
|
275
|
+
*/
|
|
276
|
+
function updateRun(runId, { status, sessionId, lastSeq } = {}) {
|
|
277
|
+
const db = getDb()
|
|
278
|
+
const sets = []
|
|
279
|
+
const vals = []
|
|
280
|
+
if (status !== undefined) {
|
|
281
|
+
sets.push('status = ?')
|
|
282
|
+
vals.push(status)
|
|
283
|
+
if (status !== 'running') {
|
|
284
|
+
sets.push('ended_at = ?')
|
|
285
|
+
vals.push(Date.now())
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
if (sessionId !== undefined) {
|
|
289
|
+
sets.push('session_id = ?')
|
|
290
|
+
vals.push(sessionId)
|
|
291
|
+
}
|
|
292
|
+
if (typeof lastSeq === 'number') {
|
|
293
|
+
sets.push('last_seq = ?')
|
|
294
|
+
vals.push(lastSeq)
|
|
295
|
+
}
|
|
296
|
+
if (!sets.length) return
|
|
297
|
+
vals.push(runId)
|
|
298
|
+
db.prepare(`UPDATE chat_runs SET ${sets.join(', ')} WHERE run_id = ?`).run(...vals)
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/**
|
|
302
|
+
* Most recent still-running run for a workspace (reconnect target).
|
|
303
|
+
* @param {string|null} workspaceId
|
|
304
|
+
* @returns {object|null}
|
|
305
|
+
*/
|
|
306
|
+
function getActiveRun(workspaceId) {
|
|
307
|
+
const db = getDb()
|
|
308
|
+
if (workspaceId) {
|
|
309
|
+
return db.prepare(
|
|
310
|
+
`SELECT * FROM chat_runs WHERE workspace_id = ? AND status = 'running' ORDER BY started_at DESC LIMIT 1`
|
|
311
|
+
).get(workspaceId) || null
|
|
312
|
+
}
|
|
313
|
+
return db.prepare(
|
|
314
|
+
`SELECT * FROM chat_runs WHERE workspace_id IS NULL AND status = 'running' ORDER BY started_at DESC LIMIT 1`
|
|
315
|
+
).get() || null
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
/**
|
|
319
|
+
* Load a run row by id.
|
|
320
|
+
* @param {string} runId
|
|
321
|
+
* @returns {object|null}
|
|
322
|
+
*/
|
|
323
|
+
function getRun(runId) {
|
|
324
|
+
const db = getDb()
|
|
325
|
+
return db.prepare('SELECT * FROM chat_runs WHERE run_id = ?').get(runId) || null
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
/**
|
|
329
|
+
* Boot-time sweep: any run still flagged `running` belongs to a previous server
|
|
330
|
+
* process whose in-memory owner is gone. Mark them interrupted so clients stop
|
|
331
|
+
* waiting on a stream that will never resume.
|
|
332
|
+
* @returns {number} rows swept
|
|
333
|
+
*/
|
|
334
|
+
function markRunningInterrupted() {
|
|
335
|
+
const db = getDb()
|
|
336
|
+
const res = db.prepare(
|
|
337
|
+
`UPDATE chat_runs SET status = 'interrupted', ended_at = ? WHERE status = 'running'`
|
|
338
|
+
).run(Date.now())
|
|
339
|
+
return res.changes || 0
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
/**
|
|
343
|
+
* Prune run rows older than maxAgeMs (terminal states only). Keeps the table
|
|
344
|
+
* from growing unbounded; the NDJSON event logs are pruned separately.
|
|
345
|
+
* @param {number} maxAgeMs
|
|
346
|
+
* @returns {number} rows removed
|
|
347
|
+
*/
|
|
348
|
+
function pruneRuns(maxAgeMs) {
|
|
349
|
+
const db = getDb()
|
|
350
|
+
const cutoff = Date.now() - maxAgeMs
|
|
351
|
+
const res = db.prepare(
|
|
352
|
+
`DELETE FROM chat_runs WHERE status != 'running' AND COALESCE(ended_at, started_at) < ?`
|
|
353
|
+
).run(cutoff)
|
|
354
|
+
return res.changes || 0
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
module.exports = {
|
|
358
|
+
load, loadById, listSessions, save, addMessage, rekeySession, clear, deleteSession,
|
|
359
|
+
createRun, updateRun, getActiveRun, getRun, markRunningInterrupted, pruneRuns,
|
|
360
|
+
}
|
|
@@ -12,7 +12,7 @@ const MAX_FAIL_COUNT = 3
|
|
|
12
12
|
function find({ urlTemplate, domFingerprint }) {
|
|
13
13
|
const db = getDb()
|
|
14
14
|
const row = db.prepare(`
|
|
15
|
-
SELECT url_template, dom_fingerprint, selectors_json, page_type,
|
|
15
|
+
SELECT url_template, dom_fingerprint, selectors_json, page_type, ready_selector,
|
|
16
16
|
hit_count, fail_count, last_verified_at, created_at
|
|
17
17
|
FROM page_recipes
|
|
18
18
|
WHERE url_template = ? AND dom_fingerprint = ?
|
|
@@ -24,7 +24,7 @@ function find({ urlTemplate, domFingerprint }) {
|
|
|
24
24
|
function findByTemplate(urlTemplate) {
|
|
25
25
|
const db = getDb()
|
|
26
26
|
const rows = db.prepare(`
|
|
27
|
-
SELECT url_template, dom_fingerprint, selectors_json, page_type,
|
|
27
|
+
SELECT url_template, dom_fingerprint, selectors_json, page_type, ready_selector,
|
|
28
28
|
hit_count, fail_count, last_verified_at, created_at
|
|
29
29
|
FROM page_recipes
|
|
30
30
|
WHERE url_template = ?
|
|
@@ -33,19 +33,20 @@ function findByTemplate(urlTemplate) {
|
|
|
33
33
|
return rows.map(parseRow)
|
|
34
34
|
}
|
|
35
35
|
|
|
36
|
-
function upsert({ urlTemplate, domFingerprint, selectors, pageType }) {
|
|
36
|
+
function upsert({ urlTemplate, domFingerprint, selectors, pageType, readySelector }) {
|
|
37
37
|
const db = getDb()
|
|
38
38
|
const json = JSON.stringify(selectors || {})
|
|
39
39
|
const now = new Date().toISOString()
|
|
40
40
|
db.prepare(`
|
|
41
|
-
INSERT INTO page_recipes (url_template, dom_fingerprint, selectors_json, page_type, hit_count, fail_count, last_verified_at, created_at)
|
|
42
|
-
VALUES (?, ?, ?, ?, 0, 0, ?, ?)
|
|
41
|
+
INSERT INTO page_recipes (url_template, dom_fingerprint, selectors_json, page_type, ready_selector, hit_count, fail_count, last_verified_at, created_at)
|
|
42
|
+
VALUES (?, ?, ?, ?, ?, 0, 0, ?, ?)
|
|
43
43
|
ON CONFLICT(url_template, dom_fingerprint) DO UPDATE SET
|
|
44
44
|
selectors_json = excluded.selectors_json,
|
|
45
45
|
page_type = excluded.page_type,
|
|
46
|
+
ready_selector = excluded.ready_selector,
|
|
46
47
|
fail_count = 0,
|
|
47
48
|
last_verified_at = excluded.last_verified_at
|
|
48
|
-
`).run(urlTemplate, domFingerprint, json, pageType || null, now, now)
|
|
49
|
+
`).run(urlTemplate, domFingerprint, json, pageType || null, readySelector || null, now, now)
|
|
49
50
|
return find({ urlTemplate, domFingerprint })
|
|
50
51
|
}
|
|
51
52
|
|
|
@@ -102,7 +103,7 @@ function remove({ urlTemplate, domFingerprint }) {
|
|
|
102
103
|
function listAll({ limit = 100 } = {}) {
|
|
103
104
|
const db = getDb()
|
|
104
105
|
const rows = db.prepare(`
|
|
105
|
-
SELECT url_template, dom_fingerprint, selectors_json, page_type,
|
|
106
|
+
SELECT url_template, dom_fingerprint, selectors_json, page_type, ready_selector,
|
|
106
107
|
hit_count, fail_count, last_verified_at, created_at
|
|
107
108
|
FROM page_recipes
|
|
108
109
|
ORDER BY last_verified_at DESC NULLS LAST, created_at DESC
|
|
@@ -123,6 +124,7 @@ function parseRow(row) {
|
|
|
123
124
|
dom_fingerprint: row.dom_fingerprint,
|
|
124
125
|
selectors,
|
|
125
126
|
page_type: row.page_type,
|
|
127
|
+
ready_selector: row.ready_selector || null,
|
|
126
128
|
hit_count: row.hit_count,
|
|
127
129
|
fail_count: row.fail_count,
|
|
128
130
|
last_verified_at: row.last_verified_at,
|
package/docs/api-reference.md
CHANGED
|
@@ -251,6 +251,37 @@ Response:
|
|
|
251
251
|
}
|
|
252
252
|
```
|
|
253
253
|
|
|
254
|
+
### Chat (Detached Execution)
|
|
255
|
+
|
|
256
|
+
チャットメッセージは **デタッチ実行** される (v4.6.0〜)。`POST /api/chat` で起動した LLM プロセスは
|
|
257
|
+
HTTP リクエストではなく内部の **run manager** が所有するため、SSE 接続が切れても (タブを閉じる・
|
|
258
|
+
リロード・プロキシのアイドルタイムアウト・回線断) 処理は中断されない。接続は単なる「覗き窓」であり、
|
|
259
|
+
切断時はサブスクライブを解除するだけでプロセスは kill されない。LLM を実際に停止するのは
|
|
260
|
+
`POST /api/chat/abort` のみ。
|
|
261
|
+
|
|
262
|
+
| Method | Endpoint | Description |
|
|
263
|
+
|--------|----------|-------------|
|
|
264
|
+
| POST | `/api/chat` | メッセージ送信 → 内部で run を起動し、その run のイベントを SSE で tail。SSE の最初のイベントは `{ "type": "run", "run_id": "..." }`(再接続用) |
|
|
265
|
+
| GET | `/api/chat/stream` | 進行中(または直近完了)の run に **再接続**し、`cursor` 以降のイベントから tail を再開。`run_id` または `workspace_id`(アクティブ run 自動解決) と任意の `cursor`(=最後に受け取った `seq`) を指定 |
|
|
266
|
+
| POST | `/api/chat/abort` | アクティブ run を明示的に kill (SIGTERM → 2秒後 SIGKILL)。Body に `workspace_id` / `run_id`(任意) |
|
|
267
|
+
| GET | `/api/chat/session` | アクティブセッションに加え、進行中 run があれば `active_run: { run_id, status, last_seq }` を返す(クライアントはこれを見て `/api/chat/stream` にアタッチする) |
|
|
268
|
+
|
|
269
|
+
各 SSE イベントには単調増加の `seq` が付与され、これが再接続時の `cursor` になる。終端は必ず
|
|
270
|
+
`{ "type": "done", "session_id, turn_count }`(失敗時は直前に `{ "type": "error" }`)。run の
|
|
271
|
+
イベントログは `$DATA_DIR/chat-runs/{run_id}.ndjson` に永続化され、完了から一定時間(既定10分)後に
|
|
272
|
+
メモリから evict、24時間でログ/索引を prune する。
|
|
273
|
+
|
|
274
|
+
> **再起動の挙動 (Phase 1):** run manager はミニオンプロセス内で動作するため、ミニオン自体の再起動は
|
|
275
|
+
> 生存しない。起動時に `running` のまま残った run は `interrupted` に掃き出される (クライアントは待ち続けない)。
|
|
276
|
+
> 接続断への耐性が主目的であり、再起動生存は将来の拡張 (tmux バックエンド) で対応予定。
|
|
277
|
+
|
|
278
|
+
`GET /api/chat/stream` 例:
|
|
279
|
+
```bash
|
|
280
|
+
# 直近のアクティブ run に seq 12 以降から再接続
|
|
281
|
+
curl -N -H "Authorization: Bearer $API_TOKEN" \
|
|
282
|
+
"http://localhost:8080/api/chat/stream?workspace_id=ws_abc123&cursor=12"
|
|
283
|
+
```
|
|
284
|
+
|
|
254
285
|
### Self-Reflection Schedule (自己反省時間)
|
|
255
286
|
|
|
256
287
|
The minion has a built-in daily scheduler that automatically runs end-of-day processing
|
|
@@ -714,10 +745,32 @@ Web ページの読み取り・要約・情報抽出をミニオン内のサブ
|
|
|
714
745
|
```json
|
|
715
746
|
{
|
|
716
747
|
"url": "https://example.com/article/123",
|
|
717
|
-
"hint": "本文と著者を抽出してほしい (任意, 抽出フィールドのヒント)"
|
|
748
|
+
"hint": "本文と著者を抽出してほしい (任意, 抽出フィールドのヒント)",
|
|
749
|
+
"scroll": {
|
|
750
|
+
"strategy": "count",
|
|
751
|
+
"targetItems": 50,
|
|
752
|
+
"itemSelector": ".feed-item",
|
|
753
|
+
"maxScrolls": 20,
|
|
754
|
+
"maxMs": 15000,
|
|
755
|
+
"settleMs": 600
|
|
756
|
+
}
|
|
718
757
|
}
|
|
719
758
|
```
|
|
720
759
|
|
|
760
|
+
**`scroll` (任意, v4.7.0〜):** 無限スクロール / 遅延ロードのページで「どこまでコンテンツを読み込むか」を**呼び出し側が宣言**するためのオプション。省略時はスクロールしない (従来動作)。
|
|
761
|
+
|
|
762
|
+
| フィールド | 説明 |
|
|
763
|
+
|-----------|------|
|
|
764
|
+
| `strategy` | `"count"` (件数到達まで) / `"untilStable"` (件数=増加が止まるまで) / `"fixed"` (回数固定)。未指定/不正ならスクロールしない |
|
|
765
|
+
| `targetItems` | `count` の目標件数。`itemSelector` が解決できる場合のみ有効 |
|
|
766
|
+
| `itemSelector` | 件数を数える CSS セレクタ。省略時はレシピ内の最初の `multiple: true` セレクタを流用 |
|
|
767
|
+
| `maxScrolls` | スクロール回数の上限 (default 10、サーバー上限 50) |
|
|
768
|
+
| `maxMs` | スクロールに使う最大時間 (default 15000、サーバー上限 45000) |
|
|
769
|
+
| `settleMs` | 1スクロールごとの描画待ち静止時間 (default 600) |
|
|
770
|
+
| `times` | `fixed` のスクロール回数 (default 10) |
|
|
771
|
+
|
|
772
|
+
> 値はサーバー側で上限にクランプされる。スクロール有効時はリクエスト全体のタイムアウトが 60s→120s に拡張される。
|
|
773
|
+
|
|
721
774
|
**レスポンス (success):**
|
|
722
775
|
```json
|
|
723
776
|
{
|
|
@@ -732,15 +785,24 @@ Web ページの読み取り・要約・情報抽出をミニオン内のサブ
|
|
|
732
785
|
"title": "...",
|
|
733
786
|
"content": "Markdown 本文...",
|
|
734
787
|
"structured": { "title": "...", "author": "...", "publishedAt": "..." },
|
|
735
|
-
"selectors": { "title": { "selector": "h1" }, "author": { "selector": "a[rel=author]" } }
|
|
788
|
+
"selectors": { "title": { "selector": "h1" }, "author": { "selector": "a[rel=author]" } },
|
|
789
|
+
"scrollInfo": { "scrolls": 12, "items": 50, "reachedTarget": true, "stoppedReason": "reachedTarget" }
|
|
736
790
|
}
|
|
737
791
|
```
|
|
738
792
|
|
|
793
|
+
- `scrollInfo` は `scroll` 指定時のみ含まれる。目標未達で上限打ち切りの場合は `reachedTarget: false` と `warning` が返るので、`maxScrolls` / `maxMs` を上げて再試行できる (サイレントに打ち切らない)。
|
|
794
|
+
|
|
739
795
|
**動作:**
|
|
740
|
-
- 初回アクセス (cold): Playwright でレンダリング → Readability で本文抽出 → Anthropic Haiku
|
|
741
|
-
- 2回目以降 (hot): URL 正規化・テンプレート化 → DOM フィンガープリントで保存済みレシピを照合 →
|
|
796
|
+
- 初回アクセス (cold): Playwright でレンダリング (**DOM が静止するまで待機**) → Readability で本文抽出 → Anthropic Haiku でセレクタ + `ready_selector` (描画完了の合図となる要素) を生成 → SQLite (`page_recipes`) に保存 → セレクタで再抽出して返却
|
|
797
|
+
- 2回目以降 (hot): URL 正規化・テンプレート化 → DOM フィンガープリントで保存済みレシピを照合 → **`ready_selector` の出現を待機**してからセレクタで抽出 (LLM 呼び出しなし)
|
|
742
798
|
- セルフヒール: hot 実行で空結果が返ったら `fail_count++`、3回失敗で破棄して次回 cold 再生成
|
|
743
799
|
|
|
800
|
+
**SPA (クライアントレンダリング) への対応 (v4.7.0〜):**
|
|
801
|
+
- `page.goto` は `domcontentloaded` で解決するが、SPA はその時点では中身が空のシェルなので、ナビゲーション後に追加で描画完了を待つ:
|
|
802
|
+
1. レシピに `ready_selector` があればその要素の出現を待つ
|
|
803
|
+
2. 無ければ **DOM が `settleMs` の間ミューテーションしなくなるまで待つ** (MutationObserver、コンテンツ量に依存せず自己校正)
|
|
804
|
+
- これにより「SPA の描画が始まる前に空 DOM を掴んでタイムアウト/空結果になる」問題を回避する
|
|
805
|
+
|
|
744
806
|
**URL 正規化ルール:**
|
|
745
807
|
- `utm_*` `fbclid` `gclid` `ref` 等のトラッキングクエリは除去
|
|
746
808
|
- `page` `p` `offset` 等のページネーション値は `:n` プレースホルダ化
|