@geekbeer/minion 4.4.0 → 4.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/db/migrations/20260607000000_chat_runs.js +48 -0
- package/core/db/migrations/20260607120000_page_recipes_ready_selector.js +22 -0
- package/core/lib/chat-run-manager.js +406 -0
- package/core/lib/web-extract/extractor.js +27 -7
- package/core/lib/web-extract/playwright-runner.js +199 -1
- package/core/lib/web-extract/recipe-generator.js +19 -2
- package/core/routes/variables.js +47 -5
- package/core/routes/web.js +12 -3
- package/core/stores/chat-store.js +119 -2
- package/core/stores/page-recipe-store.js +9 -7
- package/core/stores/variable-store.js +63 -0
- package/docs/api-reference.md +82 -4
- package/docs/task-guides.md +20 -2
- package/linux/routes/chat.js +159 -193
- package/package.json +1 -1
- package/rules/core.md +12 -2
- package/win/routes/chat.js +155 -157
|
@@ -8,14 +8,37 @@
|
|
|
8
8
|
*
|
|
9
9
|
* Each call spins up a fresh chromium instance. Pooling can come later
|
|
10
10
|
* once the API stabilizes — for the experimental MVP, simple is better.
|
|
11
|
+
*
|
|
12
|
+
* Wait strategy (SPA-aware, v4.7.0):
|
|
13
|
+
* `page.goto` resolves on `domcontentloaded`, which for a client-rendered
|
|
14
|
+
* SPA fires *before* the framework has mounted and fetched its data. So
|
|
15
|
+
* after navigation we additionally wait for the content to actually appear:
|
|
16
|
+
* 1. If a `readySelector` is known (from the recipe), wait for it.
|
|
17
|
+
* 2. Otherwise wait for the DOM to *settle* — i.e. no MutationObserver
|
|
18
|
+
* events for `settleMs` — which is self-calibrating and works whether
|
|
19
|
+
* the page renders 50 chars or 50,000.
|
|
20
|
+
* `scroll` (optional) then drives infinite-scroll / lazy-load pages up to
|
|
21
|
+
* caller-declared limits, with hard server-side caps.
|
|
11
22
|
*/
|
|
12
23
|
|
|
13
24
|
const DEFAULT_NAV_TIMEOUT_MS = 20_000
|
|
14
25
|
const DEFAULT_EVAL_TIMEOUT_MS = 5_000
|
|
26
|
+
const DEFAULT_READY_TIMEOUT_MS = 8_000
|
|
27
|
+
const DEFAULT_SETTLE_MS = 500
|
|
28
|
+
const DEFAULT_SETTLE_MAX_MS = 8_000
|
|
15
29
|
const DEFAULT_USER_AGENT =
|
|
16
30
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' +
|
|
17
31
|
'Chrome/124.0.0.0 Safari/537.36 MinionWebExtract/0.1'
|
|
18
32
|
|
|
33
|
+
// Scroll defaults (caller may override) and hard caps (server enforces).
|
|
34
|
+
const SCROLL_DEFAULT_MAX_SCROLLS = 10
|
|
35
|
+
const SCROLL_HARD_MAX_SCROLLS = 50
|
|
36
|
+
const SCROLL_DEFAULT_MAX_MS = 15_000
|
|
37
|
+
const SCROLL_HARD_MAX_MS = 45_000
|
|
38
|
+
const SCROLL_DEFAULT_SETTLE_MS = 600
|
|
39
|
+
const SCROLL_STABLE_ROUNDS = 2
|
|
40
|
+
const SCROLL_STRATEGIES = ['count', 'untilStable', 'fixed']
|
|
41
|
+
|
|
19
42
|
function loadChromium() {
|
|
20
43
|
let playwright
|
|
21
44
|
try {
|
|
@@ -49,12 +72,172 @@ async function withPage(fn, opts = {}) {
|
|
|
49
72
|
}
|
|
50
73
|
}
|
|
51
74
|
|
|
75
|
+
/**
|
|
76
|
+
* Wait for the DOM to stop mutating. Resolves once no childList/characterData
|
|
77
|
+
* mutation has fired for `settleMs`, or after `settleMaxMs` regardless. Scoped
|
|
78
|
+
* to the main content landmark when present so ever-churning headers/ads/beacons
|
|
79
|
+
* don't keep it awake. Best-effort: any failure (e.g. mid-navigation) resolves.
|
|
80
|
+
*/
|
|
81
|
+
async function waitForSettle(page, opts = {}) {
|
|
82
|
+
const settleMs = opts.settleMs ?? DEFAULT_SETTLE_MS
|
|
83
|
+
const settleMaxMs = opts.settleMaxMs ?? DEFAULT_SETTLE_MAX_MS
|
|
84
|
+
try {
|
|
85
|
+
await page.evaluate(({ quiet, max }) => new Promise(resolve => {
|
|
86
|
+
const target = document.querySelector('main, article, [role="main"]') || document.body
|
|
87
|
+
if (!target) { resolve(); return }
|
|
88
|
+
let quietTimer = setTimeout(finish, quiet)
|
|
89
|
+
const hardCap = setTimeout(finish, max)
|
|
90
|
+
const obs = new MutationObserver(() => {
|
|
91
|
+
clearTimeout(quietTimer)
|
|
92
|
+
quietTimer = setTimeout(finish, quiet)
|
|
93
|
+
})
|
|
94
|
+
obs.observe(target, { childList: true, subtree: true, characterData: true })
|
|
95
|
+
function finish() {
|
|
96
|
+
clearTimeout(quietTimer)
|
|
97
|
+
clearTimeout(hardCap)
|
|
98
|
+
obs.disconnect()
|
|
99
|
+
resolve()
|
|
100
|
+
}
|
|
101
|
+
}), { quiet: settleMs, max: settleMaxMs })
|
|
102
|
+
} catch {
|
|
103
|
+
// navigation/teardown raced us — caller proceeds with whatever rendered.
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* After navigation, wait until the meaningful content is present:
|
|
109
|
+
* prefer the recipe's `readySelector`; fall back to DOM-settle detection.
|
|
110
|
+
*/
|
|
111
|
+
async function waitForReady(page, opts = {}) {
|
|
112
|
+
if (opts.readySelector) {
|
|
113
|
+
try {
|
|
114
|
+
await page.waitForSelector(opts.readySelector, {
|
|
115
|
+
state: 'visible',
|
|
116
|
+
timeout: opts.readyTimeoutMs ?? DEFAULT_READY_TIMEOUT_MS,
|
|
117
|
+
})
|
|
118
|
+
return
|
|
119
|
+
} catch {
|
|
120
|
+
// readySelector never showed — fall through to settle so we still
|
|
121
|
+
// capture whatever did render rather than returning the bare shell.
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
await waitForSettle(page, opts)
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Normalize a caller-supplied scroll request into a clamped config, or null
|
|
129
|
+
* when no (valid) scroll was requested. `itemSelector` resolution order:
|
|
130
|
+
* explicit > the recipe's first `multiple: true` selector. With no item
|
|
131
|
+
* selector, 'count' is meaningless so we measure scrollHeight stability.
|
|
132
|
+
*/
|
|
133
|
+
function normalizeScroll(scroll, selectors) {
|
|
134
|
+
if (!scroll || typeof scroll !== 'object') return null
|
|
135
|
+
if (!SCROLL_STRATEGIES.includes(scroll.strategy)) return null
|
|
136
|
+
|
|
137
|
+
let itemSelector =
|
|
138
|
+
typeof scroll.itemSelector === 'string' && scroll.itemSelector.trim()
|
|
139
|
+
? scroll.itemSelector.trim()
|
|
140
|
+
: null
|
|
141
|
+
if (!itemSelector && selectors && typeof selectors === 'object') {
|
|
142
|
+
for (const spec of Object.values(selectors)) {
|
|
143
|
+
if (spec && spec.multiple && typeof spec.selector === 'string' && spec.selector.trim()) {
|
|
144
|
+
itemSelector = spec.selector.trim()
|
|
145
|
+
break
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
const clampInt = (val, def, min, max) => {
|
|
151
|
+
const n = Number.isFinite(val) ? Math.floor(val) : def
|
|
152
|
+
return Math.max(min, Math.min(max, n))
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
return {
|
|
156
|
+
strategy: scroll.strategy,
|
|
157
|
+
itemSelector,
|
|
158
|
+
targetItems: clampInt(scroll.targetItems, 0, 0, 100_000),
|
|
159
|
+
maxScrolls: clampInt(scroll.maxScrolls, SCROLL_DEFAULT_MAX_SCROLLS, 1, SCROLL_HARD_MAX_SCROLLS),
|
|
160
|
+
maxMs: clampInt(scroll.maxMs, SCROLL_DEFAULT_MAX_MS, 500, SCROLL_HARD_MAX_MS),
|
|
161
|
+
settleMs: clampInt(scroll.settleMs, SCROLL_DEFAULT_SETTLE_MS, 100, 5_000),
|
|
162
|
+
times: clampInt(scroll.times, SCROLL_DEFAULT_MAX_SCROLLS, 1, SCROLL_HARD_MAX_SCROLLS),
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
/**
|
|
167
|
+
* Drive an infinite-scroll / lazy-load page within the caller's limits.
|
|
168
|
+
* Returns { scrolls, items, reachedTarget, stoppedReason } so the caller can
|
|
169
|
+
* tell whether it hit the target or was capped (never silently truncated).
|
|
170
|
+
*/
|
|
171
|
+
async function scrollToLoad(page, cfg) {
|
|
172
|
+
const start = Date.now()
|
|
173
|
+
const elapsed = () => Date.now() - start
|
|
174
|
+
const measure = async () => {
|
|
175
|
+
try {
|
|
176
|
+
if (cfg.itemSelector) {
|
|
177
|
+
return await page.evaluate(sel => document.querySelectorAll(sel).length, cfg.itemSelector)
|
|
178
|
+
}
|
|
179
|
+
return await page.evaluate(() => document.body.scrollHeight)
|
|
180
|
+
} catch {
|
|
181
|
+
return 0
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
let last = await measure()
|
|
186
|
+
let scrolls = 0
|
|
187
|
+
let stableRounds = 0
|
|
188
|
+
let stoppedReason = null
|
|
189
|
+
|
|
190
|
+
while (true) {
|
|
191
|
+
if (cfg.strategy === 'count' && cfg.itemSelector && cfg.targetItems > 0 && last >= cfg.targetItems) {
|
|
192
|
+
stoppedReason = 'reachedTarget'
|
|
193
|
+
break
|
|
194
|
+
}
|
|
195
|
+
if (cfg.strategy === 'fixed' && scrolls >= cfg.times) {
|
|
196
|
+
stoppedReason = 'fixedDone'
|
|
197
|
+
break
|
|
198
|
+
}
|
|
199
|
+
if (scrolls >= cfg.maxScrolls) { stoppedReason = 'maxScrolls'; break }
|
|
200
|
+
if (elapsed() >= cfg.maxMs) { stoppedReason = 'maxMs'; break }
|
|
201
|
+
|
|
202
|
+
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)).catch(() => {})
|
|
203
|
+
scrolls++
|
|
204
|
+
await waitForSettle(page, {
|
|
205
|
+
settleMs: cfg.settleMs,
|
|
206
|
+
settleMaxMs: Math.min(cfg.settleMs * 4, Math.max(cfg.maxMs - elapsed(), cfg.settleMs)),
|
|
207
|
+
})
|
|
208
|
+
|
|
209
|
+
const cur = await measure()
|
|
210
|
+
if (cfg.strategy !== 'fixed') {
|
|
211
|
+
// No growth this round counts toward "settled"; two flat rounds = done.
|
|
212
|
+
stableRounds = cur <= last ? stableRounds + 1 : 0
|
|
213
|
+
if (stableRounds >= SCROLL_STABLE_ROUNDS) {
|
|
214
|
+
last = cur
|
|
215
|
+
stoppedReason = 'stable'
|
|
216
|
+
break
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
last = cur
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
const items = cfg.itemSelector ? await measure() : null
|
|
223
|
+
return {
|
|
224
|
+
scrolls,
|
|
225
|
+
items,
|
|
226
|
+
reachedTarget:
|
|
227
|
+
cfg.strategy === 'count' && cfg.itemSelector && cfg.targetItems > 0
|
|
228
|
+
? items >= cfg.targetItems
|
|
229
|
+
: null,
|
|
230
|
+
stoppedReason: stoppedReason || 'stable',
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
52
234
|
async function renderPage(url, opts = {}) {
|
|
53
235
|
return withPage(async page => {
|
|
54
236
|
const response = await page.goto(url, {
|
|
55
237
|
waitUntil: 'domcontentloaded',
|
|
56
238
|
timeout: opts.timeoutMs ?? DEFAULT_NAV_TIMEOUT_MS,
|
|
57
239
|
})
|
|
240
|
+
await waitForReady(page, opts)
|
|
58
241
|
const html = await page.content()
|
|
59
242
|
return {
|
|
60
243
|
html,
|
|
@@ -77,6 +260,11 @@ async function renderPage(url, opts = {}) {
|
|
|
77
260
|
*
|
|
78
261
|
* `attr` defaults to 'text' (innerText). Special value 'html' returns
|
|
79
262
|
* innerHTML. Any other string is read as an HTML attribute.
|
|
263
|
+
*
|
|
264
|
+
* `opts.readySelector` waits for that element before extracting; `opts.scroll`
|
|
265
|
+
* (a config from `normalizeScroll`) drives lazy-load pages first.
|
|
266
|
+
*
|
|
267
|
+
* Returns `{ data, scrollInfo }` — scrollInfo is null when no scroll ran.
|
|
80
268
|
*/
|
|
81
269
|
async function extractWithSelectors(url, selectors, opts = {}) {
|
|
82
270
|
return withPage(async page => {
|
|
@@ -84,7 +272,14 @@ async function extractWithSelectors(url, selectors, opts = {}) {
|
|
|
84
272
|
waitUntil: 'domcontentloaded',
|
|
85
273
|
timeout: opts.timeoutMs ?? DEFAULT_NAV_TIMEOUT_MS,
|
|
86
274
|
})
|
|
87
|
-
|
|
275
|
+
await waitForReady(page, opts)
|
|
276
|
+
|
|
277
|
+
let scrollInfo = null
|
|
278
|
+
if (opts.scroll) {
|
|
279
|
+
scrollInfo = await scrollToLoad(page, opts.scroll)
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
const data = await page.evaluate(
|
|
88
283
|
({ selectorMap, evalTimeoutMs }) => {
|
|
89
284
|
const start = Date.now()
|
|
90
285
|
const result = {}
|
|
@@ -120,10 +315,13 @@ async function extractWithSelectors(url, selectors, opts = {}) {
|
|
|
120
315
|
},
|
|
121
316
|
{ selectorMap: selectors, evalTimeoutMs: opts.evalTimeoutMs ?? DEFAULT_EVAL_TIMEOUT_MS },
|
|
122
317
|
)
|
|
318
|
+
|
|
319
|
+
return { data, scrollInfo }
|
|
123
320
|
}, opts)
|
|
124
321
|
}
|
|
125
322
|
|
|
126
323
|
module.exports = {
|
|
127
324
|
renderPage,
|
|
128
325
|
extractWithSelectors,
|
|
326
|
+
normalizeScroll,
|
|
129
327
|
}
|
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
* {
|
|
18
18
|
* pageType: 'article' | 'listing' | 'product' | 'profile' | 'form' | 'other',
|
|
19
19
|
* selectors: { fieldName: { selector, attr?, multiple? }, ... },
|
|
20
|
+
* readySelector: <css selector whose presence signals content is rendered>,
|
|
20
21
|
* extracted: { fieldName: <value already pulled from this page> }
|
|
21
22
|
* }
|
|
22
23
|
*/
|
|
@@ -59,6 +60,15 @@ const ANTHROPIC_TOOLS = [{
|
|
|
59
60
|
},
|
|
60
61
|
},
|
|
61
62
|
},
|
|
63
|
+
ready_selector: {
|
|
64
|
+
type: 'string',
|
|
65
|
+
description:
|
|
66
|
+
'A single CSS selector for an element that exists ONLY once the primary ' +
|
|
67
|
+
'content has rendered (e.g. the article body, the first list item, a price). ' +
|
|
68
|
+
'On later visits the extractor waits for this element before reading the page, ' +
|
|
69
|
+
'so client-rendered (SPA) pages are captured after hydration rather than as an ' +
|
|
70
|
+
'empty shell. Pick a stable, semantic element; avoid spinners/skeletons.',
|
|
71
|
+
},
|
|
62
72
|
extracted: {
|
|
63
73
|
type: 'object',
|
|
64
74
|
description:
|
|
@@ -77,7 +87,10 @@ Given a cleaned Markdown rendering of one page, you must:
|
|
|
77
87
|
- Prefer semantic selectors (article, h1, time[datetime], a[rel="author"]) over class names where possible.
|
|
78
88
|
- Use class-based selectors only when semantic ones are unavailable.
|
|
79
89
|
- Avoid fragile attribute selectors like data-react-* or auto-generated hashes.
|
|
80
|
-
3.
|
|
90
|
+
3. Pick a "ready_selector": one CSS selector for an element that only exists once the
|
|
91
|
+
primary content has rendered (the article body, the first list item, the price, etc.).
|
|
92
|
+
Prefer a stable semantic element; never pick a loading spinner or skeleton placeholder.
|
|
93
|
+
4. Fill the "extracted" object with the values pulled from this exact page so the caller can verify your recipe works.
|
|
81
94
|
|
|
82
95
|
The same recipe will be reused for structurally similar pages, so think about what generalizes.`
|
|
83
96
|
|
|
@@ -132,6 +145,7 @@ async function generateViaPlugin(plugin, { url, cleanedMarkdown, hint }) {
|
|
|
132
145
|
return {
|
|
133
146
|
pageType: json.page_type || 'other',
|
|
134
147
|
selectors: json.selectors || {},
|
|
148
|
+
readySelector: typeof json.ready_selector === 'string' ? json.ready_selector : null,
|
|
135
149
|
extracted: json.extracted || {},
|
|
136
150
|
source: `primary:${plugin.name}`,
|
|
137
151
|
}
|
|
@@ -175,10 +189,11 @@ async function generateViaAnthropicDirect({ url, cleanedMarkdown, hint }) {
|
|
|
175
189
|
throw new Error('Anthropic API returned no tool_use block for page_extraction')
|
|
176
190
|
}
|
|
177
191
|
|
|
178
|
-
const { page_type, selectors, extracted } = toolUse.input
|
|
192
|
+
const { page_type, selectors, ready_selector, extracted } = toolUse.input
|
|
179
193
|
return {
|
|
180
194
|
pageType: page_type || 'other',
|
|
181
195
|
selectors: selectors || {},
|
|
196
|
+
readySelector: typeof ready_selector === 'string' ? ready_selector : null,
|
|
182
197
|
extracted: extracted || {},
|
|
183
198
|
source: 'anthropic-direct',
|
|
184
199
|
}
|
|
@@ -199,12 +214,14 @@ function buildTextPrompt({ url, cleanedMarkdown, hint }) {
|
|
|
199
214
|
' "selectors": {',
|
|
200
215
|
' "<fieldName>": { "selector": "<css>", "attr"?: "<text|html|attribute-name>", "multiple"?: <boolean> }',
|
|
201
216
|
' },',
|
|
217
|
+
' "ready_selector": "<css selector that only exists once the main content has rendered>",',
|
|
202
218
|
' "extracted": { "<fieldName>": "<string or array of strings>" }',
|
|
203
219
|
'}',
|
|
204
220
|
'',
|
|
205
221
|
'Notes:',
|
|
206
222
|
'- attr defaults to "text" (innerText). Use "html" or an HTML attribute name to override.',
|
|
207
223
|
'- Set multiple=true for list fields (returns array).',
|
|
224
|
+
'- "ready_selector" should target a stable content element (article body, first list item, price); never a spinner/skeleton.',
|
|
208
225
|
'- "extracted" must contain the values you actually read from THIS page using those selectors.',
|
|
209
226
|
'',
|
|
210
227
|
'--- Cleaned Markdown ---',
|
package/core/routes/variables.js
CHANGED
|
@@ -108,6 +108,29 @@ function variableRoutes(fastify, _opts, done) {
|
|
|
108
108
|
return { success: true, scopes: variableStore.listVariableScopes() }
|
|
109
109
|
})
|
|
110
110
|
|
|
111
|
+
// Move a variable from one workspace scope to another. Body:
|
|
112
|
+
// { from_workspace_id, to_workspace_id } — omit/empty string targets the
|
|
113
|
+
// minion-wide bucket. The destination is never overwritten on conflict.
|
|
114
|
+
fastify.post('/api/variables/:key/move', async (request, reply) => {
|
|
115
|
+
if (!verifyToken(request)) {
|
|
116
|
+
return reply.code(401).send({ error: 'Unauthorized' })
|
|
117
|
+
}
|
|
118
|
+
const { key } = request.params
|
|
119
|
+
const from = typeof request.body?.from_workspace_id === 'string' ? request.body.from_workspace_id : ''
|
|
120
|
+
const to = typeof request.body?.to_workspace_id === 'string' ? request.body.to_workspace_id : ''
|
|
121
|
+
const result = variableStore.moveVariable(from, to, key)
|
|
122
|
+
if (result.status === 'same_scope') {
|
|
123
|
+
return reply.code(400).send({ error: 'Source and destination scopes are the same.' })
|
|
124
|
+
}
|
|
125
|
+
if (result.status === 'not_found') {
|
|
126
|
+
return reply.code(404).send({ error: `Variable not found: ${key}` })
|
|
127
|
+
}
|
|
128
|
+
if (result.status === 'conflict') {
|
|
129
|
+
return reply.code(409).send({ error: `A variable with the same key already exists in the destination scope: ${key}` })
|
|
130
|
+
}
|
|
131
|
+
return { success: true, key, from_workspace_id: result.from, to_workspace_id: result.to }
|
|
132
|
+
})
|
|
133
|
+
|
|
111
134
|
// ─── Secrets (sensitive, workspace-scoped) ────────────────────────────
|
|
112
135
|
//
|
|
113
136
|
// Secrets are scoped per workspace. Pass ?workspace_id=<uuid> to target a
|
|
@@ -118,11 +141,6 @@ function variableRoutes(fastify, _opts, done) {
|
|
|
118
141
|
// Values are never returned via the API by design — only key names. Secrets
|
|
119
142
|
// never leave the minion: the HQ proxy is a pure pass-through.
|
|
120
143
|
|
|
121
|
-
function readWorkspaceId(request) {
|
|
122
|
-
const rawWs = request.query?.workspace_id
|
|
123
|
-
return (typeof rawWs === 'string') ? rawWs : ''
|
|
124
|
-
}
|
|
125
|
-
|
|
126
144
|
fastify.get('/api/secrets', async (request, reply) => {
|
|
127
145
|
if (!verifyToken(request)) {
|
|
128
146
|
return reply.code(401).send({ error: 'Unauthorized' })
|
|
@@ -175,6 +193,30 @@ function variableRoutes(fastify, _opts, done) {
|
|
|
175
193
|
return { success: true, scopes: variableStore.listSecretScopes() }
|
|
176
194
|
})
|
|
177
195
|
|
|
196
|
+
// Move a secret from one workspace scope to another. Body:
|
|
197
|
+
// { from_workspace_id, to_workspace_id } — omit/empty string targets the
|
|
198
|
+
// minion-wide bucket. The value is moved within the minion (it never leaves);
|
|
199
|
+
// the destination is never overwritten on conflict.
|
|
200
|
+
fastify.post('/api/secrets/:key/move', async (request, reply) => {
|
|
201
|
+
if (!verifyToken(request)) {
|
|
202
|
+
return reply.code(401).send({ error: 'Unauthorized' })
|
|
203
|
+
}
|
|
204
|
+
const { key } = request.params
|
|
205
|
+
const from = typeof request.body?.from_workspace_id === 'string' ? request.body.from_workspace_id : ''
|
|
206
|
+
const to = typeof request.body?.to_workspace_id === 'string' ? request.body.to_workspace_id : ''
|
|
207
|
+
const result = variableStore.moveSecret(from, to, key)
|
|
208
|
+
if (result.status === 'same_scope') {
|
|
209
|
+
return reply.code(400).send({ error: 'Source and destination scopes are the same.' })
|
|
210
|
+
}
|
|
211
|
+
if (result.status === 'not_found') {
|
|
212
|
+
return reply.code(404).send({ error: `Secret not found: ${key}` })
|
|
213
|
+
}
|
|
214
|
+
if (result.status === 'conflict') {
|
|
215
|
+
return reply.code(409).send({ error: `A secret with the same key already exists in the destination scope: ${key}` })
|
|
216
|
+
}
|
|
217
|
+
return { success: true, key, from_workspace_id: result.from, to_workspace_id: result.to }
|
|
218
|
+
})
|
|
219
|
+
|
|
178
220
|
done()
|
|
179
221
|
}
|
|
180
222
|
|
package/core/routes/web.js
CHANGED
|
@@ -19,6 +19,9 @@ const { extract } = require('../lib/web-extract')
|
|
|
19
19
|
const pageRecipeStore = require('../stores/page-recipe-store')
|
|
20
20
|
|
|
21
21
|
const REQUEST_TIMEOUT_MS = 60_000
|
|
22
|
+
// Scrolling adds an extra navigation plus an in-page scroll loop, so give
|
|
23
|
+
// scroll-enabled requests a wider ceiling than the plain extract path.
|
|
24
|
+
const SCROLL_REQUEST_TIMEOUT_MS = 120_000
|
|
22
25
|
|
|
23
26
|
async function webRoutes(fastify) {
|
|
24
27
|
fastify.post('/api/web/extract', async (request, reply) => {
|
|
@@ -28,7 +31,7 @@ async function webRoutes(fastify) {
|
|
|
28
31
|
}
|
|
29
32
|
|
|
30
33
|
const body = request.body || {}
|
|
31
|
-
const { url, hint } = body
|
|
34
|
+
const { url, hint, scroll } = body
|
|
32
35
|
|
|
33
36
|
if (!url || typeof url !== 'string') {
|
|
34
37
|
reply.code(400)
|
|
@@ -40,11 +43,17 @@ async function webRoutes(fastify) {
|
|
|
40
43
|
reply.code(400)
|
|
41
44
|
return { success: false, error: 'url is not a valid URL' }
|
|
42
45
|
}
|
|
46
|
+
if (scroll != null && typeof scroll !== 'object') {
|
|
47
|
+
reply.code(400)
|
|
48
|
+
return { success: false, error: 'scroll must be an object when provided' }
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const requestTimeoutMs = scroll ? SCROLL_REQUEST_TIMEOUT_MS : REQUEST_TIMEOUT_MS
|
|
43
52
|
|
|
44
53
|
try {
|
|
45
54
|
const result = await Promise.race([
|
|
46
|
-
extract({ url, hint: typeof hint === 'string' ? hint : null }),
|
|
47
|
-
new Promise((_, rej) => setTimeout(() => rej(new Error('extract timeout')),
|
|
55
|
+
extract({ url, hint: typeof hint === 'string' ? hint : null, scroll: scroll || null }),
|
|
56
|
+
new Promise((_, rej) => setTimeout(() => rej(new Error('extract timeout')), requestTimeoutMs)),
|
|
48
57
|
])
|
|
49
58
|
return { success: true, ...result }
|
|
50
59
|
} catch (err) {
|
|
@@ -200,8 +200,18 @@ function rekeySession(oldSessionId, newSessionId) {
|
|
|
200
200
|
.run(Date.now(), oldSession.turn_count || 0, newSessionId)
|
|
201
201
|
db.prepare('DELETE FROM chat_sessions WHERE session_id = ?').run(oldSessionId)
|
|
202
202
|
} else {
|
|
203
|
-
|
|
203
|
+
// Insert the new parent row first, repoint the children, then drop the
|
|
204
|
+
// old parent. Updating chat_sessions.session_id in place would orphan the
|
|
205
|
+
// existing chat_messages mid-statement — the FK is ON UPDATE NO ACTION, so
|
|
206
|
+
// SQLite raises "FOREIGN KEY constraint failed" before the follow-up
|
|
207
|
+
// UPDATE can repoint them. (The final DELETE cascades to nothing because
|
|
208
|
+
// the messages were already moved.)
|
|
209
|
+
db.prepare(
|
|
210
|
+
'INSERT INTO chat_sessions (session_id, workspace_id, turn_count, created_at, updated_at) ' +
|
|
211
|
+
'SELECT ?, workspace_id, turn_count, created_at, updated_at FROM chat_sessions WHERE session_id = ?'
|
|
212
|
+
).run(newSessionId, oldSessionId)
|
|
204
213
|
db.prepare('UPDATE chat_messages SET session_id = ? WHERE session_id = ?').run(newSessionId, oldSessionId)
|
|
214
|
+
db.prepare('DELETE FROM chat_sessions WHERE session_id = ?').run(oldSessionId)
|
|
205
215
|
}
|
|
206
216
|
return true
|
|
207
217
|
})
|
|
@@ -240,4 +250,111 @@ function deleteSession(sessionId) {
|
|
|
240
250
|
return result.changes > 0
|
|
241
251
|
}
|
|
242
252
|
|
|
243
|
-
|
|
253
|
+
// ---------------------------------------------------------------------------
|
|
254
|
+
// Chat runs — durable index for detached chat execution (see chat-run-manager).
|
|
255
|
+
// A run is owned by the run manager, not the HTTP request. These rows let a
|
|
256
|
+
// reconnecting client find the in-flight run for a workspace and resume tailing.
|
|
257
|
+
// ---------------------------------------------------------------------------
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* Record the start of a detached run.
|
|
261
|
+
* @param {{ runId: string, sessionId?: string|null, pendingSessionId?: string|null, workspaceId?: string|null }} run
|
|
262
|
+
*/
|
|
263
|
+
function createRun({ runId, sessionId, pendingSessionId, workspaceId }) {
|
|
264
|
+
const db = getDb()
|
|
265
|
+
db.prepare(
|
|
266
|
+
`INSERT INTO chat_runs (run_id, session_id, pending_session_id, workspace_id, status, started_at, last_seq)
|
|
267
|
+
VALUES (?, ?, ?, ?, 'running', ?, 0)`
|
|
268
|
+
).run(runId, sessionId || null, pendingSessionId || null, workspaceId || null, Date.now())
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
/**
|
|
272
|
+
* Update a run's progress/terminal state. Only provided fields are written.
|
|
273
|
+
* @param {string} runId
|
|
274
|
+
* @param {{ status?: string, sessionId?: string|null, lastSeq?: number }} patch
|
|
275
|
+
*/
|
|
276
|
+
function updateRun(runId, { status, sessionId, lastSeq } = {}) {
|
|
277
|
+
const db = getDb()
|
|
278
|
+
const sets = []
|
|
279
|
+
const vals = []
|
|
280
|
+
if (status !== undefined) {
|
|
281
|
+
sets.push('status = ?')
|
|
282
|
+
vals.push(status)
|
|
283
|
+
if (status !== 'running') {
|
|
284
|
+
sets.push('ended_at = ?')
|
|
285
|
+
vals.push(Date.now())
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
if (sessionId !== undefined) {
|
|
289
|
+
sets.push('session_id = ?')
|
|
290
|
+
vals.push(sessionId)
|
|
291
|
+
}
|
|
292
|
+
if (typeof lastSeq === 'number') {
|
|
293
|
+
sets.push('last_seq = ?')
|
|
294
|
+
vals.push(lastSeq)
|
|
295
|
+
}
|
|
296
|
+
if (!sets.length) return
|
|
297
|
+
vals.push(runId)
|
|
298
|
+
db.prepare(`UPDATE chat_runs SET ${sets.join(', ')} WHERE run_id = ?`).run(...vals)
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/**
|
|
302
|
+
* Most recent still-running run for a workspace (reconnect target).
|
|
303
|
+
* @param {string|null} workspaceId
|
|
304
|
+
* @returns {object|null}
|
|
305
|
+
*/
|
|
306
|
+
function getActiveRun(workspaceId) {
|
|
307
|
+
const db = getDb()
|
|
308
|
+
if (workspaceId) {
|
|
309
|
+
return db.prepare(
|
|
310
|
+
`SELECT * FROM chat_runs WHERE workspace_id = ? AND status = 'running' ORDER BY started_at DESC LIMIT 1`
|
|
311
|
+
).get(workspaceId) || null
|
|
312
|
+
}
|
|
313
|
+
return db.prepare(
|
|
314
|
+
`SELECT * FROM chat_runs WHERE workspace_id IS NULL AND status = 'running' ORDER BY started_at DESC LIMIT 1`
|
|
315
|
+
).get() || null
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
/**
|
|
319
|
+
* Load a run row by id.
|
|
320
|
+
* @param {string} runId
|
|
321
|
+
* @returns {object|null}
|
|
322
|
+
*/
|
|
323
|
+
function getRun(runId) {
|
|
324
|
+
const db = getDb()
|
|
325
|
+
return db.prepare('SELECT * FROM chat_runs WHERE run_id = ?').get(runId) || null
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
/**
|
|
329
|
+
* Boot-time sweep: any run still flagged `running` belongs to a previous server
|
|
330
|
+
* process whose in-memory owner is gone. Mark them interrupted so clients stop
|
|
331
|
+
* waiting on a stream that will never resume.
|
|
332
|
+
* @returns {number} rows swept
|
|
333
|
+
*/
|
|
334
|
+
function markRunningInterrupted() {
|
|
335
|
+
const db = getDb()
|
|
336
|
+
const res = db.prepare(
|
|
337
|
+
`UPDATE chat_runs SET status = 'interrupted', ended_at = ? WHERE status = 'running'`
|
|
338
|
+
).run(Date.now())
|
|
339
|
+
return res.changes || 0
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
/**
|
|
343
|
+
* Prune run rows older than maxAgeMs (terminal states only). Keeps the table
|
|
344
|
+
* from growing unbounded; the NDJSON event logs are pruned separately.
|
|
345
|
+
* @param {number} maxAgeMs
|
|
346
|
+
* @returns {number} rows removed
|
|
347
|
+
*/
|
|
348
|
+
function pruneRuns(maxAgeMs) {
|
|
349
|
+
const db = getDb()
|
|
350
|
+
const cutoff = Date.now() - maxAgeMs
|
|
351
|
+
const res = db.prepare(
|
|
352
|
+
`DELETE FROM chat_runs WHERE status != 'running' AND COALESCE(ended_at, started_at) < ?`
|
|
353
|
+
).run(cutoff)
|
|
354
|
+
return res.changes || 0
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
module.exports = {
|
|
358
|
+
load, loadById, listSessions, save, addMessage, rekeySession, clear, deleteSession,
|
|
359
|
+
createRun, updateRun, getActiveRun, getRun, markRunningInterrupted, pruneRuns,
|
|
360
|
+
}
|
|
@@ -12,7 +12,7 @@ const MAX_FAIL_COUNT = 3
|
|
|
12
12
|
function find({ urlTemplate, domFingerprint }) {
|
|
13
13
|
const db = getDb()
|
|
14
14
|
const row = db.prepare(`
|
|
15
|
-
SELECT url_template, dom_fingerprint, selectors_json, page_type,
|
|
15
|
+
SELECT url_template, dom_fingerprint, selectors_json, page_type, ready_selector,
|
|
16
16
|
hit_count, fail_count, last_verified_at, created_at
|
|
17
17
|
FROM page_recipes
|
|
18
18
|
WHERE url_template = ? AND dom_fingerprint = ?
|
|
@@ -24,7 +24,7 @@ function find({ urlTemplate, domFingerprint }) {
|
|
|
24
24
|
function findByTemplate(urlTemplate) {
|
|
25
25
|
const db = getDb()
|
|
26
26
|
const rows = db.prepare(`
|
|
27
|
-
SELECT url_template, dom_fingerprint, selectors_json, page_type,
|
|
27
|
+
SELECT url_template, dom_fingerprint, selectors_json, page_type, ready_selector,
|
|
28
28
|
hit_count, fail_count, last_verified_at, created_at
|
|
29
29
|
FROM page_recipes
|
|
30
30
|
WHERE url_template = ?
|
|
@@ -33,19 +33,20 @@ function findByTemplate(urlTemplate) {
|
|
|
33
33
|
return rows.map(parseRow)
|
|
34
34
|
}
|
|
35
35
|
|
|
36
|
-
function upsert({ urlTemplate, domFingerprint, selectors, pageType }) {
|
|
36
|
+
function upsert({ urlTemplate, domFingerprint, selectors, pageType, readySelector }) {
|
|
37
37
|
const db = getDb()
|
|
38
38
|
const json = JSON.stringify(selectors || {})
|
|
39
39
|
const now = new Date().toISOString()
|
|
40
40
|
db.prepare(`
|
|
41
|
-
INSERT INTO page_recipes (url_template, dom_fingerprint, selectors_json, page_type, hit_count, fail_count, last_verified_at, created_at)
|
|
42
|
-
VALUES (?, ?, ?, ?, 0, 0, ?, ?)
|
|
41
|
+
INSERT INTO page_recipes (url_template, dom_fingerprint, selectors_json, page_type, ready_selector, hit_count, fail_count, last_verified_at, created_at)
|
|
42
|
+
VALUES (?, ?, ?, ?, ?, 0, 0, ?, ?)
|
|
43
43
|
ON CONFLICT(url_template, dom_fingerprint) DO UPDATE SET
|
|
44
44
|
selectors_json = excluded.selectors_json,
|
|
45
45
|
page_type = excluded.page_type,
|
|
46
|
+
ready_selector = excluded.ready_selector,
|
|
46
47
|
fail_count = 0,
|
|
47
48
|
last_verified_at = excluded.last_verified_at
|
|
48
|
-
`).run(urlTemplate, domFingerprint, json, pageType || null, now, now)
|
|
49
|
+
`).run(urlTemplate, domFingerprint, json, pageType || null, readySelector || null, now, now)
|
|
49
50
|
return find({ urlTemplate, domFingerprint })
|
|
50
51
|
}
|
|
51
52
|
|
|
@@ -102,7 +103,7 @@ function remove({ urlTemplate, domFingerprint }) {
|
|
|
102
103
|
function listAll({ limit = 100 } = {}) {
|
|
103
104
|
const db = getDb()
|
|
104
105
|
const rows = db.prepare(`
|
|
105
|
-
SELECT url_template, dom_fingerprint, selectors_json, page_type,
|
|
106
|
+
SELECT url_template, dom_fingerprint, selectors_json, page_type, ready_selector,
|
|
106
107
|
hit_count, fail_count, last_verified_at, created_at
|
|
107
108
|
FROM page_recipes
|
|
108
109
|
ORDER BY last_verified_at DESC NULLS LAST, created_at DESC
|
|
@@ -123,6 +124,7 @@ function parseRow(row) {
|
|
|
123
124
|
dom_fingerprint: row.dom_fingerprint,
|
|
124
125
|
selectors,
|
|
125
126
|
page_type: row.page_type,
|
|
127
|
+
ready_selector: row.ready_selector || null,
|
|
126
128
|
hit_count: row.hit_count,
|
|
127
129
|
fail_count: row.fail_count,
|
|
128
130
|
last_verified_at: row.last_verified_at,
|