@geekbeer/minion 3.52.0 → 3.55.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,129 @@
1
+ /**
2
+ * Headless browser fetch + selector-based extraction.
3
+ *
4
+ * `playwright` is an optionalDependency: if it's missing (e.g. ARM host
5
+ * where the chromium binary failed to install), require() throws and the
6
+ * route layer surfaces a 503 "browser unavailable" error instead of
7
+ * crashing the agent.
8
+ *
9
+ * Each call spins up a fresh chromium instance. Pooling can come later
10
+ * once the API stabilizes — for the experimental MVP, simple is better.
11
+ */
12
+
13
+ const DEFAULT_NAV_TIMEOUT_MS = 20_000
14
+ const DEFAULT_EVAL_TIMEOUT_MS = 5_000
15
+ const DEFAULT_USER_AGENT =
16
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' +
17
+ 'Chrome/124.0.0.0 Safari/537.36 MinionWebExtract/0.1'
18
+
19
+ function loadChromium() {
20
+ let playwright
21
+ try {
22
+ playwright = require('playwright')
23
+ } catch (err) {
24
+ const e = new Error(
25
+ 'playwright is not installed. Run `npx playwright install chromium` ' +
26
+ 'on the minion host to enable POST /api/web/extract.'
27
+ )
28
+ e.code = 'PLAYWRIGHT_UNAVAILABLE'
29
+ throw e
30
+ }
31
+ return playwright.chromium
32
+ }
33
+
34
+ async function withPage(fn, opts = {}) {
35
+ const chromium = loadChromium()
36
+ const browser = await chromium.launch({
37
+ headless: true,
38
+ args: ['--no-sandbox', '--disable-dev-shm-usage'],
39
+ })
40
+ try {
41
+ const context = await browser.newContext({
42
+ userAgent: opts.userAgent || DEFAULT_USER_AGENT,
43
+ viewport: { width: 1280, height: 800 },
44
+ })
45
+ const page = await context.newPage()
46
+ return await fn(page)
47
+ } finally {
48
+ await browser.close().catch(() => {})
49
+ }
50
+ }
51
+
52
+ async function renderPage(url, opts = {}) {
53
+ return withPage(async page => {
54
+ const response = await page.goto(url, {
55
+ waitUntil: 'domcontentloaded',
56
+ timeout: opts.timeoutMs ?? DEFAULT_NAV_TIMEOUT_MS,
57
+ })
58
+ const html = await page.content()
59
+ return {
60
+ html,
61
+ finalUrl: page.url(),
62
+ statusCode: response?.status() ?? null,
63
+ }
64
+ }, opts)
65
+ }
66
+
67
+ /**
68
+ * Run a recipe against a freshly loaded page.
69
+ *
70
+ * `selectors` shape (each value is an object):
71
+ * {
72
+ * title: { selector: 'h1', attr: 'text' },
73
+ * body: { selector: 'article', attr: 'text' },
74
+ * items: { selector: '.list-item .title', attr: 'text', multiple: true },
75
+ * link: { selector: 'a.permalink', attr: 'href' }
76
+ * }
77
+ *
78
+ * `attr` defaults to 'text' (innerText). Special value 'html' returns
79
+ * innerHTML. Any other string is read as an HTML attribute.
80
+ */
81
+ async function extractWithSelectors(url, selectors, opts = {}) {
82
+ return withPage(async page => {
83
+ await page.goto(url, {
84
+ waitUntil: 'domcontentloaded',
85
+ timeout: opts.timeoutMs ?? DEFAULT_NAV_TIMEOUT_MS,
86
+ })
87
+ return await page.evaluate(
88
+ ({ selectorMap, evalTimeoutMs }) => {
89
+ const start = Date.now()
90
+ const result = {}
91
+ for (const [field, spec] of Object.entries(selectorMap)) {
92
+ if (Date.now() - start > evalTimeoutMs) {
93
+ result[field] = null
94
+ continue
95
+ }
96
+ if (!spec || !spec.selector) {
97
+ result[field] = null
98
+ continue
99
+ }
100
+ const attr = spec.attr || 'text'
101
+ const readOne = (el) => {
102
+ if (!el) return null
103
+ if (attr === 'text') return (el.innerText || el.textContent || '').trim()
104
+ if (attr === 'html') return el.innerHTML
105
+ return el.getAttribute(attr)
106
+ }
107
+ try {
108
+ if (spec.multiple) {
109
+ const nodes = document.querySelectorAll(spec.selector)
110
+ result[field] = Array.from(nodes).map(readOne).filter(v => v != null && v !== '')
111
+ } else {
112
+ const node = document.querySelector(spec.selector)
113
+ result[field] = readOne(node)
114
+ }
115
+ } catch {
116
+ result[field] = null
117
+ }
118
+ }
119
+ return result
120
+ },
121
+ { selectorMap: selectors, evalTimeoutMs: opts.evalTimeoutMs ?? DEFAULT_EVAL_TIMEOUT_MS },
122
+ )
123
+ }, opts)
124
+ }
125
+
126
+ module.exports = {
127
+ renderPage,
128
+ extractWithSelectors,
129
+ }
@@ -0,0 +1,247 @@
1
+ /**
2
+ * Cold-path recipe generator.
3
+ *
4
+ * Resolution order (first available wins):
5
+ * 1. Primary LLM plugin (PUT /api/llm/config) — uses claude / gemini / codex
6
+ * CLI subprocess via the same plugin contract that runQuickLlmCall does.
7
+ * Prompt asks for plain JSON; we extract+parse it.
8
+ * 2. ANTHROPIC_API_KEY env var — direct Anthropic Messages API with
9
+ * tool_use schema enforcement (same fetch pattern as
10
+ * core/lib/revision-watcher.js).
11
+ * 3. Otherwise: throw LLM_UNAVAILABLE so the route layer can surface a 503.
12
+ *
13
+ * The model only ever sees cleaned Markdown — never the raw HTML — and only
14
+ * runs on cold-cache misses, so the cost is bounded.
15
+ *
16
+ * Returns:
17
+ * {
18
+ * pageType: 'article' | 'listing' | 'product' | 'profile' | 'form' | 'other',
19
+ * selectors: { fieldName: { selector, attr?, multiple? }, ... },
20
+ * extracted: { fieldName: <value already pulled from this page> }
21
+ * }
22
+ */
23
+
24
+ const { getActivePrimary } = require('../../llm-plugins/lib/active')
25
+
26
+ const ANTHROPIC_MODEL = 'claude-haiku-4-5-20251001'
27
+ const MAX_TOKENS = 2048
28
+ const PLUGIN_TIMEOUT_MS = 60_000
29
+
30
+ const TOOL_DESCRIPTION =
31
+ 'Classify the page and propose CSS selectors for the most useful fields. ' +
32
+ 'Also produce the extracted values directly so the caller can verify the recipe.'
33
+
34
+ const ANTHROPIC_TOOLS = [{
35
+ name: 'page_extraction',
36
+ description: TOOL_DESCRIPTION,
37
+ input_schema: {
38
+ type: 'object',
39
+ required: ['page_type', 'selectors', 'extracted'],
40
+ properties: {
41
+ page_type: {
42
+ type: 'string',
43
+ enum: ['article', 'listing', 'product', 'profile', 'form', 'other'],
44
+ description: 'High-level classification of the page.',
45
+ },
46
+ selectors: {
47
+ type: 'object',
48
+ description:
49
+ 'Map of fieldName -> { selector, attr?, multiple? }. Use plain CSS selectors. ' +
50
+ 'attr defaults to "text" (innerText); use "html" or an HTML attribute name to override. ' +
51
+ 'Set multiple=true for list fields. Aim for 3-8 fields covering the page\'s primary content.',
52
+ additionalProperties: {
53
+ type: 'object',
54
+ required: ['selector'],
55
+ properties: {
56
+ selector: { type: 'string' },
57
+ attr: { type: 'string' },
58
+ multiple: { type: 'boolean' },
59
+ },
60
+ },
61
+ },
62
+ extracted: {
63
+ type: 'object',
64
+ description:
65
+ 'Values extracted from this specific page using the selectors above. ' +
66
+ 'Strings or arrays of strings.',
67
+ },
68
+ },
69
+ },
70
+ }]
71
+
72
+ const SYSTEM_PROMPT = `You design CSS selector recipes for extracting structured content from web pages.
73
+
74
+ Given a cleaned Markdown rendering of one page, you must:
75
+ 1. Classify the page (article / listing / product / profile / form / other).
76
+ 2. Propose 3-8 CSS selectors that capture the page's primary information.
77
+ - Prefer semantic selectors (article, h1, time[datetime], a[rel="author"]) over class names where possible.
78
+ - Use class-based selectors only when semantic ones are unavailable.
79
+ - Avoid fragile attribute selectors like data-react-* or auto-generated hashes.
80
+ 3. Fill the "extracted" object with the values pulled from this exact page so the caller can verify your recipe works.
81
+
82
+ The same recipe will be reused for structurally similar pages, so think about what generalizes.`
83
+
84
+ async function generateRecipe({ url, cleanedMarkdown, hint }) {
85
+ const primary = getActivePrimary()
86
+ if (primary) {
87
+ return await generateViaPlugin(primary, { url, cleanedMarkdown, hint })
88
+ }
89
+ if (process.env.ANTHROPIC_API_KEY) {
90
+ return await generateViaAnthropicDirect({ url, cleanedMarkdown, hint })
91
+ }
92
+ const e = new Error(
93
+ 'No LLM available for cold-path recipe generation. Configure a primary LLM via ' +
94
+ 'PUT /api/llm/config (recommended) or set ANTHROPIC_API_KEY as a fallback.'
95
+ )
96
+ e.code = 'LLM_UNAVAILABLE'
97
+ throw e
98
+ }
99
+
100
+ async function generateViaPlugin(plugin, { url, cleanedMarkdown, hint }) {
101
+ const prompt = buildTextPrompt({ url, cleanedMarkdown, hint })
102
+ const input = {
103
+ prompt,
104
+ timeoutMs: PLUGIN_TIMEOUT_MS,
105
+ }
106
+ // Claude Code CLI accepts model aliases like 'haiku'. Other plugins ignore this.
107
+ if (plugin.name === 'claude') input.model = 'haiku'
108
+
109
+ let output
110
+ try {
111
+ output = await plugin.invoke(input)
112
+ } catch (err) {
113
+ const e = new Error(`Primary LLM (${plugin.name}) invoke failed: ${err.message}`)
114
+ e.code = 'PRIMARY_LLM_FAILED'
115
+ throw e
116
+ }
117
+ if (output.error) {
118
+ const e = new Error(`Primary LLM (${plugin.name}) returned error: ${output.error.message}`)
119
+ e.code = 'PRIMARY_LLM_FAILED'
120
+ throw e
121
+ }
122
+
123
+ const json = extractJson(output.text || '')
124
+ if (!json) {
125
+ const e = new Error(
126
+ `Primary LLM (${plugin.name}) did not return parseable JSON. ` +
127
+ `Raw output (first 500 chars): ${(output.text || '').slice(0, 500)}`
128
+ )
129
+ e.code = 'PRIMARY_LLM_BAD_JSON'
130
+ throw e
131
+ }
132
+ return {
133
+ pageType: json.page_type || 'other',
134
+ selectors: json.selectors || {},
135
+ extracted: json.extracted || {},
136
+ source: `primary:${plugin.name}`,
137
+ }
138
+ }
139
+
140
+ async function generateViaAnthropicDirect({ url, cleanedMarkdown, hint }) {
141
+ const apiKey = process.env.ANTHROPIC_API_KEY
142
+ const userParts = [
143
+ `URL: ${url}`,
144
+ hint ? `Caller hint: ${hint}` : null,
145
+ '',
146
+ '--- Cleaned Markdown (Readability output) ---',
147
+ cleanedMarkdown || '(empty)',
148
+ ].filter(Boolean).join('\n')
149
+
150
+ const resp = await fetch('https://api.anthropic.com/v1/messages', {
151
+ method: 'POST',
152
+ headers: {
153
+ 'Content-Type': 'application/json',
154
+ 'x-api-key': apiKey,
155
+ 'anthropic-version': '2023-06-01',
156
+ },
157
+ body: JSON.stringify({
158
+ model: ANTHROPIC_MODEL,
159
+ max_tokens: MAX_TOKENS,
160
+ system: SYSTEM_PROMPT,
161
+ tools: ANTHROPIC_TOOLS,
162
+ tool_choice: { type: 'tool', name: 'page_extraction' },
163
+ messages: [{ role: 'user', content: userParts }],
164
+ }),
165
+ })
166
+
167
+ if (!resp.ok) {
168
+ const text = await resp.text()
169
+ throw new Error(`Anthropic API error: ${resp.status} ${text}`)
170
+ }
171
+
172
+ const data = await resp.json()
173
+ const toolUse = (data.content || []).find(block => block.type === 'tool_use' && block.name === 'page_extraction')
174
+ if (!toolUse || !toolUse.input) {
175
+ throw new Error('Anthropic API returned no tool_use block for page_extraction')
176
+ }
177
+
178
+ const { page_type, selectors, extracted } = toolUse.input
179
+ return {
180
+ pageType: page_type || 'other',
181
+ selectors: selectors || {},
182
+ extracted: extracted || {},
183
+ source: 'anthropic-direct',
184
+ }
185
+ }
186
+
187
+ function buildTextPrompt({ url, cleanedMarkdown, hint }) {
188
+ return [
189
+ SYSTEM_PROMPT,
190
+ '',
191
+ `URL: ${url}`,
192
+ hint ? `Caller hint: ${hint}` : null,
193
+ '',
194
+ 'Output ONLY a JSON object — no prose, no explanations, no code fences.',
195
+ 'The JSON must have exactly this shape:',
196
+ '',
197
+ '{',
198
+ ' "page_type": "article" | "listing" | "product" | "profile" | "form" | "other",',
199
+ ' "selectors": {',
200
+ ' "<fieldName>": { "selector": "<css>", "attr"?: "<text|html|attribute-name>", "multiple"?: <boolean> }',
201
+ ' },',
202
+ ' "extracted": { "<fieldName>": "<string or array of strings>" }',
203
+ '}',
204
+ '',
205
+ 'Notes:',
206
+ '- attr defaults to "text" (innerText). Use "html" or an HTML attribute name to override.',
207
+ '- Set multiple=true for list fields (returns array).',
208
+ '- "extracted" must contain the values you actually read from THIS page using those selectors.',
209
+ '',
210
+ '--- Cleaned Markdown ---',
211
+ cleanedMarkdown || '(empty)',
212
+ ].filter(s => s !== null).join('\n')
213
+ }
214
+
215
+ /**
216
+ * Extract a JSON object from arbitrary LLM text. Handles three common shapes:
217
+ * 1. Raw JSON
218
+ * 2. Fenced code block (```json ... ``` or ``` ... ```)
219
+ * 3. Prose with embedded {...} — uses the outermost braces
220
+ */
221
+ function extractJson(text) {
222
+ if (!text || typeof text !== 'string') return null
223
+ const trimmed = text.trim()
224
+ if (!trimmed) return null
225
+
226
+ try { return JSON.parse(trimmed) } catch {}
227
+
228
+ const fence = trimmed.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
229
+ if (fence && fence[1]) {
230
+ try { return JSON.parse(fence[1].trim()) } catch {}
231
+ }
232
+
233
+ const first = trimmed.indexOf('{')
234
+ const last = trimmed.lastIndexOf('}')
235
+ if (first >= 0 && last > first) {
236
+ try { return JSON.parse(trimmed.slice(first, last + 1)) } catch {}
237
+ }
238
+
239
+ return null
240
+ }
241
+
242
+ module.exports = {
243
+ generateRecipe,
244
+ ANTHROPIC_MODEL,
245
+ // exported for tests
246
+ extractJson,
247
+ }
@@ -0,0 +1,90 @@
1
+ /**
2
+ * URL normalization for the page recipe cache.
3
+ *
4
+ * Two URLs that resolve to the same template + the same DOM fingerprint are
5
+ * treated as structurally identical and share an extraction recipe. The
6
+ * template captures the path/query *shape* (with IDs / slugs / pagination
7
+ * placeholdered out) so the cache hits across e.g.
8
+ * /work/proposal/123456?utm_source=foo
9
+ * /work/proposal/789012?utm_source=bar&fbclid=...
10
+ * Tracking params are stripped, remaining query keys are sorted, and known
11
+ * pagination params keep their key but lose their value.
12
+ */
13
+
14
+ const STRIP_QUERY_PARAMS = new Set([
15
+ 'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
16
+ 'fbclid', 'gclid', 'mc_cid', 'mc_eid', 'ref', 'referrer',
17
+ ])
18
+
19
+ const PAGINATION_QUERY_PARAMS = new Set([
20
+ 'page', 'p', 'pagenumber', 'pagenum', 'offset', 'start',
21
+ ])
22
+
23
+ const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i
24
+ const HEX32_RE = /^[0-9a-f]{32}$/i
25
+ const ALL_DIGITS_RE = /^\d+$/
26
+ const LONG_ALNUM_RE = /^(?=.*\d)(?=.*[a-zA-Z])[A-Za-z0-9]{20,}$/
27
+
28
+ function placeholderForSegment(segment) {
29
+ if (!segment) return segment
30
+ if (ALL_DIGITS_RE.test(segment)) return ':id'
31
+ if (UUID_RE.test(segment) || HEX32_RE.test(segment)) return ':uuid'
32
+ if (LONG_ALNUM_RE.test(segment)) return ':slug'
33
+ return segment
34
+ }
35
+
36
+ function normalizeUrl(rawUrl) {
37
+ let parsed
38
+ try {
39
+ parsed = new URL(rawUrl)
40
+ } catch (err) {
41
+ throw new Error(`Invalid URL: ${rawUrl}`)
42
+ }
43
+
44
+ // Lowercase host (case-insensitive per RFC 3986)
45
+ const host = parsed.host.toLowerCase()
46
+
47
+ // Path: keep separators, placeholder each segment.
48
+ const pathSegments = parsed.pathname.split('/').map(seg => seg ? placeholderForSegment(seg) : seg)
49
+ const templatePath = pathSegments.join('/')
50
+
51
+ // Query: filter, sort, and placeholder pagination values.
52
+ const queryEntries = []
53
+ for (const [key, value] of parsed.searchParams.entries()) {
54
+ const lowerKey = key.toLowerCase()
55
+ if (STRIP_QUERY_PARAMS.has(lowerKey)) continue
56
+ if (PAGINATION_QUERY_PARAMS.has(lowerKey)) {
57
+ queryEntries.push([key, ':n'])
58
+ } else {
59
+ queryEntries.push([key, value])
60
+ }
61
+ }
62
+ queryEntries.sort((a, b) => a[0] < b[0] ? -1 : a[0] > b[0] ? 1 : 0)
63
+
64
+ const templateQuery = queryEntries.length
65
+ ? '?' + queryEntries.map(([k, v]) => `${k}=${v}`).join('&')
66
+ : ''
67
+
68
+ const template = `${host}${templatePath}${templateQuery}`
69
+
70
+ // Canonical URL: original protocol + host + path, query reduced to non-tracking params
71
+ // (keeping their *real* values so the actual fetch stays correct).
72
+ const canonical = new URL(parsed.toString())
73
+ for (const param of [...canonical.searchParams.keys()]) {
74
+ if (STRIP_QUERY_PARAMS.has(param.toLowerCase())) {
75
+ canonical.searchParams.delete(param)
76
+ }
77
+ }
78
+ canonical.hash = ''
79
+
80
+ return {
81
+ template,
82
+ canonicalUrl: canonical.toString(),
83
+ }
84
+ }
85
+
86
+ module.exports = {
87
+ normalizeUrl,
88
+ STRIP_QUERY_PARAMS,
89
+ PAGINATION_QUERY_PARAMS,
90
+ }
@@ -0,0 +1,94 @@
1
+ /**
2
+ * Web Page Extraction (experimental — v3.53.0)
3
+ *
4
+ * Endpoints:
5
+ * POST /api/web/extract - Fetch a URL and return structured JSON
6
+ * GET /api/web/recipes - List cached recipes (debug)
7
+ * DELETE /api/web/recipes/:template/:fingerprint - Drop a recipe
8
+ *
9
+ * The extract endpoint runs Playwright + Readability + an Anthropic Haiku
10
+ * recipe-generation step in this minion process, returning only structured
11
+ * JSON to the caller. Designed to keep large DOM payloads off the main
12
+ * Claude Code chat session that issued the request.
13
+ *
14
+ * Marked experimental: response shape may change.
15
+ */
16
+
17
+ const { verifyToken } = require('../lib/auth')
18
+ const { extract } = require('../lib/web-extract')
19
+ const pageRecipeStore = require('../stores/page-recipe-store')
20
+
21
+ const REQUEST_TIMEOUT_MS = 60_000
22
+
23
+ async function webRoutes(fastify) {
24
+ fastify.post('/api/web/extract', async (request, reply) => {
25
+ if (!verifyToken(request)) {
26
+ reply.code(401)
27
+ return { success: false, error: 'Unauthorized' }
28
+ }
29
+
30
+ const body = request.body || {}
31
+ const { url, hint } = body
32
+
33
+ if (!url || typeof url !== 'string') {
34
+ reply.code(400)
35
+ return { success: false, error: 'url (string) is required' }
36
+ }
37
+ try {
38
+ new URL(url)
39
+ } catch {
40
+ reply.code(400)
41
+ return { success: false, error: 'url is not a valid URL' }
42
+ }
43
+
44
+ try {
45
+ const result = await Promise.race([
46
+ extract({ url, hint: typeof hint === 'string' ? hint : null }),
47
+ new Promise((_, rej) => setTimeout(() => rej(new Error('extract timeout')), REQUEST_TIMEOUT_MS)),
48
+ ])
49
+ return { success: true, ...result }
50
+ } catch (err) {
51
+ if (
52
+ err.code === 'PLAYWRIGHT_UNAVAILABLE' ||
53
+ err.code === 'LLM_UNAVAILABLE' ||
54
+ err.code === 'ANTHROPIC_KEY_MISSING'
55
+ ) {
56
+ reply.code(503)
57
+ return { success: false, error: err.message, code: err.code }
58
+ }
59
+ if (err.code === 'PRIMARY_LLM_FAILED' || err.code === 'PRIMARY_LLM_BAD_JSON') {
60
+ reply.code(502)
61
+ return { success: false, error: err.message, code: err.code }
62
+ }
63
+ request.log.error({ err }, '[web/extract] failed')
64
+ reply.code(500)
65
+ return { success: false, error: err.message || String(err) }
66
+ }
67
+ })
68
+
69
+ // Debug helpers — list / delete cached recipes.
70
+ fastify.get('/api/web/recipes', async (request, reply) => {
71
+ if (!verifyToken(request)) {
72
+ reply.code(401)
73
+ return { success: false, error: 'Unauthorized' }
74
+ }
75
+ const recipes = pageRecipeStore.listAll({ limit: 200 })
76
+ return { success: true, experimental: true, recipes }
77
+ })
78
+
79
+ fastify.delete('/api/web/recipes', async (request, reply) => {
80
+ if (!verifyToken(request)) {
81
+ reply.code(401)
82
+ return { success: false, error: 'Unauthorized' }
83
+ }
84
+ const { template, fingerprint } = request.query || {}
85
+ if (!template || !fingerprint) {
86
+ reply.code(400)
87
+ return { success: false, error: 'template and fingerprint query params are required' }
88
+ }
89
+ const removed = pageRecipeStore.remove({ urlTemplate: template, domFingerprint: fingerprint })
90
+ return { success: true, removed }
91
+ })
92
+ }
93
+
94
+ module.exports = { webRoutes }
@@ -0,0 +1,143 @@
1
+ /**
2
+ * Page Recipe Store (SQLite, experimental — v3.53.0)
3
+ *
4
+ * Backs POST /api/web/extract. See core/lib/web-extract/extractor.js for the
5
+ * orchestrator that decides hot vs cold paths.
6
+ */
7
+
8
+ const { getDb } = require('../db')
9
+
10
+ const MAX_FAIL_COUNT = 3
11
+
12
+ function find({ urlTemplate, domFingerprint }) {
13
+ const db = getDb()
14
+ const row = db.prepare(`
15
+ SELECT url_template, dom_fingerprint, selectors_json, page_type,
16
+ hit_count, fail_count, last_verified_at, created_at
17
+ FROM page_recipes
18
+ WHERE url_template = ? AND dom_fingerprint = ?
19
+ `).get(urlTemplate, domFingerprint)
20
+ if (!row) return null
21
+ return parseRow(row)
22
+ }
23
+
24
+ function findByTemplate(urlTemplate) {
25
+ const db = getDb()
26
+ const rows = db.prepare(`
27
+ SELECT url_template, dom_fingerprint, selectors_json, page_type,
28
+ hit_count, fail_count, last_verified_at, created_at
29
+ FROM page_recipes
30
+ WHERE url_template = ?
31
+ ORDER BY last_verified_at DESC NULLS LAST, created_at DESC
32
+ `).all(urlTemplate)
33
+ return rows.map(parseRow)
34
+ }
35
+
36
+ function upsert({ urlTemplate, domFingerprint, selectors, pageType }) {
37
+ const db = getDb()
38
+ const json = JSON.stringify(selectors || {})
39
+ const now = new Date().toISOString()
40
+ db.prepare(`
41
+ INSERT INTO page_recipes (url_template, dom_fingerprint, selectors_json, page_type, hit_count, fail_count, last_verified_at, created_at)
42
+ VALUES (?, ?, ?, ?, 0, 0, ?, ?)
43
+ ON CONFLICT(url_template, dom_fingerprint) DO UPDATE SET
44
+ selectors_json = excluded.selectors_json,
45
+ page_type = excluded.page_type,
46
+ fail_count = 0,
47
+ last_verified_at = excluded.last_verified_at
48
+ `).run(urlTemplate, domFingerprint, json, pageType || null, now, now)
49
+ return find({ urlTemplate, domFingerprint })
50
+ }
51
+
52
+ function incrementHit({ urlTemplate, domFingerprint }) {
53
+ const db = getDb()
54
+ db.prepare(`
55
+ UPDATE page_recipes
56
+ SET hit_count = hit_count + 1
57
+ WHERE url_template = ? AND dom_fingerprint = ?
58
+ `).run(urlTemplate, domFingerprint)
59
+ }
60
+
61
+ function setLastVerified({ urlTemplate, domFingerprint }) {
62
+ const db = getDb()
63
+ db.prepare(`
64
+ UPDATE page_recipes
65
+ SET last_verified_at = ?, fail_count = 0
66
+ WHERE url_template = ? AND dom_fingerprint = ?
67
+ `).run(new Date().toISOString(), urlTemplate, domFingerprint)
68
+ }
69
+
70
+ /**
71
+ * Increment fail count. Returns true if the recipe was deleted (>= MAX_FAIL_COUNT).
72
+ */
73
+ function incrementFail({ urlTemplate, domFingerprint }) {
74
+ const db = getDb()
75
+ const row = db.prepare(`
76
+ SELECT fail_count FROM page_recipes
77
+ WHERE url_template = ? AND dom_fingerprint = ?
78
+ `).get(urlTemplate, domFingerprint)
79
+ if (!row) return false
80
+ const next = (row.fail_count || 0) + 1
81
+ if (next >= MAX_FAIL_COUNT) {
82
+ remove({ urlTemplate, domFingerprint })
83
+ return true
84
+ }
85
+ db.prepare(`
86
+ UPDATE page_recipes
87
+ SET fail_count = ?
88
+ WHERE url_template = ? AND dom_fingerprint = ?
89
+ `).run(next, urlTemplate, domFingerprint)
90
+ return false
91
+ }
92
+
93
+ function remove({ urlTemplate, domFingerprint }) {
94
+ const db = getDb()
95
+ const result = db.prepare(`
96
+ DELETE FROM page_recipes
97
+ WHERE url_template = ? AND dom_fingerprint = ?
98
+ `).run(urlTemplate, domFingerprint)
99
+ return result.changes > 0
100
+ }
101
+
102
+ function listAll({ limit = 100 } = {}) {
103
+ const db = getDb()
104
+ const rows = db.prepare(`
105
+ SELECT url_template, dom_fingerprint, selectors_json, page_type,
106
+ hit_count, fail_count, last_verified_at, created_at
107
+ FROM page_recipes
108
+ ORDER BY last_verified_at DESC NULLS LAST, created_at DESC
109
+ LIMIT ?
110
+ `).all(limit)
111
+ return rows.map(parseRow)
112
+ }
113
+
114
+ function parseRow(row) {
115
+ let selectors = {}
116
+ try {
117
+ selectors = JSON.parse(row.selectors_json || '{}')
118
+ } catch {
119
+ selectors = {}
120
+ }
121
+ return {
122
+ url_template: row.url_template,
123
+ dom_fingerprint: row.dom_fingerprint,
124
+ selectors,
125
+ page_type: row.page_type,
126
+ hit_count: row.hit_count,
127
+ fail_count: row.fail_count,
128
+ last_verified_at: row.last_verified_at,
129
+ created_at: row.created_at,
130
+ }
131
+ }
132
+
133
+ module.exports = {
134
+ find,
135
+ findByTemplate,
136
+ upsert,
137
+ incrementHit,
138
+ incrementFail,
139
+ setLastVerified,
140
+ remove,
141
+ listAll,
142
+ MAX_FAIL_COUNT,
143
+ }