@geekbeer/minion 3.51.2 → 3.53.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +7 -0
- package/core/api.js +26 -2
- package/core/db/migrations/20260508000000_page_recipes.js +33 -0
- package/core/lib/board-task-poller.js +12 -1
- package/core/lib/dag-cron-poller.js +14 -1
- package/core/lib/dag-step-poller.js +17 -1
- package/core/lib/frozen-state.js +64 -0
- package/core/lib/revision-watcher.js +5 -1
- package/core/lib/step-poller.js +5 -1
- package/core/lib/thread-watcher.js +5 -1
- package/core/lib/web-extract/extractor.js +142 -0
- package/core/lib/web-extract/fingerprint.js +63 -0
- package/core/lib/web-extract/html-cleaner.js +72 -0
- package/core/lib/web-extract/index.js +21 -0
- package/core/lib/web-extract/playwright-runner.js +129 -0
- package/core/lib/web-extract/recipe-generator.js +247 -0
- package/core/lib/web-extract/url-normalize.js +90 -0
- package/core/routes/admin.js +49 -0
- package/core/routes/web.js +94 -0
- package/core/stores/page-recipe-store.js +143 -0
- package/docs/api-reference.md +83 -0
- package/docs/task-guides.md +58 -0
- package/linux/routes/chat.js +36 -4
- package/linux/server.js +4 -0
- package/mac/server.js +2 -0
- package/package.json +6 -2
- package/rules/core.md +29 -1
- package/win/routes/chat.js +37 -2
- package/win/server.js +4 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Headless browser fetch + selector-based extraction.
|
|
3
|
+
*
|
|
4
|
+
* `playwright` is an optionalDependency: if it's missing (e.g. ARM host
|
|
5
|
+
* where the chromium binary failed to install), require() throws and the
|
|
6
|
+
* route layer surfaces a 503 "browser unavailable" error instead of
|
|
7
|
+
* crashing the agent.
|
|
8
|
+
*
|
|
9
|
+
* Each call spins up a fresh chromium instance. Pooling can come later
|
|
10
|
+
* once the API stabilizes — for the experimental MVP, simple is better.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
const DEFAULT_NAV_TIMEOUT_MS = 20_000
|
|
14
|
+
const DEFAULT_EVAL_TIMEOUT_MS = 5_000
|
|
15
|
+
const DEFAULT_USER_AGENT =
|
|
16
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' +
|
|
17
|
+
'Chrome/124.0.0.0 Safari/537.36 MinionWebExtract/0.1'
|
|
18
|
+
|
|
19
|
+
function loadChromium() {
|
|
20
|
+
let playwright
|
|
21
|
+
try {
|
|
22
|
+
playwright = require('playwright')
|
|
23
|
+
} catch (err) {
|
|
24
|
+
const e = new Error(
|
|
25
|
+
'playwright is not installed. Run `npx playwright install chromium` ' +
|
|
26
|
+
'on the minion host to enable POST /api/web/extract.'
|
|
27
|
+
)
|
|
28
|
+
e.code = 'PLAYWRIGHT_UNAVAILABLE'
|
|
29
|
+
throw e
|
|
30
|
+
}
|
|
31
|
+
return playwright.chromium
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
async function withPage(fn, opts = {}) {
|
|
35
|
+
const chromium = loadChromium()
|
|
36
|
+
const browser = await chromium.launch({
|
|
37
|
+
headless: true,
|
|
38
|
+
args: ['--no-sandbox', '--disable-dev-shm-usage'],
|
|
39
|
+
})
|
|
40
|
+
try {
|
|
41
|
+
const context = await browser.newContext({
|
|
42
|
+
userAgent: opts.userAgent || DEFAULT_USER_AGENT,
|
|
43
|
+
viewport: { width: 1280, height: 800 },
|
|
44
|
+
})
|
|
45
|
+
const page = await context.newPage()
|
|
46
|
+
return await fn(page)
|
|
47
|
+
} finally {
|
|
48
|
+
await browser.close().catch(() => {})
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
async function renderPage(url, opts = {}) {
|
|
53
|
+
return withPage(async page => {
|
|
54
|
+
const response = await page.goto(url, {
|
|
55
|
+
waitUntil: 'domcontentloaded',
|
|
56
|
+
timeout: opts.timeoutMs ?? DEFAULT_NAV_TIMEOUT_MS,
|
|
57
|
+
})
|
|
58
|
+
const html = await page.content()
|
|
59
|
+
return {
|
|
60
|
+
html,
|
|
61
|
+
finalUrl: page.url(),
|
|
62
|
+
statusCode: response?.status() ?? null,
|
|
63
|
+
}
|
|
64
|
+
}, opts)
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Run a recipe against a freshly loaded page.
|
|
69
|
+
*
|
|
70
|
+
* `selectors` shape (each value is an object):
|
|
71
|
+
* {
|
|
72
|
+
* title: { selector: 'h1', attr: 'text' },
|
|
73
|
+
* body: { selector: 'article', attr: 'text' },
|
|
74
|
+
* items: { selector: '.list-item .title', attr: 'text', multiple: true },
|
|
75
|
+
* link: { selector: 'a.permalink', attr: 'href' }
|
|
76
|
+
* }
|
|
77
|
+
*
|
|
78
|
+
* `attr` defaults to 'text' (innerText). Special value 'html' returns
|
|
79
|
+
* innerHTML. Any other string is read as an HTML attribute.
|
|
80
|
+
*/
|
|
81
|
+
async function extractWithSelectors(url, selectors, opts = {}) {
|
|
82
|
+
return withPage(async page => {
|
|
83
|
+
await page.goto(url, {
|
|
84
|
+
waitUntil: 'domcontentloaded',
|
|
85
|
+
timeout: opts.timeoutMs ?? DEFAULT_NAV_TIMEOUT_MS,
|
|
86
|
+
})
|
|
87
|
+
return await page.evaluate(
|
|
88
|
+
({ selectorMap, evalTimeoutMs }) => {
|
|
89
|
+
const start = Date.now()
|
|
90
|
+
const result = {}
|
|
91
|
+
for (const [field, spec] of Object.entries(selectorMap)) {
|
|
92
|
+
if (Date.now() - start > evalTimeoutMs) {
|
|
93
|
+
result[field] = null
|
|
94
|
+
continue
|
|
95
|
+
}
|
|
96
|
+
if (!spec || !spec.selector) {
|
|
97
|
+
result[field] = null
|
|
98
|
+
continue
|
|
99
|
+
}
|
|
100
|
+
const attr = spec.attr || 'text'
|
|
101
|
+
const readOne = (el) => {
|
|
102
|
+
if (!el) return null
|
|
103
|
+
if (attr === 'text') return (el.innerText || el.textContent || '').trim()
|
|
104
|
+
if (attr === 'html') return el.innerHTML
|
|
105
|
+
return el.getAttribute(attr)
|
|
106
|
+
}
|
|
107
|
+
try {
|
|
108
|
+
if (spec.multiple) {
|
|
109
|
+
const nodes = document.querySelectorAll(spec.selector)
|
|
110
|
+
result[field] = Array.from(nodes).map(readOne).filter(v => v != null && v !== '')
|
|
111
|
+
} else {
|
|
112
|
+
const node = document.querySelector(spec.selector)
|
|
113
|
+
result[field] = readOne(node)
|
|
114
|
+
}
|
|
115
|
+
} catch {
|
|
116
|
+
result[field] = null
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
return result
|
|
120
|
+
},
|
|
121
|
+
{ selectorMap: selectors, evalTimeoutMs: opts.evalTimeoutMs ?? DEFAULT_EVAL_TIMEOUT_MS },
|
|
122
|
+
)
|
|
123
|
+
}, opts)
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
module.exports = {
|
|
127
|
+
renderPage,
|
|
128
|
+
extractWithSelectors,
|
|
129
|
+
}
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cold-path recipe generator.
|
|
3
|
+
*
|
|
4
|
+
* Resolution order (first available wins):
|
|
5
|
+
* 1. Primary LLM plugin (PUT /api/llm/config) — uses claude / gemini / codex
|
|
6
|
+
* CLI subprocess via the same plugin contract that runQuickLlmCall does.
|
|
7
|
+
* Prompt asks for plain JSON; we extract+parse it.
|
|
8
|
+
* 2. ANTHROPIC_API_KEY env var — direct Anthropic Messages API with
|
|
9
|
+
* tool_use schema enforcement (same fetch pattern as
|
|
10
|
+
* core/lib/revision-watcher.js).
|
|
11
|
+
* 3. Otherwise: throw LLM_UNAVAILABLE so the route layer can surface a 503.
|
|
12
|
+
*
|
|
13
|
+
* The model only ever sees cleaned Markdown — never the raw HTML — and only
|
|
14
|
+
* runs on cold-cache misses, so the cost is bounded.
|
|
15
|
+
*
|
|
16
|
+
* Returns:
|
|
17
|
+
* {
|
|
18
|
+
* pageType: 'article' | 'listing' | 'product' | 'profile' | 'form' | 'other',
|
|
19
|
+
* selectors: { fieldName: { selector, attr?, multiple? }, ... },
|
|
20
|
+
* extracted: { fieldName: <value already pulled from this page> }
|
|
21
|
+
* }
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
const { getActivePrimary } = require('../../llm-plugins/lib/active')
|
|
25
|
+
|
|
26
|
+
const ANTHROPIC_MODEL = 'claude-haiku-4-5-20251001'
|
|
27
|
+
const MAX_TOKENS = 2048
|
|
28
|
+
const PLUGIN_TIMEOUT_MS = 60_000
|
|
29
|
+
|
|
30
|
+
const TOOL_DESCRIPTION =
|
|
31
|
+
'Classify the page and propose CSS selectors for the most useful fields. ' +
|
|
32
|
+
'Also produce the extracted values directly so the caller can verify the recipe.'
|
|
33
|
+
|
|
34
|
+
const ANTHROPIC_TOOLS = [{
|
|
35
|
+
name: 'page_extraction',
|
|
36
|
+
description: TOOL_DESCRIPTION,
|
|
37
|
+
input_schema: {
|
|
38
|
+
type: 'object',
|
|
39
|
+
required: ['page_type', 'selectors', 'extracted'],
|
|
40
|
+
properties: {
|
|
41
|
+
page_type: {
|
|
42
|
+
type: 'string',
|
|
43
|
+
enum: ['article', 'listing', 'product', 'profile', 'form', 'other'],
|
|
44
|
+
description: 'High-level classification of the page.',
|
|
45
|
+
},
|
|
46
|
+
selectors: {
|
|
47
|
+
type: 'object',
|
|
48
|
+
description:
|
|
49
|
+
'Map of fieldName -> { selector, attr?, multiple? }. Use plain CSS selectors. ' +
|
|
50
|
+
'attr defaults to "text" (innerText); use "html" or an HTML attribute name to override. ' +
|
|
51
|
+
'Set multiple=true for list fields. Aim for 3-8 fields covering the page\'s primary content.',
|
|
52
|
+
additionalProperties: {
|
|
53
|
+
type: 'object',
|
|
54
|
+
required: ['selector'],
|
|
55
|
+
properties: {
|
|
56
|
+
selector: { type: 'string' },
|
|
57
|
+
attr: { type: 'string' },
|
|
58
|
+
multiple: { type: 'boolean' },
|
|
59
|
+
},
|
|
60
|
+
},
|
|
61
|
+
},
|
|
62
|
+
extracted: {
|
|
63
|
+
type: 'object',
|
|
64
|
+
description:
|
|
65
|
+
'Values extracted from this specific page using the selectors above. ' +
|
|
66
|
+
'Strings or arrays of strings.',
|
|
67
|
+
},
|
|
68
|
+
},
|
|
69
|
+
},
|
|
70
|
+
}]
|
|
71
|
+
|
|
72
|
+
const SYSTEM_PROMPT = `You design CSS selector recipes for extracting structured content from web pages.
|
|
73
|
+
|
|
74
|
+
Given a cleaned Markdown rendering of one page, you must:
|
|
75
|
+
1. Classify the page (article / listing / product / profile / form / other).
|
|
76
|
+
2. Propose 3-8 CSS selectors that capture the page's primary information.
|
|
77
|
+
- Prefer semantic selectors (article, h1, time[datetime], a[rel="author"]) over class names where possible.
|
|
78
|
+
- Use class-based selectors only when semantic ones are unavailable.
|
|
79
|
+
- Avoid fragile attribute selectors like data-react-* or auto-generated hashes.
|
|
80
|
+
3. Fill the "extracted" object with the values pulled from this exact page so the caller can verify your recipe works.
|
|
81
|
+
|
|
82
|
+
The same recipe will be reused for structurally similar pages, so think about what generalizes.`
|
|
83
|
+
|
|
84
|
+
async function generateRecipe({ url, cleanedMarkdown, hint }) {
|
|
85
|
+
const primary = getActivePrimary()
|
|
86
|
+
if (primary) {
|
|
87
|
+
return await generateViaPlugin(primary, { url, cleanedMarkdown, hint })
|
|
88
|
+
}
|
|
89
|
+
if (process.env.ANTHROPIC_API_KEY) {
|
|
90
|
+
return await generateViaAnthropicDirect({ url, cleanedMarkdown, hint })
|
|
91
|
+
}
|
|
92
|
+
const e = new Error(
|
|
93
|
+
'No LLM available for cold-path recipe generation. Configure a primary LLM via ' +
|
|
94
|
+
'PUT /api/llm/config (recommended) or set ANTHROPIC_API_KEY as a fallback.'
|
|
95
|
+
)
|
|
96
|
+
e.code = 'LLM_UNAVAILABLE'
|
|
97
|
+
throw e
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
async function generateViaPlugin(plugin, { url, cleanedMarkdown, hint }) {
|
|
101
|
+
const prompt = buildTextPrompt({ url, cleanedMarkdown, hint })
|
|
102
|
+
const input = {
|
|
103
|
+
prompt,
|
|
104
|
+
timeoutMs: PLUGIN_TIMEOUT_MS,
|
|
105
|
+
}
|
|
106
|
+
// Claude Code CLI accepts model aliases like 'haiku'. Other plugins ignore this.
|
|
107
|
+
if (plugin.name === 'claude') input.model = 'haiku'
|
|
108
|
+
|
|
109
|
+
let output
|
|
110
|
+
try {
|
|
111
|
+
output = await plugin.invoke(input)
|
|
112
|
+
} catch (err) {
|
|
113
|
+
const e = new Error(`Primary LLM (${plugin.name}) invoke failed: ${err.message}`)
|
|
114
|
+
e.code = 'PRIMARY_LLM_FAILED'
|
|
115
|
+
throw e
|
|
116
|
+
}
|
|
117
|
+
if (output.error) {
|
|
118
|
+
const e = new Error(`Primary LLM (${plugin.name}) returned error: ${output.error.message}`)
|
|
119
|
+
e.code = 'PRIMARY_LLM_FAILED'
|
|
120
|
+
throw e
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
const json = extractJson(output.text || '')
|
|
124
|
+
if (!json) {
|
|
125
|
+
const e = new Error(
|
|
126
|
+
`Primary LLM (${plugin.name}) did not return parseable JSON. ` +
|
|
127
|
+
`Raw output (first 500 chars): ${(output.text || '').slice(0, 500)}`
|
|
128
|
+
)
|
|
129
|
+
e.code = 'PRIMARY_LLM_BAD_JSON'
|
|
130
|
+
throw e
|
|
131
|
+
}
|
|
132
|
+
return {
|
|
133
|
+
pageType: json.page_type || 'other',
|
|
134
|
+
selectors: json.selectors || {},
|
|
135
|
+
extracted: json.extracted || {},
|
|
136
|
+
source: `primary:${plugin.name}`,
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
async function generateViaAnthropicDirect({ url, cleanedMarkdown, hint }) {
|
|
141
|
+
const apiKey = process.env.ANTHROPIC_API_KEY
|
|
142
|
+
const userParts = [
|
|
143
|
+
`URL: ${url}`,
|
|
144
|
+
hint ? `Caller hint: ${hint}` : null,
|
|
145
|
+
'',
|
|
146
|
+
'--- Cleaned Markdown (Readability output) ---',
|
|
147
|
+
cleanedMarkdown || '(empty)',
|
|
148
|
+
].filter(Boolean).join('\n')
|
|
149
|
+
|
|
150
|
+
const resp = await fetch('https://api.anthropic.com/v1/messages', {
|
|
151
|
+
method: 'POST',
|
|
152
|
+
headers: {
|
|
153
|
+
'Content-Type': 'application/json',
|
|
154
|
+
'x-api-key': apiKey,
|
|
155
|
+
'anthropic-version': '2023-06-01',
|
|
156
|
+
},
|
|
157
|
+
body: JSON.stringify({
|
|
158
|
+
model: ANTHROPIC_MODEL,
|
|
159
|
+
max_tokens: MAX_TOKENS,
|
|
160
|
+
system: SYSTEM_PROMPT,
|
|
161
|
+
tools: ANTHROPIC_TOOLS,
|
|
162
|
+
tool_choice: { type: 'tool', name: 'page_extraction' },
|
|
163
|
+
messages: [{ role: 'user', content: userParts }],
|
|
164
|
+
}),
|
|
165
|
+
})
|
|
166
|
+
|
|
167
|
+
if (!resp.ok) {
|
|
168
|
+
const text = await resp.text()
|
|
169
|
+
throw new Error(`Anthropic API error: ${resp.status} ${text}`)
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
const data = await resp.json()
|
|
173
|
+
const toolUse = (data.content || []).find(block => block.type === 'tool_use' && block.name === 'page_extraction')
|
|
174
|
+
if (!toolUse || !toolUse.input) {
|
|
175
|
+
throw new Error('Anthropic API returned no tool_use block for page_extraction')
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
const { page_type, selectors, extracted } = toolUse.input
|
|
179
|
+
return {
|
|
180
|
+
pageType: page_type || 'other',
|
|
181
|
+
selectors: selectors || {},
|
|
182
|
+
extracted: extracted || {},
|
|
183
|
+
source: 'anthropic-direct',
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
function buildTextPrompt({ url, cleanedMarkdown, hint }) {
|
|
188
|
+
return [
|
|
189
|
+
SYSTEM_PROMPT,
|
|
190
|
+
'',
|
|
191
|
+
`URL: ${url}`,
|
|
192
|
+
hint ? `Caller hint: ${hint}` : null,
|
|
193
|
+
'',
|
|
194
|
+
'Output ONLY a JSON object — no prose, no explanations, no code fences.',
|
|
195
|
+
'The JSON must have exactly this shape:',
|
|
196
|
+
'',
|
|
197
|
+
'{',
|
|
198
|
+
' "page_type": "article" | "listing" | "product" | "profile" | "form" | "other",',
|
|
199
|
+
' "selectors": {',
|
|
200
|
+
' "<fieldName>": { "selector": "<css>", "attr"?: "<text|html|attribute-name>", "multiple"?: <boolean> }',
|
|
201
|
+
' },',
|
|
202
|
+
' "extracted": { "<fieldName>": "<string or array of strings>" }',
|
|
203
|
+
'}',
|
|
204
|
+
'',
|
|
205
|
+
'Notes:',
|
|
206
|
+
'- attr defaults to "text" (innerText). Use "html" or an HTML attribute name to override.',
|
|
207
|
+
'- Set multiple=true for list fields (returns array).',
|
|
208
|
+
'- "extracted" must contain the values you actually read from THIS page using those selectors.',
|
|
209
|
+
'',
|
|
210
|
+
'--- Cleaned Markdown ---',
|
|
211
|
+
cleanedMarkdown || '(empty)',
|
|
212
|
+
].filter(s => s !== null).join('\n')
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
/**
|
|
216
|
+
* Extract a JSON object from arbitrary LLM text. Handles three common shapes:
|
|
217
|
+
* 1. Raw JSON
|
|
218
|
+
* 2. Fenced code block (```json ... ``` or ``` ... ```)
|
|
219
|
+
* 3. Prose with embedded {...} — uses the outermost braces
|
|
220
|
+
*/
|
|
221
|
+
function extractJson(text) {
|
|
222
|
+
if (!text || typeof text !== 'string') return null
|
|
223
|
+
const trimmed = text.trim()
|
|
224
|
+
if (!trimmed) return null
|
|
225
|
+
|
|
226
|
+
try { return JSON.parse(trimmed) } catch {}
|
|
227
|
+
|
|
228
|
+
const fence = trimmed.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
|
|
229
|
+
if (fence && fence[1]) {
|
|
230
|
+
try { return JSON.parse(fence[1].trim()) } catch {}
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
const first = trimmed.indexOf('{')
|
|
234
|
+
const last = trimmed.lastIndexOf('}')
|
|
235
|
+
if (first >= 0 && last > first) {
|
|
236
|
+
try { return JSON.parse(trimmed.slice(first, last + 1)) } catch {}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
return null
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
module.exports = {
|
|
243
|
+
generateRecipe,
|
|
244
|
+
ANTHROPIC_MODEL,
|
|
245
|
+
// exported for tests
|
|
246
|
+
extractJson,
|
|
247
|
+
}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* URL normalization for the page recipe cache.
|
|
3
|
+
*
|
|
4
|
+
* Two URLs that resolve to the same template + the same DOM fingerprint are
|
|
5
|
+
* treated as structurally identical and share an extraction recipe. The
|
|
6
|
+
* template captures the path/query *shape* (with IDs / slugs / pagination
|
|
7
|
+
* placeholdered out) so the cache hits across e.g.
|
|
8
|
+
* /work/proposal/123456?utm_source=foo
|
|
9
|
+
* /work/proposal/789012?utm_source=bar&fbclid=...
|
|
10
|
+
* Tracking params are stripped, remaining query keys are sorted, and known
|
|
11
|
+
* pagination params keep their key but lose their value.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
const STRIP_QUERY_PARAMS = new Set([
|
|
15
|
+
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
|
|
16
|
+
'fbclid', 'gclid', 'mc_cid', 'mc_eid', 'ref', 'referrer',
|
|
17
|
+
])
|
|
18
|
+
|
|
19
|
+
const PAGINATION_QUERY_PARAMS = new Set([
|
|
20
|
+
'page', 'p', 'pagenumber', 'pagenum', 'offset', 'start',
|
|
21
|
+
])
|
|
22
|
+
|
|
23
|
+
const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i
|
|
24
|
+
const HEX32_RE = /^[0-9a-f]{32}$/i
|
|
25
|
+
const ALL_DIGITS_RE = /^\d+$/
|
|
26
|
+
const LONG_ALNUM_RE = /^(?=.*\d)(?=.*[a-zA-Z])[A-Za-z0-9]{20,}$/
|
|
27
|
+
|
|
28
|
+
function placeholderForSegment(segment) {
|
|
29
|
+
if (!segment) return segment
|
|
30
|
+
if (ALL_DIGITS_RE.test(segment)) return ':id'
|
|
31
|
+
if (UUID_RE.test(segment) || HEX32_RE.test(segment)) return ':uuid'
|
|
32
|
+
if (LONG_ALNUM_RE.test(segment)) return ':slug'
|
|
33
|
+
return segment
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function normalizeUrl(rawUrl) {
|
|
37
|
+
let parsed
|
|
38
|
+
try {
|
|
39
|
+
parsed = new URL(rawUrl)
|
|
40
|
+
} catch (err) {
|
|
41
|
+
throw new Error(`Invalid URL: ${rawUrl}`)
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Lowercase host (case-insensitive per RFC 3986)
|
|
45
|
+
const host = parsed.host.toLowerCase()
|
|
46
|
+
|
|
47
|
+
// Path: keep separators, placeholder each segment.
|
|
48
|
+
const pathSegments = parsed.pathname.split('/').map(seg => seg ? placeholderForSegment(seg) : seg)
|
|
49
|
+
const templatePath = pathSegments.join('/')
|
|
50
|
+
|
|
51
|
+
// Query: filter, sort, and placeholder pagination values.
|
|
52
|
+
const queryEntries = []
|
|
53
|
+
for (const [key, value] of parsed.searchParams.entries()) {
|
|
54
|
+
const lowerKey = key.toLowerCase()
|
|
55
|
+
if (STRIP_QUERY_PARAMS.has(lowerKey)) continue
|
|
56
|
+
if (PAGINATION_QUERY_PARAMS.has(lowerKey)) {
|
|
57
|
+
queryEntries.push([key, ':n'])
|
|
58
|
+
} else {
|
|
59
|
+
queryEntries.push([key, value])
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
queryEntries.sort((a, b) => a[0] < b[0] ? -1 : a[0] > b[0] ? 1 : 0)
|
|
63
|
+
|
|
64
|
+
const templateQuery = queryEntries.length
|
|
65
|
+
? '?' + queryEntries.map(([k, v]) => `${k}=${v}`).join('&')
|
|
66
|
+
: ''
|
|
67
|
+
|
|
68
|
+
const template = `${host}${templatePath}${templateQuery}`
|
|
69
|
+
|
|
70
|
+
// Canonical URL: original protocol + host + path, query reduced to non-tracking params
|
|
71
|
+
// (keeping their *real* values so the actual fetch stays correct).
|
|
72
|
+
const canonical = new URL(parsed.toString())
|
|
73
|
+
for (const param of [...canonical.searchParams.keys()]) {
|
|
74
|
+
if (STRIP_QUERY_PARAMS.has(param.toLowerCase())) {
|
|
75
|
+
canonical.searchParams.delete(param)
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
canonical.hash = ''
|
|
79
|
+
|
|
80
|
+
return {
|
|
81
|
+
template,
|
|
82
|
+
canonicalUrl: canonical.toString(),
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
module.exports = {
|
|
87
|
+
normalizeUrl,
|
|
88
|
+
STRIP_QUERY_PARAMS,
|
|
89
|
+
PAGINATION_QUERY_PARAMS,
|
|
90
|
+
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Admin routes
|
|
3
|
+
*
|
|
4
|
+
* HQ-only operations pushed via Cloudflare Tunnel. Currently exposes the
|
|
5
|
+
* billing-driven freeze endpoint. Recovery is intentionally NOT exposed
|
|
6
|
+
* here — HQ recovers a frozen minion by sending the existing
|
|
7
|
+
* `restart-agent` command (commands.js), which terminates the process and
|
|
8
|
+
* lets the in-memory frozen flag clear naturally.
|
|
9
|
+
*
|
|
10
|
+
* Endpoints:
|
|
11
|
+
* POST /api/admin/freeze - Set in-memory frozen state
|
|
12
|
+
* GET /api/admin/state - Diagnostic: report frozen state
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
const { verifyToken } = require('../lib/auth')
|
|
16
|
+
const frozenState = require('../lib/frozen-state')
|
|
17
|
+
|
|
18
|
+
async function adminRoutes(fastify) {
|
|
19
|
+
fastify.post('/api/admin/freeze', async (request, reply) => {
|
|
20
|
+
if (!verifyToken(request)) {
|
|
21
|
+
reply.code(401)
|
|
22
|
+
return { success: false, error: 'Unauthorized' }
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
const body = request.body || {}
|
|
26
|
+
const reason = typeof body.reason === 'string' ? body.reason : 'past_due'
|
|
27
|
+
|
|
28
|
+
frozenState.setFrozen({ reason })
|
|
29
|
+
|
|
30
|
+
return {
|
|
31
|
+
success: true,
|
|
32
|
+
state: frozenState.getState(),
|
|
33
|
+
}
|
|
34
|
+
})
|
|
35
|
+
|
|
36
|
+
fastify.get('/api/admin/state', async (request, reply) => {
|
|
37
|
+
if (!verifyToken(request)) {
|
|
38
|
+
reply.code(401)
|
|
39
|
+
return { success: false, error: 'Unauthorized' }
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
return {
|
|
43
|
+
success: true,
|
|
44
|
+
state: frozenState.getState(),
|
|
45
|
+
}
|
|
46
|
+
})
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
module.exports = { adminRoutes }
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web Page Extraction (experimental — v3.53.0)
|
|
3
|
+
*
|
|
4
|
+
* Endpoints:
|
|
5
|
+
* POST /api/web/extract - Fetch a URL and return structured JSON
|
|
6
|
+
* GET /api/web/recipes - List cached recipes (debug)
|
|
7
|
+
* DELETE /api/web/recipes/:template/:fingerprint - Drop a recipe
|
|
8
|
+
*
|
|
9
|
+
* The extract endpoint runs Playwright + Readability + an Anthropic Haiku
|
|
10
|
+
* recipe-generation step in this minion process, returning only structured
|
|
11
|
+
* JSON to the caller. Designed to keep large DOM payloads off the main
|
|
12
|
+
* Claude Code chat session that issued the request.
|
|
13
|
+
*
|
|
14
|
+
* Marked experimental: response shape may change.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
const { verifyToken } = require('../lib/auth')
|
|
18
|
+
const { extract } = require('../lib/web-extract')
|
|
19
|
+
const pageRecipeStore = require('../stores/page-recipe-store')
|
|
20
|
+
|
|
21
|
+
const REQUEST_TIMEOUT_MS = 60_000
|
|
22
|
+
|
|
23
|
+
async function webRoutes(fastify) {
|
|
24
|
+
fastify.post('/api/web/extract', async (request, reply) => {
|
|
25
|
+
if (!verifyToken(request)) {
|
|
26
|
+
reply.code(401)
|
|
27
|
+
return { success: false, error: 'Unauthorized' }
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
const body = request.body || {}
|
|
31
|
+
const { url, hint } = body
|
|
32
|
+
|
|
33
|
+
if (!url || typeof url !== 'string') {
|
|
34
|
+
reply.code(400)
|
|
35
|
+
return { success: false, error: 'url (string) is required' }
|
|
36
|
+
}
|
|
37
|
+
try {
|
|
38
|
+
new URL(url)
|
|
39
|
+
} catch {
|
|
40
|
+
reply.code(400)
|
|
41
|
+
return { success: false, error: 'url is not a valid URL' }
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
try {
|
|
45
|
+
const result = await Promise.race([
|
|
46
|
+
extract({ url, hint: typeof hint === 'string' ? hint : null }),
|
|
47
|
+
new Promise((_, rej) => setTimeout(() => rej(new Error('extract timeout')), REQUEST_TIMEOUT_MS)),
|
|
48
|
+
])
|
|
49
|
+
return { success: true, ...result }
|
|
50
|
+
} catch (err) {
|
|
51
|
+
if (
|
|
52
|
+
err.code === 'PLAYWRIGHT_UNAVAILABLE' ||
|
|
53
|
+
err.code === 'LLM_UNAVAILABLE' ||
|
|
54
|
+
err.code === 'ANTHROPIC_KEY_MISSING'
|
|
55
|
+
) {
|
|
56
|
+
reply.code(503)
|
|
57
|
+
return { success: false, error: err.message, code: err.code }
|
|
58
|
+
}
|
|
59
|
+
if (err.code === 'PRIMARY_LLM_FAILED' || err.code === 'PRIMARY_LLM_BAD_JSON') {
|
|
60
|
+
reply.code(502)
|
|
61
|
+
return { success: false, error: err.message, code: err.code }
|
|
62
|
+
}
|
|
63
|
+
request.log.error({ err }, '[web/extract] failed')
|
|
64
|
+
reply.code(500)
|
|
65
|
+
return { success: false, error: err.message || String(err) }
|
|
66
|
+
}
|
|
67
|
+
})
|
|
68
|
+
|
|
69
|
+
// Debug helpers — list / delete cached recipes.
|
|
70
|
+
fastify.get('/api/web/recipes', async (request, reply) => {
|
|
71
|
+
if (!verifyToken(request)) {
|
|
72
|
+
reply.code(401)
|
|
73
|
+
return { success: false, error: 'Unauthorized' }
|
|
74
|
+
}
|
|
75
|
+
const recipes = pageRecipeStore.listAll({ limit: 200 })
|
|
76
|
+
return { success: true, experimental: true, recipes }
|
|
77
|
+
})
|
|
78
|
+
|
|
79
|
+
fastify.delete('/api/web/recipes', async (request, reply) => {
|
|
80
|
+
if (!verifyToken(request)) {
|
|
81
|
+
reply.code(401)
|
|
82
|
+
return { success: false, error: 'Unauthorized' }
|
|
83
|
+
}
|
|
84
|
+
const { template, fingerprint } = request.query || {}
|
|
85
|
+
if (!template || !fingerprint) {
|
|
86
|
+
reply.code(400)
|
|
87
|
+
return { success: false, error: 'template and fingerprint query params are required' }
|
|
88
|
+
}
|
|
89
|
+
const removed = pageRecipeStore.remove({ urlTemplate: template, domFingerprint: fingerprint })
|
|
90
|
+
return { success: true, removed }
|
|
91
|
+
})
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
module.exports = { webRoutes }
|