@geekbeer/minion 3.52.0 → 3.53.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +7 -0
- package/core/db/migrations/20260508000000_page_recipes.js +33 -0
- package/core/lib/web-extract/extractor.js +142 -0
- package/core/lib/web-extract/fingerprint.js +63 -0
- package/core/lib/web-extract/html-cleaner.js +72 -0
- package/core/lib/web-extract/index.js +21 -0
- package/core/lib/web-extract/playwright-runner.js +129 -0
- package/core/lib/web-extract/recipe-generator.js +247 -0
- package/core/lib/web-extract/url-normalize.js +90 -0
- package/core/routes/web.js +94 -0
- package/core/stores/page-recipe-store.js +143 -0
- package/docs/api-reference.md +66 -0
- package/docs/task-guides.md +58 -0
- package/linux/routes/chat.js +36 -4
- package/linux/server.js +2 -0
- package/mac/server.js +2 -0
- package/package.json +6 -2
- package/rules/core.md +21 -1
- package/win/routes/chat.js +37 -2
- package/win/server.js +2 -0
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cold-path recipe generator.
|
|
3
|
+
*
|
|
4
|
+
* Resolution order (first available wins):
|
|
5
|
+
* 1. Primary LLM plugin (PUT /api/llm/config) — uses claude / gemini / codex
|
|
6
|
+
* CLI subprocess via the same plugin contract that runQuickLlmCall does.
|
|
7
|
+
* Prompt asks for plain JSON; we extract+parse it.
|
|
8
|
+
* 2. ANTHROPIC_API_KEY env var — direct Anthropic Messages API with
|
|
9
|
+
* tool_use schema enforcement (same fetch pattern as
|
|
10
|
+
* core/lib/revision-watcher.js).
|
|
11
|
+
* 3. Otherwise: throw LLM_UNAVAILABLE so the route layer can surface a 503.
|
|
12
|
+
*
|
|
13
|
+
* The model only ever sees cleaned Markdown — never the raw HTML — and only
|
|
14
|
+
* runs on cold-cache misses, so the cost is bounded.
|
|
15
|
+
*
|
|
16
|
+
* Returns:
|
|
17
|
+
* {
|
|
18
|
+
* pageType: 'article' | 'listing' | 'product' | 'profile' | 'form' | 'other',
|
|
19
|
+
* selectors: { fieldName: { selector, attr?, multiple? }, ... },
|
|
20
|
+
* extracted: { fieldName: <value already pulled from this page> }
|
|
21
|
+
* }
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
const { getActivePrimary } = require('../../llm-plugins/lib/active')
|
|
25
|
+
|
|
26
|
+
const ANTHROPIC_MODEL = 'claude-haiku-4-5-20251001'
|
|
27
|
+
const MAX_TOKENS = 2048
|
|
28
|
+
const PLUGIN_TIMEOUT_MS = 60_000
|
|
29
|
+
|
|
30
|
+
const TOOL_DESCRIPTION =
|
|
31
|
+
'Classify the page and propose CSS selectors for the most useful fields. ' +
|
|
32
|
+
'Also produce the extracted values directly so the caller can verify the recipe.'
|
|
33
|
+
|
|
34
|
+
const ANTHROPIC_TOOLS = [{
|
|
35
|
+
name: 'page_extraction',
|
|
36
|
+
description: TOOL_DESCRIPTION,
|
|
37
|
+
input_schema: {
|
|
38
|
+
type: 'object',
|
|
39
|
+
required: ['page_type', 'selectors', 'extracted'],
|
|
40
|
+
properties: {
|
|
41
|
+
page_type: {
|
|
42
|
+
type: 'string',
|
|
43
|
+
enum: ['article', 'listing', 'product', 'profile', 'form', 'other'],
|
|
44
|
+
description: 'High-level classification of the page.',
|
|
45
|
+
},
|
|
46
|
+
selectors: {
|
|
47
|
+
type: 'object',
|
|
48
|
+
description:
|
|
49
|
+
'Map of fieldName -> { selector, attr?, multiple? }. Use plain CSS selectors. ' +
|
|
50
|
+
'attr defaults to "text" (innerText); use "html" or an HTML attribute name to override. ' +
|
|
51
|
+
'Set multiple=true for list fields. Aim for 3-8 fields covering the page\'s primary content.',
|
|
52
|
+
additionalProperties: {
|
|
53
|
+
type: 'object',
|
|
54
|
+
required: ['selector'],
|
|
55
|
+
properties: {
|
|
56
|
+
selector: { type: 'string' },
|
|
57
|
+
attr: { type: 'string' },
|
|
58
|
+
multiple: { type: 'boolean' },
|
|
59
|
+
},
|
|
60
|
+
},
|
|
61
|
+
},
|
|
62
|
+
extracted: {
|
|
63
|
+
type: 'object',
|
|
64
|
+
description:
|
|
65
|
+
'Values extracted from this specific page using the selectors above. ' +
|
|
66
|
+
'Strings or arrays of strings.',
|
|
67
|
+
},
|
|
68
|
+
},
|
|
69
|
+
},
|
|
70
|
+
}]
|
|
71
|
+
|
|
72
|
+
const SYSTEM_PROMPT = `You design CSS selector recipes for extracting structured content from web pages.
|
|
73
|
+
|
|
74
|
+
Given a cleaned Markdown rendering of one page, you must:
|
|
75
|
+
1. Classify the page (article / listing / product / profile / form / other).
|
|
76
|
+
2. Propose 3-8 CSS selectors that capture the page's primary information.
|
|
77
|
+
- Prefer semantic selectors (article, h1, time[datetime], a[rel="author"]) over class names where possible.
|
|
78
|
+
- Use class-based selectors only when semantic ones are unavailable.
|
|
79
|
+
- Avoid fragile attribute selectors like data-react-* or auto-generated hashes.
|
|
80
|
+
3. Fill the "extracted" object with the values pulled from this exact page so the caller can verify your recipe works.
|
|
81
|
+
|
|
82
|
+
The same recipe will be reused for structurally similar pages, so think about what generalizes.`
|
|
83
|
+
|
|
84
|
+
async function generateRecipe({ url, cleanedMarkdown, hint }) {
|
|
85
|
+
const primary = getActivePrimary()
|
|
86
|
+
if (primary) {
|
|
87
|
+
return await generateViaPlugin(primary, { url, cleanedMarkdown, hint })
|
|
88
|
+
}
|
|
89
|
+
if (process.env.ANTHROPIC_API_KEY) {
|
|
90
|
+
return await generateViaAnthropicDirect({ url, cleanedMarkdown, hint })
|
|
91
|
+
}
|
|
92
|
+
const e = new Error(
|
|
93
|
+
'No LLM available for cold-path recipe generation. Configure a primary LLM via ' +
|
|
94
|
+
'PUT /api/llm/config (recommended) or set ANTHROPIC_API_KEY as a fallback.'
|
|
95
|
+
)
|
|
96
|
+
e.code = 'LLM_UNAVAILABLE'
|
|
97
|
+
throw e
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
async function generateViaPlugin(plugin, { url, cleanedMarkdown, hint }) {
|
|
101
|
+
const prompt = buildTextPrompt({ url, cleanedMarkdown, hint })
|
|
102
|
+
const input = {
|
|
103
|
+
prompt,
|
|
104
|
+
timeoutMs: PLUGIN_TIMEOUT_MS,
|
|
105
|
+
}
|
|
106
|
+
// Claude Code CLI accepts model aliases like 'haiku'. Other plugins ignore this.
|
|
107
|
+
if (plugin.name === 'claude') input.model = 'haiku'
|
|
108
|
+
|
|
109
|
+
let output
|
|
110
|
+
try {
|
|
111
|
+
output = await plugin.invoke(input)
|
|
112
|
+
} catch (err) {
|
|
113
|
+
const e = new Error(`Primary LLM (${plugin.name}) invoke failed: ${err.message}`)
|
|
114
|
+
e.code = 'PRIMARY_LLM_FAILED'
|
|
115
|
+
throw e
|
|
116
|
+
}
|
|
117
|
+
if (output.error) {
|
|
118
|
+
const e = new Error(`Primary LLM (${plugin.name}) returned error: ${output.error.message}`)
|
|
119
|
+
e.code = 'PRIMARY_LLM_FAILED'
|
|
120
|
+
throw e
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
const json = extractJson(output.text || '')
|
|
124
|
+
if (!json) {
|
|
125
|
+
const e = new Error(
|
|
126
|
+
`Primary LLM (${plugin.name}) did not return parseable JSON. ` +
|
|
127
|
+
`Raw output (first 500 chars): ${(output.text || '').slice(0, 500)}`
|
|
128
|
+
)
|
|
129
|
+
e.code = 'PRIMARY_LLM_BAD_JSON'
|
|
130
|
+
throw e
|
|
131
|
+
}
|
|
132
|
+
return {
|
|
133
|
+
pageType: json.page_type || 'other',
|
|
134
|
+
selectors: json.selectors || {},
|
|
135
|
+
extracted: json.extracted || {},
|
|
136
|
+
source: `primary:${plugin.name}`,
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
async function generateViaAnthropicDirect({ url, cleanedMarkdown, hint }) {
|
|
141
|
+
const apiKey = process.env.ANTHROPIC_API_KEY
|
|
142
|
+
const userParts = [
|
|
143
|
+
`URL: ${url}`,
|
|
144
|
+
hint ? `Caller hint: ${hint}` : null,
|
|
145
|
+
'',
|
|
146
|
+
'--- Cleaned Markdown (Readability output) ---',
|
|
147
|
+
cleanedMarkdown || '(empty)',
|
|
148
|
+
].filter(Boolean).join('\n')
|
|
149
|
+
|
|
150
|
+
const resp = await fetch('https://api.anthropic.com/v1/messages', {
|
|
151
|
+
method: 'POST',
|
|
152
|
+
headers: {
|
|
153
|
+
'Content-Type': 'application/json',
|
|
154
|
+
'x-api-key': apiKey,
|
|
155
|
+
'anthropic-version': '2023-06-01',
|
|
156
|
+
},
|
|
157
|
+
body: JSON.stringify({
|
|
158
|
+
model: ANTHROPIC_MODEL,
|
|
159
|
+
max_tokens: MAX_TOKENS,
|
|
160
|
+
system: SYSTEM_PROMPT,
|
|
161
|
+
tools: ANTHROPIC_TOOLS,
|
|
162
|
+
tool_choice: { type: 'tool', name: 'page_extraction' },
|
|
163
|
+
messages: [{ role: 'user', content: userParts }],
|
|
164
|
+
}),
|
|
165
|
+
})
|
|
166
|
+
|
|
167
|
+
if (!resp.ok) {
|
|
168
|
+
const text = await resp.text()
|
|
169
|
+
throw new Error(`Anthropic API error: ${resp.status} ${text}`)
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
const data = await resp.json()
|
|
173
|
+
const toolUse = (data.content || []).find(block => block.type === 'tool_use' && block.name === 'page_extraction')
|
|
174
|
+
if (!toolUse || !toolUse.input) {
|
|
175
|
+
throw new Error('Anthropic API returned no tool_use block for page_extraction')
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
const { page_type, selectors, extracted } = toolUse.input
|
|
179
|
+
return {
|
|
180
|
+
pageType: page_type || 'other',
|
|
181
|
+
selectors: selectors || {},
|
|
182
|
+
extracted: extracted || {},
|
|
183
|
+
source: 'anthropic-direct',
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
function buildTextPrompt({ url, cleanedMarkdown, hint }) {
|
|
188
|
+
return [
|
|
189
|
+
SYSTEM_PROMPT,
|
|
190
|
+
'',
|
|
191
|
+
`URL: ${url}`,
|
|
192
|
+
hint ? `Caller hint: ${hint}` : null,
|
|
193
|
+
'',
|
|
194
|
+
'Output ONLY a JSON object — no prose, no explanations, no code fences.',
|
|
195
|
+
'The JSON must have exactly this shape:',
|
|
196
|
+
'',
|
|
197
|
+
'{',
|
|
198
|
+
' "page_type": "article" | "listing" | "product" | "profile" | "form" | "other",',
|
|
199
|
+
' "selectors": {',
|
|
200
|
+
' "<fieldName>": { "selector": "<css>", "attr"?: "<text|html|attribute-name>", "multiple"?: <boolean> }',
|
|
201
|
+
' },',
|
|
202
|
+
' "extracted": { "<fieldName>": "<string or array of strings>" }',
|
|
203
|
+
'}',
|
|
204
|
+
'',
|
|
205
|
+
'Notes:',
|
|
206
|
+
'- attr defaults to "text" (innerText). Use "html" or an HTML attribute name to override.',
|
|
207
|
+
'- Set multiple=true for list fields (returns array).',
|
|
208
|
+
'- "extracted" must contain the values you actually read from THIS page using those selectors.',
|
|
209
|
+
'',
|
|
210
|
+
'--- Cleaned Markdown ---',
|
|
211
|
+
cleanedMarkdown || '(empty)',
|
|
212
|
+
].filter(s => s !== null).join('\n')
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
/**
|
|
216
|
+
* Extract a JSON object from arbitrary LLM text. Handles three common shapes:
|
|
217
|
+
* 1. Raw JSON
|
|
218
|
+
* 2. Fenced code block (```json ... ``` or ``` ... ```)
|
|
219
|
+
* 3. Prose with embedded {...} — uses the outermost braces
|
|
220
|
+
*/
|
|
221
|
+
function extractJson(text) {
|
|
222
|
+
if (!text || typeof text !== 'string') return null
|
|
223
|
+
const trimmed = text.trim()
|
|
224
|
+
if (!trimmed) return null
|
|
225
|
+
|
|
226
|
+
try { return JSON.parse(trimmed) } catch {}
|
|
227
|
+
|
|
228
|
+
const fence = trimmed.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
|
|
229
|
+
if (fence && fence[1]) {
|
|
230
|
+
try { return JSON.parse(fence[1].trim()) } catch {}
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
const first = trimmed.indexOf('{')
|
|
234
|
+
const last = trimmed.lastIndexOf('}')
|
|
235
|
+
if (first >= 0 && last > first) {
|
|
236
|
+
try { return JSON.parse(trimmed.slice(first, last + 1)) } catch {}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
return null
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
module.exports = {
|
|
243
|
+
generateRecipe,
|
|
244
|
+
ANTHROPIC_MODEL,
|
|
245
|
+
// exported for tests
|
|
246
|
+
extractJson,
|
|
247
|
+
}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* URL normalization for the page recipe cache.
|
|
3
|
+
*
|
|
4
|
+
* Two URLs that resolve to the same template + the same DOM fingerprint are
|
|
5
|
+
* treated as structurally identical and share an extraction recipe. The
|
|
6
|
+
* template captures the path/query *shape* (with IDs / slugs / pagination
|
|
7
|
+
* placeholdered out) so the cache hits across e.g.
|
|
8
|
+
* /work/proposal/123456?utm_source=foo
|
|
9
|
+
* /work/proposal/789012?utm_source=bar&fbclid=...
|
|
10
|
+
* Tracking params are stripped, remaining query keys are sorted, and known
|
|
11
|
+
* pagination params keep their key but lose their value.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
const STRIP_QUERY_PARAMS = new Set([
|
|
15
|
+
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
|
|
16
|
+
'fbclid', 'gclid', 'mc_cid', 'mc_eid', 'ref', 'referrer',
|
|
17
|
+
])
|
|
18
|
+
|
|
19
|
+
const PAGINATION_QUERY_PARAMS = new Set([
|
|
20
|
+
'page', 'p', 'pagenumber', 'pagenum', 'offset', 'start',
|
|
21
|
+
])
|
|
22
|
+
|
|
23
|
+
const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i
|
|
24
|
+
const HEX32_RE = /^[0-9a-f]{32}$/i
|
|
25
|
+
const ALL_DIGITS_RE = /^\d+$/
|
|
26
|
+
const LONG_ALNUM_RE = /^(?=.*\d)(?=.*[a-zA-Z])[A-Za-z0-9]{20,}$/
|
|
27
|
+
|
|
28
|
+
function placeholderForSegment(segment) {
|
|
29
|
+
if (!segment) return segment
|
|
30
|
+
if (ALL_DIGITS_RE.test(segment)) return ':id'
|
|
31
|
+
if (UUID_RE.test(segment) || HEX32_RE.test(segment)) return ':uuid'
|
|
32
|
+
if (LONG_ALNUM_RE.test(segment)) return ':slug'
|
|
33
|
+
return segment
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function normalizeUrl(rawUrl) {
|
|
37
|
+
let parsed
|
|
38
|
+
try {
|
|
39
|
+
parsed = new URL(rawUrl)
|
|
40
|
+
} catch (err) {
|
|
41
|
+
throw new Error(`Invalid URL: ${rawUrl}`)
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Lowercase host (case-insensitive per RFC 3986)
|
|
45
|
+
const host = parsed.host.toLowerCase()
|
|
46
|
+
|
|
47
|
+
// Path: keep separators, placeholder each segment.
|
|
48
|
+
const pathSegments = parsed.pathname.split('/').map(seg => seg ? placeholderForSegment(seg) : seg)
|
|
49
|
+
const templatePath = pathSegments.join('/')
|
|
50
|
+
|
|
51
|
+
// Query: filter, sort, and placeholder pagination values.
|
|
52
|
+
const queryEntries = []
|
|
53
|
+
for (const [key, value] of parsed.searchParams.entries()) {
|
|
54
|
+
const lowerKey = key.toLowerCase()
|
|
55
|
+
if (STRIP_QUERY_PARAMS.has(lowerKey)) continue
|
|
56
|
+
if (PAGINATION_QUERY_PARAMS.has(lowerKey)) {
|
|
57
|
+
queryEntries.push([key, ':n'])
|
|
58
|
+
} else {
|
|
59
|
+
queryEntries.push([key, value])
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
queryEntries.sort((a, b) => a[0] < b[0] ? -1 : a[0] > b[0] ? 1 : 0)
|
|
63
|
+
|
|
64
|
+
const templateQuery = queryEntries.length
|
|
65
|
+
? '?' + queryEntries.map(([k, v]) => `${k}=${v}`).join('&')
|
|
66
|
+
: ''
|
|
67
|
+
|
|
68
|
+
const template = `${host}${templatePath}${templateQuery}`
|
|
69
|
+
|
|
70
|
+
// Canonical URL: original protocol + host + path, query reduced to non-tracking params
|
|
71
|
+
// (keeping their *real* values so the actual fetch stays correct).
|
|
72
|
+
const canonical = new URL(parsed.toString())
|
|
73
|
+
for (const param of [...canonical.searchParams.keys()]) {
|
|
74
|
+
if (STRIP_QUERY_PARAMS.has(param.toLowerCase())) {
|
|
75
|
+
canonical.searchParams.delete(param)
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
canonical.hash = ''
|
|
79
|
+
|
|
80
|
+
return {
|
|
81
|
+
template,
|
|
82
|
+
canonicalUrl: canonical.toString(),
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
module.exports = {
|
|
87
|
+
normalizeUrl,
|
|
88
|
+
STRIP_QUERY_PARAMS,
|
|
89
|
+
PAGINATION_QUERY_PARAMS,
|
|
90
|
+
}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web Page Extraction (experimental — v3.53.0)
|
|
3
|
+
*
|
|
4
|
+
* Endpoints:
|
|
5
|
+
* POST /api/web/extract - Fetch a URL and return structured JSON
|
|
6
|
+
* GET /api/web/recipes - List cached recipes (debug)
|
|
7
|
+
* DELETE /api/web/recipes/:template/:fingerprint - Drop a recipe
|
|
8
|
+
*
|
|
9
|
+
* The extract endpoint runs Playwright + Readability + an Anthropic Haiku
|
|
10
|
+
* recipe-generation step in this minion process, returning only structured
|
|
11
|
+
* JSON to the caller. Designed to keep large DOM payloads off the main
|
|
12
|
+
* Claude Code chat session that issued the request.
|
|
13
|
+
*
|
|
14
|
+
* Marked experimental: response shape may change.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
const { verifyToken } = require('../lib/auth')
|
|
18
|
+
const { extract } = require('../lib/web-extract')
|
|
19
|
+
const pageRecipeStore = require('../stores/page-recipe-store')
|
|
20
|
+
|
|
21
|
+
const REQUEST_TIMEOUT_MS = 60_000
|
|
22
|
+
|
|
23
|
+
async function webRoutes(fastify) {
|
|
24
|
+
fastify.post('/api/web/extract', async (request, reply) => {
|
|
25
|
+
if (!verifyToken(request)) {
|
|
26
|
+
reply.code(401)
|
|
27
|
+
return { success: false, error: 'Unauthorized' }
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
const body = request.body || {}
|
|
31
|
+
const { url, hint } = body
|
|
32
|
+
|
|
33
|
+
if (!url || typeof url !== 'string') {
|
|
34
|
+
reply.code(400)
|
|
35
|
+
return { success: false, error: 'url (string) is required' }
|
|
36
|
+
}
|
|
37
|
+
try {
|
|
38
|
+
new URL(url)
|
|
39
|
+
} catch {
|
|
40
|
+
reply.code(400)
|
|
41
|
+
return { success: false, error: 'url is not a valid URL' }
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
try {
|
|
45
|
+
const result = await Promise.race([
|
|
46
|
+
extract({ url, hint: typeof hint === 'string' ? hint : null }),
|
|
47
|
+
new Promise((_, rej) => setTimeout(() => rej(new Error('extract timeout')), REQUEST_TIMEOUT_MS)),
|
|
48
|
+
])
|
|
49
|
+
return { success: true, ...result }
|
|
50
|
+
} catch (err) {
|
|
51
|
+
if (
|
|
52
|
+
err.code === 'PLAYWRIGHT_UNAVAILABLE' ||
|
|
53
|
+
err.code === 'LLM_UNAVAILABLE' ||
|
|
54
|
+
err.code === 'ANTHROPIC_KEY_MISSING'
|
|
55
|
+
) {
|
|
56
|
+
reply.code(503)
|
|
57
|
+
return { success: false, error: err.message, code: err.code }
|
|
58
|
+
}
|
|
59
|
+
if (err.code === 'PRIMARY_LLM_FAILED' || err.code === 'PRIMARY_LLM_BAD_JSON') {
|
|
60
|
+
reply.code(502)
|
|
61
|
+
return { success: false, error: err.message, code: err.code }
|
|
62
|
+
}
|
|
63
|
+
request.log.error({ err }, '[web/extract] failed')
|
|
64
|
+
reply.code(500)
|
|
65
|
+
return { success: false, error: err.message || String(err) }
|
|
66
|
+
}
|
|
67
|
+
})
|
|
68
|
+
|
|
69
|
+
// Debug helpers — list / delete cached recipes.
|
|
70
|
+
fastify.get('/api/web/recipes', async (request, reply) => {
|
|
71
|
+
if (!verifyToken(request)) {
|
|
72
|
+
reply.code(401)
|
|
73
|
+
return { success: false, error: 'Unauthorized' }
|
|
74
|
+
}
|
|
75
|
+
const recipes = pageRecipeStore.listAll({ limit: 200 })
|
|
76
|
+
return { success: true, experimental: true, recipes }
|
|
77
|
+
})
|
|
78
|
+
|
|
79
|
+
fastify.delete('/api/web/recipes', async (request, reply) => {
|
|
80
|
+
if (!verifyToken(request)) {
|
|
81
|
+
reply.code(401)
|
|
82
|
+
return { success: false, error: 'Unauthorized' }
|
|
83
|
+
}
|
|
84
|
+
const { template, fingerprint } = request.query || {}
|
|
85
|
+
if (!template || !fingerprint) {
|
|
86
|
+
reply.code(400)
|
|
87
|
+
return { success: false, error: 'template and fingerprint query params are required' }
|
|
88
|
+
}
|
|
89
|
+
const removed = pageRecipeStore.remove({ urlTemplate: template, domFingerprint: fingerprint })
|
|
90
|
+
return { success: true, removed }
|
|
91
|
+
})
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
module.exports = { webRoutes }
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Page Recipe Store (SQLite, experimental — v3.53.0)
|
|
3
|
+
*
|
|
4
|
+
* Backs POST /api/web/extract. See core/lib/web-extract/extractor.js for the
|
|
5
|
+
* orchestrator that decides hot vs cold paths.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
const { getDb } = require('../db')
|
|
9
|
+
|
|
10
|
+
const MAX_FAIL_COUNT = 3
|
|
11
|
+
|
|
12
|
+
function find({ urlTemplate, domFingerprint }) {
|
|
13
|
+
const db = getDb()
|
|
14
|
+
const row = db.prepare(`
|
|
15
|
+
SELECT url_template, dom_fingerprint, selectors_json, page_type,
|
|
16
|
+
hit_count, fail_count, last_verified_at, created_at
|
|
17
|
+
FROM page_recipes
|
|
18
|
+
WHERE url_template = ? AND dom_fingerprint = ?
|
|
19
|
+
`).get(urlTemplate, domFingerprint)
|
|
20
|
+
if (!row) return null
|
|
21
|
+
return parseRow(row)
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function findByTemplate(urlTemplate) {
|
|
25
|
+
const db = getDb()
|
|
26
|
+
const rows = db.prepare(`
|
|
27
|
+
SELECT url_template, dom_fingerprint, selectors_json, page_type,
|
|
28
|
+
hit_count, fail_count, last_verified_at, created_at
|
|
29
|
+
FROM page_recipes
|
|
30
|
+
WHERE url_template = ?
|
|
31
|
+
ORDER BY last_verified_at DESC NULLS LAST, created_at DESC
|
|
32
|
+
`).all(urlTemplate)
|
|
33
|
+
return rows.map(parseRow)
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function upsert({ urlTemplate, domFingerprint, selectors, pageType }) {
|
|
37
|
+
const db = getDb()
|
|
38
|
+
const json = JSON.stringify(selectors || {})
|
|
39
|
+
const now = new Date().toISOString()
|
|
40
|
+
db.prepare(`
|
|
41
|
+
INSERT INTO page_recipes (url_template, dom_fingerprint, selectors_json, page_type, hit_count, fail_count, last_verified_at, created_at)
|
|
42
|
+
VALUES (?, ?, ?, ?, 0, 0, ?, ?)
|
|
43
|
+
ON CONFLICT(url_template, dom_fingerprint) DO UPDATE SET
|
|
44
|
+
selectors_json = excluded.selectors_json,
|
|
45
|
+
page_type = excluded.page_type,
|
|
46
|
+
fail_count = 0,
|
|
47
|
+
last_verified_at = excluded.last_verified_at
|
|
48
|
+
`).run(urlTemplate, domFingerprint, json, pageType || null, now, now)
|
|
49
|
+
return find({ urlTemplate, domFingerprint })
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function incrementHit({ urlTemplate, domFingerprint }) {
|
|
53
|
+
const db = getDb()
|
|
54
|
+
db.prepare(`
|
|
55
|
+
UPDATE page_recipes
|
|
56
|
+
SET hit_count = hit_count + 1
|
|
57
|
+
WHERE url_template = ? AND dom_fingerprint = ?
|
|
58
|
+
`).run(urlTemplate, domFingerprint)
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function setLastVerified({ urlTemplate, domFingerprint }) {
|
|
62
|
+
const db = getDb()
|
|
63
|
+
db.prepare(`
|
|
64
|
+
UPDATE page_recipes
|
|
65
|
+
SET last_verified_at = ?, fail_count = 0
|
|
66
|
+
WHERE url_template = ? AND dom_fingerprint = ?
|
|
67
|
+
`).run(new Date().toISOString(), urlTemplate, domFingerprint)
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Increment fail count. Returns true if the recipe was deleted (>= MAX_FAIL_COUNT).
|
|
72
|
+
*/
|
|
73
|
+
function incrementFail({ urlTemplate, domFingerprint }) {
|
|
74
|
+
const db = getDb()
|
|
75
|
+
const row = db.prepare(`
|
|
76
|
+
SELECT fail_count FROM page_recipes
|
|
77
|
+
WHERE url_template = ? AND dom_fingerprint = ?
|
|
78
|
+
`).get(urlTemplate, domFingerprint)
|
|
79
|
+
if (!row) return false
|
|
80
|
+
const next = (row.fail_count || 0) + 1
|
|
81
|
+
if (next >= MAX_FAIL_COUNT) {
|
|
82
|
+
remove({ urlTemplate, domFingerprint })
|
|
83
|
+
return true
|
|
84
|
+
}
|
|
85
|
+
db.prepare(`
|
|
86
|
+
UPDATE page_recipes
|
|
87
|
+
SET fail_count = ?
|
|
88
|
+
WHERE url_template = ? AND dom_fingerprint = ?
|
|
89
|
+
`).run(next, urlTemplate, domFingerprint)
|
|
90
|
+
return false
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
function remove({ urlTemplate, domFingerprint }) {
|
|
94
|
+
const db = getDb()
|
|
95
|
+
const result = db.prepare(`
|
|
96
|
+
DELETE FROM page_recipes
|
|
97
|
+
WHERE url_template = ? AND dom_fingerprint = ?
|
|
98
|
+
`).run(urlTemplate, domFingerprint)
|
|
99
|
+
return result.changes > 0
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function listAll({ limit = 100 } = {}) {
|
|
103
|
+
const db = getDb()
|
|
104
|
+
const rows = db.prepare(`
|
|
105
|
+
SELECT url_template, dom_fingerprint, selectors_json, page_type,
|
|
106
|
+
hit_count, fail_count, last_verified_at, created_at
|
|
107
|
+
FROM page_recipes
|
|
108
|
+
ORDER BY last_verified_at DESC NULLS LAST, created_at DESC
|
|
109
|
+
LIMIT ?
|
|
110
|
+
`).all(limit)
|
|
111
|
+
return rows.map(parseRow)
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
function parseRow(row) {
|
|
115
|
+
let selectors = {}
|
|
116
|
+
try {
|
|
117
|
+
selectors = JSON.parse(row.selectors_json || '{}')
|
|
118
|
+
} catch {
|
|
119
|
+
selectors = {}
|
|
120
|
+
}
|
|
121
|
+
return {
|
|
122
|
+
url_template: row.url_template,
|
|
123
|
+
dom_fingerprint: row.dom_fingerprint,
|
|
124
|
+
selectors,
|
|
125
|
+
page_type: row.page_type,
|
|
126
|
+
hit_count: row.hit_count,
|
|
127
|
+
fail_count: row.fail_count,
|
|
128
|
+
last_verified_at: row.last_verified_at,
|
|
129
|
+
created_at: row.created_at,
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
module.exports = {
|
|
134
|
+
find,
|
|
135
|
+
findByTemplate,
|
|
136
|
+
upsert,
|
|
137
|
+
incrementHit,
|
|
138
|
+
incrementFail,
|
|
139
|
+
setLastVerified,
|
|
140
|
+
remove,
|
|
141
|
+
listAll,
|
|
142
|
+
MAX_FAIL_COUNT,
|
|
143
|
+
}
|
package/docs/api-reference.md
CHANGED
|
@@ -696,6 +696,72 @@ PUT `/api/email/inbox/:id` body:
|
|
|
696
696
|
|
|
697
697
|
Note: 既読メールは受信後90日で自動削除される。未読メールは保持される。
|
|
698
698
|
|
|
699
|
+
### Web Page Extraction 🧪 (experimental, v3.53.0〜)
|
|
700
|
+
|
|
701
|
+
Web ページの読み取り・要約・情報抽出をミニオン内のサブプロセスで完結させ、メインの Claude Code セッションには結果 JSON だけを返すための実験的 API。Playwright MCP で DOM 全体がチャットに流れ込みトークン肥大化を起こす問題への対処として導入。
|
|
702
|
+
|
|
703
|
+
| Method | Endpoint | Description |
|
|
704
|
+
|--------|----------|-------------|
|
|
705
|
+
| POST | `/api/web/extract` | URL を抽出済み JSON に変換 (タイトル・本文・主要構造化データ) |
|
|
706
|
+
| GET | `/api/web/recipes` | キャッシュされたレシピ一覧 (debug) |
|
|
707
|
+
| DELETE | `/api/web/recipes?template=...&fingerprint=...` | レシピを削除 |
|
|
708
|
+
|
|
709
|
+
**前提条件 (LLM の解決順):**
|
|
710
|
+
1. **primary LLM プラグインが設定されていればそれを使用** (推奨)。`PUT /api/llm/config` で `claude` 等を primary に指定すると、CLI の認証情報 (`~/.claude/.credentials.json`) がそのまま使われる。API キーの別途設定は不要。
|
|
711
|
+
2. primary 未設定で `ANTHROPIC_API_KEY` シークレットが設定されていれば、そちらを fallback として使用 (Anthropic Messages API の tool_use で JSON Schema を強制)。
|
|
712
|
+
3. どちらもなければ 503 (`LLM_UNAVAILABLE`) を返す。
|
|
713
|
+
|
|
714
|
+
その他の前提:
|
|
715
|
+
- ホスト上で `npx playwright install chromium` を実行済みであること (未実行の場合 503 が返る)
|
|
716
|
+
|
|
717
|
+
**`POST /api/web/extract` リクエスト:**
|
|
718
|
+
```json
|
|
719
|
+
{
|
|
720
|
+
"url": "https://example.com/article/123",
|
|
721
|
+
"hint": "本文と著者を抽出してほしい (任意, 抽出フィールドのヒント)"
|
|
722
|
+
}
|
|
723
|
+
```
|
|
724
|
+
|
|
725
|
+
**レスポンス (success):**
|
|
726
|
+
```json
|
|
727
|
+
{
|
|
728
|
+
"success": true,
|
|
729
|
+
"experimental": true,
|
|
730
|
+
"url": "https://example.com/article/123",
|
|
731
|
+
"finalUrl": "https://example.com/article/123",
|
|
732
|
+
"statusCode": 200,
|
|
733
|
+
"recipeMode": "cold",
|
|
734
|
+
"recipeId": "example.com/article/:id#abc123def456",
|
|
735
|
+
"pageType": "article",
|
|
736
|
+
"title": "...",
|
|
737
|
+
"content": "Markdown 本文...",
|
|
738
|
+
"structured": { "title": "...", "author": "...", "publishedAt": "..." },
|
|
739
|
+
"selectors": { "title": { "selector": "h1" }, "author": { "selector": "a[rel=author]" } }
|
|
740
|
+
}
|
|
741
|
+
```
|
|
742
|
+
|
|
743
|
+
**動作:**
|
|
744
|
+
- 初回アクセス (cold): Playwright でレンダリング → Readability で本文抽出 → Anthropic Haiku でセレクタ生成 → SQLite (`page_recipes`) に保存 → セレクタで再抽出して返却
|
|
745
|
+
- 2回目以降 (hot): URL 正規化・テンプレート化 → DOM フィンガープリントで保存済みレシピを照合 → セレクタで抽出のみ (LLM 呼び出しなし)
|
|
746
|
+
- セルフヒール: hot 実行で空結果が返ったら `fail_count++`、3回失敗で破棄して次回 cold 再生成
|
|
747
|
+
|
|
748
|
+
**URL 正規化ルール:**
|
|
749
|
+
- `utm_*` `fbclid` `gclid` `ref` 等のトラッキングクエリは除去
|
|
750
|
+
- `page` `p` `offset` 等のページネーション値は `:n` プレースホルダ化
|
|
751
|
+
- パスセグメントは数値→`:id`、UUID→`:uuid`、20文字以上の英数字→`:slug` に置換
|
|
752
|
+
- 例: `https://www.lancers.jp/work/proposal/123456?utm_source=foo&page=2` → `lancers.jp/work/proposal/:id?page=:n`
|
|
753
|
+
|
|
754
|
+
**エラー:**
|
|
755
|
+
- 401: APIトークン不正
|
|
756
|
+
- 400: URL 欠落・不正
|
|
757
|
+
- 502: primary LLM 呼び出し失敗 (`PRIMARY_LLM_FAILED`) / 返却 JSON が parse 不能 (`PRIMARY_LLM_BAD_JSON`)
|
|
758
|
+
- 503: Playwright 未インストール (`PLAYWRIGHT_UNAVAILABLE`) / LLM 未設定 (`LLM_UNAVAILABLE`)
|
|
759
|
+
- 500: 抽出失敗 (タイムアウト等)
|
|
760
|
+
|
|
761
|
+
**注意 (experimental):**
|
|
762
|
+
- レスポンス形状・URL 正規化ルール・キャッシュスキーマは予告なく変更される可能性がある
|
|
763
|
+
- 認証付きページ (ログイン必須) は対応外。Cookie や対話的操作が必要な場合は Playwright MCP を使用すること
|
|
764
|
+
|
|
699
765
|
### Commands
|
|
700
766
|
|
|
701
767
|
| Method | Endpoint | Description |
|