@geekbeer/minion 3.52.0 → 3.53.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,247 @@
1
+ /**
2
+ * Cold-path recipe generator.
3
+ *
4
+ * Resolution order (first available wins):
5
+ * 1. Primary LLM plugin (PUT /api/llm/config) — uses claude / gemini / codex
6
+ * CLI subprocess via the same plugin contract that runQuickLlmCall does.
7
+ * Prompt asks for plain JSON; we extract+parse it.
8
+ * 2. ANTHROPIC_API_KEY env var — direct Anthropic Messages API with
9
+ * tool_use schema enforcement (same fetch pattern as
10
+ * core/lib/revision-watcher.js).
11
+ * 3. Otherwise: throw LLM_UNAVAILABLE so the route layer can surface a 503.
12
+ *
13
+ * The model only ever sees cleaned Markdown — never the raw HTML — and only
14
+ * runs on cold-cache misses, so the cost is bounded.
15
+ *
16
+ * Returns:
17
+ * {
18
+ * pageType: 'article' | 'listing' | 'product' | 'profile' | 'form' | 'other',
19
+ * selectors: { fieldName: { selector, attr?, multiple? }, ... },
20
+ * extracted: { fieldName: <value already pulled from this page> }
21
+ * }
22
+ */
23
+
24
+ const { getActivePrimary } = require('../../llm-plugins/lib/active')
25
+
26
+ const ANTHROPIC_MODEL = 'claude-haiku-4-5-20251001'
27
+ const MAX_TOKENS = 2048
28
+ const PLUGIN_TIMEOUT_MS = 60_000
29
+
30
+ const TOOL_DESCRIPTION =
31
+ 'Classify the page and propose CSS selectors for the most useful fields. ' +
32
+ 'Also produce the extracted values directly so the caller can verify the recipe.'
33
+
34
+ const ANTHROPIC_TOOLS = [{
35
+ name: 'page_extraction',
36
+ description: TOOL_DESCRIPTION,
37
+ input_schema: {
38
+ type: 'object',
39
+ required: ['page_type', 'selectors', 'extracted'],
40
+ properties: {
41
+ page_type: {
42
+ type: 'string',
43
+ enum: ['article', 'listing', 'product', 'profile', 'form', 'other'],
44
+ description: 'High-level classification of the page.',
45
+ },
46
+ selectors: {
47
+ type: 'object',
48
+ description:
49
+ 'Map of fieldName -> { selector, attr?, multiple? }. Use plain CSS selectors. ' +
50
+ 'attr defaults to "text" (innerText); use "html" or an HTML attribute name to override. ' +
51
+ 'Set multiple=true for list fields. Aim for 3-8 fields covering the page\'s primary content.',
52
+ additionalProperties: {
53
+ type: 'object',
54
+ required: ['selector'],
55
+ properties: {
56
+ selector: { type: 'string' },
57
+ attr: { type: 'string' },
58
+ multiple: { type: 'boolean' },
59
+ },
60
+ },
61
+ },
62
+ extracted: {
63
+ type: 'object',
64
+ description:
65
+ 'Values extracted from this specific page using the selectors above. ' +
66
+ 'Strings or arrays of strings.',
67
+ },
68
+ },
69
+ },
70
+ }]
71
+
72
+ const SYSTEM_PROMPT = `You design CSS selector recipes for extracting structured content from web pages.
73
+
74
+ Given a cleaned Markdown rendering of one page, you must:
75
+ 1. Classify the page (article / listing / product / profile / form / other).
76
+ 2. Propose 3-8 CSS selectors that capture the page's primary information.
77
+ - Prefer semantic selectors (article, h1, time[datetime], a[rel="author"]) over class names where possible.
78
+ - Use class-based selectors only when semantic ones are unavailable.
79
+ - Avoid fragile attribute selectors like data-react-* or auto-generated hashes.
80
+ 3. Fill the "extracted" object with the values pulled from this exact page so the caller can verify your recipe works.
81
+
82
+ The same recipe will be reused for structurally similar pages, so think about what generalizes.`
83
+
84
+ async function generateRecipe({ url, cleanedMarkdown, hint }) {
85
+ const primary = getActivePrimary()
86
+ if (primary) {
87
+ return await generateViaPlugin(primary, { url, cleanedMarkdown, hint })
88
+ }
89
+ if (process.env.ANTHROPIC_API_KEY) {
90
+ return await generateViaAnthropicDirect({ url, cleanedMarkdown, hint })
91
+ }
92
+ const e = new Error(
93
+ 'No LLM available for cold-path recipe generation. Configure a primary LLM via ' +
94
+ 'PUT /api/llm/config (recommended) or set ANTHROPIC_API_KEY as a fallback.'
95
+ )
96
+ e.code = 'LLM_UNAVAILABLE'
97
+ throw e
98
+ }
99
+
100
+ async function generateViaPlugin(plugin, { url, cleanedMarkdown, hint }) {
101
+ const prompt = buildTextPrompt({ url, cleanedMarkdown, hint })
102
+ const input = {
103
+ prompt,
104
+ timeoutMs: PLUGIN_TIMEOUT_MS,
105
+ }
106
+ // Claude Code CLI accepts model aliases like 'haiku'. Other plugins ignore this.
107
+ if (plugin.name === 'claude') input.model = 'haiku'
108
+
109
+ let output
110
+ try {
111
+ output = await plugin.invoke(input)
112
+ } catch (err) {
113
+ const e = new Error(`Primary LLM (${plugin.name}) invoke failed: ${err.message}`)
114
+ e.code = 'PRIMARY_LLM_FAILED'
115
+ throw e
116
+ }
117
+ if (output.error) {
118
+ const e = new Error(`Primary LLM (${plugin.name}) returned error: ${output.error.message}`)
119
+ e.code = 'PRIMARY_LLM_FAILED'
120
+ throw e
121
+ }
122
+
123
+ const json = extractJson(output.text || '')
124
+ if (!json) {
125
+ const e = new Error(
126
+ `Primary LLM (${plugin.name}) did not return parseable JSON. ` +
127
+ `Raw output (first 500 chars): ${(output.text || '').slice(0, 500)}`
128
+ )
129
+ e.code = 'PRIMARY_LLM_BAD_JSON'
130
+ throw e
131
+ }
132
+ return {
133
+ pageType: json.page_type || 'other',
134
+ selectors: json.selectors || {},
135
+ extracted: json.extracted || {},
136
+ source: `primary:${plugin.name}`,
137
+ }
138
+ }
139
+
140
+ async function generateViaAnthropicDirect({ url, cleanedMarkdown, hint }) {
141
+ const apiKey = process.env.ANTHROPIC_API_KEY
142
+ const userParts = [
143
+ `URL: ${url}`,
144
+ hint ? `Caller hint: ${hint}` : null,
145
+ '',
146
+ '--- Cleaned Markdown (Readability output) ---',
147
+ cleanedMarkdown || '(empty)',
148
+ ].filter(Boolean).join('\n')
149
+
150
+ const resp = await fetch('https://api.anthropic.com/v1/messages', {
151
+ method: 'POST',
152
+ headers: {
153
+ 'Content-Type': 'application/json',
154
+ 'x-api-key': apiKey,
155
+ 'anthropic-version': '2023-06-01',
156
+ },
157
+ body: JSON.stringify({
158
+ model: ANTHROPIC_MODEL,
159
+ max_tokens: MAX_TOKENS,
160
+ system: SYSTEM_PROMPT,
161
+ tools: ANTHROPIC_TOOLS,
162
+ tool_choice: { type: 'tool', name: 'page_extraction' },
163
+ messages: [{ role: 'user', content: userParts }],
164
+ }),
165
+ })
166
+
167
+ if (!resp.ok) {
168
+ const text = await resp.text()
169
+ throw new Error(`Anthropic API error: ${resp.status} ${text}`)
170
+ }
171
+
172
+ const data = await resp.json()
173
+ const toolUse = (data.content || []).find(block => block.type === 'tool_use' && block.name === 'page_extraction')
174
+ if (!toolUse || !toolUse.input) {
175
+ throw new Error('Anthropic API returned no tool_use block for page_extraction')
176
+ }
177
+
178
+ const { page_type, selectors, extracted } = toolUse.input
179
+ return {
180
+ pageType: page_type || 'other',
181
+ selectors: selectors || {},
182
+ extracted: extracted || {},
183
+ source: 'anthropic-direct',
184
+ }
185
+ }
186
+
187
+ function buildTextPrompt({ url, cleanedMarkdown, hint }) {
188
+ return [
189
+ SYSTEM_PROMPT,
190
+ '',
191
+ `URL: ${url}`,
192
+ hint ? `Caller hint: ${hint}` : null,
193
+ '',
194
+ 'Output ONLY a JSON object — no prose, no explanations, no code fences.',
195
+ 'The JSON must have exactly this shape:',
196
+ '',
197
+ '{',
198
+ ' "page_type": "article" | "listing" | "product" | "profile" | "form" | "other",',
199
+ ' "selectors": {',
200
+ ' "<fieldName>": { "selector": "<css>", "attr"?: "<text|html|attribute-name>", "multiple"?: <boolean> }',
201
+ ' },',
202
+ ' "extracted": { "<fieldName>": "<string or array of strings>" }',
203
+ '}',
204
+ '',
205
+ 'Notes:',
206
+ '- attr defaults to "text" (innerText). Use "html" or an HTML attribute name to override.',
207
+ '- Set multiple=true for list fields (returns array).',
208
+ '- "extracted" must contain the values you actually read from THIS page using those selectors.',
209
+ '',
210
+ '--- Cleaned Markdown ---',
211
+ cleanedMarkdown || '(empty)',
212
+ ].filter(s => s !== null).join('\n')
213
+ }
214
+
215
+ /**
216
+ * Extract a JSON object from arbitrary LLM text. Handles three common shapes:
217
+ * 1. Raw JSON
218
+ * 2. Fenced code block (```json ... ``` or ``` ... ```)
219
+ * 3. Prose with embedded {...} — uses the outermost braces
220
+ */
221
+ function extractJson(text) {
222
+ if (!text || typeof text !== 'string') return null
223
+ const trimmed = text.trim()
224
+ if (!trimmed) return null
225
+
226
+ try { return JSON.parse(trimmed) } catch {}
227
+
228
+ const fence = trimmed.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
229
+ if (fence && fence[1]) {
230
+ try { return JSON.parse(fence[1].trim()) } catch {}
231
+ }
232
+
233
+ const first = trimmed.indexOf('{')
234
+ const last = trimmed.lastIndexOf('}')
235
+ if (first >= 0 && last > first) {
236
+ try { return JSON.parse(trimmed.slice(first, last + 1)) } catch {}
237
+ }
238
+
239
+ return null
240
+ }
241
+
242
+ module.exports = {
243
+ generateRecipe,
244
+ ANTHROPIC_MODEL,
245
+ // exported for tests
246
+ extractJson,
247
+ }
@@ -0,0 +1,90 @@
1
+ /**
2
+ * URL normalization for the page recipe cache.
3
+ *
4
+ * Two URLs that resolve to the same template + the same DOM fingerprint are
5
+ * treated as structurally identical and share an extraction recipe. The
6
+ * template captures the path/query *shape* (with IDs / slugs / pagination
7
+ * placeholdered out) so the cache hits across e.g.
8
+ * /work/proposal/123456?utm_source=foo
9
+ * /work/proposal/789012?utm_source=bar&fbclid=...
10
+ * Tracking params are stripped, remaining query keys are sorted, and known
11
+ * pagination params keep their key but lose their value.
12
+ */
13
+
14
+ const STRIP_QUERY_PARAMS = new Set([
15
+ 'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
16
+ 'fbclid', 'gclid', 'mc_cid', 'mc_eid', 'ref', 'referrer',
17
+ ])
18
+
19
+ const PAGINATION_QUERY_PARAMS = new Set([
20
+ 'page', 'p', 'pagenumber', 'pagenum', 'offset', 'start',
21
+ ])
22
+
23
+ const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i
24
+ const HEX32_RE = /^[0-9a-f]{32}$/i
25
+ const ALL_DIGITS_RE = /^\d+$/
26
+ const LONG_ALNUM_RE = /^(?=.*\d)(?=.*[a-zA-Z])[A-Za-z0-9]{20,}$/
27
+
28
+ function placeholderForSegment(segment) {
29
+ if (!segment) return segment
30
+ if (ALL_DIGITS_RE.test(segment)) return ':id'
31
+ if (UUID_RE.test(segment) || HEX32_RE.test(segment)) return ':uuid'
32
+ if (LONG_ALNUM_RE.test(segment)) return ':slug'
33
+ return segment
34
+ }
35
+
36
+ function normalizeUrl(rawUrl) {
37
+ let parsed
38
+ try {
39
+ parsed = new URL(rawUrl)
40
+ } catch (err) {
41
+ throw new Error(`Invalid URL: ${rawUrl}`)
42
+ }
43
+
44
+ // Lowercase host (case-insensitive per RFC 3986)
45
+ const host = parsed.host.toLowerCase()
46
+
47
+ // Path: keep separators, placeholder each segment.
48
+ const pathSegments = parsed.pathname.split('/').map(seg => seg ? placeholderForSegment(seg) : seg)
49
+ const templatePath = pathSegments.join('/')
50
+
51
+ // Query: filter, sort, and placeholder pagination values.
52
+ const queryEntries = []
53
+ for (const [key, value] of parsed.searchParams.entries()) {
54
+ const lowerKey = key.toLowerCase()
55
+ if (STRIP_QUERY_PARAMS.has(lowerKey)) continue
56
+ if (PAGINATION_QUERY_PARAMS.has(lowerKey)) {
57
+ queryEntries.push([key, ':n'])
58
+ } else {
59
+ queryEntries.push([key, value])
60
+ }
61
+ }
62
+ queryEntries.sort((a, b) => a[0] < b[0] ? -1 : a[0] > b[0] ? 1 : 0)
63
+
64
+ const templateQuery = queryEntries.length
65
+ ? '?' + queryEntries.map(([k, v]) => `${k}=${v}`).join('&')
66
+ : ''
67
+
68
+ const template = `${host}${templatePath}${templateQuery}`
69
+
70
+ // Canonical URL: original protocol + host + path, query reduced to non-tracking params
71
+ // (keeping their *real* values so the actual fetch stays correct).
72
+ const canonical = new URL(parsed.toString())
73
+ for (const param of [...canonical.searchParams.keys()]) {
74
+ if (STRIP_QUERY_PARAMS.has(param.toLowerCase())) {
75
+ canonical.searchParams.delete(param)
76
+ }
77
+ }
78
+ canonical.hash = ''
79
+
80
+ return {
81
+ template,
82
+ canonicalUrl: canonical.toString(),
83
+ }
84
+ }
85
+
86
+ module.exports = {
87
+ normalizeUrl,
88
+ STRIP_QUERY_PARAMS,
89
+ PAGINATION_QUERY_PARAMS,
90
+ }
@@ -0,0 +1,94 @@
1
+ /**
2
+ * Web Page Extraction (experimental — v3.53.0)
3
+ *
4
+ * Endpoints:
5
+ * POST /api/web/extract - Fetch a URL and return structured JSON
6
+ * GET /api/web/recipes - List cached recipes (debug)
7
+ * DELETE /api/web/recipes/:template/:fingerprint - Drop a recipe
8
+ *
9
+ * The extract endpoint runs Playwright + Readability + an Anthropic Haiku
10
+ * recipe-generation step in this minion process, returning only structured
11
+ * JSON to the caller. Designed to keep large DOM payloads off the main
12
+ * Claude Code chat session that issued the request.
13
+ *
14
+ * Marked experimental: response shape may change.
15
+ */
16
+
17
+ const { verifyToken } = require('../lib/auth')
18
+ const { extract } = require('../lib/web-extract')
19
+ const pageRecipeStore = require('../stores/page-recipe-store')
20
+
21
+ const REQUEST_TIMEOUT_MS = 60_000
22
+
23
+ async function webRoutes(fastify) {
24
+ fastify.post('/api/web/extract', async (request, reply) => {
25
+ if (!verifyToken(request)) {
26
+ reply.code(401)
27
+ return { success: false, error: 'Unauthorized' }
28
+ }
29
+
30
+ const body = request.body || {}
31
+ const { url, hint } = body
32
+
33
+ if (!url || typeof url !== 'string') {
34
+ reply.code(400)
35
+ return { success: false, error: 'url (string) is required' }
36
+ }
37
+ try {
38
+ new URL(url)
39
+ } catch {
40
+ reply.code(400)
41
+ return { success: false, error: 'url is not a valid URL' }
42
+ }
43
+
44
+ try {
45
+ const result = await Promise.race([
46
+ extract({ url, hint: typeof hint === 'string' ? hint : null }),
47
+ new Promise((_, rej) => setTimeout(() => rej(new Error('extract timeout')), REQUEST_TIMEOUT_MS)),
48
+ ])
49
+ return { success: true, ...result }
50
+ } catch (err) {
51
+ if (
52
+ err.code === 'PLAYWRIGHT_UNAVAILABLE' ||
53
+ err.code === 'LLM_UNAVAILABLE' ||
54
+ err.code === 'ANTHROPIC_KEY_MISSING'
55
+ ) {
56
+ reply.code(503)
57
+ return { success: false, error: err.message, code: err.code }
58
+ }
59
+ if (err.code === 'PRIMARY_LLM_FAILED' || err.code === 'PRIMARY_LLM_BAD_JSON') {
60
+ reply.code(502)
61
+ return { success: false, error: err.message, code: err.code }
62
+ }
63
+ request.log.error({ err }, '[web/extract] failed')
64
+ reply.code(500)
65
+ return { success: false, error: err.message || String(err) }
66
+ }
67
+ })
68
+
69
+ // Debug helpers — list / delete cached recipes.
70
+ fastify.get('/api/web/recipes', async (request, reply) => {
71
+ if (!verifyToken(request)) {
72
+ reply.code(401)
73
+ return { success: false, error: 'Unauthorized' }
74
+ }
75
+ const recipes = pageRecipeStore.listAll({ limit: 200 })
76
+ return { success: true, experimental: true, recipes }
77
+ })
78
+
79
+ fastify.delete('/api/web/recipes', async (request, reply) => {
80
+ if (!verifyToken(request)) {
81
+ reply.code(401)
82
+ return { success: false, error: 'Unauthorized' }
83
+ }
84
+ const { template, fingerprint } = request.query || {}
85
+ if (!template || !fingerprint) {
86
+ reply.code(400)
87
+ return { success: false, error: 'template and fingerprint query params are required' }
88
+ }
89
+ const removed = pageRecipeStore.remove({ urlTemplate: template, domFingerprint: fingerprint })
90
+ return { success: true, removed }
91
+ })
92
+ }
93
+
94
+ module.exports = { webRoutes }
@@ -0,0 +1,143 @@
1
+ /**
2
+ * Page Recipe Store (SQLite, experimental — v3.53.0)
3
+ *
4
+ * Backs POST /api/web/extract. See core/lib/web-extract/extractor.js for the
5
+ * orchestrator that decides hot vs cold paths.
6
+ */
7
+
8
+ const { getDb } = require('../db')
9
+
10
+ const MAX_FAIL_COUNT = 3
11
+
12
+ function find({ urlTemplate, domFingerprint }) {
13
+ const db = getDb()
14
+ const row = db.prepare(`
15
+ SELECT url_template, dom_fingerprint, selectors_json, page_type,
16
+ hit_count, fail_count, last_verified_at, created_at
17
+ FROM page_recipes
18
+ WHERE url_template = ? AND dom_fingerprint = ?
19
+ `).get(urlTemplate, domFingerprint)
20
+ if (!row) return null
21
+ return parseRow(row)
22
+ }
23
+
24
+ function findByTemplate(urlTemplate) {
25
+ const db = getDb()
26
+ const rows = db.prepare(`
27
+ SELECT url_template, dom_fingerprint, selectors_json, page_type,
28
+ hit_count, fail_count, last_verified_at, created_at
29
+ FROM page_recipes
30
+ WHERE url_template = ?
31
+ ORDER BY last_verified_at DESC NULLS LAST, created_at DESC
32
+ `).all(urlTemplate)
33
+ return rows.map(parseRow)
34
+ }
35
+
36
+ function upsert({ urlTemplate, domFingerprint, selectors, pageType }) {
37
+ const db = getDb()
38
+ const json = JSON.stringify(selectors || {})
39
+ const now = new Date().toISOString()
40
+ db.prepare(`
41
+ INSERT INTO page_recipes (url_template, dom_fingerprint, selectors_json, page_type, hit_count, fail_count, last_verified_at, created_at)
42
+ VALUES (?, ?, ?, ?, 0, 0, ?, ?)
43
+ ON CONFLICT(url_template, dom_fingerprint) DO UPDATE SET
44
+ selectors_json = excluded.selectors_json,
45
+ page_type = excluded.page_type,
46
+ fail_count = 0,
47
+ last_verified_at = excluded.last_verified_at
48
+ `).run(urlTemplate, domFingerprint, json, pageType || null, now, now)
49
+ return find({ urlTemplate, domFingerprint })
50
+ }
51
+
52
+ function incrementHit({ urlTemplate, domFingerprint }) {
53
+ const db = getDb()
54
+ db.prepare(`
55
+ UPDATE page_recipes
56
+ SET hit_count = hit_count + 1
57
+ WHERE url_template = ? AND dom_fingerprint = ?
58
+ `).run(urlTemplate, domFingerprint)
59
+ }
60
+
61
+ function setLastVerified({ urlTemplate, domFingerprint }) {
62
+ const db = getDb()
63
+ db.prepare(`
64
+ UPDATE page_recipes
65
+ SET last_verified_at = ?, fail_count = 0
66
+ WHERE url_template = ? AND dom_fingerprint = ?
67
+ `).run(new Date().toISOString(), urlTemplate, domFingerprint)
68
+ }
69
+
70
+ /**
71
+ * Increment fail count. Returns true if the recipe was deleted (>= MAX_FAIL_COUNT).
72
+ */
73
+ function incrementFail({ urlTemplate, domFingerprint }) {
74
+ const db = getDb()
75
+ const row = db.prepare(`
76
+ SELECT fail_count FROM page_recipes
77
+ WHERE url_template = ? AND dom_fingerprint = ?
78
+ `).get(urlTemplate, domFingerprint)
79
+ if (!row) return false
80
+ const next = (row.fail_count || 0) + 1
81
+ if (next >= MAX_FAIL_COUNT) {
82
+ remove({ urlTemplate, domFingerprint })
83
+ return true
84
+ }
85
+ db.prepare(`
86
+ UPDATE page_recipes
87
+ SET fail_count = ?
88
+ WHERE url_template = ? AND dom_fingerprint = ?
89
+ `).run(next, urlTemplate, domFingerprint)
90
+ return false
91
+ }
92
+
93
+ function remove({ urlTemplate, domFingerprint }) {
94
+ const db = getDb()
95
+ const result = db.prepare(`
96
+ DELETE FROM page_recipes
97
+ WHERE url_template = ? AND dom_fingerprint = ?
98
+ `).run(urlTemplate, domFingerprint)
99
+ return result.changes > 0
100
+ }
101
+
102
+ function listAll({ limit = 100 } = {}) {
103
+ const db = getDb()
104
+ const rows = db.prepare(`
105
+ SELECT url_template, dom_fingerprint, selectors_json, page_type,
106
+ hit_count, fail_count, last_verified_at, created_at
107
+ FROM page_recipes
108
+ ORDER BY last_verified_at DESC NULLS LAST, created_at DESC
109
+ LIMIT ?
110
+ `).all(limit)
111
+ return rows.map(parseRow)
112
+ }
113
+
114
+ function parseRow(row) {
115
+ let selectors = {}
116
+ try {
117
+ selectors = JSON.parse(row.selectors_json || '{}')
118
+ } catch {
119
+ selectors = {}
120
+ }
121
+ return {
122
+ url_template: row.url_template,
123
+ dom_fingerprint: row.dom_fingerprint,
124
+ selectors,
125
+ page_type: row.page_type,
126
+ hit_count: row.hit_count,
127
+ fail_count: row.fail_count,
128
+ last_verified_at: row.last_verified_at,
129
+ created_at: row.created_at,
130
+ }
131
+ }
132
+
133
+ module.exports = {
134
+ find,
135
+ findByTemplate,
136
+ upsert,
137
+ incrementHit,
138
+ incrementFail,
139
+ setLastVerified,
140
+ remove,
141
+ listAll,
142
+ MAX_FAIL_COUNT,
143
+ }
@@ -696,6 +696,72 @@ PUT `/api/email/inbox/:id` body:
696
696
 
697
697
  Note: 既読メールは受信後90日で自動削除される。未読メールは保持される。
698
698
 
699
+ ### Web Page Extraction 🧪 (experimental, v3.53.0〜)
700
+
701
+ Web ページの読み取り・要約・情報抽出をミニオン内のサブプロセスで完結させ、メインの Claude Code セッションには結果 JSON だけを返すための実験的 API。Playwright MCP で DOM 全体がチャットに流れ込みトークン肥大化を起こす問題への対処として導入。
702
+
703
+ | Method | Endpoint | Description |
704
+ |--------|----------|-------------|
705
+ | POST | `/api/web/extract` | URL を抽出済み JSON に変換 (タイトル・本文・主要構造化データ) |
706
+ | GET | `/api/web/recipes` | キャッシュされたレシピ一覧 (debug) |
707
+ | DELETE | `/api/web/recipes?template=...&fingerprint=...` | レシピを削除 |
708
+
709
+ **前提条件 (LLM の解決順):**
710
+ 1. **primary LLM プラグインが設定されていればそれを使用** (推奨)。`PUT /api/llm/config` で `claude` 等を primary に指定すると、CLI の認証情報 (`~/.claude/.credentials.json`) がそのまま使われる。API キーの別途設定は不要。
711
+ 2. primary 未設定で `ANTHROPIC_API_KEY` シークレットが設定されていれば、そちらを fallback として使用 (Anthropic Messages API の tool_use で JSON Schema を強制)。
712
+ 3. どちらもなければ 503 (`LLM_UNAVAILABLE`) を返す。
713
+
714
+ その他の前提:
715
+ - ホスト上で `npx playwright install chromium` を実行済みであること (未実行の場合 503 が返る)
716
+
717
+ **`POST /api/web/extract` リクエスト:**
718
+ ```json
719
+ {
720
+ "url": "https://example.com/article/123",
721
+ "hint": "本文と著者を抽出してほしい (任意, 抽出フィールドのヒント)"
722
+ }
723
+ ```
724
+
725
+ **レスポンス (success):**
726
+ ```json
727
+ {
728
+ "success": true,
729
+ "experimental": true,
730
+ "url": "https://example.com/article/123",
731
+ "finalUrl": "https://example.com/article/123",
732
+ "statusCode": 200,
733
+ "recipeMode": "cold",
734
+ "recipeId": "example.com/article/:id#abc123def456",
735
+ "pageType": "article",
736
+ "title": "...",
737
+ "content": "Markdown 本文...",
738
+ "structured": { "title": "...", "author": "...", "publishedAt": "..." },
739
+ "selectors": { "title": { "selector": "h1" }, "author": { "selector": "a[rel=author]" } }
740
+ }
741
+ ```
742
+
743
+ **動作:**
744
+ - 初回アクセス (cold): Playwright でレンダリング → Readability で本文抽出 → Anthropic Haiku でセレクタ生成 → SQLite (`page_recipes`) に保存 → セレクタで再抽出して返却
745
+ - 2回目以降 (hot): URL 正規化・テンプレート化 → DOM フィンガープリントで保存済みレシピを照合 → セレクタで抽出のみ (LLM 呼び出しなし)
746
+ - セルフヒール: hot 実行で空結果が返ったら `fail_count++`、3回失敗で破棄して次回 cold 再生成
747
+
748
+ **URL 正規化ルール:**
749
+ - `utm_*` `fbclid` `gclid` `ref` 等のトラッキングクエリは除去
750
+ - `page` `p` `offset` 等のページネーション値は `:n` プレースホルダ化
751
+ - パスセグメントは数値→`:id`、UUID→`:uuid`、20文字以上の英数字→`:slug` に置換
752
+ - 例: `https://www.lancers.jp/work/proposal/123456?utm_source=foo&page=2` → `lancers.jp/work/proposal/:id?page=:n`
753
+
754
+ **エラー:**
755
+ - 401: APIトークン不正
756
+ - 400: URL 欠落・不正
757
+ - 502: primary LLM 呼び出し失敗 (`PRIMARY_LLM_FAILED`) / 返却 JSON が parse 不能 (`PRIMARY_LLM_BAD_JSON`)
758
+ - 503: Playwright 未インストール (`PLAYWRIGHT_UNAVAILABLE`) / LLM 未設定 (`LLM_UNAVAILABLE`)
759
+ - 500: 抽出失敗 (タイムアウト等)
760
+
761
+ **注意 (experimental):**
762
+ - レスポンス形状・URL 正規化ルール・キャッシュスキーマは予告なく変更される可能性がある
763
+ - 認証付きページ (ログイン必須) は対応外。Cookie や対話的操作が必要な場合は Playwright MCP を使用すること
764
+
699
765
  ### Commands
700
766
 
701
767
  | Method | Endpoint | Description |