@geekbeer/minion 3.51.2 → 3.53.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +7 -0
- package/core/api.js +26 -2
- package/core/db/migrations/20260508000000_page_recipes.js +33 -0
- package/core/lib/board-task-poller.js +12 -1
- package/core/lib/dag-cron-poller.js +14 -1
- package/core/lib/dag-step-poller.js +17 -1
- package/core/lib/frozen-state.js +64 -0
- package/core/lib/revision-watcher.js +5 -1
- package/core/lib/step-poller.js +5 -1
- package/core/lib/thread-watcher.js +5 -1
- package/core/lib/web-extract/extractor.js +142 -0
- package/core/lib/web-extract/fingerprint.js +63 -0
- package/core/lib/web-extract/html-cleaner.js +72 -0
- package/core/lib/web-extract/index.js +21 -0
- package/core/lib/web-extract/playwright-runner.js +129 -0
- package/core/lib/web-extract/recipe-generator.js +247 -0
- package/core/lib/web-extract/url-normalize.js +90 -0
- package/core/routes/admin.js +49 -0
- package/core/routes/web.js +94 -0
- package/core/stores/page-recipe-store.js +143 -0
- package/docs/api-reference.md +83 -0
- package/docs/task-guides.md +58 -0
- package/linux/routes/chat.js +36 -4
- package/linux/server.js +4 -0
- package/mac/server.js +2 -0
- package/package.json +6 -2
- package/rules/core.md +29 -1
- package/win/routes/chat.js +37 -2
- package/win/server.js +4 -0
package/.env.example
CHANGED
|
@@ -17,3 +17,10 @@ MINION_ID=
|
|
|
17
17
|
|
|
18
18
|
# Agent port (optional, default: 8080)
|
|
19
19
|
AGENT_PORT=8080
|
|
20
|
+
|
|
21
|
+
# Anthropic API key (optional, experimental, fallback only) —
|
|
22
|
+
# POST /api/web/extract prefers the primary LLM plugin (see PUT /api/llm/config)
|
|
23
|
+
# and only uses ANTHROPIC_API_KEY if no primary plugin is configured. Set via:
|
|
24
|
+
# curl -X PUT http://localhost:8080/api/secrets/ANTHROPIC_API_KEY \
|
|
25
|
+
# -H "Authorization: Bearer $API_TOKEN" -d '{"value": "sk-ant-..."}'
|
|
26
|
+
ANTHROPIC_API_KEY=
|
package/core/api.js
CHANGED
|
@@ -6,6 +6,16 @@
|
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
8
|
const { config, isHqConfigured } = require('./config')
|
|
9
|
+
const frozenState = require('./lib/frozen-state')
|
|
10
|
+
|
|
11
|
+
class BillingFrozenError extends Error {
|
|
12
|
+
constructor(reason) {
|
|
13
|
+
super(`Minion is billing-frozen: ${reason || 'unknown'}`)
|
|
14
|
+
this.name = 'BillingFrozenError'
|
|
15
|
+
this.statusCode = 402
|
|
16
|
+
this.billingFrozen = true
|
|
17
|
+
}
|
|
18
|
+
}
|
|
9
19
|
|
|
10
20
|
/**
|
|
11
21
|
* Send HTTP request to the HQ server
|
|
@@ -17,6 +27,10 @@ async function request(endpoint, options = {}) {
|
|
|
17
27
|
return { skipped: true, reason: 'HQ not configured' }
|
|
18
28
|
}
|
|
19
29
|
|
|
30
|
+
if (frozenState.isFrozen()) {
|
|
31
|
+
throw new BillingFrozenError(frozenState.getState().reason)
|
|
32
|
+
}
|
|
33
|
+
|
|
20
34
|
const url = `${config.HQ_URL}/api/minion${endpoint}`
|
|
21
35
|
|
|
22
36
|
const response = await fetch(url, {
|
|
@@ -28,10 +42,19 @@ async function request(endpoint, options = {}) {
|
|
|
28
42
|
},
|
|
29
43
|
})
|
|
30
44
|
|
|
31
|
-
|
|
45
|
+
let data = null
|
|
46
|
+
try {
|
|
47
|
+
data = await response.json()
|
|
48
|
+
} catch {
|
|
49
|
+
data = {}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
if (frozenState.maybeFreezeFromResponse(response, data)) {
|
|
53
|
+
throw new BillingFrozenError(data && data.reason)
|
|
54
|
+
}
|
|
32
55
|
|
|
33
56
|
if (!response.ok) {
|
|
34
|
-
const err = new Error(data.error || `API request failed: ${response.status}`)
|
|
57
|
+
const err = new Error((data && data.error) || `API request failed: ${response.status}`)
|
|
35
58
|
err.statusCode = response.status
|
|
36
59
|
throw err
|
|
37
60
|
}
|
|
@@ -203,4 +226,5 @@ module.exports = {
|
|
|
203
226
|
deleteThread,
|
|
204
227
|
createProjectMemory,
|
|
205
228
|
searchProjectMemories,
|
|
229
|
+
BillingFrozenError,
|
|
206
230
|
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* page_recipes — Web page extraction recipe cache (experimental, v3.53.0).
|
|
3
|
+
*
|
|
4
|
+
* Stores selectors learned from a first-time visit so subsequent visits to
|
|
5
|
+
* structurally similar pages skip the LLM round trip. Keyed by URL template
|
|
6
|
+
* (after normalization) + DOM fingerprint to tolerate A/B variants.
|
|
7
|
+
*
|
|
8
|
+
* Marked experimental: schema may change before the API stabilizes.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
module.exports = {
|
|
12
|
+
version: 20260508000000,
|
|
13
|
+
name: 'page_recipes',
|
|
14
|
+
|
|
15
|
+
up(db, { tableExists }) {
|
|
16
|
+
if (tableExists(db, 'page_recipes')) return
|
|
17
|
+
|
|
18
|
+
db.exec(`
|
|
19
|
+
CREATE TABLE page_recipes (
|
|
20
|
+
url_template TEXT NOT NULL,
|
|
21
|
+
dom_fingerprint TEXT NOT NULL,
|
|
22
|
+
selectors_json TEXT NOT NULL,
|
|
23
|
+
page_type TEXT,
|
|
24
|
+
hit_count INTEGER NOT NULL DEFAULT 0,
|
|
25
|
+
fail_count INTEGER NOT NULL DEFAULT 0,
|
|
26
|
+
last_verified_at TEXT,
|
|
27
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
28
|
+
PRIMARY KEY (url_template, dom_fingerprint)
|
|
29
|
+
);
|
|
30
|
+
CREATE INDEX idx_page_recipes_template ON page_recipes(url_template);
|
|
31
|
+
`)
|
|
32
|
+
},
|
|
33
|
+
}
|
|
@@ -21,6 +21,8 @@
|
|
|
21
21
|
|
|
22
22
|
const { config, isHqConfigured } = require('../config')
|
|
23
23
|
const concurrency = require('./concurrency-manager')
|
|
24
|
+
const frozenState = require('./frozen-state')
|
|
25
|
+
const api = require('../api')
|
|
24
26
|
|
|
25
27
|
// Polling interval: 30 seconds (matches dag-step-poller).
|
|
26
28
|
const POLL_INTERVAL_MS = 30_000
|
|
@@ -45,6 +47,9 @@ async function hqRequest(endpoint, options = {}) {
|
|
|
45
47
|
if (!isHqConfigured()) {
|
|
46
48
|
return { skipped: true, reason: 'HQ not configured' }
|
|
47
49
|
}
|
|
50
|
+
if (frozenState.isFrozen()) {
|
|
51
|
+
throw new api.BillingFrozenError(frozenState.getState().reason)
|
|
52
|
+
}
|
|
48
53
|
const url = `${config.HQ_URL}${endpoint}`
|
|
49
54
|
const resp = await fetch(url, {
|
|
50
55
|
...options,
|
|
@@ -61,6 +66,9 @@ async function hqRequest(endpoint, options = {}) {
|
|
|
61
66
|
} catch {
|
|
62
67
|
data = { raw: text }
|
|
63
68
|
}
|
|
69
|
+
if (frozenState.maybeFreezeFromResponse(resp, data)) {
|
|
70
|
+
throw new api.BillingFrozenError(data && data.reason)
|
|
71
|
+
}
|
|
64
72
|
if (!resp.ok) {
|
|
65
73
|
const err = new Error(data.error || `HQ ${endpoint} failed: ${resp.status}`)
|
|
66
74
|
err.statusCode = resp.status
|
|
@@ -72,6 +80,7 @@ async function hqRequest(endpoint, options = {}) {
|
|
|
72
80
|
|
|
73
81
|
async function pollOnce() {
|
|
74
82
|
if (!isHqConfigured()) return
|
|
83
|
+
if (frozenState.isFrozen()) return
|
|
75
84
|
if (!runner) {
|
|
76
85
|
console.warn('[BoardTaskPoller] No runner injected, skipping poll')
|
|
77
86
|
return
|
|
@@ -117,7 +126,9 @@ async function pollOnce() {
|
|
|
117
126
|
})
|
|
118
127
|
}
|
|
119
128
|
} catch (err) {
|
|
120
|
-
if (err.
|
|
129
|
+
if (err.billingFrozen) {
|
|
130
|
+
console.log('[BoardTaskPoller] Billing frozen, suspending poll')
|
|
131
|
+
} else if (err.message?.includes('fetch failed') || err.message?.includes('ECONNREFUSED')) {
|
|
121
132
|
console.log('[BoardTaskPoller] HQ unreachable, will retry next cycle')
|
|
122
133
|
} else {
|
|
123
134
|
console.error(`[BoardTaskPoller] Poll error: ${err.message}`)
|
|
@@ -12,6 +12,8 @@
|
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
14
|
const { config, isHqConfigured } = require('../config')
|
|
15
|
+
const frozenState = require('./frozen-state')
|
|
16
|
+
const api = require('../api')
|
|
15
17
|
|
|
16
18
|
const POLL_INTERVAL_MS = 60_000
|
|
17
19
|
|
|
@@ -22,6 +24,7 @@ let lastFiredCount = 0
|
|
|
22
24
|
|
|
23
25
|
async function pollOnce() {
|
|
24
26
|
if (!isHqConfigured()) return
|
|
27
|
+
if (frozenState.isFrozen()) return
|
|
25
28
|
if (polling) return
|
|
26
29
|
|
|
27
30
|
polling = true
|
|
@@ -35,6 +38,14 @@ async function pollOnce() {
|
|
|
35
38
|
},
|
|
36
39
|
})
|
|
37
40
|
|
|
41
|
+
let payload = null
|
|
42
|
+
if (resp.status === 402) {
|
|
43
|
+
try { payload = await resp.json() } catch { payload = {} }
|
|
44
|
+
if (frozenState.maybeFreezeFromResponse(resp, payload)) {
|
|
45
|
+
throw new api.BillingFrozenError(payload && payload.reason)
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
38
49
|
if (!resp.ok) {
|
|
39
50
|
throw new Error(`dag-cron-tick failed: ${resp.status}`)
|
|
40
51
|
}
|
|
@@ -58,7 +69,9 @@ async function pollOnce() {
|
|
|
58
69
|
}
|
|
59
70
|
}
|
|
60
71
|
} catch (err) {
|
|
61
|
-
if (err.
|
|
72
|
+
if (err.billingFrozen) {
|
|
73
|
+
console.log('[DagCronPoller] Billing frozen, suspending poll')
|
|
74
|
+
} else if (err.message?.includes('fetch failed') || err.message?.includes('ECONNREFUSED')) {
|
|
62
75
|
console.log('[DagCronPoller] HQ unreachable, will retry next cycle')
|
|
63
76
|
} else {
|
|
64
77
|
console.error(`[DagCronPoller] Poll error: ${err.message}`)
|
|
@@ -13,6 +13,7 @@ const { config, isHqConfigured } = require('../config')
|
|
|
13
13
|
const api = require('../api')
|
|
14
14
|
const variableStore = require('../stores/variable-store')
|
|
15
15
|
const concurrency = require('./concurrency-manager')
|
|
16
|
+
const frozenState = require('./frozen-state')
|
|
16
17
|
|
|
17
18
|
// Polling interval: 30 seconds (matches step-poller)
|
|
18
19
|
const POLL_INTERVAL_MS = 30_000
|
|
@@ -38,6 +39,10 @@ async function dagRequest(endpoint, options = {}) {
|
|
|
38
39
|
return { skipped: true, reason: 'HQ not configured' }
|
|
39
40
|
}
|
|
40
41
|
|
|
42
|
+
if (frozenState.isFrozen()) {
|
|
43
|
+
throw new api.BillingFrozenError(frozenState.getState().reason)
|
|
44
|
+
}
|
|
45
|
+
|
|
41
46
|
const url = `${config.HQ_URL}/api/dag/minion${endpoint}`
|
|
42
47
|
const resp = await fetch(url, {
|
|
43
48
|
...options,
|
|
@@ -48,6 +53,14 @@ async function dagRequest(endpoint, options = {}) {
|
|
|
48
53
|
},
|
|
49
54
|
})
|
|
50
55
|
|
|
56
|
+
let payload = null
|
|
57
|
+
if (resp.status === 402) {
|
|
58
|
+
try { payload = await resp.json() } catch { payload = {} }
|
|
59
|
+
if (frozenState.maybeFreezeFromResponse(resp, payload)) {
|
|
60
|
+
throw new api.BillingFrozenError(payload && payload.reason)
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
51
64
|
if (!resp.ok) {
|
|
52
65
|
const err = new Error(`DAG API ${endpoint} failed: ${resp.status}`)
|
|
53
66
|
err.statusCode = resp.status
|
|
@@ -62,6 +75,7 @@ async function dagRequest(endpoint, options = {}) {
|
|
|
62
75
|
*/
|
|
63
76
|
async function pollOnce() {
|
|
64
77
|
if (!isHqConfigured()) return
|
|
78
|
+
if (frozenState.isFrozen()) return
|
|
65
79
|
if (polling) return
|
|
66
80
|
|
|
67
81
|
polling = true
|
|
@@ -90,7 +104,9 @@ async function pollOnce() {
|
|
|
90
104
|
promise
|
|
91
105
|
}
|
|
92
106
|
} catch (err) {
|
|
93
|
-
if (err.
|
|
107
|
+
if (err.billingFrozen) {
|
|
108
|
+
console.log('[DagPoller] Billing frozen, suspending poll')
|
|
109
|
+
} else if (err.message?.includes('fetch failed') || err.message?.includes('ECONNREFUSED')) {
|
|
94
110
|
console.log('[DagPoller] HQ unreachable, will retry next cycle')
|
|
95
111
|
} else {
|
|
96
112
|
console.error(`[DagPoller] Poll error: ${err.message}`)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Frozen State Module
|
|
3
|
+
*
|
|
4
|
+
* Tracks billing-driven freeze state in-memory. When frozen, all pollers
|
|
5
|
+
* skip their work and the shared HTTP wrappers refuse outbound calls.
|
|
6
|
+
*
|
|
7
|
+
* State is intentionally NOT persisted to disk:
|
|
8
|
+
* - Recovery is driven by HQ pushing `restart-agent` after payment success.
|
|
9
|
+
* - Process restart naturally clears the in-memory flag.
|
|
10
|
+
* - On restart, if billing is still past_due, the next request will receive
|
|
11
|
+
* 402 from HQ and self-freeze again. Self-healing.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
let frozen = false
|
|
15
|
+
let frozenAt = null
|
|
16
|
+
let reason = null
|
|
17
|
+
|
|
18
|
+
function isFrozen() {
|
|
19
|
+
return frozen
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function setFrozen(opts = {}) {
|
|
23
|
+
if (frozen) return
|
|
24
|
+
frozen = true
|
|
25
|
+
frozenAt = new Date().toISOString()
|
|
26
|
+
reason = opts.reason || 'unknown'
|
|
27
|
+
console.log(`[FrozenState] Minion frozen: reason=${reason} at=${frozenAt}`)
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function clearFrozen() {
|
|
31
|
+
if (!frozen) return
|
|
32
|
+
frozen = false
|
|
33
|
+
frozenAt = null
|
|
34
|
+
reason = null
|
|
35
|
+
console.log('[FrozenState] Minion unfrozen (in-memory state cleared)')
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function getState() {
|
|
39
|
+
return { frozen, frozenAt, reason }
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Inspect a fetch Response for the 402 billing-frozen signal and
|
|
44
|
+
* self-freeze if matched. Returns true if frozen was set.
|
|
45
|
+
*
|
|
46
|
+
* Expected payload: `{ "error": "billing_frozen", "reason": "past_due", ... }`
|
|
47
|
+
*/
|
|
48
|
+
function maybeFreezeFromResponse(response, payload) {
|
|
49
|
+
if (response && response.status === 402) {
|
|
50
|
+
if (payload && payload.error === 'billing_frozen') {
|
|
51
|
+
setFrozen({ reason: payload.reason || 'billing_frozen' })
|
|
52
|
+
return true
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
return false
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
module.exports = {
|
|
59
|
+
isFrozen,
|
|
60
|
+
setFrozen,
|
|
61
|
+
clearFrozen,
|
|
62
|
+
getState,
|
|
63
|
+
maybeFreezeFromResponse,
|
|
64
|
+
}
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
|
|
16
16
|
const { config, isHqConfigured } = require('../config')
|
|
17
17
|
const api = require('../api')
|
|
18
|
+
const frozenState = require('./frozen-state')
|
|
18
19
|
|
|
19
20
|
// Poll every 30 seconds (same frequency as step-poller)
|
|
20
21
|
const POLL_INTERVAL_MS = 30_000
|
|
@@ -33,6 +34,7 @@ const processingRevisions = new Set()
|
|
|
33
34
|
*/
|
|
34
35
|
async function pollOnce() {
|
|
35
36
|
if (!isHqConfigured()) return
|
|
37
|
+
if (frozenState.isFrozen()) return
|
|
36
38
|
if (polling) return
|
|
37
39
|
|
|
38
40
|
polling = true
|
|
@@ -59,7 +61,9 @@ async function pollOnce() {
|
|
|
59
61
|
}
|
|
60
62
|
}
|
|
61
63
|
} catch (err) {
|
|
62
|
-
if (err.
|
|
64
|
+
if (err.billingFrozen) {
|
|
65
|
+
console.log(`[RevisionWatcher] Billing frozen, suspending poll`)
|
|
66
|
+
} else if (err.message?.includes('fetch failed') || err.message?.includes('ECONNREFUSED')) {
|
|
63
67
|
console.log(`[RevisionWatcher] HQ unreachable, will retry next cycle`)
|
|
64
68
|
} else {
|
|
65
69
|
console.error(`[RevisionWatcher] Poll error: ${err.message}`)
|
package/core/lib/step-poller.js
CHANGED
|
@@ -21,6 +21,7 @@
|
|
|
21
21
|
const { config, isHqConfigured } = require('../config')
|
|
22
22
|
const api = require('../api')
|
|
23
23
|
const variableStore = require('../stores/variable-store')
|
|
24
|
+
const frozenState = require('./frozen-state')
|
|
24
25
|
|
|
25
26
|
// Polling interval: 30 seconds (matches heartbeat frequency)
|
|
26
27
|
const POLL_INTERVAL_MS = 30_000
|
|
@@ -42,6 +43,7 @@ let lastPollAt = null
|
|
|
42
43
|
*/
|
|
43
44
|
async function pollOnce() {
|
|
44
45
|
if (!isHqConfigured()) return
|
|
46
|
+
if (frozenState.isFrozen()) return
|
|
45
47
|
if (polling) return
|
|
46
48
|
|
|
47
49
|
polling = true
|
|
@@ -70,7 +72,9 @@ async function pollOnce() {
|
|
|
70
72
|
}
|
|
71
73
|
} catch (err) {
|
|
72
74
|
// Don't log network errors at error level — they're expected when HQ is temporarily unreachable
|
|
73
|
-
if (err.
|
|
75
|
+
if (err.billingFrozen) {
|
|
76
|
+
console.log(`[StepPoller] Billing frozen, suspending poll`)
|
|
77
|
+
} else if (err.message?.includes('fetch failed') || err.message?.includes('ECONNREFUSED')) {
|
|
74
78
|
console.log(`[StepPoller] HQ unreachable, will retry next cycle`)
|
|
75
79
|
} else {
|
|
76
80
|
console.error(`[StepPoller] Poll error: ${err.message}`)
|
|
@@ -20,6 +20,7 @@
|
|
|
20
20
|
|
|
21
21
|
const { config, isHqConfigured, isLlmConfigured } = require('../config')
|
|
22
22
|
const api = require('../api')
|
|
23
|
+
const frozenState = require('./frozen-state')
|
|
23
24
|
|
|
24
25
|
// Poll every 15 seconds
|
|
25
26
|
const POLL_INTERVAL_MS = 15_000
|
|
@@ -94,6 +95,7 @@ function isMentioned(thread, messages, myRole) {
|
|
|
94
95
|
*/
|
|
95
96
|
async function pollOnce() {
|
|
96
97
|
if (!isHqConfigured()) return
|
|
98
|
+
if (frozenState.isFrozen()) return
|
|
97
99
|
if (polling) return
|
|
98
100
|
|
|
99
101
|
polling = true
|
|
@@ -115,7 +117,9 @@ async function pollOnce() {
|
|
|
115
117
|
}
|
|
116
118
|
}
|
|
117
119
|
} catch (err) {
|
|
118
|
-
if (err.
|
|
120
|
+
if (err.billingFrozen) {
|
|
121
|
+
// Billing frozen — silent
|
|
122
|
+
} else if (err.message?.includes('fetch failed') || err.message?.includes('ECONNREFUSED')) {
|
|
119
123
|
// HQ unreachable — silent retry
|
|
120
124
|
} else {
|
|
121
125
|
console.error(`[ThreadWatcher] Poll error: ${err.message}`)
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web extraction orchestrator (experimental — v3.53.0).
|
|
3
|
+
*
|
|
4
|
+
* Cold path: Playwright fetch -> Readability/Turndown clean -> Anthropic
|
|
5
|
+
* Haiku selects fields -> store recipe -> verify by replaying
|
|
6
|
+
* selectors against the same page.
|
|
7
|
+
*
|
|
8
|
+
* Hot path: Playwright fetch -> fingerprint -> recipe lookup -> selector
|
|
9
|
+
* replay. No LLM call.
|
|
10
|
+
*
|
|
11
|
+
* Self-heal: hot replays that come back empty bump fail_count; the recipe
|
|
12
|
+
* is dropped after MAX_FAIL_COUNT and the next request retries
|
|
13
|
+
* cold. A single in-request fall-through from hot -> cold is
|
|
14
|
+
* allowed so callers don't see transient breakage.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
const { normalizeUrl } = require('./url-normalize')
|
|
18
|
+
const { computeFingerprint } = require('./fingerprint')
|
|
19
|
+
const { renderPage, extractWithSelectors } = require('./playwright-runner')
|
|
20
|
+
const { cleanHtml } = require('./html-cleaner')
|
|
21
|
+
const { generateRecipe } = require('./recipe-generator')
|
|
22
|
+
const pageRecipeStore = require('../../stores/page-recipe-store')
|
|
23
|
+
|
|
24
|
+
function isEmptyResult(data) {
|
|
25
|
+
if (!data || typeof data !== 'object') return true
|
|
26
|
+
const values = Object.values(data)
|
|
27
|
+
if (values.length === 0) return true
|
|
28
|
+
return values.every(v => {
|
|
29
|
+
if (v == null) return true
|
|
30
|
+
if (typeof v === 'string') return v.trim() === ''
|
|
31
|
+
if (Array.isArray(v)) return v.length === 0
|
|
32
|
+
return false
|
|
33
|
+
})
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
async function extract({ url, hint }) {
|
|
37
|
+
const { template, canonicalUrl } = normalizeUrl(url)
|
|
38
|
+
|
|
39
|
+
// Always render once up-front so we can compute the fingerprint regardless
|
|
40
|
+
// of cache state. Cold path reuses the HTML; hot path discards it.
|
|
41
|
+
const rendered = await renderPage(canonicalUrl)
|
|
42
|
+
const fingerprint = computeFingerprint(rendered.html)
|
|
43
|
+
|
|
44
|
+
const cached = pageRecipeStore.find({
|
|
45
|
+
urlTemplate: template,
|
|
46
|
+
domFingerprint: fingerprint,
|
|
47
|
+
})
|
|
48
|
+
|
|
49
|
+
if (cached) {
|
|
50
|
+
const data = await extractWithSelectors(canonicalUrl, cached.selectors)
|
|
51
|
+
if (!isEmptyResult(data)) {
|
|
52
|
+
pageRecipeStore.incrementHit({ urlTemplate: template, domFingerprint: fingerprint })
|
|
53
|
+
pageRecipeStore.setLastVerified({ urlTemplate: template, domFingerprint: fingerprint })
|
|
54
|
+
return shape({
|
|
55
|
+
url: canonicalUrl,
|
|
56
|
+
finalUrl: rendered.finalUrl,
|
|
57
|
+
statusCode: rendered.statusCode,
|
|
58
|
+
recipeMode: 'hot',
|
|
59
|
+
urlTemplate: template,
|
|
60
|
+
fingerprint,
|
|
61
|
+
pageType: cached.page_type,
|
|
62
|
+
selectors: cached.selectors,
|
|
63
|
+
data,
|
|
64
|
+
cleaned: null,
|
|
65
|
+
})
|
|
66
|
+
}
|
|
67
|
+
// Hot replay returned nothing — penalize and fall through to cold.
|
|
68
|
+
pageRecipeStore.incrementFail({ urlTemplate: template, domFingerprint: fingerprint })
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Cold path
|
|
72
|
+
const cleaned = cleanHtml(rendered.html, canonicalUrl)
|
|
73
|
+
const recipe = await generateRecipe({
|
|
74
|
+
url: canonicalUrl,
|
|
75
|
+
cleanedMarkdown: cleaned.contentMarkdown,
|
|
76
|
+
hint,
|
|
77
|
+
})
|
|
78
|
+
|
|
79
|
+
// Verify the recipe against this exact page before persisting.
|
|
80
|
+
const verifyData = await extractWithSelectors(canonicalUrl, recipe.selectors)
|
|
81
|
+
const verified = !isEmptyResult(verifyData)
|
|
82
|
+
|
|
83
|
+
if (verified) {
|
|
84
|
+
pageRecipeStore.upsert({
|
|
85
|
+
urlTemplate: template,
|
|
86
|
+
domFingerprint: fingerprint,
|
|
87
|
+
selectors: recipe.selectors,
|
|
88
|
+
pageType: recipe.pageType,
|
|
89
|
+
})
|
|
90
|
+
pageRecipeStore.incrementHit({ urlTemplate: template, domFingerprint: fingerprint })
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return shape({
|
|
94
|
+
url: canonicalUrl,
|
|
95
|
+
finalUrl: rendered.finalUrl,
|
|
96
|
+
statusCode: rendered.statusCode,
|
|
97
|
+
recipeMode: 'cold',
|
|
98
|
+
urlTemplate: template,
|
|
99
|
+
fingerprint,
|
|
100
|
+
pageType: recipe.pageType,
|
|
101
|
+
selectors: recipe.selectors,
|
|
102
|
+
data: verified ? verifyData : recipe.extracted,
|
|
103
|
+
cleaned,
|
|
104
|
+
recipePersisted: verified,
|
|
105
|
+
})
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
function shape({ url, finalUrl, statusCode, recipeMode, urlTemplate, fingerprint, pageType, selectors, data, cleaned, recipePersisted }) {
|
|
109
|
+
const out = {
|
|
110
|
+
experimental: true,
|
|
111
|
+
url,
|
|
112
|
+
finalUrl,
|
|
113
|
+
statusCode,
|
|
114
|
+
recipeMode,
|
|
115
|
+
recipeId: `${urlTemplate}#${fingerprint}`,
|
|
116
|
+
pageType: pageType || null,
|
|
117
|
+
title: pickField(data, ['title', 'headline', 'name']) || cleaned?.title || null,
|
|
118
|
+
content: pickField(data, ['body', 'content', 'article', 'description']) || cleaned?.contentMarkdown || null,
|
|
119
|
+
structured: data || {},
|
|
120
|
+
selectors: selectors || {},
|
|
121
|
+
}
|
|
122
|
+
if (recipeMode === 'cold' && recipePersisted === false) {
|
|
123
|
+
out.warning = 'Recipe verification failed (selectors returned empty). Result reflects LLM extraction; recipe was not persisted.'
|
|
124
|
+
}
|
|
125
|
+
return out
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
function pickField(obj, candidates) {
|
|
129
|
+
if (!obj || typeof obj !== 'object') return null
|
|
130
|
+
for (const key of candidates) {
|
|
131
|
+
const v = obj[key]
|
|
132
|
+
if (v == null) continue
|
|
133
|
+
if (typeof v === 'string' && v.trim() !== '') return v
|
|
134
|
+
if (Array.isArray(v) && v.length > 0) return v.join('\n\n')
|
|
135
|
+
}
|
|
136
|
+
return null
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
module.exports = {
|
|
140
|
+
extract,
|
|
141
|
+
isEmptyResult,
|
|
142
|
+
}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Lightweight DOM structure fingerprint.
|
|
3
|
+
*
|
|
4
|
+
* Two pages with the same template URL but materially different layouts
|
|
5
|
+
* (A/B test, logged-in vs logged-out, mobile vs desktop served) need to
|
|
6
|
+
* use different recipes. We hash a minimal structural signature instead
|
|
7
|
+
* of the full HTML so the fingerprint stays stable against trivial copy
|
|
8
|
+
* changes but flips when block-level structure shifts.
|
|
9
|
+
*
|
|
10
|
+
* Signature inputs:
|
|
11
|
+
* - Order of structural landmark tags (header/nav/main/article/...)
|
|
12
|
+
* - Top 5 most frequent class names on <div> elements
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
const crypto = require('crypto')
|
|
16
|
+
|
|
17
|
+
const LANDMARK_TAGS = ['header', 'nav', 'main', 'article', 'section', 'aside', 'footer', 'form']
|
|
18
|
+
|
|
19
|
+
function computeFingerprint(html) {
|
|
20
|
+
if (typeof html !== 'string' || html.length === 0) {
|
|
21
|
+
return 'empty'
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
let document
|
|
25
|
+
try {
|
|
26
|
+
const { parseHTML } = require('linkedom')
|
|
27
|
+
document = parseHTML(html).document
|
|
28
|
+
} catch (err) {
|
|
29
|
+
// If linkedom fails (extremely malformed HTML), fall back to a length bucket
|
|
30
|
+
return 'fallback-' + crypto.createHash('sha1').update(html.slice(0, 4096)).digest('hex').slice(0, 12)
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// Landmark tag sequence (first occurrence only, in document order)
|
|
34
|
+
const seen = []
|
|
35
|
+
const seenSet = new Set()
|
|
36
|
+
const allEls = document.querySelectorAll(LANDMARK_TAGS.join(','))
|
|
37
|
+
for (const el of allEls) {
|
|
38
|
+
const tag = el.tagName.toLowerCase()
|
|
39
|
+
if (!seenSet.has(tag)) {
|
|
40
|
+
seenSet.add(tag)
|
|
41
|
+
seen.push(tag)
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Top 5 div classes by frequency
|
|
46
|
+
const classCounts = new Map()
|
|
47
|
+
const divs = document.querySelectorAll('div[class]')
|
|
48
|
+
for (const div of divs) {
|
|
49
|
+
const classes = (div.getAttribute('class') || '').split(/\s+/).filter(Boolean)
|
|
50
|
+
for (const c of classes) {
|
|
51
|
+
classCounts.set(c, (classCounts.get(c) || 0) + 1)
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
const topClasses = [...classCounts.entries()]
|
|
55
|
+
.sort((a, b) => b[1] - a[1] || (a[0] < b[0] ? -1 : 1))
|
|
56
|
+
.slice(0, 5)
|
|
57
|
+
.map(([c]) => c)
|
|
58
|
+
|
|
59
|
+
const signature = `tags:${seen.join(',')}|cls:${topClasses.join(',')}`
|
|
60
|
+
return crypto.createHash('sha1').update(signature).digest('hex').slice(0, 12)
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
module.exports = { computeFingerprint }
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML → cleaned content (Readability) → Markdown (Turndown).
|
|
3
|
+
*
|
|
4
|
+
* The cleaned Markdown is the *only* page representation handed to the
|
|
5
|
+
* recipe-generation LLM. Keeping the input small and structured is what
|
|
6
|
+
* makes this experiment cheap enough to be worth running on every
|
|
7
|
+
* cold-cache miss.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
const MAX_MARKDOWN_LENGTH = 50_000
|
|
11
|
+
|
|
12
|
+
function cleanHtml(html, url) {
|
|
13
|
+
let parsedDocument
|
|
14
|
+
try {
|
|
15
|
+
const { parseHTML } = require('linkedom')
|
|
16
|
+
parsedDocument = parseHTML(html).document
|
|
17
|
+
} catch (err) {
|
|
18
|
+
return {
|
|
19
|
+
title: null,
|
|
20
|
+
contentHtml: '',
|
|
21
|
+
contentMarkdown: '',
|
|
22
|
+
byline: null,
|
|
23
|
+
excerpt: null,
|
|
24
|
+
length: 0,
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
let article = null
|
|
29
|
+
try {
|
|
30
|
+
const { Readability } = require('@mozilla/readability')
|
|
31
|
+
article = new Readability(parsedDocument).parse()
|
|
32
|
+
} catch {
|
|
33
|
+
article = null
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const contentHtml =
|
|
37
|
+
(article && article.content) ||
|
|
38
|
+
parsedDocument.body?.innerHTML ||
|
|
39
|
+
''
|
|
40
|
+
|
|
41
|
+
let contentMarkdown = ''
|
|
42
|
+
try {
|
|
43
|
+
const TurndownService = require('turndown')
|
|
44
|
+
const td = new TurndownService({
|
|
45
|
+
headingStyle: 'atx',
|
|
46
|
+
codeBlockStyle: 'fenced',
|
|
47
|
+
bulletListMarker: '-',
|
|
48
|
+
})
|
|
49
|
+
td.remove(['script', 'style', 'noscript', 'iframe'])
|
|
50
|
+
contentMarkdown = td.turndown(contentHtml)
|
|
51
|
+
} catch {
|
|
52
|
+
contentMarkdown = ''
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
if (contentMarkdown.length > MAX_MARKDOWN_LENGTH) {
|
|
56
|
+
contentMarkdown = contentMarkdown.slice(0, MAX_MARKDOWN_LENGTH) + '\n\n[... truncated ...]'
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
return {
|
|
60
|
+
title: article?.title || parsedDocument.title || null,
|
|
61
|
+
contentHtml,
|
|
62
|
+
contentMarkdown,
|
|
63
|
+
byline: article?.byline || null,
|
|
64
|
+
excerpt: article?.excerpt || null,
|
|
65
|
+
length: article?.length || contentMarkdown.length,
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
module.exports = {
|
|
70
|
+
cleanHtml,
|
|
71
|
+
MAX_MARKDOWN_LENGTH,
|
|
72
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web extraction (experimental, v3.53.0).
|
|
3
|
+
*
|
|
4
|
+
* Public surface for core/routes/web.js. Internal modules:
|
|
5
|
+
* - url-normalize.js URL → template + canonical URL
|
|
6
|
+
* - fingerprint.js DOM structural hash
|
|
7
|
+
* - playwright-runner.js headless fetch + selector replay
|
|
8
|
+
* - html-cleaner.js Readability + Turndown
|
|
9
|
+
* - recipe-generator.js Anthropic Haiku cold path
|
|
10
|
+
* - extractor.js orchestrator (hot/cold + self-heal)
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
const { extract } = require('./extractor')
|
|
14
|
+
const { normalizeUrl } = require('./url-normalize')
|
|
15
|
+
const { computeFingerprint } = require('./fingerprint')
|
|
16
|
+
|
|
17
|
+
module.exports = {
|
|
18
|
+
extract,
|
|
19
|
+
normalizeUrl,
|
|
20
|
+
computeFingerprint,
|
|
21
|
+
}
|