@geekbeer/minion 3.51.2 → 3.53.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.env.example CHANGED
@@ -17,3 +17,10 @@ MINION_ID=
17
17
 
18
18
  # Agent port (optional, default: 8080)
19
19
  AGENT_PORT=8080
20
+
21
+ # Anthropic API key (optional, experimental, fallback only) —
22
+ # POST /api/web/extract prefers the primary LLM plugin (see PUT /api/llm/config)
23
+ # and only uses ANTHROPIC_API_KEY if no primary plugin is configured. Set via:
24
+ # curl -X PUT http://localhost:8080/api/secrets/ANTHROPIC_API_KEY \
25
+ # -H "Authorization: Bearer $API_TOKEN" -d '{"value": "sk-ant-..."}'
26
+ ANTHROPIC_API_KEY=
package/core/api.js CHANGED
@@ -6,6 +6,16 @@
6
6
  */
7
7
 
8
8
  const { config, isHqConfigured } = require('./config')
9
+ const frozenState = require('./lib/frozen-state')
10
+
11
+ class BillingFrozenError extends Error {
12
+ constructor(reason) {
13
+ super(`Minion is billing-frozen: ${reason || 'unknown'}`)
14
+ this.name = 'BillingFrozenError'
15
+ this.statusCode = 402
16
+ this.billingFrozen = true
17
+ }
18
+ }
9
19
 
10
20
  /**
11
21
  * Send HTTP request to the HQ server
@@ -17,6 +27,10 @@ async function request(endpoint, options = {}) {
17
27
  return { skipped: true, reason: 'HQ not configured' }
18
28
  }
19
29
 
30
+ if (frozenState.isFrozen()) {
31
+ throw new BillingFrozenError(frozenState.getState().reason)
32
+ }
33
+
20
34
  const url = `${config.HQ_URL}/api/minion${endpoint}`
21
35
 
22
36
  const response = await fetch(url, {
@@ -28,10 +42,19 @@ async function request(endpoint, options = {}) {
28
42
  },
29
43
  })
30
44
 
31
- const data = await response.json()
45
+ let data = null
46
+ try {
47
+ data = await response.json()
48
+ } catch {
49
+ data = {}
50
+ }
51
+
52
+ if (frozenState.maybeFreezeFromResponse(response, data)) {
53
+ throw new BillingFrozenError(data && data.reason)
54
+ }
32
55
 
33
56
  if (!response.ok) {
34
- const err = new Error(data.error || `API request failed: ${response.status}`)
57
+ const err = new Error((data && data.error) || `API request failed: ${response.status}`)
35
58
  err.statusCode = response.status
36
59
  throw err
37
60
  }
@@ -203,4 +226,5 @@ module.exports = {
203
226
  deleteThread,
204
227
  createProjectMemory,
205
228
  searchProjectMemories,
229
+ BillingFrozenError,
206
230
  }
@@ -0,0 +1,33 @@
1
+ /**
2
+ * page_recipes — Web page extraction recipe cache (experimental, v3.53.0).
3
+ *
4
+ * Stores selectors learned from a first-time visit so subsequent visits to
5
+ * structurally similar pages skip the LLM round trip. Keyed by URL template
6
+ * (after normalization) + DOM fingerprint to tolerate A/B variants.
7
+ *
8
+ * Marked experimental: schema may change before the API stabilizes.
9
+ */
10
+
11
+ module.exports = {
12
+ version: 20260508000000,
13
+ name: 'page_recipes',
14
+
15
+ up(db, { tableExists }) {
16
+ if (tableExists(db, 'page_recipes')) return
17
+
18
+ db.exec(`
19
+ CREATE TABLE page_recipes (
20
+ url_template TEXT NOT NULL,
21
+ dom_fingerprint TEXT NOT NULL,
22
+ selectors_json TEXT NOT NULL,
23
+ page_type TEXT,
24
+ hit_count INTEGER NOT NULL DEFAULT 0,
25
+ fail_count INTEGER NOT NULL DEFAULT 0,
26
+ last_verified_at TEXT,
27
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
28
+ PRIMARY KEY (url_template, dom_fingerprint)
29
+ );
30
+ CREATE INDEX idx_page_recipes_template ON page_recipes(url_template);
31
+ `)
32
+ },
33
+ }
@@ -21,6 +21,8 @@
21
21
 
22
22
  const { config, isHqConfigured } = require('../config')
23
23
  const concurrency = require('./concurrency-manager')
24
+ const frozenState = require('./frozen-state')
25
+ const api = require('../api')
24
26
 
25
27
  // Polling interval: 30 seconds (matches dag-step-poller).
26
28
  const POLL_INTERVAL_MS = 30_000
@@ -45,6 +47,9 @@ async function hqRequest(endpoint, options = {}) {
45
47
  if (!isHqConfigured()) {
46
48
  return { skipped: true, reason: 'HQ not configured' }
47
49
  }
50
+ if (frozenState.isFrozen()) {
51
+ throw new api.BillingFrozenError(frozenState.getState().reason)
52
+ }
48
53
  const url = `${config.HQ_URL}${endpoint}`
49
54
  const resp = await fetch(url, {
50
55
  ...options,
@@ -61,6 +66,9 @@ async function hqRequest(endpoint, options = {}) {
61
66
  } catch {
62
67
  data = { raw: text }
63
68
  }
69
+ if (frozenState.maybeFreezeFromResponse(resp, data)) {
70
+ throw new api.BillingFrozenError(data && data.reason)
71
+ }
64
72
  if (!resp.ok) {
65
73
  const err = new Error(data.error || `HQ ${endpoint} failed: ${resp.status}`)
66
74
  err.statusCode = resp.status
@@ -72,6 +80,7 @@ async function hqRequest(endpoint, options = {}) {
72
80
 
73
81
  async function pollOnce() {
74
82
  if (!isHqConfigured()) return
83
+ if (frozenState.isFrozen()) return
75
84
  if (!runner) {
76
85
  console.warn('[BoardTaskPoller] No runner injected, skipping poll')
77
86
  return
@@ -117,7 +126,9 @@ async function pollOnce() {
117
126
  })
118
127
  }
119
128
  } catch (err) {
120
- if (err.message?.includes('fetch failed') || err.message?.includes('ECONNREFUSED')) {
129
+ if (err.billingFrozen) {
130
+ console.log('[BoardTaskPoller] Billing frozen, suspending poll')
131
+ } else if (err.message?.includes('fetch failed') || err.message?.includes('ECONNREFUSED')) {
121
132
  console.log('[BoardTaskPoller] HQ unreachable, will retry next cycle')
122
133
  } else {
123
134
  console.error(`[BoardTaskPoller] Poll error: ${err.message}`)
@@ -12,6 +12,8 @@
12
12
  */
13
13
 
14
14
  const { config, isHqConfigured } = require('../config')
15
+ const frozenState = require('./frozen-state')
16
+ const api = require('../api')
15
17
 
16
18
  const POLL_INTERVAL_MS = 60_000
17
19
 
@@ -22,6 +24,7 @@ let lastFiredCount = 0
22
24
 
23
25
  async function pollOnce() {
24
26
  if (!isHqConfigured()) return
27
+ if (frozenState.isFrozen()) return
25
28
  if (polling) return
26
29
 
27
30
  polling = true
@@ -35,6 +38,14 @@ async function pollOnce() {
35
38
  },
36
39
  })
37
40
 
41
+ let payload = null
42
+ if (resp.status === 402) {
43
+ try { payload = await resp.json() } catch { payload = {} }
44
+ if (frozenState.maybeFreezeFromResponse(resp, payload)) {
45
+ throw new api.BillingFrozenError(payload && payload.reason)
46
+ }
47
+ }
48
+
38
49
  if (!resp.ok) {
39
50
  throw new Error(`dag-cron-tick failed: ${resp.status}`)
40
51
  }
@@ -58,7 +69,9 @@ async function pollOnce() {
58
69
  }
59
70
  }
60
71
  } catch (err) {
61
- if (err.message?.includes('fetch failed') || err.message?.includes('ECONNREFUSED')) {
72
+ if (err.billingFrozen) {
73
+ console.log('[DagCronPoller] Billing frozen, suspending poll')
74
+ } else if (err.message?.includes('fetch failed') || err.message?.includes('ECONNREFUSED')) {
62
75
  console.log('[DagCronPoller] HQ unreachable, will retry next cycle')
63
76
  } else {
64
77
  console.error(`[DagCronPoller] Poll error: ${err.message}`)
@@ -13,6 +13,7 @@ const { config, isHqConfigured } = require('../config')
13
13
  const api = require('../api')
14
14
  const variableStore = require('../stores/variable-store')
15
15
  const concurrency = require('./concurrency-manager')
16
+ const frozenState = require('./frozen-state')
16
17
 
17
18
  // Polling interval: 30 seconds (matches step-poller)
18
19
  const POLL_INTERVAL_MS = 30_000
@@ -38,6 +39,10 @@ async function dagRequest(endpoint, options = {}) {
38
39
  return { skipped: true, reason: 'HQ not configured' }
39
40
  }
40
41
 
42
+ if (frozenState.isFrozen()) {
43
+ throw new api.BillingFrozenError(frozenState.getState().reason)
44
+ }
45
+
41
46
  const url = `${config.HQ_URL}/api/dag/minion${endpoint}`
42
47
  const resp = await fetch(url, {
43
48
  ...options,
@@ -48,6 +53,14 @@ async function dagRequest(endpoint, options = {}) {
48
53
  },
49
54
  })
50
55
 
56
+ let payload = null
57
+ if (resp.status === 402) {
58
+ try { payload = await resp.json() } catch { payload = {} }
59
+ if (frozenState.maybeFreezeFromResponse(resp, payload)) {
60
+ throw new api.BillingFrozenError(payload && payload.reason)
61
+ }
62
+ }
63
+
51
64
  if (!resp.ok) {
52
65
  const err = new Error(`DAG API ${endpoint} failed: ${resp.status}`)
53
66
  err.statusCode = resp.status
@@ -62,6 +75,7 @@ async function dagRequest(endpoint, options = {}) {
62
75
  */
63
76
  async function pollOnce() {
64
77
  if (!isHqConfigured()) return
78
+ if (frozenState.isFrozen()) return
65
79
  if (polling) return
66
80
 
67
81
  polling = true
@@ -90,7 +104,9 @@ async function pollOnce() {
90
104
  promise
91
105
  }
92
106
  } catch (err) {
93
- if (err.message?.includes('fetch failed') || err.message?.includes('ECONNREFUSED')) {
107
+ if (err.billingFrozen) {
108
+ console.log('[DagPoller] Billing frozen, suspending poll')
109
+ } else if (err.message?.includes('fetch failed') || err.message?.includes('ECONNREFUSED')) {
94
110
  console.log('[DagPoller] HQ unreachable, will retry next cycle')
95
111
  } else {
96
112
  console.error(`[DagPoller] Poll error: ${err.message}`)
@@ -0,0 +1,64 @@
1
+ /**
2
+ * Frozen State Module
3
+ *
4
+ * Tracks billing-driven freeze state in-memory. When frozen, all pollers
5
+ * skip their work and the shared HTTP wrappers refuse outbound calls.
6
+ *
7
+ * State is intentionally NOT persisted to disk:
8
+ * - Recovery is driven by HQ pushing `restart-agent` after payment success.
9
+ * - Process restart naturally clears the in-memory flag.
10
+ * - On restart, if billing is still past_due, the next request will receive
11
+ * 402 from HQ and self-freeze again. Self-healing.
12
+ */
13
+
14
+ let frozen = false
15
+ let frozenAt = null
16
+ let reason = null
17
+
18
+ function isFrozen() {
19
+ return frozen
20
+ }
21
+
22
+ function setFrozen(opts = {}) {
23
+ if (frozen) return
24
+ frozen = true
25
+ frozenAt = new Date().toISOString()
26
+ reason = opts.reason || 'unknown'
27
+ console.log(`[FrozenState] Minion frozen: reason=${reason} at=${frozenAt}`)
28
+ }
29
+
30
+ function clearFrozen() {
31
+ if (!frozen) return
32
+ frozen = false
33
+ frozenAt = null
34
+ reason = null
35
+ console.log('[FrozenState] Minion unfrozen (in-memory state cleared)')
36
+ }
37
+
38
+ function getState() {
39
+ return { frozen, frozenAt, reason }
40
+ }
41
+
42
+ /**
43
+ * Inspect a fetch Response for the 402 billing-frozen signal and
44
+ * self-freeze if matched. Returns true if frozen was set.
45
+ *
46
+ * Expected payload: `{ "error": "billing_frozen", "reason": "past_due", ... }`
47
+ */
48
+ function maybeFreezeFromResponse(response, payload) {
49
+ if (response && response.status === 402) {
50
+ if (payload && payload.error === 'billing_frozen') {
51
+ setFrozen({ reason: payload.reason || 'billing_frozen' })
52
+ return true
53
+ }
54
+ }
55
+ return false
56
+ }
57
+
58
+ module.exports = {
59
+ isFrozen,
60
+ setFrozen,
61
+ clearFrozen,
62
+ getState,
63
+ maybeFreezeFromResponse,
64
+ }
@@ -15,6 +15,7 @@
15
15
 
16
16
  const { config, isHqConfigured } = require('../config')
17
17
  const api = require('../api')
18
+ const frozenState = require('./frozen-state')
18
19
 
19
20
  // Poll every 30 seconds (same frequency as step-poller)
20
21
  const POLL_INTERVAL_MS = 30_000
@@ -33,6 +34,7 @@ const processingRevisions = new Set()
33
34
  */
34
35
  async function pollOnce() {
35
36
  if (!isHqConfigured()) return
37
+ if (frozenState.isFrozen()) return
36
38
  if (polling) return
37
39
 
38
40
  polling = true
@@ -59,7 +61,9 @@ async function pollOnce() {
59
61
  }
60
62
  }
61
63
  } catch (err) {
62
- if (err.message?.includes('fetch failed') || err.message?.includes('ECONNREFUSED')) {
64
+ if (err.billingFrozen) {
65
+ console.log(`[RevisionWatcher] Billing frozen, suspending poll`)
66
+ } else if (err.message?.includes('fetch failed') || err.message?.includes('ECONNREFUSED')) {
63
67
  console.log(`[RevisionWatcher] HQ unreachable, will retry next cycle`)
64
68
  } else {
65
69
  console.error(`[RevisionWatcher] Poll error: ${err.message}`)
@@ -21,6 +21,7 @@
21
21
  const { config, isHqConfigured } = require('../config')
22
22
  const api = require('../api')
23
23
  const variableStore = require('../stores/variable-store')
24
+ const frozenState = require('./frozen-state')
24
25
 
25
26
  // Polling interval: 30 seconds (matches heartbeat frequency)
26
27
  const POLL_INTERVAL_MS = 30_000
@@ -42,6 +43,7 @@ let lastPollAt = null
42
43
  */
43
44
  async function pollOnce() {
44
45
  if (!isHqConfigured()) return
46
+ if (frozenState.isFrozen()) return
45
47
  if (polling) return
46
48
 
47
49
  polling = true
@@ -70,7 +72,9 @@ async function pollOnce() {
70
72
  }
71
73
  } catch (err) {
72
74
  // Don't log network errors at error level — they're expected when HQ is temporarily unreachable
73
- if (err.message?.includes('fetch failed') || err.message?.includes('ECONNREFUSED')) {
75
+ if (err.billingFrozen) {
76
+ console.log(`[StepPoller] Billing frozen, suspending poll`)
77
+ } else if (err.message?.includes('fetch failed') || err.message?.includes('ECONNREFUSED')) {
74
78
  console.log(`[StepPoller] HQ unreachable, will retry next cycle`)
75
79
  } else {
76
80
  console.error(`[StepPoller] Poll error: ${err.message}`)
@@ -20,6 +20,7 @@
20
20
 
21
21
  const { config, isHqConfigured, isLlmConfigured } = require('../config')
22
22
  const api = require('../api')
23
+ const frozenState = require('./frozen-state')
23
24
 
24
25
  // Poll every 15 seconds
25
26
  const POLL_INTERVAL_MS = 15_000
@@ -94,6 +95,7 @@ function isMentioned(thread, messages, myRole) {
94
95
  */
95
96
  async function pollOnce() {
96
97
  if (!isHqConfigured()) return
98
+ if (frozenState.isFrozen()) return
97
99
  if (polling) return
98
100
 
99
101
  polling = true
@@ -115,7 +117,9 @@ async function pollOnce() {
115
117
  }
116
118
  }
117
119
  } catch (err) {
118
- if (err.message?.includes('fetch failed') || err.message?.includes('ECONNREFUSED')) {
120
+ if (err.billingFrozen) {
121
+ // Billing frozen — silent
122
+ } else if (err.message?.includes('fetch failed') || err.message?.includes('ECONNREFUSED')) {
119
123
  // HQ unreachable — silent retry
120
124
  } else {
121
125
  console.error(`[ThreadWatcher] Poll error: ${err.message}`)
@@ -0,0 +1,142 @@
1
+ /**
2
+ * Web extraction orchestrator (experimental — v3.53.0).
3
+ *
4
+ * Cold path: Playwright fetch -> Readability/Turndown clean -> Anthropic
5
+ * Haiku selects fields -> store recipe -> verify by replaying
6
+ * selectors against the same page.
7
+ *
8
+ * Hot path: Playwright fetch -> fingerprint -> recipe lookup -> selector
9
+ * replay. No LLM call.
10
+ *
11
+ * Self-heal: hot replays that come back empty bump fail_count; the recipe
12
+ * is dropped after MAX_FAIL_COUNT and the next request retries
13
+ * cold. A single in-request fall-through from hot -> cold is
14
+ * allowed so callers don't see transient breakage.
15
+ */
16
+
17
+ const { normalizeUrl } = require('./url-normalize')
18
+ const { computeFingerprint } = require('./fingerprint')
19
+ const { renderPage, extractWithSelectors } = require('./playwright-runner')
20
+ const { cleanHtml } = require('./html-cleaner')
21
+ const { generateRecipe } = require('./recipe-generator')
22
+ const pageRecipeStore = require('../../stores/page-recipe-store')
23
+
24
+ function isEmptyResult(data) {
25
+ if (!data || typeof data !== 'object') return true
26
+ const values = Object.values(data)
27
+ if (values.length === 0) return true
28
+ return values.every(v => {
29
+ if (v == null) return true
30
+ if (typeof v === 'string') return v.trim() === ''
31
+ if (Array.isArray(v)) return v.length === 0
32
+ return false
33
+ })
34
+ }
35
+
36
+ async function extract({ url, hint }) {
37
+ const { template, canonicalUrl } = normalizeUrl(url)
38
+
39
+ // Always render once up-front so we can compute the fingerprint regardless
40
+ // of cache state. Cold path reuses the HTML; hot path discards it.
41
+ const rendered = await renderPage(canonicalUrl)
42
+ const fingerprint = computeFingerprint(rendered.html)
43
+
44
+ const cached = pageRecipeStore.find({
45
+ urlTemplate: template,
46
+ domFingerprint: fingerprint,
47
+ })
48
+
49
+ if (cached) {
50
+ const data = await extractWithSelectors(canonicalUrl, cached.selectors)
51
+ if (!isEmptyResult(data)) {
52
+ pageRecipeStore.incrementHit({ urlTemplate: template, domFingerprint: fingerprint })
53
+ pageRecipeStore.setLastVerified({ urlTemplate: template, domFingerprint: fingerprint })
54
+ return shape({
55
+ url: canonicalUrl,
56
+ finalUrl: rendered.finalUrl,
57
+ statusCode: rendered.statusCode,
58
+ recipeMode: 'hot',
59
+ urlTemplate: template,
60
+ fingerprint,
61
+ pageType: cached.page_type,
62
+ selectors: cached.selectors,
63
+ data,
64
+ cleaned: null,
65
+ })
66
+ }
67
+ // Hot replay returned nothing — penalize and fall through to cold.
68
+ pageRecipeStore.incrementFail({ urlTemplate: template, domFingerprint: fingerprint })
69
+ }
70
+
71
+ // Cold path
72
+ const cleaned = cleanHtml(rendered.html, canonicalUrl)
73
+ const recipe = await generateRecipe({
74
+ url: canonicalUrl,
75
+ cleanedMarkdown: cleaned.contentMarkdown,
76
+ hint,
77
+ })
78
+
79
+ // Verify the recipe against this exact page before persisting.
80
+ const verifyData = await extractWithSelectors(canonicalUrl, recipe.selectors)
81
+ const verified = !isEmptyResult(verifyData)
82
+
83
+ if (verified) {
84
+ pageRecipeStore.upsert({
85
+ urlTemplate: template,
86
+ domFingerprint: fingerprint,
87
+ selectors: recipe.selectors,
88
+ pageType: recipe.pageType,
89
+ })
90
+ pageRecipeStore.incrementHit({ urlTemplate: template, domFingerprint: fingerprint })
91
+ }
92
+
93
+ return shape({
94
+ url: canonicalUrl,
95
+ finalUrl: rendered.finalUrl,
96
+ statusCode: rendered.statusCode,
97
+ recipeMode: 'cold',
98
+ urlTemplate: template,
99
+ fingerprint,
100
+ pageType: recipe.pageType,
101
+ selectors: recipe.selectors,
102
+ data: verified ? verifyData : recipe.extracted,
103
+ cleaned,
104
+ recipePersisted: verified,
105
+ })
106
+ }
107
+
108
+ function shape({ url, finalUrl, statusCode, recipeMode, urlTemplate, fingerprint, pageType, selectors, data, cleaned, recipePersisted }) {
109
+ const out = {
110
+ experimental: true,
111
+ url,
112
+ finalUrl,
113
+ statusCode,
114
+ recipeMode,
115
+ recipeId: `${urlTemplate}#${fingerprint}`,
116
+ pageType: pageType || null,
117
+ title: pickField(data, ['title', 'headline', 'name']) || cleaned?.title || null,
118
+ content: pickField(data, ['body', 'content', 'article', 'description']) || cleaned?.contentMarkdown || null,
119
+ structured: data || {},
120
+ selectors: selectors || {},
121
+ }
122
+ if (recipeMode === 'cold' && recipePersisted === false) {
123
+ out.warning = 'Recipe verification failed (selectors returned empty). Result reflects LLM extraction; recipe was not persisted.'
124
+ }
125
+ return out
126
+ }
127
+
128
+ function pickField(obj, candidates) {
129
+ if (!obj || typeof obj !== 'object') return null
130
+ for (const key of candidates) {
131
+ const v = obj[key]
132
+ if (v == null) continue
133
+ if (typeof v === 'string' && v.trim() !== '') return v
134
+ if (Array.isArray(v) && v.length > 0) return v.join('\n\n')
135
+ }
136
+ return null
137
+ }
138
+
139
+ module.exports = {
140
+ extract,
141
+ isEmptyResult,
142
+ }
@@ -0,0 +1,63 @@
1
+ /**
2
+ * Lightweight DOM structure fingerprint.
3
+ *
4
+ * Two pages with the same template URL but materially different layouts
5
+ * (A/B test, logged-in vs logged-out, mobile vs desktop served) need to
6
+ * use different recipes. We hash a minimal structural signature instead
7
+ * of the full HTML so the fingerprint stays stable against trivial copy
8
+ * changes but flips when block-level structure shifts.
9
+ *
10
+ * Signature inputs:
11
+ * - Order of structural landmark tags (header/nav/main/article/...)
12
+ * - Top 5 most frequent class names on <div> elements
13
+ */
14
+
15
+ const crypto = require('crypto')
16
+
17
+ const LANDMARK_TAGS = ['header', 'nav', 'main', 'article', 'section', 'aside', 'footer', 'form']
18
+
19
+ function computeFingerprint(html) {
20
+ if (typeof html !== 'string' || html.length === 0) {
21
+ return 'empty'
22
+ }
23
+
24
+ let document
25
+ try {
26
+ const { parseHTML } = require('linkedom')
27
+ document = parseHTML(html).document
28
+ } catch (err) {
29
+ // If linkedom fails (extremely malformed HTML), fall back to a length bucket
30
+ return 'fallback-' + crypto.createHash('sha1').update(html.slice(0, 4096)).digest('hex').slice(0, 12)
31
+ }
32
+
33
+ // Landmark tag sequence (first occurrence only, in document order)
34
+ const seen = []
35
+ const seenSet = new Set()
36
+ const allEls = document.querySelectorAll(LANDMARK_TAGS.join(','))
37
+ for (const el of allEls) {
38
+ const tag = el.tagName.toLowerCase()
39
+ if (!seenSet.has(tag)) {
40
+ seenSet.add(tag)
41
+ seen.push(tag)
42
+ }
43
+ }
44
+
45
+ // Top 5 div classes by frequency
46
+ const classCounts = new Map()
47
+ const divs = document.querySelectorAll('div[class]')
48
+ for (const div of divs) {
49
+ const classes = (div.getAttribute('class') || '').split(/\s+/).filter(Boolean)
50
+ for (const c of classes) {
51
+ classCounts.set(c, (classCounts.get(c) || 0) + 1)
52
+ }
53
+ }
54
+ const topClasses = [...classCounts.entries()]
55
+ .sort((a, b) => b[1] - a[1] || (a[0] < b[0] ? -1 : 1))
56
+ .slice(0, 5)
57
+ .map(([c]) => c)
58
+
59
+ const signature = `tags:${seen.join(',')}|cls:${topClasses.join(',')}`
60
+ return crypto.createHash('sha1').update(signature).digest('hex').slice(0, 12)
61
+ }
62
+
63
+ module.exports = { computeFingerprint }
@@ -0,0 +1,72 @@
1
+ /**
2
+ * HTML → cleaned content (Readability) → Markdown (Turndown).
3
+ *
4
+ * The cleaned Markdown is the *only* page representation handed to the
5
+ * recipe-generation LLM. Keeping the input small and structured is what
6
+ * makes this experiment cheap enough to be worth running on every
7
+ * cold-cache miss.
8
+ */
9
+
10
+ const MAX_MARKDOWN_LENGTH = 50_000
11
+
12
+ function cleanHtml(html, url) {
13
+ let parsedDocument
14
+ try {
15
+ const { parseHTML } = require('linkedom')
16
+ parsedDocument = parseHTML(html).document
17
+ } catch (err) {
18
+ return {
19
+ title: null,
20
+ contentHtml: '',
21
+ contentMarkdown: '',
22
+ byline: null,
23
+ excerpt: null,
24
+ length: 0,
25
+ }
26
+ }
27
+
28
+ let article = null
29
+ try {
30
+ const { Readability } = require('@mozilla/readability')
31
+ article = new Readability(parsedDocument).parse()
32
+ } catch {
33
+ article = null
34
+ }
35
+
36
+ const contentHtml =
37
+ (article && article.content) ||
38
+ parsedDocument.body?.innerHTML ||
39
+ ''
40
+
41
+ let contentMarkdown = ''
42
+ try {
43
+ const TurndownService = require('turndown')
44
+ const td = new TurndownService({
45
+ headingStyle: 'atx',
46
+ codeBlockStyle: 'fenced',
47
+ bulletListMarker: '-',
48
+ })
49
+ td.remove(['script', 'style', 'noscript', 'iframe'])
50
+ contentMarkdown = td.turndown(contentHtml)
51
+ } catch {
52
+ contentMarkdown = ''
53
+ }
54
+
55
+ if (contentMarkdown.length > MAX_MARKDOWN_LENGTH) {
56
+ contentMarkdown = contentMarkdown.slice(0, MAX_MARKDOWN_LENGTH) + '\n\n[... truncated ...]'
57
+ }
58
+
59
+ return {
60
+ title: article?.title || parsedDocument.title || null,
61
+ contentHtml,
62
+ contentMarkdown,
63
+ byline: article?.byline || null,
64
+ excerpt: article?.excerpt || null,
65
+ length: article?.length || contentMarkdown.length,
66
+ }
67
+ }
68
+
69
+ module.exports = {
70
+ cleanHtml,
71
+ MAX_MARKDOWN_LENGTH,
72
+ }
@@ -0,0 +1,21 @@
1
+ /**
2
+ * Web extraction (experimental, v3.53.0).
3
+ *
4
+ * Public surface for core/routes/web.js. Internal modules:
5
+ * - url-normalize.js URL → template + canonical URL
6
+ * - fingerprint.js DOM structural hash
7
+ * - playwright-runner.js headless fetch + selector replay
8
+ * - html-cleaner.js Readability + Turndown
9
+ * - recipe-generator.js Anthropic Haiku cold path
10
+ * - extractor.js orchestrator (hot/cold + self-heal)
11
+ */
12
+
13
+ const { extract } = require('./extractor')
14
+ const { normalizeUrl } = require('./url-normalize')
15
+ const { computeFingerprint } = require('./fingerprint')
16
+
17
+ module.exports = {
18
+ extract,
19
+ normalizeUrl,
20
+ computeFingerprint,
21
+ }