@geekbeer/minion 3.52.0 → 3.53.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.env.example CHANGED
@@ -17,3 +17,10 @@ MINION_ID=
17
17
 
18
18
  # Agent port (optional, default: 8080)
19
19
  AGENT_PORT=8080
20
+
21
+ # Anthropic API key (optional, experimental, fallback only) —
22
+ # POST /api/web/extract prefers the primary LLM plugin (see PUT /api/llm/config)
23
+ # and only uses ANTHROPIC_API_KEY if no primary plugin is configured. Set via:
24
+ # curl -X PUT http://localhost:8080/api/secrets/ANTHROPIC_API_KEY \
25
+ # -H "Authorization: Bearer $API_TOKEN" -d '{"value": "sk-ant-..."}'
26
+ ANTHROPIC_API_KEY=
@@ -0,0 +1,33 @@
1
+ /**
2
+ * page_recipes — Web page extraction recipe cache (experimental, v3.53.0).
3
+ *
4
+ * Stores selectors learned from a first-time visit so subsequent visits to
5
+ * structurally similar pages skip the LLM round trip. Keyed by URL template
6
+ * (after normalization) + DOM fingerprint to tolerate A/B variants.
7
+ *
8
+ * Marked experimental: schema may change before the API stabilizes.
9
+ */
10
+
11
+ module.exports = {
12
+ version: 20260508000000,
13
+ name: 'page_recipes',
14
+
15
+ up(db, { tableExists }) {
16
+ if (tableExists(db, 'page_recipes')) return
17
+
18
+ db.exec(`
19
+ CREATE TABLE page_recipes (
20
+ url_template TEXT NOT NULL,
21
+ dom_fingerprint TEXT NOT NULL,
22
+ selectors_json TEXT NOT NULL,
23
+ page_type TEXT,
24
+ hit_count INTEGER NOT NULL DEFAULT 0,
25
+ fail_count INTEGER NOT NULL DEFAULT 0,
26
+ last_verified_at TEXT,
27
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
28
+ PRIMARY KEY (url_template, dom_fingerprint)
29
+ );
30
+ CREATE INDEX idx_page_recipes_template ON page_recipes(url_template);
31
+ `)
32
+ },
33
+ }
@@ -0,0 +1,142 @@
1
+ /**
2
+ * Web extraction orchestrator (experimental — v3.53.0).
3
+ *
4
+ * Cold path: Playwright fetch -> Readability/Turndown clean -> Anthropic
5
+ * Haiku selects fields -> store recipe -> verify by replaying
6
+ * selectors against the same page.
7
+ *
8
+ * Hot path: Playwright fetch -> fingerprint -> recipe lookup -> selector
9
+ * replay. No LLM call.
10
+ *
11
+ * Self-heal: hot replays that come back empty bump fail_count; the recipe
12
+ * is dropped after MAX_FAIL_COUNT and the next request retries
13
+ * cold. A single in-request fall-through from hot -> cold is
14
+ * allowed so callers don't see transient breakage.
15
+ */
16
+
17
+ const { normalizeUrl } = require('./url-normalize')
18
+ const { computeFingerprint } = require('./fingerprint')
19
+ const { renderPage, extractWithSelectors } = require('./playwright-runner')
20
+ const { cleanHtml } = require('./html-cleaner')
21
+ const { generateRecipe } = require('./recipe-generator')
22
+ const pageRecipeStore = require('../../stores/page-recipe-store')
23
+
24
+ function isEmptyResult(data) {
25
+ if (!data || typeof data !== 'object') return true
26
+ const values = Object.values(data)
27
+ if (values.length === 0) return true
28
+ return values.every(v => {
29
+ if (v == null) return true
30
+ if (typeof v === 'string') return v.trim() === ''
31
+ if (Array.isArray(v)) return v.length === 0
32
+ return false
33
+ })
34
+ }
35
+
36
+ async function extract({ url, hint }) {
37
+ const { template, canonicalUrl } = normalizeUrl(url)
38
+
39
+ // Always render once up-front so we can compute the fingerprint regardless
40
+ // of cache state. Cold path reuses the HTML; hot path discards it.
41
+ const rendered = await renderPage(canonicalUrl)
42
+ const fingerprint = computeFingerprint(rendered.html)
43
+
44
+ const cached = pageRecipeStore.find({
45
+ urlTemplate: template,
46
+ domFingerprint: fingerprint,
47
+ })
48
+
49
+ if (cached) {
50
+ const data = await extractWithSelectors(canonicalUrl, cached.selectors)
51
+ if (!isEmptyResult(data)) {
52
+ pageRecipeStore.incrementHit({ urlTemplate: template, domFingerprint: fingerprint })
53
+ pageRecipeStore.setLastVerified({ urlTemplate: template, domFingerprint: fingerprint })
54
+ return shape({
55
+ url: canonicalUrl,
56
+ finalUrl: rendered.finalUrl,
57
+ statusCode: rendered.statusCode,
58
+ recipeMode: 'hot',
59
+ urlTemplate: template,
60
+ fingerprint,
61
+ pageType: cached.page_type,
62
+ selectors: cached.selectors,
63
+ data,
64
+ cleaned: null,
65
+ })
66
+ }
67
+ // Hot replay returned nothing — penalize and fall through to cold.
68
+ pageRecipeStore.incrementFail({ urlTemplate: template, domFingerprint: fingerprint })
69
+ }
70
+
71
+ // Cold path
72
+ const cleaned = cleanHtml(rendered.html, canonicalUrl)
73
+ const recipe = await generateRecipe({
74
+ url: canonicalUrl,
75
+ cleanedMarkdown: cleaned.contentMarkdown,
76
+ hint,
77
+ })
78
+
79
+ // Verify the recipe against this exact page before persisting.
80
+ const verifyData = await extractWithSelectors(canonicalUrl, recipe.selectors)
81
+ const verified = !isEmptyResult(verifyData)
82
+
83
+ if (verified) {
84
+ pageRecipeStore.upsert({
85
+ urlTemplate: template,
86
+ domFingerprint: fingerprint,
87
+ selectors: recipe.selectors,
88
+ pageType: recipe.pageType,
89
+ })
90
+ pageRecipeStore.incrementHit({ urlTemplate: template, domFingerprint: fingerprint })
91
+ }
92
+
93
+ return shape({
94
+ url: canonicalUrl,
95
+ finalUrl: rendered.finalUrl,
96
+ statusCode: rendered.statusCode,
97
+ recipeMode: 'cold',
98
+ urlTemplate: template,
99
+ fingerprint,
100
+ pageType: recipe.pageType,
101
+ selectors: recipe.selectors,
102
+ data: verified ? verifyData : recipe.extracted,
103
+ cleaned,
104
+ recipePersisted: verified,
105
+ })
106
+ }
107
+
108
+ function shape({ url, finalUrl, statusCode, recipeMode, urlTemplate, fingerprint, pageType, selectors, data, cleaned, recipePersisted }) {
109
+ const out = {
110
+ experimental: true,
111
+ url,
112
+ finalUrl,
113
+ statusCode,
114
+ recipeMode,
115
+ recipeId: `${urlTemplate}#${fingerprint}`,
116
+ pageType: pageType || null,
117
+ title: pickField(data, ['title', 'headline', 'name']) || cleaned?.title || null,
118
+ content: pickField(data, ['body', 'content', 'article', 'description']) || cleaned?.contentMarkdown || null,
119
+ structured: data || {},
120
+ selectors: selectors || {},
121
+ }
122
+ if (recipeMode === 'cold' && recipePersisted === false) {
123
+ out.warning = 'Recipe verification failed (selectors returned empty). Result reflects LLM extraction; recipe was not persisted.'
124
+ }
125
+ return out
126
+ }
127
+
128
+ function pickField(obj, candidates) {
129
+ if (!obj || typeof obj !== 'object') return null
130
+ for (const key of candidates) {
131
+ const v = obj[key]
132
+ if (v == null) continue
133
+ if (typeof v === 'string' && v.trim() !== '') return v
134
+ if (Array.isArray(v) && v.length > 0) return v.join('\n\n')
135
+ }
136
+ return null
137
+ }
138
+
139
+ module.exports = {
140
+ extract,
141
+ isEmptyResult,
142
+ }
@@ -0,0 +1,63 @@
1
+ /**
2
+ * Lightweight DOM structure fingerprint.
3
+ *
4
+ * Two pages with the same template URL but materially different layouts
5
+ * (A/B test, logged-in vs logged-out, mobile vs desktop served) need to
6
+ * use different recipes. We hash a minimal structural signature instead
7
+ * of the full HTML so the fingerprint stays stable against trivial copy
8
+ * changes but flips when block-level structure shifts.
9
+ *
10
+ * Signature inputs:
11
+ * - Order of structural landmark tags (header/nav/main/article/...)
12
+ * - Top 5 most frequent class names on <div> elements
13
+ */
14
+
15
+ const crypto = require('crypto')
16
+
17
+ const LANDMARK_TAGS = ['header', 'nav', 'main', 'article', 'section', 'aside', 'footer', 'form']
18
+
19
+ function computeFingerprint(html) {
20
+ if (typeof html !== 'string' || html.length === 0) {
21
+ return 'empty'
22
+ }
23
+
24
+ let document
25
+ try {
26
+ const { parseHTML } = require('linkedom')
27
+ document = parseHTML(html).document
28
+ } catch (err) {
29
+ // If linkedom fails (extremely malformed HTML), fall back to a length bucket
30
+ return 'fallback-' + crypto.createHash('sha1').update(html.slice(0, 4096)).digest('hex').slice(0, 12)
31
+ }
32
+
33
+ // Landmark tag sequence (first occurrence only, in document order)
34
+ const seen = []
35
+ const seenSet = new Set()
36
+ const allEls = document.querySelectorAll(LANDMARK_TAGS.join(','))
37
+ for (const el of allEls) {
38
+ const tag = el.tagName.toLowerCase()
39
+ if (!seenSet.has(tag)) {
40
+ seenSet.add(tag)
41
+ seen.push(tag)
42
+ }
43
+ }
44
+
45
+ // Top 5 div classes by frequency
46
+ const classCounts = new Map()
47
+ const divs = document.querySelectorAll('div[class]')
48
+ for (const div of divs) {
49
+ const classes = (div.getAttribute('class') || '').split(/\s+/).filter(Boolean)
50
+ for (const c of classes) {
51
+ classCounts.set(c, (classCounts.get(c) || 0) + 1)
52
+ }
53
+ }
54
+ const topClasses = [...classCounts.entries()]
55
+ .sort((a, b) => b[1] - a[1] || (a[0] < b[0] ? -1 : 1))
56
+ .slice(0, 5)
57
+ .map(([c]) => c)
58
+
59
+ const signature = `tags:${seen.join(',')}|cls:${topClasses.join(',')}`
60
+ return crypto.createHash('sha1').update(signature).digest('hex').slice(0, 12)
61
+ }
62
+
63
+ module.exports = { computeFingerprint }
@@ -0,0 +1,72 @@
1
+ /**
2
+ * HTML → cleaned content (Readability) → Markdown (Turndown).
3
+ *
4
+ * The cleaned Markdown is the *only* page representation handed to the
5
+ * recipe-generation LLM. Keeping the input small and structured is what
6
+ * makes this experiment cheap enough to be worth running on every
7
+ * cold-cache miss.
8
+ */
9
+
10
+ const MAX_MARKDOWN_LENGTH = 50_000
11
+
12
+ function cleanHtml(html, url) {
13
+ let parsedDocument
14
+ try {
15
+ const { parseHTML } = require('linkedom')
16
+ parsedDocument = parseHTML(html).document
17
+ } catch (err) {
18
+ return {
19
+ title: null,
20
+ contentHtml: '',
21
+ contentMarkdown: '',
22
+ byline: null,
23
+ excerpt: null,
24
+ length: 0,
25
+ }
26
+ }
27
+
28
+ let article = null
29
+ try {
30
+ const { Readability } = require('@mozilla/readability')
31
+ article = new Readability(parsedDocument).parse()
32
+ } catch {
33
+ article = null
34
+ }
35
+
36
+ const contentHtml =
37
+ (article && article.content) ||
38
+ parsedDocument.body?.innerHTML ||
39
+ ''
40
+
41
+ let contentMarkdown = ''
42
+ try {
43
+ const TurndownService = require('turndown')
44
+ const td = new TurndownService({
45
+ headingStyle: 'atx',
46
+ codeBlockStyle: 'fenced',
47
+ bulletListMarker: '-',
48
+ })
49
+ td.remove(['script', 'style', 'noscript', 'iframe'])
50
+ contentMarkdown = td.turndown(contentHtml)
51
+ } catch {
52
+ contentMarkdown = ''
53
+ }
54
+
55
+ if (contentMarkdown.length > MAX_MARKDOWN_LENGTH) {
56
+ contentMarkdown = contentMarkdown.slice(0, MAX_MARKDOWN_LENGTH) + '\n\n[... truncated ...]'
57
+ }
58
+
59
+ return {
60
+ title: article?.title || parsedDocument.title || null,
61
+ contentHtml,
62
+ contentMarkdown,
63
+ byline: article?.byline || null,
64
+ excerpt: article?.excerpt || null,
65
+ length: article?.length || contentMarkdown.length,
66
+ }
67
+ }
68
+
69
+ module.exports = {
70
+ cleanHtml,
71
+ MAX_MARKDOWN_LENGTH,
72
+ }
@@ -0,0 +1,21 @@
1
+ /**
2
+ * Web extraction (experimental, v3.53.0).
3
+ *
4
+ * Public surface for core/routes/web.js. Internal modules:
5
+ * - url-normalize.js URL → template + canonical URL
6
+ * - fingerprint.js DOM structural hash
7
+ * - playwright-runner.js headless fetch + selector replay
8
+ * - html-cleaner.js Readability + Turndown
9
+ * - recipe-generator.js Anthropic Haiku cold path
10
+ * - extractor.js orchestrator (hot/cold + self-heal)
11
+ */
12
+
13
+ const { extract } = require('./extractor')
14
+ const { normalizeUrl } = require('./url-normalize')
15
+ const { computeFingerprint } = require('./fingerprint')
16
+
17
+ module.exports = {
18
+ extract,
19
+ normalizeUrl,
20
+ computeFingerprint,
21
+ }
@@ -0,0 +1,129 @@
1
+ /**
2
+ * Headless browser fetch + selector-based extraction.
3
+ *
4
+ * `playwright` is an optionalDependency: if it's missing (e.g. ARM host
5
+ * where the chromium binary failed to install), require() throws and the
6
+ * route layer surfaces a 503 "browser unavailable" error instead of
7
+ * crashing the agent.
8
+ *
9
+ * Each call spins up a fresh chromium instance. Pooling can come later
10
+ * once the API stabilizes — for the experimental MVP, simple is better.
11
+ */
12
+
13
+ const DEFAULT_NAV_TIMEOUT_MS = 20_000
14
+ const DEFAULT_EVAL_TIMEOUT_MS = 5_000
15
+ const DEFAULT_USER_AGENT =
16
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' +
17
+ 'Chrome/124.0.0.0 Safari/537.36 MinionWebExtract/0.1'
18
+
19
+ function loadChromium() {
20
+ let playwright
21
+ try {
22
+ playwright = require('playwright')
23
+ } catch (err) {
24
+ const e = new Error(
25
+ 'playwright is not installed. Run `npx playwright install chromium` ' +
26
+ 'on the minion host to enable POST /api/web/extract.'
27
+ )
28
+ e.code = 'PLAYWRIGHT_UNAVAILABLE'
29
+ throw e
30
+ }
31
+ return playwright.chromium
32
+ }
33
+
34
+ async function withPage(fn, opts = {}) {
35
+ const chromium = loadChromium()
36
+ const browser = await chromium.launch({
37
+ headless: true,
38
+ args: ['--no-sandbox', '--disable-dev-shm-usage'],
39
+ })
40
+ try {
41
+ const context = await browser.newContext({
42
+ userAgent: opts.userAgent || DEFAULT_USER_AGENT,
43
+ viewport: { width: 1280, height: 800 },
44
+ })
45
+ const page = await context.newPage()
46
+ return await fn(page)
47
+ } finally {
48
+ await browser.close().catch(() => {})
49
+ }
50
+ }
51
+
52
+ async function renderPage(url, opts = {}) {
53
+ return withPage(async page => {
54
+ const response = await page.goto(url, {
55
+ waitUntil: 'domcontentloaded',
56
+ timeout: opts.timeoutMs ?? DEFAULT_NAV_TIMEOUT_MS,
57
+ })
58
+ const html = await page.content()
59
+ return {
60
+ html,
61
+ finalUrl: page.url(),
62
+ statusCode: response?.status() ?? null,
63
+ }
64
+ }, opts)
65
+ }
66
+
67
+ /**
68
+ * Run a recipe against a freshly loaded page.
69
+ *
70
+ * `selectors` shape (each value is an object):
71
+ * {
72
+ * title: { selector: 'h1', attr: 'text' },
73
+ * body: { selector: 'article', attr: 'text' },
74
+ * items: { selector: '.list-item .title', attr: 'text', multiple: true },
75
+ * link: { selector: 'a.permalink', attr: 'href' }
76
+ * }
77
+ *
78
+ * `attr` defaults to 'text' (innerText). Special value 'html' returns
79
+ * innerHTML. Any other string is read as an HTML attribute.
80
+ */
81
+ async function extractWithSelectors(url, selectors, opts = {}) {
82
+ return withPage(async page => {
83
+ await page.goto(url, {
84
+ waitUntil: 'domcontentloaded',
85
+ timeout: opts.timeoutMs ?? DEFAULT_NAV_TIMEOUT_MS,
86
+ })
87
+ return await page.evaluate(
88
+ ({ selectorMap, evalTimeoutMs }) => {
89
+ const start = Date.now()
90
+ const result = {}
91
+ for (const [field, spec] of Object.entries(selectorMap)) {
92
+ if (Date.now() - start > evalTimeoutMs) {
93
+ result[field] = null
94
+ continue
95
+ }
96
+ if (!spec || !spec.selector) {
97
+ result[field] = null
98
+ continue
99
+ }
100
+ const attr = spec.attr || 'text'
101
+ const readOne = (el) => {
102
+ if (!el) return null
103
+ if (attr === 'text') return (el.innerText || el.textContent || '').trim()
104
+ if (attr === 'html') return el.innerHTML
105
+ return el.getAttribute(attr)
106
+ }
107
+ try {
108
+ if (spec.multiple) {
109
+ const nodes = document.querySelectorAll(spec.selector)
110
+ result[field] = Array.from(nodes).map(readOne).filter(v => v != null && v !== '')
111
+ } else {
112
+ const node = document.querySelector(spec.selector)
113
+ result[field] = readOne(node)
114
+ }
115
+ } catch {
116
+ result[field] = null
117
+ }
118
+ }
119
+ return result
120
+ },
121
+ { selectorMap: selectors, evalTimeoutMs: opts.evalTimeoutMs ?? DEFAULT_EVAL_TIMEOUT_MS },
122
+ )
123
+ }, opts)
124
+ }
125
+
126
+ module.exports = {
127
+ renderPage,
128
+ extractWithSelectors,
129
+ }