@geekbeer/minion 3.52.0 → 3.53.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +7 -0
- package/core/db/migrations/20260508000000_page_recipes.js +33 -0
- package/core/lib/web-extract/extractor.js +142 -0
- package/core/lib/web-extract/fingerprint.js +63 -0
- package/core/lib/web-extract/html-cleaner.js +72 -0
- package/core/lib/web-extract/index.js +21 -0
- package/core/lib/web-extract/playwright-runner.js +129 -0
- package/core/lib/web-extract/recipe-generator.js +247 -0
- package/core/lib/web-extract/url-normalize.js +90 -0
- package/core/routes/web.js +94 -0
- package/core/stores/page-recipe-store.js +143 -0
- package/docs/api-reference.md +66 -0
- package/docs/task-guides.md +58 -0
- package/linux/routes/chat.js +36 -4
- package/linux/server.js +2 -0
- package/mac/server.js +2 -0
- package/package.json +6 -2
- package/rules/core.md +21 -1
- package/win/routes/chat.js +37 -2
- package/win/server.js +2 -0
package/.env.example
CHANGED
|
@@ -17,3 +17,10 @@ MINION_ID=
|
|
|
17
17
|
|
|
18
18
|
# Agent port (optional, default: 8080)
|
|
19
19
|
AGENT_PORT=8080
|
|
20
|
+
|
|
21
|
+
# Anthropic API key (optional, experimental, fallback only) —
|
|
22
|
+
# POST /api/web/extract prefers the primary LLM plugin (see PUT /api/llm/config)
|
|
23
|
+
# and only uses ANTHROPIC_API_KEY if no primary plugin is configured. Set via:
|
|
24
|
+
# curl -X PUT http://localhost:8080/api/secrets/ANTHROPIC_API_KEY \
|
|
25
|
+
# -H "Authorization: Bearer $API_TOKEN" -d '{"value": "sk-ant-..."}'
|
|
26
|
+
ANTHROPIC_API_KEY=
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* page_recipes — Web page extraction recipe cache (experimental, v3.53.0).
|
|
3
|
+
*
|
|
4
|
+
* Stores selectors learned from a first-time visit so subsequent visits to
|
|
5
|
+
* structurally similar pages skip the LLM round trip. Keyed by URL template
|
|
6
|
+
* (after normalization) + DOM fingerprint to tolerate A/B variants.
|
|
7
|
+
*
|
|
8
|
+
* Marked experimental: schema may change before the API stabilizes.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
module.exports = {
|
|
12
|
+
version: 20260508000000,
|
|
13
|
+
name: 'page_recipes',
|
|
14
|
+
|
|
15
|
+
up(db, { tableExists }) {
|
|
16
|
+
if (tableExists(db, 'page_recipes')) return
|
|
17
|
+
|
|
18
|
+
db.exec(`
|
|
19
|
+
CREATE TABLE page_recipes (
|
|
20
|
+
url_template TEXT NOT NULL,
|
|
21
|
+
dom_fingerprint TEXT NOT NULL,
|
|
22
|
+
selectors_json TEXT NOT NULL,
|
|
23
|
+
page_type TEXT,
|
|
24
|
+
hit_count INTEGER NOT NULL DEFAULT 0,
|
|
25
|
+
fail_count INTEGER NOT NULL DEFAULT 0,
|
|
26
|
+
last_verified_at TEXT,
|
|
27
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
28
|
+
PRIMARY KEY (url_template, dom_fingerprint)
|
|
29
|
+
);
|
|
30
|
+
CREATE INDEX idx_page_recipes_template ON page_recipes(url_template);
|
|
31
|
+
`)
|
|
32
|
+
},
|
|
33
|
+
}
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web extraction orchestrator (experimental — v3.53.0).
|
|
3
|
+
*
|
|
4
|
+
* Cold path: Playwright fetch -> Readability/Turndown clean -> Anthropic
|
|
5
|
+
* Haiku selects fields -> store recipe -> verify by replaying
|
|
6
|
+
* selectors against the same page.
|
|
7
|
+
*
|
|
8
|
+
* Hot path: Playwright fetch -> fingerprint -> recipe lookup -> selector
|
|
9
|
+
* replay. No LLM call.
|
|
10
|
+
*
|
|
11
|
+
* Self-heal: hot replays that come back empty bump fail_count; the recipe
|
|
12
|
+
* is dropped after MAX_FAIL_COUNT and the next request retries
|
|
13
|
+
* cold. A single in-request fall-through from hot -> cold is
|
|
14
|
+
* allowed so callers don't see transient breakage.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
const { normalizeUrl } = require('./url-normalize')
|
|
18
|
+
const { computeFingerprint } = require('./fingerprint')
|
|
19
|
+
const { renderPage, extractWithSelectors } = require('./playwright-runner')
|
|
20
|
+
const { cleanHtml } = require('./html-cleaner')
|
|
21
|
+
const { generateRecipe } = require('./recipe-generator')
|
|
22
|
+
const pageRecipeStore = require('../../stores/page-recipe-store')
|
|
23
|
+
|
|
24
|
+
function isEmptyResult(data) {
|
|
25
|
+
if (!data || typeof data !== 'object') return true
|
|
26
|
+
const values = Object.values(data)
|
|
27
|
+
if (values.length === 0) return true
|
|
28
|
+
return values.every(v => {
|
|
29
|
+
if (v == null) return true
|
|
30
|
+
if (typeof v === 'string') return v.trim() === ''
|
|
31
|
+
if (Array.isArray(v)) return v.length === 0
|
|
32
|
+
return false
|
|
33
|
+
})
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
async function extract({ url, hint }) {
|
|
37
|
+
const { template, canonicalUrl } = normalizeUrl(url)
|
|
38
|
+
|
|
39
|
+
// Always render once up-front so we can compute the fingerprint regardless
|
|
40
|
+
// of cache state. Cold path reuses the HTML; hot path discards it.
|
|
41
|
+
const rendered = await renderPage(canonicalUrl)
|
|
42
|
+
const fingerprint = computeFingerprint(rendered.html)
|
|
43
|
+
|
|
44
|
+
const cached = pageRecipeStore.find({
|
|
45
|
+
urlTemplate: template,
|
|
46
|
+
domFingerprint: fingerprint,
|
|
47
|
+
})
|
|
48
|
+
|
|
49
|
+
if (cached) {
|
|
50
|
+
const data = await extractWithSelectors(canonicalUrl, cached.selectors)
|
|
51
|
+
if (!isEmptyResult(data)) {
|
|
52
|
+
pageRecipeStore.incrementHit({ urlTemplate: template, domFingerprint: fingerprint })
|
|
53
|
+
pageRecipeStore.setLastVerified({ urlTemplate: template, domFingerprint: fingerprint })
|
|
54
|
+
return shape({
|
|
55
|
+
url: canonicalUrl,
|
|
56
|
+
finalUrl: rendered.finalUrl,
|
|
57
|
+
statusCode: rendered.statusCode,
|
|
58
|
+
recipeMode: 'hot',
|
|
59
|
+
urlTemplate: template,
|
|
60
|
+
fingerprint,
|
|
61
|
+
pageType: cached.page_type,
|
|
62
|
+
selectors: cached.selectors,
|
|
63
|
+
data,
|
|
64
|
+
cleaned: null,
|
|
65
|
+
})
|
|
66
|
+
}
|
|
67
|
+
// Hot replay returned nothing — penalize and fall through to cold.
|
|
68
|
+
pageRecipeStore.incrementFail({ urlTemplate: template, domFingerprint: fingerprint })
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Cold path
|
|
72
|
+
const cleaned = cleanHtml(rendered.html, canonicalUrl)
|
|
73
|
+
const recipe = await generateRecipe({
|
|
74
|
+
url: canonicalUrl,
|
|
75
|
+
cleanedMarkdown: cleaned.contentMarkdown,
|
|
76
|
+
hint,
|
|
77
|
+
})
|
|
78
|
+
|
|
79
|
+
// Verify the recipe against this exact page before persisting.
|
|
80
|
+
const verifyData = await extractWithSelectors(canonicalUrl, recipe.selectors)
|
|
81
|
+
const verified = !isEmptyResult(verifyData)
|
|
82
|
+
|
|
83
|
+
if (verified) {
|
|
84
|
+
pageRecipeStore.upsert({
|
|
85
|
+
urlTemplate: template,
|
|
86
|
+
domFingerprint: fingerprint,
|
|
87
|
+
selectors: recipe.selectors,
|
|
88
|
+
pageType: recipe.pageType,
|
|
89
|
+
})
|
|
90
|
+
pageRecipeStore.incrementHit({ urlTemplate: template, domFingerprint: fingerprint })
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return shape({
|
|
94
|
+
url: canonicalUrl,
|
|
95
|
+
finalUrl: rendered.finalUrl,
|
|
96
|
+
statusCode: rendered.statusCode,
|
|
97
|
+
recipeMode: 'cold',
|
|
98
|
+
urlTemplate: template,
|
|
99
|
+
fingerprint,
|
|
100
|
+
pageType: recipe.pageType,
|
|
101
|
+
selectors: recipe.selectors,
|
|
102
|
+
data: verified ? verifyData : recipe.extracted,
|
|
103
|
+
cleaned,
|
|
104
|
+
recipePersisted: verified,
|
|
105
|
+
})
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
function shape({ url, finalUrl, statusCode, recipeMode, urlTemplate, fingerprint, pageType, selectors, data, cleaned, recipePersisted }) {
|
|
109
|
+
const out = {
|
|
110
|
+
experimental: true,
|
|
111
|
+
url,
|
|
112
|
+
finalUrl,
|
|
113
|
+
statusCode,
|
|
114
|
+
recipeMode,
|
|
115
|
+
recipeId: `${urlTemplate}#${fingerprint}`,
|
|
116
|
+
pageType: pageType || null,
|
|
117
|
+
title: pickField(data, ['title', 'headline', 'name']) || cleaned?.title || null,
|
|
118
|
+
content: pickField(data, ['body', 'content', 'article', 'description']) || cleaned?.contentMarkdown || null,
|
|
119
|
+
structured: data || {},
|
|
120
|
+
selectors: selectors || {},
|
|
121
|
+
}
|
|
122
|
+
if (recipeMode === 'cold' && recipePersisted === false) {
|
|
123
|
+
out.warning = 'Recipe verification failed (selectors returned empty). Result reflects LLM extraction; recipe was not persisted.'
|
|
124
|
+
}
|
|
125
|
+
return out
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
function pickField(obj, candidates) {
|
|
129
|
+
if (!obj || typeof obj !== 'object') return null
|
|
130
|
+
for (const key of candidates) {
|
|
131
|
+
const v = obj[key]
|
|
132
|
+
if (v == null) continue
|
|
133
|
+
if (typeof v === 'string' && v.trim() !== '') return v
|
|
134
|
+
if (Array.isArray(v) && v.length > 0) return v.join('\n\n')
|
|
135
|
+
}
|
|
136
|
+
return null
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
module.exports = {
|
|
140
|
+
extract,
|
|
141
|
+
isEmptyResult,
|
|
142
|
+
}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Lightweight DOM structure fingerprint.
|
|
3
|
+
*
|
|
4
|
+
* Two pages with the same template URL but materially different layouts
|
|
5
|
+
* (A/B test, logged-in vs logged-out, mobile vs desktop served) need to
|
|
6
|
+
* use different recipes. We hash a minimal structural signature instead
|
|
7
|
+
* of the full HTML so the fingerprint stays stable against trivial copy
|
|
8
|
+
* changes but flips when block-level structure shifts.
|
|
9
|
+
*
|
|
10
|
+
* Signature inputs:
|
|
11
|
+
* - Order of structural landmark tags (header/nav/main/article/...)
|
|
12
|
+
* - Top 5 most frequent class names on <div> elements
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
const crypto = require('crypto')
|
|
16
|
+
|
|
17
|
+
const LANDMARK_TAGS = ['header', 'nav', 'main', 'article', 'section', 'aside', 'footer', 'form']
|
|
18
|
+
|
|
19
|
+
function computeFingerprint(html) {
|
|
20
|
+
if (typeof html !== 'string' || html.length === 0) {
|
|
21
|
+
return 'empty'
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
let document
|
|
25
|
+
try {
|
|
26
|
+
const { parseHTML } = require('linkedom')
|
|
27
|
+
document = parseHTML(html).document
|
|
28
|
+
} catch (err) {
|
|
29
|
+
// If linkedom fails (extremely malformed HTML), fall back to a length bucket
|
|
30
|
+
return 'fallback-' + crypto.createHash('sha1').update(html.slice(0, 4096)).digest('hex').slice(0, 12)
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// Landmark tag sequence (first occurrence only, in document order)
|
|
34
|
+
const seen = []
|
|
35
|
+
const seenSet = new Set()
|
|
36
|
+
const allEls = document.querySelectorAll(LANDMARK_TAGS.join(','))
|
|
37
|
+
for (const el of allEls) {
|
|
38
|
+
const tag = el.tagName.toLowerCase()
|
|
39
|
+
if (!seenSet.has(tag)) {
|
|
40
|
+
seenSet.add(tag)
|
|
41
|
+
seen.push(tag)
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Top 5 div classes by frequency
|
|
46
|
+
const classCounts = new Map()
|
|
47
|
+
const divs = document.querySelectorAll('div[class]')
|
|
48
|
+
for (const div of divs) {
|
|
49
|
+
const classes = (div.getAttribute('class') || '').split(/\s+/).filter(Boolean)
|
|
50
|
+
for (const c of classes) {
|
|
51
|
+
classCounts.set(c, (classCounts.get(c) || 0) + 1)
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
const topClasses = [...classCounts.entries()]
|
|
55
|
+
.sort((a, b) => b[1] - a[1] || (a[0] < b[0] ? -1 : 1))
|
|
56
|
+
.slice(0, 5)
|
|
57
|
+
.map(([c]) => c)
|
|
58
|
+
|
|
59
|
+
const signature = `tags:${seen.join(',')}|cls:${topClasses.join(',')}`
|
|
60
|
+
return crypto.createHash('sha1').update(signature).digest('hex').slice(0, 12)
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
module.exports = { computeFingerprint }
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML → cleaned content (Readability) → Markdown (Turndown).
|
|
3
|
+
*
|
|
4
|
+
* The cleaned Markdown is the *only* page representation handed to the
|
|
5
|
+
* recipe-generation LLM. Keeping the input small and structured is what
|
|
6
|
+
* makes this experiment cheap enough to be worth running on every
|
|
7
|
+
* cold-cache miss.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
const MAX_MARKDOWN_LENGTH = 50_000
|
|
11
|
+
|
|
12
|
+
function cleanHtml(html, url) {
|
|
13
|
+
let parsedDocument
|
|
14
|
+
try {
|
|
15
|
+
const { parseHTML } = require('linkedom')
|
|
16
|
+
parsedDocument = parseHTML(html).document
|
|
17
|
+
} catch (err) {
|
|
18
|
+
return {
|
|
19
|
+
title: null,
|
|
20
|
+
contentHtml: '',
|
|
21
|
+
contentMarkdown: '',
|
|
22
|
+
byline: null,
|
|
23
|
+
excerpt: null,
|
|
24
|
+
length: 0,
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
let article = null
|
|
29
|
+
try {
|
|
30
|
+
const { Readability } = require('@mozilla/readability')
|
|
31
|
+
article = new Readability(parsedDocument).parse()
|
|
32
|
+
} catch {
|
|
33
|
+
article = null
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const contentHtml =
|
|
37
|
+
(article && article.content) ||
|
|
38
|
+
parsedDocument.body?.innerHTML ||
|
|
39
|
+
''
|
|
40
|
+
|
|
41
|
+
let contentMarkdown = ''
|
|
42
|
+
try {
|
|
43
|
+
const TurndownService = require('turndown')
|
|
44
|
+
const td = new TurndownService({
|
|
45
|
+
headingStyle: 'atx',
|
|
46
|
+
codeBlockStyle: 'fenced',
|
|
47
|
+
bulletListMarker: '-',
|
|
48
|
+
})
|
|
49
|
+
td.remove(['script', 'style', 'noscript', 'iframe'])
|
|
50
|
+
contentMarkdown = td.turndown(contentHtml)
|
|
51
|
+
} catch {
|
|
52
|
+
contentMarkdown = ''
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
if (contentMarkdown.length > MAX_MARKDOWN_LENGTH) {
|
|
56
|
+
contentMarkdown = contentMarkdown.slice(0, MAX_MARKDOWN_LENGTH) + '\n\n[... truncated ...]'
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
return {
|
|
60
|
+
title: article?.title || parsedDocument.title || null,
|
|
61
|
+
contentHtml,
|
|
62
|
+
contentMarkdown,
|
|
63
|
+
byline: article?.byline || null,
|
|
64
|
+
excerpt: article?.excerpt || null,
|
|
65
|
+
length: article?.length || contentMarkdown.length,
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
module.exports = {
|
|
70
|
+
cleanHtml,
|
|
71
|
+
MAX_MARKDOWN_LENGTH,
|
|
72
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web extraction (experimental, v3.53.0).
|
|
3
|
+
*
|
|
4
|
+
* Public surface for core/routes/web.js. Internal modules:
|
|
5
|
+
* - url-normalize.js URL → template + canonical URL
|
|
6
|
+
* - fingerprint.js DOM structural hash
|
|
7
|
+
* - playwright-runner.js headless fetch + selector replay
|
|
8
|
+
* - html-cleaner.js Readability + Turndown
|
|
9
|
+
* - recipe-generator.js Anthropic Haiku cold path
|
|
10
|
+
* - extractor.js orchestrator (hot/cold + self-heal)
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
const { extract } = require('./extractor')
|
|
14
|
+
const { normalizeUrl } = require('./url-normalize')
|
|
15
|
+
const { computeFingerprint } = require('./fingerprint')
|
|
16
|
+
|
|
17
|
+
module.exports = {
|
|
18
|
+
extract,
|
|
19
|
+
normalizeUrl,
|
|
20
|
+
computeFingerprint,
|
|
21
|
+
}
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Headless browser fetch + selector-based extraction.
|
|
3
|
+
*
|
|
4
|
+
* `playwright` is an optionalDependency: if it's missing (e.g. ARM host
|
|
5
|
+
* where the chromium binary failed to install), require() throws and the
|
|
6
|
+
* route layer surfaces a 503 "browser unavailable" error instead of
|
|
7
|
+
* crashing the agent.
|
|
8
|
+
*
|
|
9
|
+
* Each call spins up a fresh chromium instance. Pooling can come later
|
|
10
|
+
* once the API stabilizes — for the experimental MVP, simple is better.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
const DEFAULT_NAV_TIMEOUT_MS = 20_000
|
|
14
|
+
const DEFAULT_EVAL_TIMEOUT_MS = 5_000
|
|
15
|
+
const DEFAULT_USER_AGENT =
|
|
16
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' +
|
|
17
|
+
'Chrome/124.0.0.0 Safari/537.36 MinionWebExtract/0.1'
|
|
18
|
+
|
|
19
|
+
function loadChromium() {
|
|
20
|
+
let playwright
|
|
21
|
+
try {
|
|
22
|
+
playwright = require('playwright')
|
|
23
|
+
} catch (err) {
|
|
24
|
+
const e = new Error(
|
|
25
|
+
'playwright is not installed. Run `npx playwright install chromium` ' +
|
|
26
|
+
'on the minion host to enable POST /api/web/extract.'
|
|
27
|
+
)
|
|
28
|
+
e.code = 'PLAYWRIGHT_UNAVAILABLE'
|
|
29
|
+
throw e
|
|
30
|
+
}
|
|
31
|
+
return playwright.chromium
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
async function withPage(fn, opts = {}) {
|
|
35
|
+
const chromium = loadChromium()
|
|
36
|
+
const browser = await chromium.launch({
|
|
37
|
+
headless: true,
|
|
38
|
+
args: ['--no-sandbox', '--disable-dev-shm-usage'],
|
|
39
|
+
})
|
|
40
|
+
try {
|
|
41
|
+
const context = await browser.newContext({
|
|
42
|
+
userAgent: opts.userAgent || DEFAULT_USER_AGENT,
|
|
43
|
+
viewport: { width: 1280, height: 800 },
|
|
44
|
+
})
|
|
45
|
+
const page = await context.newPage()
|
|
46
|
+
return await fn(page)
|
|
47
|
+
} finally {
|
|
48
|
+
await browser.close().catch(() => {})
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
async function renderPage(url, opts = {}) {
|
|
53
|
+
return withPage(async page => {
|
|
54
|
+
const response = await page.goto(url, {
|
|
55
|
+
waitUntil: 'domcontentloaded',
|
|
56
|
+
timeout: opts.timeoutMs ?? DEFAULT_NAV_TIMEOUT_MS,
|
|
57
|
+
})
|
|
58
|
+
const html = await page.content()
|
|
59
|
+
return {
|
|
60
|
+
html,
|
|
61
|
+
finalUrl: page.url(),
|
|
62
|
+
statusCode: response?.status() ?? null,
|
|
63
|
+
}
|
|
64
|
+
}, opts)
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Run a recipe against a freshly loaded page.
|
|
69
|
+
*
|
|
70
|
+
* `selectors` shape (each value is an object):
|
|
71
|
+
* {
|
|
72
|
+
* title: { selector: 'h1', attr: 'text' },
|
|
73
|
+
* body: { selector: 'article', attr: 'text' },
|
|
74
|
+
* items: { selector: '.list-item .title', attr: 'text', multiple: true },
|
|
75
|
+
* link: { selector: 'a.permalink', attr: 'href' }
|
|
76
|
+
* }
|
|
77
|
+
*
|
|
78
|
+
* `attr` defaults to 'text' (innerText). Special value 'html' returns
|
|
79
|
+
* innerHTML. Any other string is read as an HTML attribute.
|
|
80
|
+
*/
|
|
81
|
+
async function extractWithSelectors(url, selectors, opts = {}) {
|
|
82
|
+
return withPage(async page => {
|
|
83
|
+
await page.goto(url, {
|
|
84
|
+
waitUntil: 'domcontentloaded',
|
|
85
|
+
timeout: opts.timeoutMs ?? DEFAULT_NAV_TIMEOUT_MS,
|
|
86
|
+
})
|
|
87
|
+
return await page.evaluate(
|
|
88
|
+
({ selectorMap, evalTimeoutMs }) => {
|
|
89
|
+
const start = Date.now()
|
|
90
|
+
const result = {}
|
|
91
|
+
for (const [field, spec] of Object.entries(selectorMap)) {
|
|
92
|
+
if (Date.now() - start > evalTimeoutMs) {
|
|
93
|
+
result[field] = null
|
|
94
|
+
continue
|
|
95
|
+
}
|
|
96
|
+
if (!spec || !spec.selector) {
|
|
97
|
+
result[field] = null
|
|
98
|
+
continue
|
|
99
|
+
}
|
|
100
|
+
const attr = spec.attr || 'text'
|
|
101
|
+
const readOne = (el) => {
|
|
102
|
+
if (!el) return null
|
|
103
|
+
if (attr === 'text') return (el.innerText || el.textContent || '').trim()
|
|
104
|
+
if (attr === 'html') return el.innerHTML
|
|
105
|
+
return el.getAttribute(attr)
|
|
106
|
+
}
|
|
107
|
+
try {
|
|
108
|
+
if (spec.multiple) {
|
|
109
|
+
const nodes = document.querySelectorAll(spec.selector)
|
|
110
|
+
result[field] = Array.from(nodes).map(readOne).filter(v => v != null && v !== '')
|
|
111
|
+
} else {
|
|
112
|
+
const node = document.querySelector(spec.selector)
|
|
113
|
+
result[field] = readOne(node)
|
|
114
|
+
}
|
|
115
|
+
} catch {
|
|
116
|
+
result[field] = null
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
return result
|
|
120
|
+
},
|
|
121
|
+
{ selectorMap: selectors, evalTimeoutMs: opts.evalTimeoutMs ?? DEFAULT_EVAL_TIMEOUT_MS },
|
|
122
|
+
)
|
|
123
|
+
}, opts)
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
module.exports = {
|
|
127
|
+
renderPage,
|
|
128
|
+
extractWithSelectors,
|
|
129
|
+
}
|