@geekbeer/minion 3.52.0 → 3.55.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +7 -0
- package/core/db/migrations/20260508000000_page_recipes.js +33 -0
- package/core/lib/dag-step-poller.js +245 -1
- package/core/lib/web-extract/extractor.js +142 -0
- package/core/lib/web-extract/fingerprint.js +63 -0
- package/core/lib/web-extract/html-cleaner.js +72 -0
- package/core/lib/web-extract/index.js +21 -0
- package/core/lib/web-extract/playwright-runner.js +129 -0
- package/core/lib/web-extract/recipe-generator.js +247 -0
- package/core/lib/web-extract/url-normalize.js +90 -0
- package/core/routes/web.js +94 -0
- package/core/stores/page-recipe-store.js +143 -0
- package/docs/api-reference.md +106 -297
- package/docs/task-guides.md +134 -75
- package/linux/routes/chat.js +37 -20
- package/linux/server.js +2 -0
- package/mac/server.js +2 -0
- package/package.json +6 -2
- package/rules/core.md +26 -9
- package/win/routes/chat.js +38 -16
- package/win/server.js +2 -0
package/.env.example
CHANGED
|
@@ -17,3 +17,10 @@ MINION_ID=
|
|
|
17
17
|
|
|
18
18
|
# Agent port (optional, default: 8080)
|
|
19
19
|
AGENT_PORT=8080
|
|
20
|
+
|
|
21
|
+
# Anthropic API key (optional, experimental, fallback only) —
|
|
22
|
+
# POST /api/web/extract prefers the primary LLM plugin (see PUT /api/llm/config)
|
|
23
|
+
# and only uses ANTHROPIC_API_KEY if no primary plugin is configured. Set via:
|
|
24
|
+
# curl -X PUT http://localhost:8080/api/secrets/ANTHROPIC_API_KEY \
|
|
25
|
+
# -H "Authorization: Bearer $API_TOKEN" -d '{"value": "sk-ant-..."}'
|
|
26
|
+
ANTHROPIC_API_KEY=
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* page_recipes — Web page extraction recipe cache (experimental, v3.53.0).
|
|
3
|
+
*
|
|
4
|
+
* Stores selectors learned from a first-time visit so subsequent visits to
|
|
5
|
+
* structurally similar pages skip the LLM round trip. Keyed by URL template
|
|
6
|
+
* (after normalization) + DOM fingerprint to tolerate A/B variants.
|
|
7
|
+
*
|
|
8
|
+
* Marked experimental: schema may change before the API stabilizes.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
module.exports = {
|
|
12
|
+
version: 20260508000000,
|
|
13
|
+
name: 'page_recipes',
|
|
14
|
+
|
|
15
|
+
up(db, { tableExists }) {
|
|
16
|
+
if (tableExists(db, 'page_recipes')) return
|
|
17
|
+
|
|
18
|
+
db.exec(`
|
|
19
|
+
CREATE TABLE page_recipes (
|
|
20
|
+
url_template TEXT NOT NULL,
|
|
21
|
+
dom_fingerprint TEXT NOT NULL,
|
|
22
|
+
selectors_json TEXT NOT NULL,
|
|
23
|
+
page_type TEXT,
|
|
24
|
+
hit_count INTEGER NOT NULL DEFAULT 0,
|
|
25
|
+
fail_count INTEGER NOT NULL DEFAULT 0,
|
|
26
|
+
last_verified_at TEXT,
|
|
27
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
28
|
+
PRIMARY KEY (url_template, dom_fingerprint)
|
|
29
|
+
);
|
|
30
|
+
CREATE INDEX idx_page_recipes_template ON page_recipes(url_template);
|
|
31
|
+
`)
|
|
32
|
+
},
|
|
33
|
+
}
|
|
@@ -119,12 +119,15 @@ async function pollOnce() {
|
|
|
119
119
|
|
|
120
120
|
/**
|
|
121
121
|
* Execute a single pending DAG node.
|
|
122
|
-
* Routes to skill
|
|
122
|
+
* Routes to skill / transform / script execution based on node_type.
|
|
123
123
|
*/
|
|
124
124
|
async function executeNode(node) {
|
|
125
125
|
if (node.node_type === 'transform') {
|
|
126
126
|
return executeTransformNode(node)
|
|
127
127
|
}
|
|
128
|
+
if (node.node_type === 'script') {
|
|
129
|
+
return executeScriptNode(node)
|
|
130
|
+
}
|
|
128
131
|
return executeSkillNode(node)
|
|
129
132
|
}
|
|
130
133
|
|
|
@@ -467,6 +470,247 @@ function appendContractTable(lines, fields) {
|
|
|
467
470
|
}
|
|
468
471
|
}
|
|
469
472
|
|
|
473
|
+
/**
|
|
474
|
+
* Execute a script node:
|
|
475
|
+
* 1. Claim the node
|
|
476
|
+
* 2. Spawn python3 / node as a child process
|
|
477
|
+
* 3. Pipe input_data as JSON on stdin
|
|
478
|
+
* 4. Parse stdout as JSON → output_data, report completion
|
|
479
|
+
*
|
|
480
|
+
* Failure modes (all → status 'failed'):
|
|
481
|
+
* - Unsupported runtime
|
|
482
|
+
* - Non-zero exit code (stderr → error_message)
|
|
483
|
+
* - stdout is not parseable as JSON
|
|
484
|
+
* - Wall-clock timeout exceeded (process is SIGKILL'd)
|
|
485
|
+
*
|
|
486
|
+
* No tmux session is involved — this is a synchronous in-process child.
|
|
487
|
+
* The Output Contract on the outgoing edge is enforced by HQ on
|
|
488
|
+
* /node-complete, identical to skill / transform nodes.
|
|
489
|
+
*/
|
|
490
|
+
async function executeScriptNode(node) {
|
|
491
|
+
const {
|
|
492
|
+
node_execution_id,
|
|
493
|
+
node_id,
|
|
494
|
+
dag_workflow_name,
|
|
495
|
+
scope_path,
|
|
496
|
+
assigned_role,
|
|
497
|
+
input_data,
|
|
498
|
+
script_runtime,
|
|
499
|
+
script_source,
|
|
500
|
+
script_timeout_seconds,
|
|
501
|
+
} = node
|
|
502
|
+
|
|
503
|
+
console.log(
|
|
504
|
+
`[DagPoller] Executing script node "${node_id}" of DAG "${dag_workflow_name}" ` +
|
|
505
|
+
`(runtime: ${script_runtime}, scope: "${scope_path || 'root'}", role: ${assigned_role})`
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
try {
|
|
509
|
+
try {
|
|
510
|
+
await dagRequest('/claim-node', {
|
|
511
|
+
method: 'POST',
|
|
512
|
+
body: JSON.stringify({ node_execution_id }),
|
|
513
|
+
})
|
|
514
|
+
} catch (claimErr) {
|
|
515
|
+
if (claimErr.statusCode === 409) {
|
|
516
|
+
console.log(`[DagPoller] Script node ${node_id} already claimed, skipping`)
|
|
517
|
+
return
|
|
518
|
+
}
|
|
519
|
+
throw claimErr
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
if (!script_source || !script_source.trim()) {
|
|
523
|
+
await reportNodeComplete(node_execution_id, 'failed', null, 'Script node has empty script_source')
|
|
524
|
+
return
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
let interpreter
|
|
528
|
+
if (script_runtime === 'python') {
|
|
529
|
+
interpreter = 'python3'
|
|
530
|
+
} else if (script_runtime === 'node') {
|
|
531
|
+
interpreter = 'node'
|
|
532
|
+
} else {
|
|
533
|
+
await reportNodeComplete(
|
|
534
|
+
node_execution_id,
|
|
535
|
+
'failed',
|
|
536
|
+
null,
|
|
537
|
+
`Unsupported script_runtime: ${script_runtime} (allowed: python, node)`,
|
|
538
|
+
)
|
|
539
|
+
return
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
const timeoutMs = clampTimeout(script_timeout_seconds) * 1000
|
|
543
|
+
|
|
544
|
+
const result = await runScriptProcess(interpreter, script_source, input_data || {}, timeoutMs)
|
|
545
|
+
|
|
546
|
+
if (result.timedOut) {
|
|
547
|
+
await reportNodeComplete(
|
|
548
|
+
node_execution_id,
|
|
549
|
+
'failed',
|
|
550
|
+
null,
|
|
551
|
+
`Script timed out after ${timeoutMs / 1000}s. stderr: ${truncate(result.stderr, 2000)}`,
|
|
552
|
+
)
|
|
553
|
+
return
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
if (result.exitCode !== 0) {
|
|
557
|
+
await reportNodeComplete(
|
|
558
|
+
node_execution_id,
|
|
559
|
+
'failed',
|
|
560
|
+
null,
|
|
561
|
+
`Script exited with code ${result.exitCode}. stderr: ${truncate(result.stderr, 2000)}`,
|
|
562
|
+
)
|
|
563
|
+
return
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
let outputData
|
|
567
|
+
try {
|
|
568
|
+
outputData = JSON.parse(result.stdout)
|
|
569
|
+
} catch (parseErr) {
|
|
570
|
+
await reportNodeComplete(
|
|
571
|
+
node_execution_id,
|
|
572
|
+
'failed',
|
|
573
|
+
null,
|
|
574
|
+
`Script stdout is not valid JSON: ${parseErr.message}. stdout (first 500 chars): ${truncate(result.stdout, 500)}`,
|
|
575
|
+
)
|
|
576
|
+
return
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
if (typeof outputData !== 'object' || outputData === null || Array.isArray(outputData)) {
|
|
580
|
+
await reportNodeComplete(
|
|
581
|
+
node_execution_id,
|
|
582
|
+
'failed',
|
|
583
|
+
null,
|
|
584
|
+
`Script output_data must be a JSON object (got ${Array.isArray(outputData) ? 'array' : typeof outputData})`,
|
|
585
|
+
)
|
|
586
|
+
return
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
await reportNodeComplete(
|
|
590
|
+
node_execution_id,
|
|
591
|
+
'completed',
|
|
592
|
+
outputData,
|
|
593
|
+
buildScriptSummary(outputData, result.stderr),
|
|
594
|
+
)
|
|
595
|
+
console.log(`[DagPoller] Script node "${node_id}" completed`)
|
|
596
|
+
|
|
597
|
+
} catch (err) {
|
|
598
|
+
console.error(`[DagPoller] Failed to execute script node ${node_id}: ${err.message}`)
|
|
599
|
+
try {
|
|
600
|
+
await reportNodeComplete(node_execution_id, 'failed', null, err.message)
|
|
601
|
+
} catch {
|
|
602
|
+
// best-effort
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
/**
|
|
608
|
+
* Build the output_summary text shown in the HQ execution detail UI for a
|
|
609
|
+
* successful script node. The UI renders output_summary as a collapsible
|
|
610
|
+
* code block under "Output"; output_data itself is only visible from the
|
|
611
|
+
* Pin-fixture dialog. Always emit at least the JSON output so users can
|
|
612
|
+
* inspect what the script returned without round-tripping through fixtures.
|
|
613
|
+
*/
|
|
614
|
+
function buildScriptSummary(outputData, stderr) {
|
|
615
|
+
const lines = []
|
|
616
|
+
let serialized
|
|
617
|
+
try {
|
|
618
|
+
serialized = JSON.stringify(outputData, null, 2)
|
|
619
|
+
} catch {
|
|
620
|
+
serialized = '(failed to serialize output_data)'
|
|
621
|
+
}
|
|
622
|
+
lines.push('output_data:')
|
|
623
|
+
lines.push('```json')
|
|
624
|
+
lines.push(truncate(serialized, 8000))
|
|
625
|
+
lines.push('```')
|
|
626
|
+
if (stderr && stderr.trim()) {
|
|
627
|
+
lines.push('')
|
|
628
|
+
lines.push('stderr:')
|
|
629
|
+
lines.push(truncate(stderr, 2000))
|
|
630
|
+
}
|
|
631
|
+
return lines.join('\n')
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
function clampTimeout(seconds) {
|
|
635
|
+
const n = typeof seconds === 'number' && Number.isFinite(seconds) ? seconds : 60
|
|
636
|
+
if (n < 1) return 1
|
|
637
|
+
if (n > 600) return 600
|
|
638
|
+
return Math.floor(n)
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
function truncate(str, max) {
|
|
642
|
+
if (!str) return ''
|
|
643
|
+
return str.length > max ? str.slice(0, max) + '…(truncated)' : str
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
/**
|
|
647
|
+
* Spawn the interpreter, write the script to stdin via `-` / `-c` is not
|
|
648
|
+
* portable (python requires `-`, node requires `-e` with the source as an
|
|
649
|
+
* argument and no stdin). Instead we feed the script via stdin for both:
|
|
650
|
+
* - python3 reads source from stdin when invoked with `-`
|
|
651
|
+
* - node reads source from stdin when invoked with no args, but stdin is
|
|
652
|
+
* then consumed by the script — so for node we must use `-e` with the
|
|
653
|
+
* source as a flag value. To keep input_data on stdin for both, we
|
|
654
|
+
* write the script to a temp file and pass the path as argv[1].
|
|
655
|
+
*/
|
|
656
|
+
function runScriptProcess(interpreter, source, inputData, timeoutMs) {
|
|
657
|
+
const { spawn } = require('child_process')
|
|
658
|
+
const fs = require('fs')
|
|
659
|
+
const os = require('os')
|
|
660
|
+
const path = require('path')
|
|
661
|
+
const crypto = require('crypto')
|
|
662
|
+
|
|
663
|
+
const ext = interpreter === 'python3' ? 'py' : 'js'
|
|
664
|
+
const tmpFile = path.join(
|
|
665
|
+
os.tmpdir(),
|
|
666
|
+
`dag-script-${crypto.randomBytes(8).toString('hex')}.${ext}`,
|
|
667
|
+
)
|
|
668
|
+
fs.writeFileSync(tmpFile, source, 'utf-8')
|
|
669
|
+
|
|
670
|
+
return new Promise(resolve => {
|
|
671
|
+
const child = spawn(interpreter, [tmpFile], {
|
|
672
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
673
|
+
})
|
|
674
|
+
|
|
675
|
+
let stdout = ''
|
|
676
|
+
let stderr = ''
|
|
677
|
+
let timedOut = false
|
|
678
|
+
let killed = false
|
|
679
|
+
|
|
680
|
+
const timer = setTimeout(() => {
|
|
681
|
+
timedOut = true
|
|
682
|
+
killed = true
|
|
683
|
+
try { child.kill('SIGKILL') } catch { /* noop */ }
|
|
684
|
+
}, timeoutMs)
|
|
685
|
+
|
|
686
|
+
child.stdout.on('data', chunk => { stdout += chunk.toString('utf-8') })
|
|
687
|
+
child.stderr.on('data', chunk => { stderr += chunk.toString('utf-8') })
|
|
688
|
+
|
|
689
|
+
child.on('error', err => {
|
|
690
|
+
clearTimeout(timer)
|
|
691
|
+
cleanup(tmpFile)
|
|
692
|
+
resolve({ exitCode: -1, stdout, stderr: stderr + `\nspawn error: ${err.message}`, timedOut })
|
|
693
|
+
})
|
|
694
|
+
|
|
695
|
+
child.on('close', code => {
|
|
696
|
+
clearTimeout(timer)
|
|
697
|
+
cleanup(tmpFile)
|
|
698
|
+
resolve({ exitCode: killed ? -1 : (code ?? -1), stdout, stderr, timedOut })
|
|
699
|
+
})
|
|
700
|
+
|
|
701
|
+
try {
|
|
702
|
+
child.stdin.write(JSON.stringify(inputData))
|
|
703
|
+
child.stdin.end()
|
|
704
|
+
} catch (writeErr) {
|
|
705
|
+
stderr += `\nstdin write error: ${writeErr.message}`
|
|
706
|
+
}
|
|
707
|
+
})
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
function cleanup(tmpFile) {
|
|
711
|
+
try { require('fs').unlinkSync(tmpFile) } catch { /* noop */ }
|
|
712
|
+
}
|
|
713
|
+
|
|
470
714
|
/**
|
|
471
715
|
* Resolve skill_version_id to skill name via HQ API.
|
|
472
716
|
*/
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web extraction orchestrator (experimental — v3.53.0).
|
|
3
|
+
*
|
|
4
|
+
* Cold path: Playwright fetch -> Readability/Turndown clean -> Anthropic
|
|
5
|
+
* Haiku selects fields -> store recipe -> verify by replaying
|
|
6
|
+
* selectors against the same page.
|
|
7
|
+
*
|
|
8
|
+
* Hot path: Playwright fetch -> fingerprint -> recipe lookup -> selector
|
|
9
|
+
* replay. No LLM call.
|
|
10
|
+
*
|
|
11
|
+
* Self-heal: hot replays that come back empty bump fail_count; the recipe
|
|
12
|
+
* is dropped after MAX_FAIL_COUNT and the next request retries
|
|
13
|
+
* cold. A single in-request fall-through from hot -> cold is
|
|
14
|
+
* allowed so callers don't see transient breakage.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
const { normalizeUrl } = require('./url-normalize')
|
|
18
|
+
const { computeFingerprint } = require('./fingerprint')
|
|
19
|
+
const { renderPage, extractWithSelectors } = require('./playwright-runner')
|
|
20
|
+
const { cleanHtml } = require('./html-cleaner')
|
|
21
|
+
const { generateRecipe } = require('./recipe-generator')
|
|
22
|
+
const pageRecipeStore = require('../../stores/page-recipe-store')
|
|
23
|
+
|
|
24
|
+
function isEmptyResult(data) {
|
|
25
|
+
if (!data || typeof data !== 'object') return true
|
|
26
|
+
const values = Object.values(data)
|
|
27
|
+
if (values.length === 0) return true
|
|
28
|
+
return values.every(v => {
|
|
29
|
+
if (v == null) return true
|
|
30
|
+
if (typeof v === 'string') return v.trim() === ''
|
|
31
|
+
if (Array.isArray(v)) return v.length === 0
|
|
32
|
+
return false
|
|
33
|
+
})
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
async function extract({ url, hint }) {
|
|
37
|
+
const { template, canonicalUrl } = normalizeUrl(url)
|
|
38
|
+
|
|
39
|
+
// Always render once up-front so we can compute the fingerprint regardless
|
|
40
|
+
// of cache state. Cold path reuses the HTML; hot path discards it.
|
|
41
|
+
const rendered = await renderPage(canonicalUrl)
|
|
42
|
+
const fingerprint = computeFingerprint(rendered.html)
|
|
43
|
+
|
|
44
|
+
const cached = pageRecipeStore.find({
|
|
45
|
+
urlTemplate: template,
|
|
46
|
+
domFingerprint: fingerprint,
|
|
47
|
+
})
|
|
48
|
+
|
|
49
|
+
if (cached) {
|
|
50
|
+
const data = await extractWithSelectors(canonicalUrl, cached.selectors)
|
|
51
|
+
if (!isEmptyResult(data)) {
|
|
52
|
+
pageRecipeStore.incrementHit({ urlTemplate: template, domFingerprint: fingerprint })
|
|
53
|
+
pageRecipeStore.setLastVerified({ urlTemplate: template, domFingerprint: fingerprint })
|
|
54
|
+
return shape({
|
|
55
|
+
url: canonicalUrl,
|
|
56
|
+
finalUrl: rendered.finalUrl,
|
|
57
|
+
statusCode: rendered.statusCode,
|
|
58
|
+
recipeMode: 'hot',
|
|
59
|
+
urlTemplate: template,
|
|
60
|
+
fingerprint,
|
|
61
|
+
pageType: cached.page_type,
|
|
62
|
+
selectors: cached.selectors,
|
|
63
|
+
data,
|
|
64
|
+
cleaned: null,
|
|
65
|
+
})
|
|
66
|
+
}
|
|
67
|
+
// Hot replay returned nothing — penalize and fall through to cold.
|
|
68
|
+
pageRecipeStore.incrementFail({ urlTemplate: template, domFingerprint: fingerprint })
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Cold path
|
|
72
|
+
const cleaned = cleanHtml(rendered.html, canonicalUrl)
|
|
73
|
+
const recipe = await generateRecipe({
|
|
74
|
+
url: canonicalUrl,
|
|
75
|
+
cleanedMarkdown: cleaned.contentMarkdown,
|
|
76
|
+
hint,
|
|
77
|
+
})
|
|
78
|
+
|
|
79
|
+
// Verify the recipe against this exact page before persisting.
|
|
80
|
+
const verifyData = await extractWithSelectors(canonicalUrl, recipe.selectors)
|
|
81
|
+
const verified = !isEmptyResult(verifyData)
|
|
82
|
+
|
|
83
|
+
if (verified) {
|
|
84
|
+
pageRecipeStore.upsert({
|
|
85
|
+
urlTemplate: template,
|
|
86
|
+
domFingerprint: fingerprint,
|
|
87
|
+
selectors: recipe.selectors,
|
|
88
|
+
pageType: recipe.pageType,
|
|
89
|
+
})
|
|
90
|
+
pageRecipeStore.incrementHit({ urlTemplate: template, domFingerprint: fingerprint })
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return shape({
|
|
94
|
+
url: canonicalUrl,
|
|
95
|
+
finalUrl: rendered.finalUrl,
|
|
96
|
+
statusCode: rendered.statusCode,
|
|
97
|
+
recipeMode: 'cold',
|
|
98
|
+
urlTemplate: template,
|
|
99
|
+
fingerprint,
|
|
100
|
+
pageType: recipe.pageType,
|
|
101
|
+
selectors: recipe.selectors,
|
|
102
|
+
data: verified ? verifyData : recipe.extracted,
|
|
103
|
+
cleaned,
|
|
104
|
+
recipePersisted: verified,
|
|
105
|
+
})
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
function shape({ url, finalUrl, statusCode, recipeMode, urlTemplate, fingerprint, pageType, selectors, data, cleaned, recipePersisted }) {
|
|
109
|
+
const out = {
|
|
110
|
+
experimental: true,
|
|
111
|
+
url,
|
|
112
|
+
finalUrl,
|
|
113
|
+
statusCode,
|
|
114
|
+
recipeMode,
|
|
115
|
+
recipeId: `${urlTemplate}#${fingerprint}`,
|
|
116
|
+
pageType: pageType || null,
|
|
117
|
+
title: pickField(data, ['title', 'headline', 'name']) || cleaned?.title || null,
|
|
118
|
+
content: pickField(data, ['body', 'content', 'article', 'description']) || cleaned?.contentMarkdown || null,
|
|
119
|
+
structured: data || {},
|
|
120
|
+
selectors: selectors || {},
|
|
121
|
+
}
|
|
122
|
+
if (recipeMode === 'cold' && recipePersisted === false) {
|
|
123
|
+
out.warning = 'Recipe verification failed (selectors returned empty). Result reflects LLM extraction; recipe was not persisted.'
|
|
124
|
+
}
|
|
125
|
+
return out
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
function pickField(obj, candidates) {
|
|
129
|
+
if (!obj || typeof obj !== 'object') return null
|
|
130
|
+
for (const key of candidates) {
|
|
131
|
+
const v = obj[key]
|
|
132
|
+
if (v == null) continue
|
|
133
|
+
if (typeof v === 'string' && v.trim() !== '') return v
|
|
134
|
+
if (Array.isArray(v) && v.length > 0) return v.join('\n\n')
|
|
135
|
+
}
|
|
136
|
+
return null
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
module.exports = {
|
|
140
|
+
extract,
|
|
141
|
+
isEmptyResult,
|
|
142
|
+
}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Lightweight DOM structure fingerprint.
|
|
3
|
+
*
|
|
4
|
+
* Two pages with the same template URL but materially different layouts
|
|
5
|
+
* (A/B test, logged-in vs logged-out, mobile vs desktop served) need to
|
|
6
|
+
* use different recipes. We hash a minimal structural signature instead
|
|
7
|
+
* of the full HTML so the fingerprint stays stable against trivial copy
|
|
8
|
+
* changes but flips when block-level structure shifts.
|
|
9
|
+
*
|
|
10
|
+
* Signature inputs:
|
|
11
|
+
* - Order of structural landmark tags (header/nav/main/article/...)
|
|
12
|
+
* - Top 5 most frequent class names on <div> elements
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
const crypto = require('crypto')
|
|
16
|
+
|
|
17
|
+
const LANDMARK_TAGS = ['header', 'nav', 'main', 'article', 'section', 'aside', 'footer', 'form']
|
|
18
|
+
|
|
19
|
+
function computeFingerprint(html) {
|
|
20
|
+
if (typeof html !== 'string' || html.length === 0) {
|
|
21
|
+
return 'empty'
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
let document
|
|
25
|
+
try {
|
|
26
|
+
const { parseHTML } = require('linkedom')
|
|
27
|
+
document = parseHTML(html).document
|
|
28
|
+
} catch (err) {
|
|
29
|
+
// If linkedom fails (extremely malformed HTML), fall back to a length bucket
|
|
30
|
+
return 'fallback-' + crypto.createHash('sha1').update(html.slice(0, 4096)).digest('hex').slice(0, 12)
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// Landmark tag sequence (first occurrence only, in document order)
|
|
34
|
+
const seen = []
|
|
35
|
+
const seenSet = new Set()
|
|
36
|
+
const allEls = document.querySelectorAll(LANDMARK_TAGS.join(','))
|
|
37
|
+
for (const el of allEls) {
|
|
38
|
+
const tag = el.tagName.toLowerCase()
|
|
39
|
+
if (!seenSet.has(tag)) {
|
|
40
|
+
seenSet.add(tag)
|
|
41
|
+
seen.push(tag)
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Top 5 div classes by frequency
|
|
46
|
+
const classCounts = new Map()
|
|
47
|
+
const divs = document.querySelectorAll('div[class]')
|
|
48
|
+
for (const div of divs) {
|
|
49
|
+
const classes = (div.getAttribute('class') || '').split(/\s+/).filter(Boolean)
|
|
50
|
+
for (const c of classes) {
|
|
51
|
+
classCounts.set(c, (classCounts.get(c) || 0) + 1)
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
const topClasses = [...classCounts.entries()]
|
|
55
|
+
.sort((a, b) => b[1] - a[1] || (a[0] < b[0] ? -1 : 1))
|
|
56
|
+
.slice(0, 5)
|
|
57
|
+
.map(([c]) => c)
|
|
58
|
+
|
|
59
|
+
const signature = `tags:${seen.join(',')}|cls:${topClasses.join(',')}`
|
|
60
|
+
return crypto.createHash('sha1').update(signature).digest('hex').slice(0, 12)
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
module.exports = { computeFingerprint }
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML → cleaned content (Readability) → Markdown (Turndown).
|
|
3
|
+
*
|
|
4
|
+
* The cleaned Markdown is the *only* page representation handed to the
|
|
5
|
+
* recipe-generation LLM. Keeping the input small and structured is what
|
|
6
|
+
* makes this experiment cheap enough to be worth running on every
|
|
7
|
+
* cold-cache miss.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
const MAX_MARKDOWN_LENGTH = 50_000
|
|
11
|
+
|
|
12
|
+
function cleanHtml(html, url) {
|
|
13
|
+
let parsedDocument
|
|
14
|
+
try {
|
|
15
|
+
const { parseHTML } = require('linkedom')
|
|
16
|
+
parsedDocument = parseHTML(html).document
|
|
17
|
+
} catch (err) {
|
|
18
|
+
return {
|
|
19
|
+
title: null,
|
|
20
|
+
contentHtml: '',
|
|
21
|
+
contentMarkdown: '',
|
|
22
|
+
byline: null,
|
|
23
|
+
excerpt: null,
|
|
24
|
+
length: 0,
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
let article = null
|
|
29
|
+
try {
|
|
30
|
+
const { Readability } = require('@mozilla/readability')
|
|
31
|
+
article = new Readability(parsedDocument).parse()
|
|
32
|
+
} catch {
|
|
33
|
+
article = null
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const contentHtml =
|
|
37
|
+
(article && article.content) ||
|
|
38
|
+
parsedDocument.body?.innerHTML ||
|
|
39
|
+
''
|
|
40
|
+
|
|
41
|
+
let contentMarkdown = ''
|
|
42
|
+
try {
|
|
43
|
+
const TurndownService = require('turndown')
|
|
44
|
+
const td = new TurndownService({
|
|
45
|
+
headingStyle: 'atx',
|
|
46
|
+
codeBlockStyle: 'fenced',
|
|
47
|
+
bulletListMarker: '-',
|
|
48
|
+
})
|
|
49
|
+
td.remove(['script', 'style', 'noscript', 'iframe'])
|
|
50
|
+
contentMarkdown = td.turndown(contentHtml)
|
|
51
|
+
} catch {
|
|
52
|
+
contentMarkdown = ''
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
if (contentMarkdown.length > MAX_MARKDOWN_LENGTH) {
|
|
56
|
+
contentMarkdown = contentMarkdown.slice(0, MAX_MARKDOWN_LENGTH) + '\n\n[... truncated ...]'
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
return {
|
|
60
|
+
title: article?.title || parsedDocument.title || null,
|
|
61
|
+
contentHtml,
|
|
62
|
+
contentMarkdown,
|
|
63
|
+
byline: article?.byline || null,
|
|
64
|
+
excerpt: article?.excerpt || null,
|
|
65
|
+
length: article?.length || contentMarkdown.length,
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
module.exports = {
|
|
70
|
+
cleanHtml,
|
|
71
|
+
MAX_MARKDOWN_LENGTH,
|
|
72
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web extraction (experimental, v3.53.0).
|
|
3
|
+
*
|
|
4
|
+
* Public surface for core/routes/web.js. Internal modules:
|
|
5
|
+
* - url-normalize.js URL → template + canonical URL
|
|
6
|
+
* - fingerprint.js DOM structural hash
|
|
7
|
+
* - playwright-runner.js headless fetch + selector replay
|
|
8
|
+
* - html-cleaner.js Readability + Turndown
|
|
9
|
+
* - recipe-generator.js Anthropic Haiku cold path
|
|
10
|
+
* - extractor.js orchestrator (hot/cold + self-heal)
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
const { extract } = require('./extractor')
|
|
14
|
+
const { normalizeUrl } = require('./url-normalize')
|
|
15
|
+
const { computeFingerprint } = require('./fingerprint')
|
|
16
|
+
|
|
17
|
+
module.exports = {
|
|
18
|
+
extract,
|
|
19
|
+
normalizeUrl,
|
|
20
|
+
computeFingerprint,
|
|
21
|
+
}
|