@geekbeer/minion 3.52.0 → 3.55.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.env.example CHANGED
@@ -17,3 +17,10 @@ MINION_ID=
17
17
 
18
18
  # Agent port (optional, default: 8080)
19
19
  AGENT_PORT=8080
20
+
21
+ # Anthropic API key (optional, experimental, fallback only) —
22
+ # POST /api/web/extract prefers the primary LLM plugin (see PUT /api/llm/config)
23
+ # and only uses ANTHROPIC_API_KEY if no primary plugin is configured. Set via:
24
+ # curl -X PUT http://localhost:8080/api/secrets/ANTHROPIC_API_KEY \
25
+ # -H "Authorization: Bearer $API_TOKEN" -d '{"value": "sk-ant-..."}'
26
+ ANTHROPIC_API_KEY=
@@ -0,0 +1,33 @@
1
+ /**
2
+ * page_recipes — Web page extraction recipe cache (experimental, v3.53.0).
3
+ *
4
+ * Stores selectors learned from a first-time visit so subsequent visits to
5
+ * structurally similar pages skip the LLM round trip. Keyed by URL template
6
+ * (after normalization) + DOM fingerprint to tolerate A/B variants.
7
+ *
8
+ * Marked experimental: schema may change before the API stabilizes.
9
+ */
10
+
11
+ module.exports = {
12
+ version: 20260508000000,
13
+ name: 'page_recipes',
14
+
15
+ up(db, { tableExists }) {
16
+ if (tableExists(db, 'page_recipes')) return
17
+
18
+ db.exec(`
19
+ CREATE TABLE page_recipes (
20
+ url_template TEXT NOT NULL,
21
+ dom_fingerprint TEXT NOT NULL,
22
+ selectors_json TEXT NOT NULL,
23
+ page_type TEXT,
24
+ hit_count INTEGER NOT NULL DEFAULT 0,
25
+ fail_count INTEGER NOT NULL DEFAULT 0,
26
+ last_verified_at TEXT,
27
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
28
+ PRIMARY KEY (url_template, dom_fingerprint)
29
+ );
30
+ CREATE INDEX idx_page_recipes_template ON page_recipes(url_template);
31
+ `)
32
+ },
33
+ }
@@ -119,12 +119,15 @@ async function pollOnce() {
119
119
 
120
120
  /**
121
121
  * Execute a single pending DAG node.
122
- * Routes to skill or transform execution based on node_type.
122
+ * Routes to skill / transform / script execution based on node_type.
123
123
  */
124
124
  async function executeNode(node) {
125
125
  if (node.node_type === 'transform') {
126
126
  return executeTransformNode(node)
127
127
  }
128
+ if (node.node_type === 'script') {
129
+ return executeScriptNode(node)
130
+ }
128
131
  return executeSkillNode(node)
129
132
  }
130
133
 
@@ -467,6 +470,247 @@ function appendContractTable(lines, fields) {
467
470
  }
468
471
  }
469
472
 
473
+ /**
474
+ * Execute a script node:
475
+ * 1. Claim the node
476
+ * 2. Spawn python3 / node as a child process
477
+ * 3. Pipe input_data as JSON on stdin
478
+ * 4. Parse stdout as JSON → output_data, report completion
479
+ *
480
+ * Failure modes (all → status 'failed'):
481
+ * - Unsupported runtime
482
+ * - Non-zero exit code (stderr → error_message)
483
+ * - stdout is not parseable as JSON
484
+ * - Wall-clock timeout exceeded (process is SIGKILL'd)
485
+ *
486
+ * No tmux session is involved — this is a synchronous in-process child.
487
+ * The Output Contract on the outgoing edge is enforced by HQ on
488
+ * /node-complete, identical to skill / transform nodes.
489
+ */
490
+ async function executeScriptNode(node) {
491
+ const {
492
+ node_execution_id,
493
+ node_id,
494
+ dag_workflow_name,
495
+ scope_path,
496
+ assigned_role,
497
+ input_data,
498
+ script_runtime,
499
+ script_source,
500
+ script_timeout_seconds,
501
+ } = node
502
+
503
+ console.log(
504
+ `[DagPoller] Executing script node "${node_id}" of DAG "${dag_workflow_name}" ` +
505
+ `(runtime: ${script_runtime}, scope: "${scope_path || 'root'}", role: ${assigned_role})`
506
+ )
507
+
508
+ try {
509
+ try {
510
+ await dagRequest('/claim-node', {
511
+ method: 'POST',
512
+ body: JSON.stringify({ node_execution_id }),
513
+ })
514
+ } catch (claimErr) {
515
+ if (claimErr.statusCode === 409) {
516
+ console.log(`[DagPoller] Script node ${node_id} already claimed, skipping`)
517
+ return
518
+ }
519
+ throw claimErr
520
+ }
521
+
522
+ if (!script_source || !script_source.trim()) {
523
+ await reportNodeComplete(node_execution_id, 'failed', null, 'Script node has empty script_source')
524
+ return
525
+ }
526
+
527
+ let interpreter
528
+ if (script_runtime === 'python') {
529
+ interpreter = 'python3'
530
+ } else if (script_runtime === 'node') {
531
+ interpreter = 'node'
532
+ } else {
533
+ await reportNodeComplete(
534
+ node_execution_id,
535
+ 'failed',
536
+ null,
537
+ `Unsupported script_runtime: ${script_runtime} (allowed: python, node)`,
538
+ )
539
+ return
540
+ }
541
+
542
+ const timeoutMs = clampTimeout(script_timeout_seconds) * 1000
543
+
544
+ const result = await runScriptProcess(interpreter, script_source, input_data || {}, timeoutMs)
545
+
546
+ if (result.timedOut) {
547
+ await reportNodeComplete(
548
+ node_execution_id,
549
+ 'failed',
550
+ null,
551
+ `Script timed out after ${timeoutMs / 1000}s. stderr: ${truncate(result.stderr, 2000)}`,
552
+ )
553
+ return
554
+ }
555
+
556
+ if (result.exitCode !== 0) {
557
+ await reportNodeComplete(
558
+ node_execution_id,
559
+ 'failed',
560
+ null,
561
+ `Script exited with code ${result.exitCode}. stderr: ${truncate(result.stderr, 2000)}`,
562
+ )
563
+ return
564
+ }
565
+
566
+ let outputData
567
+ try {
568
+ outputData = JSON.parse(result.stdout)
569
+ } catch (parseErr) {
570
+ await reportNodeComplete(
571
+ node_execution_id,
572
+ 'failed',
573
+ null,
574
+ `Script stdout is not valid JSON: ${parseErr.message}. stdout (first 500 chars): ${truncate(result.stdout, 500)}`,
575
+ )
576
+ return
577
+ }
578
+
579
+ if (typeof outputData !== 'object' || outputData === null || Array.isArray(outputData)) {
580
+ await reportNodeComplete(
581
+ node_execution_id,
582
+ 'failed',
583
+ null,
584
+ `Script output_data must be a JSON object (got ${Array.isArray(outputData) ? 'array' : typeof outputData})`,
585
+ )
586
+ return
587
+ }
588
+
589
+ await reportNodeComplete(
590
+ node_execution_id,
591
+ 'completed',
592
+ outputData,
593
+ buildScriptSummary(outputData, result.stderr),
594
+ )
595
+ console.log(`[DagPoller] Script node "${node_id}" completed`)
596
+
597
+ } catch (err) {
598
+ console.error(`[DagPoller] Failed to execute script node ${node_id}: ${err.message}`)
599
+ try {
600
+ await reportNodeComplete(node_execution_id, 'failed', null, err.message)
601
+ } catch {
602
+ // best-effort
603
+ }
604
+ }
605
+ }
606
+
607
+ /**
608
+ * Build the output_summary text shown in the HQ execution detail UI for a
609
+ * successful script node. The UI renders output_summary as a collapsible
610
+ * code block under "Output"; output_data itself is only visible from the
611
+ * Pin-fixture dialog. Always emit at least the JSON output so users can
612
+ * inspect what the script returned without round-tripping through fixtures.
613
+ */
614
+ function buildScriptSummary(outputData, stderr) {
615
+ const lines = []
616
+ let serialized
617
+ try {
618
+ serialized = JSON.stringify(outputData, null, 2)
619
+ } catch {
620
+ serialized = '(failed to serialize output_data)'
621
+ }
622
+ lines.push('output_data:')
623
+ lines.push('```json')
624
+ lines.push(truncate(serialized, 8000))
625
+ lines.push('```')
626
+ if (stderr && stderr.trim()) {
627
+ lines.push('')
628
+ lines.push('stderr:')
629
+ lines.push(truncate(stderr, 2000))
630
+ }
631
+ return lines.join('\n')
632
+ }
633
+
634
+ function clampTimeout(seconds) {
635
+ const n = typeof seconds === 'number' && Number.isFinite(seconds) ? seconds : 60
636
+ if (n < 1) return 1
637
+ if (n > 600) return 600
638
+ return Math.floor(n)
639
+ }
640
+
641
+ function truncate(str, max) {
642
+ if (!str) return ''
643
+ return str.length > max ? str.slice(0, max) + '…(truncated)' : str
644
+ }
645
+
646
+ /**
647
+ * Spawn the interpreter, write the script to stdin via `-` / `-c` is not
648
+ * portable (python requires `-`, node requires `-e` with the source as an
649
+ * argument and no stdin). Instead we feed the script via stdin for both:
650
+ * - python3 reads source from stdin when invoked with `-`
651
+ * - node reads source from stdin when invoked with no args, but stdin is
652
+ * then consumed by the script — so for node we must use `-e` with the
653
+ * source as a flag value. To keep input_data on stdin for both, we
654
+ * write the script to a temp file and pass the path as argv[1].
655
+ */
656
+ function runScriptProcess(interpreter, source, inputData, timeoutMs) {
657
+ const { spawn } = require('child_process')
658
+ const fs = require('fs')
659
+ const os = require('os')
660
+ const path = require('path')
661
+ const crypto = require('crypto')
662
+
663
+ const ext = interpreter === 'python3' ? 'py' : 'js'
664
+ const tmpFile = path.join(
665
+ os.tmpdir(),
666
+ `dag-script-${crypto.randomBytes(8).toString('hex')}.${ext}`,
667
+ )
668
+ fs.writeFileSync(tmpFile, source, 'utf-8')
669
+
670
+ return new Promise(resolve => {
671
+ const child = spawn(interpreter, [tmpFile], {
672
+ stdio: ['pipe', 'pipe', 'pipe'],
673
+ })
674
+
675
+ let stdout = ''
676
+ let stderr = ''
677
+ let timedOut = false
678
+ let killed = false
679
+
680
+ const timer = setTimeout(() => {
681
+ timedOut = true
682
+ killed = true
683
+ try { child.kill('SIGKILL') } catch { /* noop */ }
684
+ }, timeoutMs)
685
+
686
+ child.stdout.on('data', chunk => { stdout += chunk.toString('utf-8') })
687
+ child.stderr.on('data', chunk => { stderr += chunk.toString('utf-8') })
688
+
689
+ child.on('error', err => {
690
+ clearTimeout(timer)
691
+ cleanup(tmpFile)
692
+ resolve({ exitCode: -1, stdout, stderr: stderr + `\nspawn error: ${err.message}`, timedOut })
693
+ })
694
+
695
+ child.on('close', code => {
696
+ clearTimeout(timer)
697
+ cleanup(tmpFile)
698
+ resolve({ exitCode: killed ? -1 : (code ?? -1), stdout, stderr, timedOut })
699
+ })
700
+
701
+ try {
702
+ child.stdin.write(JSON.stringify(inputData))
703
+ child.stdin.end()
704
+ } catch (writeErr) {
705
+ stderr += `\nstdin write error: ${writeErr.message}`
706
+ }
707
+ })
708
+ }
709
+
710
+ function cleanup(tmpFile) {
711
+ try { require('fs').unlinkSync(tmpFile) } catch { /* noop */ }
712
+ }
713
+
470
714
  /**
471
715
  * Resolve skill_version_id to skill name via HQ API.
472
716
  */
@@ -0,0 +1,142 @@
1
+ /**
2
+ * Web extraction orchestrator (experimental — v3.53.0).
3
+ *
4
+ * Cold path: Playwright fetch -> Readability/Turndown clean -> Anthropic
5
+ * Haiku selects fields -> store recipe -> verify by replaying
6
+ * selectors against the same page.
7
+ *
8
+ * Hot path: Playwright fetch -> fingerprint -> recipe lookup -> selector
9
+ * replay. No LLM call.
10
+ *
11
+ * Self-heal: hot replays that come back empty bump fail_count; the recipe
12
+ * is dropped after MAX_FAIL_COUNT and the next request retries
13
+ * cold. A single in-request fall-through from hot -> cold is
14
+ * allowed so callers don't see transient breakage.
15
+ */
16
+
17
+ const { normalizeUrl } = require('./url-normalize')
18
+ const { computeFingerprint } = require('./fingerprint')
19
+ const { renderPage, extractWithSelectors } = require('./playwright-runner')
20
+ const { cleanHtml } = require('./html-cleaner')
21
+ const { generateRecipe } = require('./recipe-generator')
22
+ const pageRecipeStore = require('../../stores/page-recipe-store')
23
+
24
+ function isEmptyResult(data) {
25
+ if (!data || typeof data !== 'object') return true
26
+ const values = Object.values(data)
27
+ if (values.length === 0) return true
28
+ return values.every(v => {
29
+ if (v == null) return true
30
+ if (typeof v === 'string') return v.trim() === ''
31
+ if (Array.isArray(v)) return v.length === 0
32
+ return false
33
+ })
34
+ }
35
+
36
+ async function extract({ url, hint }) {
37
+ const { template, canonicalUrl } = normalizeUrl(url)
38
+
39
+ // Always render once up-front so we can compute the fingerprint regardless
40
+ // of cache state. Cold path reuses the HTML; hot path discards it.
41
+ const rendered = await renderPage(canonicalUrl)
42
+ const fingerprint = computeFingerprint(rendered.html)
43
+
44
+ const cached = pageRecipeStore.find({
45
+ urlTemplate: template,
46
+ domFingerprint: fingerprint,
47
+ })
48
+
49
+ if (cached) {
50
+ const data = await extractWithSelectors(canonicalUrl, cached.selectors)
51
+ if (!isEmptyResult(data)) {
52
+ pageRecipeStore.incrementHit({ urlTemplate: template, domFingerprint: fingerprint })
53
+ pageRecipeStore.setLastVerified({ urlTemplate: template, domFingerprint: fingerprint })
54
+ return shape({
55
+ url: canonicalUrl,
56
+ finalUrl: rendered.finalUrl,
57
+ statusCode: rendered.statusCode,
58
+ recipeMode: 'hot',
59
+ urlTemplate: template,
60
+ fingerprint,
61
+ pageType: cached.page_type,
62
+ selectors: cached.selectors,
63
+ data,
64
+ cleaned: null,
65
+ })
66
+ }
67
+ // Hot replay returned nothing — penalize and fall through to cold.
68
+ pageRecipeStore.incrementFail({ urlTemplate: template, domFingerprint: fingerprint })
69
+ }
70
+
71
+ // Cold path
72
+ const cleaned = cleanHtml(rendered.html, canonicalUrl)
73
+ const recipe = await generateRecipe({
74
+ url: canonicalUrl,
75
+ cleanedMarkdown: cleaned.contentMarkdown,
76
+ hint,
77
+ })
78
+
79
+ // Verify the recipe against this exact page before persisting.
80
+ const verifyData = await extractWithSelectors(canonicalUrl, recipe.selectors)
81
+ const verified = !isEmptyResult(verifyData)
82
+
83
+ if (verified) {
84
+ pageRecipeStore.upsert({
85
+ urlTemplate: template,
86
+ domFingerprint: fingerprint,
87
+ selectors: recipe.selectors,
88
+ pageType: recipe.pageType,
89
+ })
90
+ pageRecipeStore.incrementHit({ urlTemplate: template, domFingerprint: fingerprint })
91
+ }
92
+
93
+ return shape({
94
+ url: canonicalUrl,
95
+ finalUrl: rendered.finalUrl,
96
+ statusCode: rendered.statusCode,
97
+ recipeMode: 'cold',
98
+ urlTemplate: template,
99
+ fingerprint,
100
+ pageType: recipe.pageType,
101
+ selectors: recipe.selectors,
102
+ data: verified ? verifyData : recipe.extracted,
103
+ cleaned,
104
+ recipePersisted: verified,
105
+ })
106
+ }
107
+
108
+ function shape({ url, finalUrl, statusCode, recipeMode, urlTemplate, fingerprint, pageType, selectors, data, cleaned, recipePersisted }) {
109
+ const out = {
110
+ experimental: true,
111
+ url,
112
+ finalUrl,
113
+ statusCode,
114
+ recipeMode,
115
+ recipeId: `${urlTemplate}#${fingerprint}`,
116
+ pageType: pageType || null,
117
+ title: pickField(data, ['title', 'headline', 'name']) || cleaned?.title || null,
118
+ content: pickField(data, ['body', 'content', 'article', 'description']) || cleaned?.contentMarkdown || null,
119
+ structured: data || {},
120
+ selectors: selectors || {},
121
+ }
122
+ if (recipeMode === 'cold' && recipePersisted === false) {
123
+ out.warning = 'Recipe verification failed (selectors returned empty). Result reflects LLM extraction; recipe was not persisted.'
124
+ }
125
+ return out
126
+ }
127
+
128
+ function pickField(obj, candidates) {
129
+ if (!obj || typeof obj !== 'object') return null
130
+ for (const key of candidates) {
131
+ const v = obj[key]
132
+ if (v == null) continue
133
+ if (typeof v === 'string' && v.trim() !== '') return v
134
+ if (Array.isArray(v) && v.length > 0) return v.join('\n\n')
135
+ }
136
+ return null
137
+ }
138
+
139
+ module.exports = {
140
+ extract,
141
+ isEmptyResult,
142
+ }
@@ -0,0 +1,63 @@
1
+ /**
2
+ * Lightweight DOM structure fingerprint.
3
+ *
4
+ * Two pages with the same template URL but materially different layouts
5
+ * (A/B test, logged-in vs logged-out, mobile vs desktop served) need to
6
+ * use different recipes. We hash a minimal structural signature instead
7
+ * of the full HTML so the fingerprint stays stable against trivial copy
8
+ * changes but flips when block-level structure shifts.
9
+ *
10
+ * Signature inputs:
11
+ * - Order of structural landmark tags (header/nav/main/article/...)
12
+ * - Top 5 most frequent class names on <div> elements
13
+ */
14
+
15
+ const crypto = require('crypto')
16
+
17
+ const LANDMARK_TAGS = ['header', 'nav', 'main', 'article', 'section', 'aside', 'footer', 'form']
18
+
19
+ function computeFingerprint(html) {
20
+ if (typeof html !== 'string' || html.length === 0) {
21
+ return 'empty'
22
+ }
23
+
24
+ let document
25
+ try {
26
+ const { parseHTML } = require('linkedom')
27
+ document = parseHTML(html).document
28
+ } catch (err) {
29
+ // If linkedom fails (extremely malformed HTML), fall back to a length bucket
30
+ return 'fallback-' + crypto.createHash('sha1').update(html.slice(0, 4096)).digest('hex').slice(0, 12)
31
+ }
32
+
33
+ // Landmark tag sequence (first occurrence only, in document order)
34
+ const seen = []
35
+ const seenSet = new Set()
36
+ const allEls = document.querySelectorAll(LANDMARK_TAGS.join(','))
37
+ for (const el of allEls) {
38
+ const tag = el.tagName.toLowerCase()
39
+ if (!seenSet.has(tag)) {
40
+ seenSet.add(tag)
41
+ seen.push(tag)
42
+ }
43
+ }
44
+
45
+ // Top 5 div classes by frequency
46
+ const classCounts = new Map()
47
+ const divs = document.querySelectorAll('div[class]')
48
+ for (const div of divs) {
49
+ const classes = (div.getAttribute('class') || '').split(/\s+/).filter(Boolean)
50
+ for (const c of classes) {
51
+ classCounts.set(c, (classCounts.get(c) || 0) + 1)
52
+ }
53
+ }
54
+ const topClasses = [...classCounts.entries()]
55
+ .sort((a, b) => b[1] - a[1] || (a[0] < b[0] ? -1 : 1))
56
+ .slice(0, 5)
57
+ .map(([c]) => c)
58
+
59
+ const signature = `tags:${seen.join(',')}|cls:${topClasses.join(',')}`
60
+ return crypto.createHash('sha1').update(signature).digest('hex').slice(0, 12)
61
+ }
62
+
63
+ module.exports = { computeFingerprint }
@@ -0,0 +1,72 @@
1
+ /**
2
+ * HTML → cleaned content (Readability) → Markdown (Turndown).
3
+ *
4
+ * The cleaned Markdown is the *only* page representation handed to the
5
+ * recipe-generation LLM. Keeping the input small and structured is what
6
+ * makes this experiment cheap enough to be worth running on every
7
+ * cold-cache miss.
8
+ */
9
+
10
+ const MAX_MARKDOWN_LENGTH = 50_000
11
+
12
+ function cleanHtml(html, url) {
13
+ let parsedDocument
14
+ try {
15
+ const { parseHTML } = require('linkedom')
16
+ parsedDocument = parseHTML(html).document
17
+ } catch (err) {
18
+ return {
19
+ title: null,
20
+ contentHtml: '',
21
+ contentMarkdown: '',
22
+ byline: null,
23
+ excerpt: null,
24
+ length: 0,
25
+ }
26
+ }
27
+
28
+ let article = null
29
+ try {
30
+ const { Readability } = require('@mozilla/readability')
31
+ article = new Readability(parsedDocument).parse()
32
+ } catch {
33
+ article = null
34
+ }
35
+
36
+ const contentHtml =
37
+ (article && article.content) ||
38
+ parsedDocument.body?.innerHTML ||
39
+ ''
40
+
41
+ let contentMarkdown = ''
42
+ try {
43
+ const TurndownService = require('turndown')
44
+ const td = new TurndownService({
45
+ headingStyle: 'atx',
46
+ codeBlockStyle: 'fenced',
47
+ bulletListMarker: '-',
48
+ })
49
+ td.remove(['script', 'style', 'noscript', 'iframe'])
50
+ contentMarkdown = td.turndown(contentHtml)
51
+ } catch {
52
+ contentMarkdown = ''
53
+ }
54
+
55
+ if (contentMarkdown.length > MAX_MARKDOWN_LENGTH) {
56
+ contentMarkdown = contentMarkdown.slice(0, MAX_MARKDOWN_LENGTH) + '\n\n[... truncated ...]'
57
+ }
58
+
59
+ return {
60
+ title: article?.title || parsedDocument.title || null,
61
+ contentHtml,
62
+ contentMarkdown,
63
+ byline: article?.byline || null,
64
+ excerpt: article?.excerpt || null,
65
+ length: article?.length || contentMarkdown.length,
66
+ }
67
+ }
68
+
69
+ module.exports = {
70
+ cleanHtml,
71
+ MAX_MARKDOWN_LENGTH,
72
+ }
@@ -0,0 +1,21 @@
1
+ /**
2
+ * Web extraction (experimental, v3.53.0).
3
+ *
4
+ * Public surface for core/routes/web.js. Internal modules:
5
+ * - url-normalize.js URL → template + canonical URL
6
+ * - fingerprint.js DOM structural hash
7
+ * - playwright-runner.js headless fetch + selector replay
8
+ * - html-cleaner.js Readability + Turndown
9
+ * - recipe-generator.js Anthropic Haiku cold path
10
+ * - extractor.js orchestrator (hot/cold + self-heal)
11
+ */
12
+
13
+ const { extract } = require('./extractor')
14
+ const { normalizeUrl } = require('./url-normalize')
15
+ const { computeFingerprint } = require('./fingerprint')
16
+
17
+ module.exports = {
18
+ extract,
19
+ normalizeUrl,
20
+ computeFingerprint,
21
+ }