mcp-scraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +56 -0
  2. package/dist/bin/api-server.cjs +9256 -0
  3. package/dist/bin/api-server.cjs.map +1 -0
  4. package/dist/bin/api-server.d.cts +1 -0
  5. package/dist/bin/api-server.d.ts +1 -0
  6. package/dist/bin/api-server.js +38 -0
  7. package/dist/bin/api-server.js.map +1 -0
  8. package/dist/bin/mcp-stdio-server.cjs +840 -0
  9. package/dist/bin/mcp-stdio-server.cjs.map +1 -0
  10. package/dist/bin/mcp-stdio-server.d.cts +1 -0
  11. package/dist/bin/mcp-stdio-server.d.ts +1 -0
  12. package/dist/bin/mcp-stdio-server.js +41 -0
  13. package/dist/bin/mcp-stdio-server.js.map +1 -0
  14. package/dist/bin/paa-harvest.cjs +1438 -0
  15. package/dist/bin/paa-harvest.cjs.map +1 -0
  16. package/dist/bin/paa-harvest.d.cts +1 -0
  17. package/dist/bin/paa-harvest.d.ts +1 -0
  18. package/dist/bin/paa-harvest.js +37 -0
  19. package/dist/bin/paa-harvest.js.map +1 -0
  20. package/dist/chunk-4API3ZCT.js +1387 -0
  21. package/dist/chunk-4API3ZCT.js.map +1 -0
  22. package/dist/chunk-LXZDJJXR.js +476 -0
  23. package/dist/chunk-LXZDJJXR.js.map +1 -0
  24. package/dist/chunk-ZBP4RHNW.js +805 -0
  25. package/dist/chunk-ZBP4RHNW.js.map +1 -0
  26. package/dist/db-IOYMX64U.js +87 -0
  27. package/dist/db-IOYMX64U.js.map +1 -0
  28. package/dist/index.cjs +1689 -0
  29. package/dist/index.cjs.map +1 -0
  30. package/dist/index.d.cts +210 -0
  31. package/dist/index.d.ts +210 -0
  32. package/dist/index.js +275 -0
  33. package/dist/index.js.map +1 -0
  34. package/dist/server-63DR2HE5.js +6062 -0
  35. package/dist/server-63DR2HE5.js.map +1 -0
  36. package/dist/worker-3ECJHPRE.js +88 -0
  37. package/dist/worker-3ECJHPRE.js.map +1 -0
  38. package/package.json +76 -0
@@ -0,0 +1,88 @@
1
+ import {
2
+ harvest
3
+ } from "./chunk-4API3ZCT.js";
4
+ import {
5
+ claimPendingJob,
6
+ completeJob,
7
+ failJob
8
+ } from "./chunk-LXZDJJXR.js";
9
+
10
+ // src/api/webhook.ts
11
+ async function deliverWebhook(url, payload, retries = 3) {
12
+ for (let attempt = 1; attempt <= retries; attempt++) {
13
+ try {
14
+ const res = await fetch(url, {
15
+ method: "POST",
16
+ headers: { "content-type": "application/json" },
17
+ body: JSON.stringify(payload),
18
+ signal: AbortSignal.timeout(1e4)
19
+ });
20
+ if (res.ok) return;
21
+ console.warn(`[webhook] attempt ${attempt} \u2192 ${res.status} from ${url}`);
22
+ } catch (err) {
23
+ console.warn(`[webhook] attempt ${attempt} failed:`, err instanceof Error ? err.message : err);
24
+ }
25
+ if (attempt < retries) await new Promise((r) => setTimeout(r, 1e3 * attempt * 2));
26
+ }
27
+ console.error(`[webhook] gave up after ${retries} attempts for ${url}`);
28
+ }
29
+
30
+ // src/api/worker.ts
31
+ var MAX_CONCURRENT = 2;
32
+ var running = 0;
33
+ async function processJob(job) {
34
+ running++;
35
+ try {
36
+ const opts = typeof job.options === "string" ? JSON.parse(job.options) : job.options;
37
+ const result = await harvest({
38
+ ...opts,
39
+ kernelApiKey: process.env.KERNEL_API_KEY,
40
+ headless: true,
41
+ format: "json",
42
+ outputDir: "/tmp/paa-output-api"
43
+ });
44
+ await completeJob(job.id, result);
45
+ if (job.callback_url) {
46
+ await deliverWebhook(job.callback_url, { job_id: job.id, status: "done", result });
47
+ }
48
+ } catch (err) {
49
+ const msg = err instanceof Error ? err.message : String(err);
50
+ await failJob(job.id, msg);
51
+ if (job.callback_url) {
52
+ await deliverWebhook(job.callback_url, { job_id: job.id, status: "failed", error: msg });
53
+ }
54
+ } finally {
55
+ running--;
56
+ }
57
+ }
58
+ async function tickOnce() {
59
+ const job = await claimPendingJob();
60
+ if (!job) return { claimed: false };
61
+ const startedAt = Date.now();
62
+ await processJob(job);
63
+ return { claimed: true, jobId: job.id, completed: true, durationMs: Date.now() - startedAt };
64
+ }
65
+ async function drainQueue(budget) {
66
+ const results = [];
67
+ for (let i = 0; i < budget.maxJobs; i++) {
68
+ if (Date.now() >= budget.deadlineMs) break;
69
+ const r = await tickOnce();
70
+ results.push(r);
71
+ if (!r.claimed) break;
72
+ }
73
+ return results;
74
+ }
75
+ function startWorker() {
76
+ setInterval(async () => {
77
+ if (running >= MAX_CONCURRENT) return;
78
+ const job = await claimPendingJob();
79
+ if (job) void processJob(job);
80
+ }, 2e3);
81
+ console.log(`[worker] started \u2014 polling every 2s, max ${MAX_CONCURRENT} concurrent`);
82
+ }
83
+ export {
84
+ drainQueue,
85
+ startWorker,
86
+ tickOnce
87
+ };
88
+ //# sourceMappingURL=worker-3ECJHPRE.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/api/webhook.ts","../src/api/worker.ts"],"sourcesContent":["export async function deliverWebhook(url: string, payload: object, retries = 3): Promise<void> {\n for (let attempt = 1; attempt <= retries; attempt++) {\n try {\n const res = await fetch(url, {\n method: 'POST',\n headers: { 'content-type': 'application/json' },\n body: JSON.stringify(payload),\n signal: AbortSignal.timeout(10_000),\n })\n if (res.ok) return\n console.warn(`[webhook] attempt ${attempt} → ${res.status} from ${url}`)\n } catch (err) {\n console.warn(`[webhook] attempt ${attempt} failed:`, err instanceof Error ? err.message : err)\n }\n if (attempt < retries) await new Promise((r) => setTimeout(r, 1000 * attempt * 2))\n }\n console.error(`[webhook] gave up after ${retries} attempts for ${url}`)\n}\n","import { claimPendingJob, completeJob, failJob } from './db.js'\nimport { harvest } from '../harvest.js'\nimport { deliverWebhook } from './webhook.js'\nimport type { HarvestOptions } from '../types.js'\n\nexport type TickResult = {\n claimed: boolean\n jobId?: string\n completed?: boolean\n durationMs?: number\n}\nexport type DrainBudget = {\n maxJobs: number\n deadlineMs: number\n}\n\nconst MAX_CONCURRENT = 2\nlet running = 0\n\nasync function processJob(job: Awaited<ReturnType<typeof claimPendingJob>> & object) {\n running++\n try {\n const opts = typeof job.options === 'string' ? JSON.parse(job.options) as Partial<HarvestOptions> : job.options as Partial<HarvestOptions>\n const result = await harvest({\n ...opts,\n kernelApiKey: process.env.KERNEL_API_KEY,\n headless: true,\n format: 'json',\n outputDir: '/tmp/paa-output-api',\n })\n await completeJob(job.id, result)\n if (job.callback_url) {\n await deliverWebhook(job.callback_url, { job_id: job.id, status: 'done', result })\n }\n } catch (err) {\n const msg = err instanceof Error ? err.message : String(err)\n await failJob(job.id, msg)\n if (job.callback_url) {\n await deliverWebhook(job.callback_url, { job_id: job.id, status: 'failed', error: msg })\n }\n } finally {\n running--\n }\n}\n\nexport async function tickOnce(): Promise<TickResult> {\n const job = await claimPendingJob()\n if (!job) return { claimed: false }\n const startedAt = Date.now()\n await processJob(job as NonNullable<typeof job>)\n return { claimed: true, jobId: (job as { id: string }).id, completed: true, durationMs: Date.now() - startedAt }\n}\n\nexport async function drainQueue(budget: DrainBudget): Promise<TickResult[]> {\n const results: TickResult[] = []\n for (let i = 0; i < budget.maxJobs; i++) {\n if (Date.now() >= budget.deadlineMs) break\n const r = await tickOnce()\n results.push(r)\n if (!r.claimed) break\n }\n return results\n}\n\nexport function startWorker(): void {\n setInterval(async () => {\n if (running >= MAX_CONCURRENT) return\n const job = await claimPendingJob()\n if (job) void processJob(job as NonNullable<typeof job>)\n }, 2000)\n console.log(`[worker] started — polling every 2s, max ${MAX_CONCURRENT} concurrent`)\n}\n"],"mappings":";;;;;;;;;;AAAA,eAAsB,eAAe,KAAa,SAAiB,UAAU,GAAkB;AAC7F,WAAS,UAAU,GAAG,WAAW,SAAS,WAAW;AACnD,QAAI;AACF,YAAM,MAAM,MAAM,MAAM,KAAK;AAAA,QAC3B,QAAQ;AAAA,QACR,SAAS,EAAE,gBAAgB,mBAAmB;AAAA,QAC9C,MAAM,KAAK,UAAU,OAAO;AAAA,QAC5B,QAAQ,YAAY,QAAQ,GAAM;AAAA,MACpC,CAAC;AACD,UAAI,IAAI,GAAI;AACZ,cAAQ,KAAK,qBAAqB,OAAO,WAAM,IAAI,MAAM,SAAS,GAAG,EAAE;AAAA,IACzE,SAAS,KAAK;AACZ,cAAQ,KAAK,qBAAqB,OAAO,YAAY,eAAe,QAAQ,IAAI,UAAU,GAAG;AAAA,IAC/F;AACA,QAAI,UAAU,QAAS,OAAM,IAAI,QAAQ,CAAC,MAAM,WAAW,GAAG,MAAO,UAAU,CAAC,CAAC;AAAA,EACnF;AACA,UAAQ,MAAM,2BAA2B,OAAO,iBAAiB,GAAG,EAAE;AACxE;;;ACDA,IAAM,iBAAiB;AACvB,IAAI,UAAU;AAEd,eAAe,WAAW,KAA2D;AACnF;AACA,MAAI;AACF,UAAM,OAAO,OAAO,IAAI,YAAY,WAAW,KAAK,MAAM,IAAI,OAAO,IAA+B,IAAI;AACxG,UAAM,SAAS,MAAM,QAAQ;AAAA,MAC3B,GAAG;AAAA,MACH,cAAc,QAAQ,IAAI;AAAA,MAC1B,UAAU;AAAA,MACV,QAAQ;AAAA,MACR,WAAW;AAAA,IACb,CAAC;AACD,UAAM,YAAY,IAAI,IAAI,MAAM;AAChC,QAAI,IAAI,cAAc;AACpB,YAAM,eAAe,IAAI,cAAc,EAAE,QAAQ,IAAI,IAAI,QAAQ,QAAQ,OAAO,CAAC;AAAA,IACnF;AAAA,EACF,SAAS,KAAK;AACZ,UAAM,MAAM,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC3D,UAAM,QAAQ,IAAI,IAAI,GAAG;AACzB,QAAI,IAAI,cAAc;AACpB,YAAM,eAAe,IAAI,cAAc,EAAE,QAAQ,IAAI,IAAI,QAAQ,UAAU,OAAO,IAAI,CAAC;AAAA,IACzF;AAAA,EACF,UAAE;AACA;AAAA,EACF;AACF;AAEA,eAAsB,WAAgC;AACpD,QAAM,MAAM,MAAM,gBAAgB;AAClC,MAAI,CAAC,IAAK,QAAO,EAAE,SAAS,MAAM;AAClC,QAAM,YAAY,KAAK,IAAI;AAC3B,QAAM,WAAW,GAA8B;AAC/C,SAAO,EAAE,SAAS,MAAM,OAAQ,IAAuB,IAAI,WAAW,MAAM,YAAY,KAAK,IAAI,IAAI,UAAU;AACjH;AAEA,eAAsB,WAAW,QAA4C;AAC3E,QAAM,UAAwB,CAAC;AAC/B,WAAS,IAAI,GAAG,IAAI,OAAO,SAAS,KAAK;AACvC,QAAI,KAAK,IAAI,KAAK,OAAO,WAAY;AACrC,UAAM,IAAI,MAAM,SAAS;AACzB,YAAQ,KAAK,CAAC;AACd,QAAI,CAAC,EAAE,QAAS;AAAA,EAClB;AACA,SAAO;AACT;AAEO,SAAS,cAAoB;AAClC,cAAY,YAAY;AACtB,QAAI,WAAW,eAAgB;AAC/B,UAAM,MAAM,MAAM,gBAAgB;AAClC,QAAI,IAAK,MAAK,WAAW,GAA8B;AAAA,EACzD,GAAG,GAAI;AACP,UAAQ,IAAI,iDAA4C,cAAc,aAAa;AACrF;","names":[]}
package/package.json ADDED
@@ -0,0 +1,76 @@
1
+ {
2
+ "name": "mcp-scraper",
3
+ "version": "0.1.0",
4
+ "description": "MCP server for MCP Scraper web intelligence tools",
5
+ "type": "module",
6
+ "main": "./dist/index.cjs",
7
+ "module": "./dist/index.js",
8
+ "types": "./dist/index.d.ts",
9
+ "exports": {
10
+ ".": {
11
+ "types": "./dist/index.d.ts",
12
+ "import": "./dist/index.js",
13
+ "require": "./dist/index.cjs"
14
+ }
15
+ },
16
+ "bin": {
17
+ "paa-harvest": "dist/bin/paa-harvest.js",
18
+ "paa-api": "dist/bin/api-server.js",
19
+ "mcp-scraper": "dist/bin/mcp-stdio-server.js"
20
+ },
21
+ "files": [
22
+ "dist",
23
+ "README.md"
24
+ ],
25
+ "scripts": {
26
+ "build": "tsup && npm run build:ui",
27
+ "build:ui": "esbuild public/app.jsx --bundle --loader:.jsx=jsx --outfile=public/app.js --target=es2020",
28
+ "api": "tsx bin/api-server.ts",
29
+ "dev": "tsx src/cli.ts",
30
+ "test": "vitest run tests/unit",
31
+ "test:integration": "vitest run tests/integration",
32
+ "test:smoke": "vitest run tests/smoke",
33
+ "typecheck": "tsc --noEmit",
34
+ "test:kernel": "vitest run tests/integration/kernel.smoke.test.ts"
35
+ },
36
+ "dependencies": {
37
+ "@anthropic-ai/sdk": "^0.96.0",
38
+ "@distube/ytdl-core": "^4.16.12",
39
+ "@fal-ai/client": "^1.10.1",
40
+ "@hono/node-server": "^2.0.2",
41
+ "@libsql/client": "^0.17.3",
42
+ "@modelcontextprotocol/sdk": "^1.29.0",
43
+ "@onkernel/sdk": "^0.52.0",
44
+ "@types/turndown": "^5.0.6",
45
+ "better-sqlite3": "^12.9.0",
46
+ "bgutils-js": "^3.2.0",
47
+ "commander": "^12.0.0",
48
+ "hono": "^4.12.18",
49
+ "inngest": "^4.4.0",
50
+ "p-limit": "^7.3.0",
51
+ "papaparse": "^5.4.0",
52
+ "playwright": "^1.44.0",
53
+ "playwright-extra": "^4.3.6",
54
+ "puppeteer-extra-plugin-stealth": "^2.11.2",
55
+ "stripe": "^22.1.1",
56
+ "turndown": "^7.2.4",
57
+ "youtube-transcript": "^1.3.1",
58
+ "youtubei.js": "^17.0.1",
59
+ "zod": "^3.23.0"
60
+ },
61
+ "devDependencies": {
62
+ "@onkernel/cli": "^0.19.2",
63
+ "@types/better-sqlite3": "^7.6.13",
64
+ "@types/jsdom": "^28.0.3",
65
+ "@types/node": "^20.0.0",
66
+ "@types/papaparse": "^5.3.0",
67
+ "jsdom": "^29.1.1",
68
+ "tsup": "^8.0.0",
69
+ "tsx": "^4.0.0",
70
+ "typescript": "^5.4.0",
71
+ "vitest": "^1.6.0"
72
+ },
73
+ "engines": {
74
+ "node": ">=20"
75
+ }
76
+ }