mcp-scraper 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -2
- package/dist/bin/api-server.cjs +957 -243
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +2 -2
- package/dist/bin/mcp-stdio-server.cjs +540 -158
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +2 -1
- package/dist/bin/mcp-stdio-server.js.map +1 -1
- package/dist/bin/paa-harvest.cjs +36 -5
- package/dist/bin/paa-harvest.cjs.map +1 -1
- package/dist/bin/paa-harvest.js +5 -3
- package/dist/bin/paa-harvest.js.map +1 -1
- package/dist/{chunk-6TWZS2FQ.js → chunk-RE6HCRYC.js} +543 -159
- package/dist/chunk-RE6HCRYC.js.map +1 -0
- package/dist/{chunk-W4P2U5VF.js → chunk-TM22BLWP.js} +46 -34
- package/dist/chunk-TM22BLWP.js.map +1 -0
- package/dist/{chunk-7HB7NDOY.js → chunk-ZK456YXN.js} +12 -2
- package/dist/chunk-ZK456YXN.js.map +1 -0
- package/dist/chunk-ZMOWIBMK.js +36 -0
- package/dist/chunk-ZMOWIBMK.js.map +1 -0
- package/dist/index.cjs +34 -3
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +2 -1
- package/dist/index.js.map +1 -1
- package/dist/{server-2Y27U4TO.js → server-QXVVTKJP.js} +311 -48
- package/dist/server-QXVVTKJP.js.map +1 -0
- package/dist/{worker-UT4ZQU2T.js → worker-AUCXFHEL.js} +6 -4
- package/dist/worker-AUCXFHEL.js.map +1 -0
- package/docs/adr/0001-in-page-graphql-interception-for-anti-bot-scraping.md +58 -0
- package/docs/adr/README.md +11 -0
- package/docs/mcp-tool-quality-spec.md +238 -0
- package/package.json +5 -4
- package/dist/chunk-6TWZS2FQ.js.map +0 -1
- package/dist/chunk-7HB7NDOY.js.map +0 -1
- package/dist/chunk-W4P2U5VF.js.map +0 -1
- package/dist/server-2Y27U4TO.js.map +0 -1
- package/dist/worker-UT4ZQU2T.js.map +0 -1
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/api/webhook.ts","../src/api/worker.ts"],"sourcesContent":["export async function deliverWebhook(url: string, payload: object, retries = 3): Promise<void> {\n for (let attempt = 1; attempt <= retries; attempt++) {\n try {\n const res = await fetch(url, {\n method: 'POST',\n headers: { 'content-type': 'application/json' },\n body: JSON.stringify(payload),\n signal: AbortSignal.timeout(10_000),\n })\n if (res.ok) return\n console.warn(`[webhook] attempt ${attempt} → ${res.status} from ${url}`)\n } catch (err) {\n console.warn(`[webhook] attempt ${attempt} failed:`, err instanceof Error ? err.message : err)\n }\n if (attempt < retries) await new Promise((r) => setTimeout(r, 1000 * attempt * 2))\n }\n console.error(`[webhook] gave up after ${retries} attempts for ${url}`)\n}\n","import { claimPendingJob, completeJob, failJob, creditMc, debitMc, listHarvestAttempts } from './db.js'\nimport { harvest } from '../harvest.js'\nimport { deliverWebhook } from './webhook.js'\nimport type { HarvestOptions } from '../types.js'\nimport { MC_COSTS } from './rates.js'\nimport { classifyHarvestProblem, harvestProblemResponse, serializeHarvestProblem } from './harvest-problems.js'\nimport { createHarvestAttemptRecorder } from './harvest-attempt-events.js'\n\nexport type TickResult = {\n claimed: boolean\n jobId?: string\n completed?: boolean\n durationMs?: number\n}\nexport type DrainBudget = {\n maxJobs: number\n deadlineMs: number\n}\n\nconst MAX_CONCURRENT = 2\nlet running = 0\n\nfunction countPaaQuestions(result: unknown): number {\n if (!result || typeof result !== 'object') return 0\n const value = result as { totalQuestions?: unknown; flat?: unknown }\n if (typeof value.totalQuestions === 'number') return value.totalQuestions\n return Array.isArray(value.flat) ? value.flat.length : 0\n}\n\nfunction paaCostForQuestionCount(questionCount: number): number {\n return Math.max(1, questionCount) * MC_COSTS.paa\n}\n\nasync function processJob(job: Awaited<ReturnType<typeof claimPendingJob>> & object) {\n running++\n try {\n const opts = typeof job.options === 'string' ? JSON.parse(job.options) as Partial<HarvestOptions> & { billingHoldMc?: number } : job.options as Partial<HarvestOptions> & { billingHoldMc?: number }\n const result = await harvest({\n ...opts,\n kernelApiKey: process.env.KERNEL_API_KEY,\n headless: true,\n format: 'json',\n outputDir: '/tmp/paa-output-api',\n onAttemptEvent: createHarvestAttemptRecorder(job.id, job.user_id),\n })\n await completeJob(job.id, result)\n const attempts = await listHarvestAttempts(job.id, job.user_id)\n if (!opts.serpOnly && typeof opts.billingHoldMc === 'number') {\n const actualCost = paaCostForQuestionCount(countPaaQuestions(result))\n const diff = opts.billingHoldMc - actualCost\n if (diff > 0) await creditMc(job.user_id, diff, 'paa_refund', 'overestimate refund')\n else if (diff < 0) await debitMc(job.user_id, -diff, 'paa', opts.query ?? job.query)\n }\n if (job.callback_url) {\n await deliverWebhook(job.callback_url, { job_id: job.id, status: 'done', result, attempts })\n }\n } catch (err) {\n const problem = classifyHarvestProblem(err)\n await failJob(job.id, serializeHarvestProblem(problem))\n const attempts = await listHarvestAttempts(job.id, job.user_id)\n try {\n const opts = typeof job.options === 'string' ? JSON.parse(job.options) as { billingHoldMc?: number } : job.options as { billingHoldMc?: number }\n if (typeof opts.billingHoldMc === 'number' && opts.billingHoldMc > 0) {\n await creditMc(job.user_id, opts.billingHoldMc, 'refund', 'failed call')\n }\n } catch {}\n if (job.callback_url) {\n await deliverWebhook(job.callback_url, { job_id: job.id, status: 'failed', ...harvestProblemResponse(problem), attempts })\n }\n } finally {\n running--\n }\n}\n\nexport async function tickOnce(): Promise<TickResult> {\n const job = await claimPendingJob()\n if (!job) return { claimed: false }\n const startedAt = Date.now()\n await processJob(job as NonNullable<typeof job>)\n return { claimed: true, jobId: (job as { id: string }).id, completed: true, durationMs: Date.now() - startedAt }\n}\n\nexport async function drainQueue(budget: DrainBudget): Promise<TickResult[]> {\n const results: TickResult[] = []\n for (let i = 0; i < budget.maxJobs; i++) {\n if (Date.now() >= budget.deadlineMs) break\n const r = await tickOnce()\n results.push(r)\n if (!r.claimed) break\n }\n return results\n}\n\nexport function startWorker(): void {\n setInterval(async () => {\n if (running >= MAX_CONCURRENT) return\n const job = await claimPendingJob()\n if (job) void processJob(job as NonNullable<typeof job>)\n }, 2000)\n console.log(`[worker] started — polling every 2s, max ${MAX_CONCURRENT} concurrent`)\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA,eAAsB,eAAe,KAAa,SAAiB,UAAU,GAAkB;AAC7F,WAAS,UAAU,GAAG,WAAW,SAAS,WAAW;AACnD,QAAI;AACF,YAAM,MAAM,MAAM,MAAM,KAAK;AAAA,QAC3B,QAAQ;AAAA,QACR,SAAS,EAAE,gBAAgB,mBAAmB;AAAA,QAC9C,MAAM,KAAK,UAAU,OAAO;AAAA,QAC5B,QAAQ,YAAY,QAAQ,GAAM;AAAA,MACpC,CAAC;AACD,UAAI,IAAI,GAAI;AACZ,cAAQ,KAAK,qBAAqB,OAAO,WAAM,IAAI,MAAM,SAAS,GAAG,EAAE;AAAA,IACzE,SAAS,KAAK;AACZ,cAAQ,KAAK,qBAAqB,OAAO,YAAY,eAAe,QAAQ,IAAI,UAAU,GAAG;AAAA,IAC/F;AACA,QAAI,UAAU,QAAS,OAAM,IAAI,QAAQ,CAAC,MAAM,WAAW,GAAG,MAAO,UAAU,CAAC,CAAC;AAAA,EACnF;AACA,UAAQ,MAAM,2BAA2B,OAAO,iBAAiB,GAAG,EAAE;AACxE;;;ACEA,IAAM,iBAAiB;AACvB,IAAI,UAAU;AAEd,SAAS,kBAAkB,QAAyB;AAClD,MAAI,CAAC,UAAU,OAAO,WAAW,SAAU,QAAO;AAClD,QAAM,QAAQ;AACd,MAAI,OAAO,MAAM,mBAAmB,SAAU,QAAO,MAAM;AAC3D,SAAO,MAAM,QAAQ,MAAM,IAAI,IAAI,MAAM,KAAK,SAAS;AACzD;AAEA,SAAS,wBAAwB,eAA+B;AAC9D,SAAO,KAAK,IAAI,GAAG,aAAa,IAAI,SAAS;AAC/C;AAEA,eAAe,WAAW,KAA2D;AACnF;AACA,MAAI;AACF,UAAM,OAAO,OAAO,IAAI,YAAY,WAAW,KAAK,MAAM,IAAI,OAAO,IAA4D,IAAI;AACrI,UAAM,SAAS,MAAM,QAAQ;AAAA,MAC3B,GAAG;AAAA,MACH,cAAc,QAAQ,IAAI;AAAA,MAC1B,UAAU;AAAA,MACV,QAAQ;AAAA,MACR,WAAW;AAAA,MACX,gBAAgB,6BAA6B,IAAI,IAAI,IAAI,OAAO;AAAA,IAClE,CAAC;AACD,UAAM,YAAY,IAAI,IAAI,MAAM;AAChC,UAAM,WAAW,MAAM,oBAAoB,IAAI,IAAI,IAAI,OAAO;AAC9D,QAAI,CAAC,KAAK,YAAY,OAAO,KAAK,kBAAkB,UAAU;AAC5D,YAAM,aAAa,wBAAwB,kBAAkB,MAAM,CAAC;AACpE,YAAM,OAAO,KAAK,gBAAgB;AAClC,UAAI,OAAO,EAAG,OAAM,SAAS,IAAI,SAAS,MAAM,cAAc,qBAAqB;AAAA,eAC1E,OAAO,EAAG,OAAM,QAAQ,IAAI,SAAS,CAAC,MAAM,OAAO,KAAK,SAAS,IAAI,KAAK;AAAA,IACrF;AACA,QAAI,IAAI,cAAc;AACpB,YAAM,eAAe,IAAI,cAAc,EAAE,QAAQ,IAAI,IAAI,QAAQ,QAAQ,QAAQ,SAAS,CAAC;AAAA,IAC7F;AAAA,EACF,SAAS,KAAK;AACZ,UAAM,UAAU,uBAAuB,GAAG;AAC1C,UAAM,QAAQ,IAAI,IAAI,wBAAwB,OAAO,CAAC;AACtD,UAAM,WAAW,MAAM,oBAAoB,IAAI,IAAI,IAAI,OAAO;AAC9D,QAAI;AACF,YAAM,OAAO,OAAO,IAAI,YAAY,WAAW,KAAK,MAAM,IAAI,OAAO,IAAkC,IAAI;AAC3G,UAAI,OAAO,KAAK,kBAAkB,YAAY,KAAK,gBAAgB,GAAG;AACpE,cAAM,SAAS,IAAI,SAAS,KAAK,eAAe,UAAU,aAAa;AAAA,MACzE;AAAA,IACF,QAAQ;AAAA,IAAC;AACT,QAAI,IAAI,cAAc;AACpB,YAAM,eAAe,IAAI,cAAc,EAAE,QAAQ,IAAI,IAAI,QAAQ,UAAU,GAAG,uBAAuB,OAAO,GAAG,SAAS,CAAC;AAAA,IAC3H;AAAA,EACF,UAAE;AACA;AAAA,EACF;AACF;AAEA,eAAsB,WAAgC;AACpD,QAAM,MAAM,MAAM,gBAAgB;AAClC,MAAI,CAAC,IAAK,QAAO,EAAE,SAAS,MAAM;AAClC,QAAM,YAAY,KAAK,IAAI;AAC3B,QAAM,WAAW,GAA8B;AAC/C,SAAO,EAAE,SAAS,MAAM,OAAQ,IAAuB,IAAI,WAAW,MAAM,YAAY,KAAK,IAAI,IAAI,UAAU;AACjH;AAEA,eAAsB,WAAW,QAA4C;AAC3E,QAAM,UAAwB,CAAC;AAC/B,WAAS,IAAI,GAAG,IAAI,OAAO,SAAS,KAAK;AACvC,QAAI,KAAK,IAAI,KAAK,OAAO,WAAY;AACrC,UAAM,IAAI,MAAM,SAAS;AACzB,YAAQ,KAAK,CAAC;AACd,QAAI,CAAC,EAAE,QAAS;AAAA,EAClB;AACA,SAAO;AACT;AAEO,SAAS,cAAoB;AAClC,cAAY,YAAY;AACtB,QAAI,WAAW,eAAgB;AAC/B,UAAM,MAAM,MAAM,gBAAgB;AAClC,QAAI,IAAK,MAAK,WAAW,GAA8B;AAAA,EACzD,GAAG,GAAI;AACP,UAAQ,IAAI,iDAA4C,cAAc,aAAa;AACrF;","names":[]}
|