mcp-scraper 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/README.md +13 -2
  2. package/dist/bin/api-server.cjs +572 -171
  3. package/dist/bin/api-server.cjs.map +1 -1
  4. package/dist/bin/api-server.js +2 -2
  5. package/dist/bin/mcp-stdio-server.cjs +299 -149
  6. package/dist/bin/mcp-stdio-server.cjs.map +1 -1
  7. package/dist/bin/mcp-stdio-server.js +2 -1
  8. package/dist/bin/mcp-stdio-server.js.map +1 -1
  9. package/dist/bin/paa-harvest.cjs +22 -1
  10. package/dist/bin/paa-harvest.cjs.map +1 -1
  11. package/dist/bin/paa-harvest.js +2 -1
  12. package/dist/bin/paa-harvest.js.map +1 -1
  13. package/dist/{chunk-6TWZS2FQ.js → chunk-3OIRNUF5.js} +302 -150
  14. package/dist/chunk-3OIRNUF5.js.map +1 -0
  15. package/dist/{chunk-W4P2U5VF.js → chunk-LUBDFS67.js} +32 -32
  16. package/dist/chunk-LUBDFS67.js.map +1 -0
  17. package/dist/{chunk-7HB7NDOY.js → chunk-ZK456YXN.js} +12 -2
  18. package/dist/chunk-ZK456YXN.js.map +1 -0
  19. package/dist/chunk-ZMOWIBMK.js +36 -0
  20. package/dist/chunk-ZMOWIBMK.js.map +1 -0
  21. package/dist/index.cjs +22 -1
  22. package/dist/index.cjs.map +1 -1
  23. package/dist/index.js +2 -1
  24. package/dist/index.js.map +1 -1
  25. package/dist/{server-2Y27U4TO.js → server-YNJHP5PU.js} +235 -22
  26. package/dist/server-YNJHP5PU.js.map +1 -0
  27. package/dist/{worker-UT4ZQU2T.js → worker-PBG6LGET.js} +4 -3
  28. package/dist/{worker-UT4ZQU2T.js.map → worker-PBG6LGET.js.map} +1 -1
  29. package/docs/adr/0001-in-page-graphql-interception-for-anti-bot-scraping.md +58 -0
  30. package/docs/adr/README.md +11 -0
  31. package/docs/mcp-tool-quality-spec.md +238 -0
  32. package/package.json +5 -4
  33. package/dist/chunk-6TWZS2FQ.js.map +0 -1
  34. package/dist/chunk-7HB7NDOY.js.map +0 -1
  35. package/dist/chunk-W4P2U5VF.js.map +0 -1
  36. package/dist/server-2Y27U4TO.js.map +0 -1
@@ -4,10 +4,11 @@ import {
4
4
  createHarvestAttemptRecorder,
5
5
  harvestProblemResponse,
6
6
  serializeHarvestProblem
7
- } from "./chunk-7HB7NDOY.js";
7
+ } from "./chunk-ZK456YXN.js";
8
8
  import {
9
9
  harvest
10
- } from "./chunk-W4P2U5VF.js";
10
+ } from "./chunk-LUBDFS67.js";
11
+ import "./chunk-ZMOWIBMK.js";
11
12
  import {
12
13
  claimPendingJob,
13
14
  completeJob,
@@ -120,4 +121,4 @@ export {
120
121
  startWorker,
121
122
  tickOnce
122
123
  };
123
- //# sourceMappingURL=worker-UT4ZQU2T.js.map
124
+ //# sourceMappingURL=worker-PBG6LGET.js.map
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/api/webhook.ts","../src/api/worker.ts"],"sourcesContent":["export async function deliverWebhook(url: string, payload: object, retries = 3): Promise<void> {\n for (let attempt = 1; attempt <= retries; attempt++) {\n try {\n const res = await fetch(url, {\n method: 'POST',\n headers: { 'content-type': 'application/json' },\n body: JSON.stringify(payload),\n signal: AbortSignal.timeout(10_000),\n })\n if (res.ok) return\n console.warn(`[webhook] attempt ${attempt} → ${res.status} from ${url}`)\n } catch (err) {\n console.warn(`[webhook] attempt ${attempt} failed:`, err instanceof Error ? err.message : err)\n }\n if (attempt < retries) await new Promise((r) => setTimeout(r, 1000 * attempt * 2))\n }\n console.error(`[webhook] gave up after ${retries} attempts for ${url}`)\n}\n","import { claimPendingJob, completeJob, failJob, creditMc, debitMc, listHarvestAttempts } from './db.js'\nimport { harvest } from '../harvest.js'\nimport { deliverWebhook } from './webhook.js'\nimport type { HarvestOptions } from '../types.js'\nimport { MC_COSTS } from './rates.js'\nimport { classifyHarvestProblem, harvestProblemResponse, serializeHarvestProblem } from './harvest-problems.js'\nimport { createHarvestAttemptRecorder } from './harvest-attempt-events.js'\n\nexport type TickResult = {\n claimed: boolean\n jobId?: string\n completed?: boolean\n durationMs?: number\n}\nexport type DrainBudget = {\n maxJobs: number\n deadlineMs: number\n}\n\nconst MAX_CONCURRENT = 2\nlet running = 0\n\nfunction countPaaQuestions(result: unknown): number {\n if (!result || typeof result !== 'object') return 0\n const value = result as { totalQuestions?: unknown; flat?: unknown }\n if (typeof value.totalQuestions === 'number') return value.totalQuestions\n return Array.isArray(value.flat) ? value.flat.length : 0\n}\n\nfunction paaCostForQuestionCount(questionCount: number): number {\n return Math.max(1, questionCount) * MC_COSTS.paa\n}\n\nasync function processJob(job: Awaited<ReturnType<typeof claimPendingJob>> & object) {\n running++\n try {\n const opts = typeof job.options === 'string' ? JSON.parse(job.options) as Partial<HarvestOptions> & { billingHoldMc?: number } : job.options as Partial<HarvestOptions> & { billingHoldMc?: number }\n const result = await harvest({\n ...opts,\n kernelApiKey: process.env.KERNEL_API_KEY,\n headless: true,\n format: 'json',\n outputDir: '/tmp/paa-output-api',\n onAttemptEvent: createHarvestAttemptRecorder(job.id, job.user_id),\n })\n await completeJob(job.id, result)\n const attempts = await listHarvestAttempts(job.id, job.user_id)\n if (!opts.serpOnly && typeof opts.billingHoldMc === 'number') {\n const actualCost = paaCostForQuestionCount(countPaaQuestions(result))\n const diff = opts.billingHoldMc - actualCost\n if (diff > 0) await creditMc(job.user_id, diff, 'paa_refund', 'overestimate refund')\n else if (diff < 0) await debitMc(job.user_id, -diff, 'paa', opts.query ?? job.query)\n }\n if (job.callback_url) {\n await deliverWebhook(job.callback_url, { job_id: job.id, status: 'done', result, attempts })\n }\n } catch (err) {\n const problem = classifyHarvestProblem(err)\n await failJob(job.id, serializeHarvestProblem(problem))\n const attempts = await listHarvestAttempts(job.id, job.user_id)\n try {\n const opts = typeof job.options === 'string' ? JSON.parse(job.options) as { billingHoldMc?: number } : job.options as { billingHoldMc?: number }\n if (typeof opts.billingHoldMc === 'number' && opts.billingHoldMc > 0) {\n await creditMc(job.user_id, opts.billingHoldMc, 'refund', 'failed call')\n }\n } catch {}\n if (job.callback_url) {\n await deliverWebhook(job.callback_url, { job_id: job.id, status: 'failed', ...harvestProblemResponse(problem), attempts })\n }\n } finally {\n running--\n }\n}\n\nexport async function tickOnce(): Promise<TickResult> {\n const job = await claimPendingJob()\n if (!job) return { claimed: false }\n const startedAt = Date.now()\n await processJob(job as NonNullable<typeof job>)\n return { claimed: true, jobId: (job as { id: string }).id, completed: true, durationMs: Date.now() - startedAt }\n}\n\nexport async function drainQueue(budget: DrainBudget): Promise<TickResult[]> {\n const results: TickResult[] = []\n for (let i = 0; i < budget.maxJobs; i++) {\n if (Date.now() >= budget.deadlineMs) break\n const r = await tickOnce()\n results.push(r)\n if (!r.claimed) break\n }\n return results\n}\n\nexport function startWorker(): void {\n setInterval(async () => {\n if (running >= MAX_CONCURRENT) return\n const job = await claimPendingJob()\n if (job) void processJob(job as NonNullable<typeof job>)\n }, 2000)\n console.log(`[worker] started — polling every 2s, max ${MAX_CONCURRENT} concurrent`)\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA,eAAsB,eAAe,KAAa,SAAiB,UAAU,GAAkB;AAC7F,WAAS,UAAU,GAAG,WAAW,SAAS,WAAW;AACnD,QAAI;AACF,YAAM,MAAM,MAAM,MAAM,KAAK;AAAA,QAC3B,QAAQ;AAAA,QACR,SAAS,EAAE,gBAAgB,mBAAmB;AAAA,QAC9C,MAAM,KAAK,UAAU,OAAO;AAAA,QAC5B,QAAQ,YAAY,QAAQ,GAAM;AAAA,MACpC,CAAC;AACD,UAAI,IAAI,GAAI;AACZ,cAAQ,KAAK,qBAAqB,OAAO,WAAM,IAAI,MAAM,SAAS,GAAG,EAAE;AAAA,IACzE,SAAS,KAAK;AACZ,cAAQ,KAAK,qBAAqB,OAAO,YAAY,eAAe,QAAQ,IAAI,UAAU,GAAG;AAAA,IAC/F;AACA,QAAI,UAAU,QAAS,OAAM,IAAI,QAAQ,CAAC,MAAM,WAAW,GAAG,MAAO,UAAU,CAAC,CAAC;AAAA,EACnF;AACA,UAAQ,MAAM,2BAA2B,OAAO,iBAAiB,GAAG,EAAE;AACxE;;;ACEA,IAAM,iBAAiB;AACvB,IAAI,UAAU;AAEd,SAAS,kBAAkB,QAAyB;AAClD,MAAI,CAAC,UAAU,OAAO,WAAW,SAAU,QAAO;AAClD,QAAM,QAAQ;AACd,MAAI,OAAO,MAAM,mBAAmB,SAAU,QAAO,MAAM;AAC3D,SAAO,MAAM,QAAQ,MAAM,IAAI,IAAI,MAAM,KAAK,SAAS;AACzD;AAEA,SAAS,wBAAwB,eAA+B;AAC9D,SAAO,KAAK,IAAI,GAAG,aAAa,IAAI,SAAS;AAC/C;AAEA,eAAe,WAAW,KAA2D;AACnF;AACA,MAAI;AACF,UAAM,OAAO,OAAO,IAAI,YAAY,WAAW,KAAK,MAAM,IAAI,OAAO,IAA4D,IAAI;AACrI,UAAM,SAAS,MAAM,QAAQ;AAAA,MAC3B,GAAG;AAAA,MACH,cAAc,QAAQ,IAAI;AAAA,MAC1B,UAAU;AAAA,MACV,QAAQ;AAAA,MACR,WAAW;AAAA,MACX,gBAAgB,6BAA6B,IAAI,IAAI,IAAI,OAAO;AAAA,IAClE,CAAC;AACD,UAAM,YAAY,IAAI,IAAI,MAAM;AAChC,UAAM,WAAW,MAAM,oBAAoB,IAAI,IAAI,IAAI,OAAO;AAC9D,QAAI,CAAC,KAAK,YAAY,OAAO,KAAK,kBAAkB,UAAU;AAC5D,YAAM,aAAa,wBAAwB,kBAAkB,MAAM,CAAC;AACpE,YAAM,OAAO,KAAK,gBAAgB;AAClC,UAAI,OAAO,EAAG,OAAM,SAAS,IAAI,SAAS,MAAM,cAAc,qBAAqB;AAAA,eAC1E,OAAO,EAAG,OAAM,QAAQ,IAAI,SAAS,CAAC,MAAM,OAAO,KAAK,SAAS,IAAI,KAAK;AAAA,IACrF;AACA,QAAI,IAAI,cAAc;AACpB,YAAM,eAAe,IAAI,cAAc,EAAE,QAAQ,IAAI,IAAI,QAAQ,QAAQ,QAAQ,SAAS,CAAC;AAAA,IAC7F;AAAA,EACF,SAAS,KAAK;AACZ,UAAM,UAAU,uBAAuB,GAAG;AAC1C,UAAM,QAAQ,IAAI,IAAI,wBAAwB,OAAO,CAAC;AACtD,UAAM,WAAW,MAAM,oBAAoB,IAAI,IAAI,IAAI,OAAO;AAC9D,QAAI;AACF,YAAM,OAAO,OAAO,IAAI,YAAY,WAAW,KAAK,MAAM,IAAI,OAAO,IAAkC,IAAI;AAC3G,UAAI,OAAO,KAAK,kBAAkB,YAAY,KAAK,gBAAgB,GAAG;AACpE,cAAM,SAAS,IAAI,SAAS,KAAK,eAAe,UAAU,aAAa;AAAA,MACzE;AAAA,IACF,QAAQ;AAAA,IAAC;AACT,QAAI,IAAI,cAAc;AACpB,YAAM,eAAe,IAAI,cAAc,EAAE,QAAQ,IAAI,IAAI,QAAQ,UAAU,GAAG,uBAAuB,OAAO,GAAG,SAAS,CAAC;AAAA,IAC3H;AAAA,EACF,UAAE;AACA;AAAA,EACF;AACF;AAEA,eAAsB,WAAgC;AACpD,QAAM,MAAM,MAAM,gBAAgB;AAClC,MAAI,CAAC,IAAK,QAAO,EAAE,SAAS,MAAM;AAClC,QAAM,YAAY,KAAK,IAAI;AAC3B,QAAM,WAAW,GAA8B;AAC/C,SAAO,EAAE,SAAS,MAAM,OAAQ,IAAuB,IAAI,WAAW,MAAM,YAAY,KAAK,IAAI,IAAI,UAAU;AACjH;AAEA,eAAsB,WAAW,QAA4C;AAC3E,QAAM,UAAwB,CAAC;AAC/B,WAAS,IAAI,GAAG,IAAI,OAAO,SAAS,KAAK;AACvC,QAAI,KAAK,IAAI,KAAK,OAAO,WAAY;AACrC,UAAM,IAAI,MAAM,SAAS;AACzB,YAAQ,KAAK,CAAC;AACd,QAAI,CAAC,EAAE,QAAS;AAAA,EAClB;AACA,SAAO;AACT;AAEO,SAAS,cAAoB;AAClC,cAAY,YAAY;AACtB,QAAI,WAAW,eAAgB;AAC/B,UAAM,MAAM,MAAM,gBAAgB;AAClC,QAAI,IAAK,MAAK,WAAW,GAA8B;AAAA,EACzD,GAAG,GAAI;AACP,UAAQ,IAAI,iDAA4C,cAAc,aAAa;AACrF;","names":[]}
1
+ {"version":3,"sources":["../src/api/webhook.ts","../src/api/worker.ts"],"sourcesContent":["export async function deliverWebhook(url: string, payload: object, retries = 3): Promise<void> {\n for (let attempt = 1; attempt <= retries; attempt++) {\n try {\n const res = await fetch(url, {\n method: 'POST',\n headers: { 'content-type': 'application/json' },\n body: JSON.stringify(payload),\n signal: AbortSignal.timeout(10_000),\n })\n if (res.ok) return\n console.warn(`[webhook] attempt ${attempt} → ${res.status} from ${url}`)\n } catch (err) {\n console.warn(`[webhook] attempt ${attempt} failed:`, err instanceof Error ? err.message : err)\n }\n if (attempt < retries) await new Promise((r) => setTimeout(r, 1000 * attempt * 2))\n }\n console.error(`[webhook] gave up after ${retries} attempts for ${url}`)\n}\n","import { claimPendingJob, completeJob, failJob, creditMc, debitMc, listHarvestAttempts } from './db.js'\nimport { harvest } from '../harvest.js'\nimport { deliverWebhook } from './webhook.js'\nimport type { HarvestOptions } from '../types.js'\nimport { MC_COSTS } from './rates.js'\nimport { classifyHarvestProblem, harvestProblemResponse, serializeHarvestProblem } from './harvest-problems.js'\nimport { createHarvestAttemptRecorder } from './harvest-attempt-events.js'\n\nexport type TickResult = {\n claimed: boolean\n jobId?: string\n completed?: boolean\n durationMs?: number\n}\nexport type DrainBudget = {\n maxJobs: number\n deadlineMs: number\n}\n\nconst MAX_CONCURRENT = 2\nlet running = 0\n\nfunction countPaaQuestions(result: unknown): number {\n if (!result || typeof result !== 'object') return 0\n const value = result as { totalQuestions?: unknown; flat?: unknown }\n if (typeof value.totalQuestions === 'number') return value.totalQuestions\n return Array.isArray(value.flat) ? value.flat.length : 0\n}\n\nfunction paaCostForQuestionCount(questionCount: number): number {\n return Math.max(1, questionCount) * MC_COSTS.paa\n}\n\nasync function processJob(job: Awaited<ReturnType<typeof claimPendingJob>> & object) {\n running++\n try {\n const opts = typeof job.options === 'string' ? JSON.parse(job.options) as Partial<HarvestOptions> & { billingHoldMc?: number } : job.options as Partial<HarvestOptions> & { billingHoldMc?: number }\n const result = await harvest({\n ...opts,\n kernelApiKey: process.env.KERNEL_API_KEY,\n headless: true,\n format: 'json',\n outputDir: '/tmp/paa-output-api',\n onAttemptEvent: createHarvestAttemptRecorder(job.id, job.user_id),\n })\n await completeJob(job.id, result)\n const attempts = await listHarvestAttempts(job.id, job.user_id)\n if (!opts.serpOnly && typeof opts.billingHoldMc === 'number') {\n const actualCost = paaCostForQuestionCount(countPaaQuestions(result))\n const diff = opts.billingHoldMc - actualCost\n if (diff > 0) await creditMc(job.user_id, diff, 'paa_refund', 'overestimate refund')\n else if (diff < 0) await debitMc(job.user_id, -diff, 'paa', opts.query ?? job.query)\n }\n if (job.callback_url) {\n await deliverWebhook(job.callback_url, { job_id: job.id, status: 'done', result, attempts })\n }\n } catch (err) {\n const problem = classifyHarvestProblem(err)\n await failJob(job.id, serializeHarvestProblem(problem))\n const attempts = await listHarvestAttempts(job.id, job.user_id)\n try {\n const opts = typeof job.options === 'string' ? JSON.parse(job.options) as { billingHoldMc?: number } : job.options as { billingHoldMc?: number }\n if (typeof opts.billingHoldMc === 'number' && opts.billingHoldMc > 0) {\n await creditMc(job.user_id, opts.billingHoldMc, 'refund', 'failed call')\n }\n } catch {}\n if (job.callback_url) {\n await deliverWebhook(job.callback_url, { job_id: job.id, status: 'failed', ...harvestProblemResponse(problem), attempts })\n }\n } finally {\n running--\n }\n}\n\nexport async function tickOnce(): Promise<TickResult> {\n const job = await claimPendingJob()\n if (!job) return { claimed: false }\n const startedAt = Date.now()\n await processJob(job as NonNullable<typeof job>)\n return { claimed: true, jobId: (job as { id: string }).id, completed: true, durationMs: Date.now() - startedAt }\n}\n\nexport async function drainQueue(budget: DrainBudget): Promise<TickResult[]> {\n const results: TickResult[] = []\n for (let i = 0; i < budget.maxJobs; i++) {\n if (Date.now() >= budget.deadlineMs) break\n const r = await tickOnce()\n results.push(r)\n if (!r.claimed) break\n }\n return results\n}\n\nexport function startWorker(): void {\n setInterval(async () => {\n if (running >= MAX_CONCURRENT) return\n const job = await claimPendingJob()\n if (job) void processJob(job as NonNullable<typeof job>)\n }, 2000)\n console.log(`[worker] started — polling every 2s, max ${MAX_CONCURRENT} concurrent`)\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;AAAA,eAAsB,eAAe,KAAa,SAAiB,UAAU,GAAkB;AAC7F,WAAS,UAAU,GAAG,WAAW,SAAS,WAAW;AACnD,QAAI;AACF,YAAM,MAAM,MAAM,MAAM,KAAK;AAAA,QAC3B,QAAQ;AAAA,QACR,SAAS,EAAE,gBAAgB,mBAAmB;AAAA,QAC9C,MAAM,KAAK,UAAU,OAAO;AAAA,QAC5B,QAAQ,YAAY,QAAQ,GAAM;AAAA,MACpC,CAAC;AACD,UAAI,IAAI,GAAI;AACZ,cAAQ,KAAK,qBAAqB,OAAO,WAAM,IAAI,MAAM,SAAS,GAAG,EAAE;AAAA,IACzE,SAAS,KAAK;AACZ,cAAQ,KAAK,qBAAqB,OAAO,YAAY,eAAe,QAAQ,IAAI,UAAU,GAAG;AAAA,IAC/F;AACA,QAAI,UAAU,QAAS,OAAM,IAAI,QAAQ,CAAC,MAAM,WAAW,GAAG,MAAO,UAAU,CAAC,CAAC;AAAA,EACnF;AACA,UAAQ,MAAM,2BAA2B,OAAO,iBAAiB,GAAG,EAAE;AACxE;;;ACEA,IAAM,iBAAiB;AACvB,IAAI,UAAU;AAEd,SAAS,kBAAkB,QAAyB;AAClD,MAAI,CAAC,UAAU,OAAO,WAAW,SAAU,QAAO;AAClD,QAAM,QAAQ;AACd,MAAI,OAAO,MAAM,mBAAmB,SAAU,QAAO,MAAM;AAC3D,SAAO,MAAM,QAAQ,MAAM,IAAI,IAAI,MAAM,KAAK,SAAS;AACzD;AAEA,SAAS,wBAAwB,eAA+B;AAC9D,SAAO,KAAK,IAAI,GAAG,aAAa,IAAI,SAAS;AAC/C;AAEA,eAAe,WAAW,KAA2D;AACnF;AACA,MAAI;AACF,UAAM,OAAO,OAAO,IAAI,YAAY,WAAW,KAAK,MAAM,IAAI,OAAO,IAA4D,IAAI;AACrI,UAAM,SAAS,MAAM,QAAQ;AAAA,MAC3B,GAAG;AAAA,MACH,cAAc,QAAQ,IAAI;AAAA,MAC1B,UAAU;AAAA,MACV,QAAQ;AAAA,MACR,WAAW;AAAA,MACX,gBAAgB,6BAA6B,IAAI,IAAI,IAAI,OAAO;AAAA,IAClE,CAAC;AACD,UAAM,YAAY,IAAI,IAAI,MAAM;AAChC,UAAM,WAAW,MAAM,oBAAoB,IAAI,IAAI,IAAI,OAAO;AAC9D,QAAI,CAAC,KAAK,YAAY,OAAO,KAAK,kBAAkB,UAAU;AAC5D,YAAM,aAAa,wBAAwB,kBAAkB,MAAM,CAAC;AACpE,YAAM,OAAO,KAAK,gBAAgB;AAClC,UAAI,OAAO,EAAG,OAAM,SAAS,IAAI,SAAS,MAAM,cAAc,qBAAqB;AAAA,eAC1E,OAAO,EAAG,OAAM,QAAQ,IAAI,SAAS,CAAC,MAAM,OAAO,KAAK,SAAS,IAAI,KAAK;AAAA,IACrF;AACA,QAAI,IAAI,cAAc;AACpB,YAAM,eAAe,IAAI,cAAc,EAAE,QAAQ,IAAI,IAAI,QAAQ,QAAQ,QAAQ,SAAS,CAAC;AAAA,IAC7F;AAAA,EACF,SAAS,KAAK;AACZ,UAAM,UAAU,uBAAuB,GAAG;AAC1C,UAAM,QAAQ,IAAI,IAAI,wBAAwB,OAAO,CAAC;AACtD,UAAM,WAAW,MAAM,oBAAoB,IAAI,IAAI,IAAI,OAAO;AAC9D,QAAI;AACF,YAAM,OAAO,OAAO,IAAI,YAAY,WAAW,KAAK,MAAM,IAAI,OAAO,IAAkC,IAAI;AAC3G,UAAI,OAAO,KAAK,kBAAkB,YAAY,KAAK,gBAAgB,GAAG;AACpE,cAAM,SAAS,IAAI,SAAS,KAAK,eAAe,UAAU,aAAa;AAAA,MACzE;AAAA,IACF,QAAQ;AAAA,IAAC;AACT,QAAI,IAAI,cAAc;AACpB,YAAM,eAAe,IAAI,cAAc,EAAE,QAAQ,IAAI,IAAI,QAAQ,UAAU,GAAG,uBAAuB,OAAO,GAAG,SAAS,CAAC;AAAA,IAC3H;AAAA,EACF,UAAE;AACA;AAAA,EACF;AACF;AAEA,eAAsB,WAAgC;AACpD,QAAM,MAAM,MAAM,gBAAgB;AAClC,MAAI,CAAC,IAAK,QAAO,EAAE,SAAS,MAAM;AAClC,QAAM,YAAY,KAAK,IAAI;AAC3B,QAAM,WAAW,GAA8B;AAC/C,SAAO,EAAE,SAAS,MAAM,OAAQ,IAAuB,IAAI,WAAW,MAAM,YAAY,KAAK,IAAI,IAAI,UAAU;AACjH;AAEA,eAAsB,WAAW,QAA4C;AAC3E,QAAM,UAAwB,CAAC;AAC/B,WAAS,IAAI,GAAG,IAAI,OAAO,SAAS,KAAK;AACvC,QAAI,KAAK,IAAI,KAAK,OAAO,WAAY;AACrC,UAAM,IAAI,MAAM,SAAS;AACzB,YAAQ,KAAK,CAAC;AACd,QAAI,CAAC,EAAE,QAAS;AAAA,EAClB;AACA,SAAO;AACT;AAEO,SAAS,cAAoB;AAClC,cAAY,YAAY;AACtB,QAAI,WAAW,eAAgB;AAC/B,UAAM,MAAM,MAAM,gBAAgB;AAClC,QAAI,IAAK,MAAK,WAAW,GAA8B;AAAA,EACzD,GAAG,GAAI;AACP,UAAQ,IAAI,iDAA4C,cAAc,aAAa;AACrF;","names":[]}
@@ -0,0 +1,58 @@
1
+ # ADR 0001: In-page API interception for anti-bot scraping (Facebook Ad Library)
2
+
3
+ - **Status:** Accepted
4
+ - **Date:** 2026-06-03
5
+ - **Deciders:** Andrew (operator), engineering
6
+ - **Applies to:** `facebook_ad_search`, `facebook_page_intel`, and future scrapers of JS-heavy, anti-bot-protected sites
7
+
8
+ ## Context
9
+
10
+ `facebook_ad_search` was unreliable — it usually returned `soft-block: no results (refunded)`, and when it did return, advertiser names came back `undefined` with `—` for library IDs. Two independent root causes:
11
+
12
+ 1. **DOM scraping a hostile SPA.** The Facebook Ad Library is a React app. The ads are *not* in the initial HTML — the page fires its own background GraphQL request and renders the JSON into DOM cards asynchronously. Our scraper waited for those cards to render, then re-parsed the rendered text with regexes (`See ad details … Sponsored`). That is fragile (breaks on any UI change) and slow, and it conflated "page still loading" with "no results."
13
+
14
+ 2. **A single static proxy.** Every Facebook scrape launched with the static `KERNEL_PROXY_ID`. Facebook flags/rate-limits a repeatedly-used IP and serves a near-empty / login-gated page to it. A real residential browser loaded the same search fine — confirming the block was IP-reputation, not the URL or a login requirement. `facebook_page_intel` worked intermittently for the same reason (same proxy, sometimes flagged).
15
+
16
+ The "undefined names / — library IDs" was a third, smaller bug: a field-name mismatch between the API response (`pageName`, `sampleLibraryId`) and the MCP formatter (`name`, `libraryId`).
17
+
18
+ ## Decision
19
+
20
+ **Stop scraping the rendered page. Intercept the JSON the page already fetches, on a fresh residential proxy, with the DOM scrape kept only as a fallback.**
21
+
22
+ Concretely, for the Ad Library:
23
+
24
+ 1. **Intercept the in-page GraphQL response.** The SPA issues `AdLibrarySearchPaginationQuery` (a `doc_id`-based POST to `/api/graphql/`) and Facebook returns the ads as JSON. We attach a Playwright `page.on('response')` listener before navigation and parse the JSON directly (`data.ad_library_main.search_results_connection.edges[].node.collated_results[]`). No request is forged — the page makes the request itself, so it carries Facebook's own session/cookies and looks like a legitimate first-party call. Implemented in `src/extractor/FacebookAdGraphql.ts`.
25
+ 2. **Fresh residential proxy per request.** Both FB routes now launch via `kernelLaunchOptsResidential()`, which uses `resolveKernelProxyId` (the same residential rotation that fixed the PAA CAPTCHAs) instead of the flagged static `KERNEL_PROXY_ID`.
26
+ 3. **DOM scrape as fallback.** If no GraphQL response is captured (e.g., the query shape drifts), the existing regex DOM parse still runs on the already-loaded page.
27
+ 4. **Field aliasing for client compatibility.** The API now returns both `pageName`/`name` and `sampleLibraryId`/`libraryId`, so even already-installed MCP clients render correctly without a package update; the formatter was also fixed to read the canonical fields.
28
+
29
+ This is the **same pattern already proven by the YouTube transcription InnerTube tier** (`CaptionFetcher.ts` does `page.evaluate(fetch('/youtubei/v1/player'…))` from inside the loaded page). This ADR names it as the preferred default for this class of target.
30
+
31
+ ### The reusable pattern
32
+
33
+ > For a JS-heavy, anti-bot-protected site whose data arrives via an internal API call: load the page on a clean (residential) IP, capture or replay that internal request **from within the page's own session**, parse the JSON, and keep DOM scraping only as a fallback.
34
+
35
+ Prefer **intercepting the response** (`page.on('response')`) over **replaying the request** when the page fires the call itself — it needs zero token/`doc_id`/`fb_dtsg` reconstruction and is the most robust form.
36
+
37
+ ## Consequences
38
+
39
+ **Positive**
40
+ - Parses structured JSON, not DOM → immune to CSS/markup changes (the usual scraper breakage).
41
+ - Faster — no waiting for the grid to render/scroll before reading.
42
+ - Unambiguous — an explicit ad list or an error, killing the "0 results = blocked?" guesswork.
43
+ - Lower block rate — an in-session first-party request plus a clean residential IP looks legitimate.
44
+ - Verified live: `Nike` returned 10 distinct, correctly-named advertisers with library IDs (Nike, Jordan, Nordstrom Rack, eBay, Whatnot…), end-to-end through the MCP, no soft-block.
45
+
46
+ **Negative / risks**
47
+ - The GraphQL `doc_id` and response shape are Facebook-internal and **drift every few months**. Mitigation: the DOM fallback, plus the capture probe technique below to re-derive the shape in minutes.
48
+ - Still not 100% — the Ad Library is adversarial; residential IPs can occasionally be challenged.
49
+ - Captured `doc_id` (`24922295957467452` as of 2026-06-03) is intentionally *not* hardcoded — we match on the `fb_api_req_friendly_name` (`AdLibrarySearchPaginationQuery`) so a `doc_id` rotation alone doesn't break us.
50
+
51
+ **Maintenance — re-capturing the request shape when it drifts**
52
+ Run a Kernel session on a residential proxy, attach `page.on('request'|'response')`, navigate to the Ad Library search URL, and log GraphQL `fb_api_req_friendly_name` / `doc_id` / `variables` / response keys. (A throwaway `fb-capture.ts` probe was used during this work; recreate it from this description.)
53
+
54
+ ## Alternatives considered
55
+
56
+ - **Official Graph Ad Library API (`/ads_archive`).** Rejected: for the US it only returns political/issue ads; commercial ads (the use case) aren't exposed outside the EU.
57
+ - **Replay the GraphQL request manually** (rebuild `lsd`/`jazoest`/`__dyn`/`doc_id`/variables). Rejected as the default: more brittle than intercepting the page's own response, though viable for pagination beyond what scrolling triggers.
58
+ - **Just rotate proxies, keep DOM scraping.** Rejected: fixes the block but leaves the fragile, slow DOM parse and the "0 results" ambiguity.
@@ -0,0 +1,11 @@
1
+ # Architecture Decision Records
2
+
3
+ Short, durable records of *why* a non-obvious technical decision was made — the context, the choice, and the consequences — so future readers (and future us) don't re-litigate or accidentally undo it.
4
+
5
+ Write one when a decision is hard to reverse, surprising, or encodes a constraint that isn't visible in the code (an anti-bot workaround, a vendor limitation, a deliberate fallback). Don't write one for routine changes.
6
+
7
+ ## Format
8
+ `NNNN-kebab-title.md`, Nygard-style: **Status · Context · Decision · Consequences**. Status is `Proposed` → `Accepted` → (later) `Superseded by ADR-XXXX`. Number sequentially; never renumber.
9
+
10
+ ## Index
11
+ - [0001 — In-page API interception for anti-bot scraping (Facebook Ad Library)](./0001-in-page-graphql-interception-for-anti-bot-scraping.md)
@@ -0,0 +1,238 @@
1
+ # MCP Tool Quality Spec
2
+
3
+ This spec defines the shipping bar for MCP Scraper tools. It exists because MCP behavior is model-facing: a tool can be technically callable and still fail if the AI cannot infer when to use it, how to fill inputs, or how to chain the result.
4
+
5
+ ## What Actually Steers The AI
6
+
7
+ The model is primarily affected by what the MCP client receives from `tools/list` and `tools/call`.
8
+
9
+ 1. Tool name
10
+ 2. Tool title and annotations
11
+ 3. Tool description
12
+ 4. Input schema field names, descriptions, defaults, enums, and limits
13
+ 5. Output schema and `structuredContent`
14
+ 6. Tool result text and next-step guidance
15
+ 7. Error shape and retry guidance
16
+
17
+ README files, website copy, and public skill markdown help humans and skill loaders, but they do not reliably affect runtime AI behavior unless the client explicitly injects those files into context.
18
+
19
+ ## Tool Boundary Rules
20
+
21
+ Each tool must have one primary job.
22
+
23
+ - Split tools when the user intent, cost model, result shape, or follow-up workflow differs.
24
+ - Do not overload one tool with unrelated modes if a model could choose the wrong path.
25
+ - Descriptions for adjacent tools must explicitly say when not to use each tool.
26
+ - If a workflow has a natural sequence, encode it in the description and result guidance.
27
+
28
+ Example:
29
+
30
+ - `maps_search`: find multiple Google Maps candidates for a category, market, lead list, or "more than the 3-pack".
31
+ - `maps_place_intel`: hydrate one known business or one selected candidate with full profile details and optional reviews.
32
+
33
+ ## Tool Naming
34
+
35
+ Names must be short, stable, and action-oriented.
36
+
37
+ - Use domain + action: `maps_search`, `maps_place_intel`, `facebook_ad_search`.
38
+ - Avoid generic names such as `search`, `lookup`, `get_data`, or `run`.
39
+ - Avoid names that imply broader capability than the tool has.
40
+ - Do not rename a public tool without a compatibility plan.
41
+
42
+ ## Tool Descriptions
43
+
44
+ Descriptions are model instructions. They must be concise but operational.
45
+
46
+ Every description must include:
47
+
48
+ - What the tool does.
49
+ - When to use it.
50
+ - When not to use it if there is an adjacent tool.
51
+ - Important defaults and hard caps.
52
+ - Cost-sensitive behavior when relevant.
53
+ - The expected next tool when chaining is common.
54
+ - Whether reports are saved locally.
55
+
56
+ Bad:
57
+
58
+ ```text
59
+ Search Google Maps.
60
+ ```
61
+
62
+ Good:
63
+
64
+ ```text
65
+ Search Google Maps for multiple businesses/profiles by category, niche, keyword, or local market. Use this when the user asks for several Google Business Profiles, GMBs, GBPs, leads, prospects, competitors, or "more than the 3-pack." Returns up to 50 candidates. Default maxResults is 10; maximum is 50. Use maps_place_intel afterward only when a selected business needs full details and reviews.
66
+ ```
67
+
68
+ ## Input Schema
69
+
70
+ Each input field must have a clear description.
71
+
72
+ Required:
73
+
74
+ - Required fields must be actually required in the schema.
75
+ - Defaults must be encoded in the schema, not only in prose.
76
+ - Hard caps must be encoded in the schema.
77
+ - Fields that are often confused must say what not to put there.
78
+ - Location and query fields must tell the model to split location from the topic when possible.
79
+ - Enum fields must explain when to choose each option.
80
+
81
+ For numeric limits:
82
+
83
+ - Normal/default value belongs in `.default(...)`.
84
+ - Maximum value belongs in `.max(...)`.
85
+ - Description must say when to use higher values.
86
+
87
+ ## Output Schema And Structured Content
88
+
89
+ Any tool whose output may be consumed by another tool should have `outputSchema` and return `structuredContent`.
90
+
91
+ Required for chaining tools:
92
+
93
+ - Return arrays/objects in `structuredContent`; do not force the model to parse Markdown.
94
+ - Keep `content` as a human-readable report.
95
+ - Ensure `structuredContent` validates against `outputSchema`.
96
+ - Include IDs, URLs, names, positions, counts, and any fields needed for the next tool.
97
+
98
+ Example:
99
+
100
+ `maps_search` must return `structuredContent.results[]` with at least:
101
+
102
+ - `position`
103
+ - `name`
104
+ - `placeUrl`
105
+ - `cid`
106
+ - `cidDecimal`
107
+ - `rating`
108
+ - `reviewCount`
109
+ - `category`
110
+ - `address`
111
+ - `websiteUrl`
112
+ - `directionsUrl`
113
+ - `metadata`
114
+
115
+ ## Tool Annotations
116
+
117
+ Every public tool should define annotations.
118
+
119
+ Use:
120
+
121
+ - `readOnlyHint: true` for research, scrape, search, transcript, and inspect tools.
122
+ - `destructiveHint: false` unless the tool mutates or deletes user data.
123
+ - `idempotentHint: false` for live web searches because results, billing, and anti-bot state can change.
124
+ - `openWorldHint: true` for tools that access public web or external live systems.
125
+ - A human-readable `title`.
126
+
127
+ Annotations are hints, not a replacement for descriptions.
128
+
129
+ ## Result Text
130
+
131
+ Human-readable text still matters. It is what users see and what some clients preserve in context.
132
+
133
+ Every successful result should include:
134
+
135
+ - Clear title.
136
+ - Returned count versus requested count when relevant.
137
+ - Key result table or summary.
138
+ - Saved report path when report saving is enabled.
139
+ - Next-step guidance for common follow-up actions.
140
+
141
+ For chained tools, the result text should name the next tool explicitly.
142
+
143
+ ## Error Format
144
+
145
+ Errors must help the model choose the next action.
146
+
147
+ Every API error should include:
148
+
149
+ - `error` or `error_code`
150
+ - Human-readable message
151
+ - Whether retry is reasonable when known
152
+ - Enough context to avoid repeating the same bad call
153
+
154
+ Common cases:
155
+
156
+ - Auth failure: tell user API key is invalid or missing. Do not retry.
157
+ - Insufficient balance: return balance, required credits, and top-up URL.
158
+ - CAPTCHA/block: say it is temporary and retryable later.
159
+ - Validation error: identify the bad or missing field.
160
+ - Timeout/cancel: say whether the server attempted cleanup.
161
+
162
+ ## Cost And Concurrency
163
+
164
+ Tools that cost credits or hold jobs must expose that in metadata or result text.
165
+
166
+ Required:
167
+
168
+ - Add cost entry to `CREDIT_COST_CATALOG`.
169
+ - Add ledger operation for billable work.
170
+ - Include refund path on failure where applicable.
171
+ - Respect the account concurrency model.
172
+ - Tool descriptions should warn when a tool is expensive or long-running.
173
+
174
+ ## Docs Surfaces
175
+
176
+ For a new or changed public tool, update all applicable surfaces:
177
+
178
+ - `src/mcp/paa-mcp-server.ts`
179
+ - `src/mcp/mcp-tool-schemas.ts`
180
+ - `src/mcp/mcp-response-formatter.ts`
181
+ - API route and API schema
182
+ - `README.md`
183
+ - `public/skill.md`
184
+ - `public/codex-skill.md`
185
+ - `public/skills/mcp-scraper/skill.md`
186
+ - Dashboard UI when users can trigger the workflow there
187
+ - Live protocol tool list tests
188
+
189
+ ## Packaging And Deployment
190
+
191
+ MCP changes often have two release surfaces.
192
+
193
+ NPX package:
194
+
195
+ - Bump `package.json` and `package-lock.json` when publishing npm.
196
+ - Rebuild `dist` with `npx tsup`.
197
+ - Verify `npm pack --dry-run` includes the rebuilt stdio binary.
198
+ - Verify built output contains the new tool name, description, schema, and formatter behavior.
199
+
200
+ Hosted API:
201
+
202
+ - Deploy API routes before or with the MCP package.
203
+ - A new MCP tool that calls a new hosted endpoint is broken until production has that endpoint.
204
+ - Answer "was this prod?" by separating API deployment from npm package publication.
205
+
206
+ ## Required Tests
207
+
208
+ Every new public MCP tool needs tests at the right level for risk.
209
+
210
+ Minimum:
211
+
212
+ - Schema default and hard cap tests.
213
+ - Formatter test when result text or `structuredContent` matters.
214
+ - Tool list/protocol test updated with the new tool.
215
+ - Billing/ledger count tests updated for new billable operations.
216
+ - Typecheck.
217
+ - Unit/contract suite.
218
+
219
+ For live-web tools:
220
+
221
+ - One live smoke test or saved live evidence for the core workflow.
222
+ - If anti-bot behavior is likely, capture failure mode and retry guidance.
223
+
224
+ ## Definition Of Done
225
+
226
+ A public MCP tool change is not done until all are true:
227
+
228
+ - Tool name and boundary are clear.
229
+ - Tool description tells the model when to use it and when not to.
230
+ - Input schema encodes defaults, limits, and field-level instructions.
231
+ - Output is structured when the result will be chained.
232
+ - Result text is useful to humans and names the next tool when appropriate.
233
+ - Errors are actionable.
234
+ - Billing and refunds are correct.
235
+ - Dashboard, docs, skill text, and README are updated where relevant.
236
+ - `dist` is rebuilt for NPX package changes.
237
+ - Production API deployment is accounted for separately from npm publication.
238
+ - Tests and smoke evidence prove the workflow.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mcp-scraper",
3
- "version": "0.1.6",
3
+ "version": "0.1.7",
4
4
  "description": "MCP server for MCP Scraper web intelligence tools",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",
@@ -20,14 +20,15 @@
20
20
  },
21
21
  "files": [
22
22
  "dist",
23
+ "docs",
23
24
  "README.md"
24
25
  ],
25
26
  "scripts": {
26
27
  "build": "tsup && npm run build:ui && npm run build:blog",
27
28
  "build:ui": "esbuild public/app.jsx --bundle --loader:.jsx=jsx --outfile=public/app.js --target=es2020",
28
- "build:blog": "tsx scripts/build-blog.ts",
29
- "api": "tsx bin/api-server.ts",
30
- "dev": "tsx src/cli.ts",
29
+ "build:blog": "node scripts/run-ts-sandbox-safe.mjs scripts/build-blog.ts",
30
+ "api": "node scripts/run-ts-sandbox-safe.mjs bin/api-server.ts",
31
+ "dev": "node scripts/run-ts-sandbox-safe.mjs src/cli.ts",
31
32
  "test": "vitest run tests/unit tests/contract",
32
33
  "test:live": "vitest run --config vitest.live.config.ts",
33
34
  "test:integration": "node scripts/run-integration-tests.mjs",