mcp-scraper 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/README.md +13 -2
  2. package/dist/bin/api-server.cjs +957 -243
  3. package/dist/bin/api-server.cjs.map +1 -1
  4. package/dist/bin/api-server.js +2 -2
  5. package/dist/bin/mcp-stdio-server.cjs +540 -158
  6. package/dist/bin/mcp-stdio-server.cjs.map +1 -1
  7. package/dist/bin/mcp-stdio-server.js +2 -1
  8. package/dist/bin/mcp-stdio-server.js.map +1 -1
  9. package/dist/bin/paa-harvest.cjs +36 -5
  10. package/dist/bin/paa-harvest.cjs.map +1 -1
  11. package/dist/bin/paa-harvest.js +5 -3
  12. package/dist/bin/paa-harvest.js.map +1 -1
  13. package/dist/{chunk-6TWZS2FQ.js → chunk-RE6HCRYC.js} +543 -159
  14. package/dist/chunk-RE6HCRYC.js.map +1 -0
  15. package/dist/{chunk-W4P2U5VF.js → chunk-TM22BLWP.js} +46 -34
  16. package/dist/chunk-TM22BLWP.js.map +1 -0
  17. package/dist/{chunk-7HB7NDOY.js → chunk-ZK456YXN.js} +12 -2
  18. package/dist/chunk-ZK456YXN.js.map +1 -0
  19. package/dist/chunk-ZMOWIBMK.js +36 -0
  20. package/dist/chunk-ZMOWIBMK.js.map +1 -0
  21. package/dist/index.cjs +34 -3
  22. package/dist/index.cjs.map +1 -1
  23. package/dist/index.js +2 -1
  24. package/dist/index.js.map +1 -1
  25. package/dist/{server-2Y27U4TO.js → server-QXVVTKJP.js} +311 -48
  26. package/dist/server-QXVVTKJP.js.map +1 -0
  27. package/dist/{worker-UT4ZQU2T.js → worker-AUCXFHEL.js} +6 -4
  28. package/dist/worker-AUCXFHEL.js.map +1 -0
  29. package/docs/adr/0001-in-page-graphql-interception-for-anti-bot-scraping.md +58 -0
  30. package/docs/adr/README.md +11 -0
  31. package/docs/mcp-tool-quality-spec.md +238 -0
  32. package/package.json +5 -4
  33. package/dist/chunk-6TWZS2FQ.js.map +0 -1
  34. package/dist/chunk-7HB7NDOY.js.map +0 -1
  35. package/dist/chunk-W4P2U5VF.js.map +0 -1
  36. package/dist/server-2Y27U4TO.js.map +0 -1
  37. package/dist/worker-UT4ZQU2T.js.map +0 -1
@@ -1 +1 @@
1
- {"version":3,"sources":["../../bin/mcp-stdio-server.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { readFileSync } from 'node:fs'\nimport { homedir } from 'node:os'\nimport { join } from 'node:path'\nimport { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'\nimport { HttpMcpToolExecutor } from '../src/mcp/http-mcp-tool-executor.js'\nimport { buildPaaExtractorMcpServer } from '../src/mcp/paa-mcp-server.js'\n\nfunction readApiKeyFile(): string | undefined {\n const explicitPath = process.env.MCP_SCRAPER_KEY_PATH?.trim()\n const paths = [explicitPath, join(homedir(), '.mcp-scraper-key')].filter(Boolean) as string[]\n for (const path of paths) {\n try {\n const value = readFileSync(path, 'utf8').trim()\n if (value) return value\n } catch {}\n }\n return undefined\n}\n\nconst apiKey = (\n process.env.MCP_SCRAPER_API_KEY ??\n process.env.MCP_SCRAPER_KEY ??\n process.env.MCP_API_KEY ??\n readApiKeyFile()\n)?.trim()\nif (!apiKey) {\n process.stderr.write('MCP_SCRAPER_API_KEY env var or ~/.mcp-scraper-key is required\\n')\n process.exit(1)\n}\n\nconst baseUrl = process.env.MCP_SCRAPER_BASE_URL?.trim() ?? process.env.MCP_BASE_URL?.trim() ?? 'https://mcpscraper.dev'\nconst executor = new HttpMcpToolExecutor(baseUrl, apiKey)\nconst server = buildPaaExtractorMcpServer(executor)\nconst transport = new StdioServerTransport()\n\nasync function main() {\n await server.connect(transport)\n}\n\nmain().catch((err) => {\n process.stderr.write(`${err instanceof Error ? err.message : String(err)}\\n`)\n process.exit(1)\n})\n"],"mappings":";;;;;;;AACA,SAAS,oBAAoB;AAC7B,SAAS,eAAe;AACxB,SAAS,YAAY;AACrB,SAAS,4BAA4B;AAIrC,SAAS,iBAAqC;AAC5C,QAAM,eAAe,QAAQ,IAAI,sBAAsB,KAAK;AAC5D,QAAM,QAAQ,CAAC,cAAc,KAAK,QAAQ,GAAG,kBAAkB,CAAC,EAAE,OAAO,OAAO;AAChF,aAAW,QAAQ,OAAO;AACxB,QAAI;AACF,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,KAAK;AAC9C,UAAI,MAAO,QAAO;AAAA,IACpB,QAAQ;AAAA,IAAC;AAAA,EACX;AACA,SAAO;AACT;AAEA,IAAM,UACJ,QAAQ,IAAI,uBACZ,QAAQ,IAAI,mBACZ,QAAQ,IAAI,eACZ,eAAe,IACd,KAAK;AACR,IAAI,CAAC,QAAQ;AACX,UAAQ,OAAO,MAAM,iEAAiE;AACtF,UAAQ,KAAK,CAAC;AAChB;AAEA,IAAM,UAAU,QAAQ,IAAI,sBAAsB,KAAK,KAAK,QAAQ,IAAI,cAAc,KAAK,KAAK;AAChG,IAAM,WAAW,IAAI,oBAAoB,SAAS,MAAM;AACxD,IAAM,SAAS,2BAA2B,QAAQ;AAClD,IAAM,YAAY,IAAI,qBAAqB;AAE3C,eAAe,OAAO;AACpB,QAAM,OAAO,QAAQ,SAAS;AAChC;AAEA,KAAK,EAAE,MAAM,CAAC,QAAQ;AACpB,UAAQ,OAAO,MAAM,GAAG,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,CAAI;AAC5E,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
1
+ {"version":3,"sources":["../../bin/mcp-stdio-server.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { readFileSync } from 'node:fs'\nimport { homedir } from 'node:os'\nimport { join } from 'node:path'\nimport { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'\nimport { HttpMcpToolExecutor } from '../src/mcp/http-mcp-tool-executor.js'\nimport { buildPaaExtractorMcpServer } from '../src/mcp/paa-mcp-server.js'\n\nfunction readApiKeyFile(): string | undefined {\n const explicitPath = process.env.MCP_SCRAPER_KEY_PATH?.trim()\n const paths = [explicitPath, join(homedir(), '.mcp-scraper-key')].filter(Boolean) as string[]\n for (const path of paths) {\n try {\n const value = readFileSync(path, 'utf8').trim()\n if (value) return value\n } catch {}\n }\n return undefined\n}\n\nconst apiKey = (\n process.env.MCP_SCRAPER_API_KEY ??\n process.env.MCP_SCRAPER_KEY ??\n process.env.MCP_API_KEY ??\n readApiKeyFile()\n)?.trim()\nif (!apiKey) {\n process.stderr.write('MCP_SCRAPER_API_KEY env var or ~/.mcp-scraper-key is required\\n')\n process.exit(1)\n}\n\nconst baseUrl = process.env.MCP_SCRAPER_BASE_URL?.trim() ?? process.env.MCP_BASE_URL?.trim() ?? 'https://mcpscraper.dev'\nconst executor = new HttpMcpToolExecutor(baseUrl, apiKey)\nconst server = buildPaaExtractorMcpServer(executor)\nconst transport = new StdioServerTransport()\n\nasync function main() {\n await server.connect(transport)\n}\n\nmain().catch((err) => {\n process.stderr.write(`${err instanceof Error ? err.message : String(err)}\\n`)\n process.exit(1)\n})\n"],"mappings":";;;;;;;;AACA,SAAS,oBAAoB;AAC7B,SAAS,eAAe;AACxB,SAAS,YAAY;AACrB,SAAS,4BAA4B;AAIrC,SAAS,iBAAqC;AAC5C,QAAM,eAAe,QAAQ,IAAI,sBAAsB,KAAK;AAC5D,QAAM,QAAQ,CAAC,cAAc,KAAK,QAAQ,GAAG,kBAAkB,CAAC,EAAE,OAAO,OAAO;AAChF,aAAW,QAAQ,OAAO;AACxB,QAAI;AACF,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,KAAK;AAC9C,UAAI,MAAO,QAAO;AAAA,IACpB,QAAQ;AAAA,IAAC;AAAA,EACX;AACA,SAAO;AACT;AAEA,IAAM,UACJ,QAAQ,IAAI,uBACZ,QAAQ,IAAI,mBACZ,QAAQ,IAAI,eACZ,eAAe,IACd,KAAK;AACR,IAAI,CAAC,QAAQ;AACX,UAAQ,OAAO,MAAM,iEAAiE;AACtF,UAAQ,KAAK,CAAC;AAChB;AAEA,IAAM,UAAU,QAAQ,IAAI,sBAAsB,KAAK,KAAK,QAAQ,IAAI,cAAc,KAAK,KAAK;AAChG,IAAM,WAAW,IAAI,oBAAoB,SAAS,MAAM;AACxD,IAAM,SAAS,2BAA2B,QAAQ;AAClD,IAAM,YAAY,IAAI,qBAAqB;AAE3C,eAAe,OAAO;AACpB,QAAM,OAAO,QAAQ,SAAS;AAChC;AAEA,KAAK,EAAE,MAAM,CAAC,QAAQ;AACpB,UAAQ,OAAO,MAAM,GAAG,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,CAAI;AAC5E,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
@@ -26,6 +26,16 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
26
26
  // src/cli.ts
27
27
  var import_commander = require("commander");
28
28
 
29
+ // src/lib/browser-service-env.ts
30
+ function browserServiceApiKey() {
31
+ const value = (process.env.BROWSER_SERVICE_API_KEY ?? process.env.KERNEL_API_KEY)?.trim();
32
+ return value || void 0;
33
+ }
34
+ function browserServiceProxyId() {
35
+ const value = (process.env.BROWSER_SERVICE_PROXY_ID ?? process.env.KERNEL_PROXY_ID)?.trim();
36
+ return value || void 0;
37
+ }
38
+
29
39
  // src/schemas.ts
30
40
  var import_zod = require("zod");
31
41
  var HarvestOptionsSchema = import_zod.z.object({
@@ -61,6 +71,16 @@ var MapsPlaceOptionsSchema = import_zod.z.object({
61
71
  kernelProxyId: import_zod.z.string().optional(),
62
72
  headless: import_zod.z.boolean().default(true)
63
73
  });
74
+ var MapsSearchOptionsSchema = import_zod.z.object({
75
+ query: import_zod.z.string().min(1),
76
+ location: import_zod.z.string().optional(),
77
+ gl: import_zod.z.string().length(2).default("us"),
78
+ hl: import_zod.z.string().length(2).default("en"),
79
+ maxResults: import_zod.z.number().int().min(1).max(50).default(10),
80
+ kernelApiKey: import_zod.z.string().optional(),
81
+ kernelProxyId: import_zod.z.string().optional(),
82
+ headless: import_zod.z.boolean().default(true)
83
+ });
64
84
  var RawPAAItemSchema = import_zod.z.object({
65
85
  question: import_zod.z.string().min(1),
66
86
  answer: import_zod.z.string().optional(),
@@ -924,8 +944,19 @@ function addCandidate(candidates, city, region, example) {
924
944
  }
925
945
  candidates.set(key, { city: normalizedCity, regionCode, count: 1, examples: [example] });
926
946
  }
947
+ function decodeSerpText(text) {
948
+ try {
949
+ return decodeURIComponent(text);
950
+ } catch {
951
+ }
952
+ try {
953
+ return decodeURIComponent(text.replace(/%(?![0-9a-fA-F]{2})/g, "%25"));
954
+ } catch {
955
+ return text;
956
+ }
957
+ }
927
958
  function scanText(candidates, text) {
928
- const normalized = decodeURIComponent(text).replace(/[+/|_-]+/g, " ");
959
+ const normalized = decodeSerpText(text).replace(/[+/|_-]+/g, " ");
929
960
  for (const match of normalized.matchAll(CITY_STATE_RE)) {
930
961
  addCandidate(candidates, match[1] ?? "", match[2] ?? "", normalized.slice(0, 180));
931
962
  }
@@ -2593,8 +2624,8 @@ async function harvest(rawOptions) {
2593
2624
  const onAttemptEvent = getAttemptLogSink(rawOptions);
2594
2625
  const requestedProxyMode = raw.proxyMode;
2595
2626
  const proxyMode = requestedProxyMode === "none" ? "none" : requestedProxyMode === "configured" ? "configured" : "location";
2596
- const kernelApiKey = typeof raw.kernelApiKey === "string" ? raw.kernelApiKey.trim() : process.env.KERNEL_API_KEY?.trim();
2597
- const configuredKernelProxyId = typeof raw.kernelProxyId === "string" ? raw.kernelProxyId.trim() : process.env.KERNEL_PROXY_ID?.trim();
2627
+ const kernelApiKey = typeof raw.kernelApiKey === "string" ? raw.kernelApiKey.trim() : browserServiceApiKey();
2628
+ const configuredKernelProxyId = typeof raw.kernelProxyId === "string" ? raw.kernelProxyId.trim() : browserServiceProxyId();
2598
2629
  const proxyOpts = {
2599
2630
  kernelApiKey,
2600
2631
  proxyMode,
@@ -2779,7 +2810,7 @@ async function harvest(rawOptions) {
2779
2810
 
2780
2811
  // src/cli.ts
2781
2812
  var program = new import_commander.Command();
2782
- program.name("paa-harvest").description("Recursively extract Google People Also Ask questions").requiredOption("-q, --query <query>", "Seed query").option("-l, --location <location>", 'Location name (e.g. "austin" or "Austin,Texas,United States")').option("--gl <gl>", "Google country code", "us").option("--hl <hl>", "Google language code", "en").option("-d, --depth <depth>", "BFS depth (1-30)", "3").option("-m, --max-questions <n>", "Max questions to harvest", "100").option("-o, --output <dir>", "Output directory", "./paa-output").option("-f, --format <format>", "Output format: json, csv, or both", "both").option("--headless", "Run browser in headless mode", false).option("--profile <dir>", "Persistent browser profile directory").option("--proxy <url>", "Proxy server URL").option("--kernel-api-key <key>", "Kernel.sh API key (or set KERNEL_API_KEY env var)").action(async (opts) => {
2813
+ program.name("paa-harvest").description("Recursively extract Google People Also Ask questions").requiredOption("-q, --query <query>", "Seed query").option("-l, --location <location>", 'Location name (e.g. "austin" or "Austin,Texas,United States")').option("--gl <gl>", "Google country code", "us").option("--hl <hl>", "Google language code", "en").option("-d, --depth <depth>", "BFS depth (1-30)", "3").option("-m, --max-questions <n>", "Max questions to harvest", "100").option("-o, --output <dir>", "Output directory", "./paa-output").option("-f, --format <format>", "Output format: json, csv, or both", "both").option("--headless", "Run browser in headless mode", false).option("--profile <dir>", "Persistent browser profile directory").option("--proxy <url>", "Proxy server URL").option("--kernel-api-key <key>", "Browser service API key (or set BROWSER_SERVICE_API_KEY env var)").action(async (opts) => {
2783
2814
  try {
2784
2815
  const result = await harvest({
2785
2816
  query: opts.query,
@@ -2793,7 +2824,7 @@ program.name("paa-harvest").description("Recursively extract Google People Also
2793
2824
  headless: opts.headless,
2794
2825
  profileDir: opts.profile,
2795
2826
  proxy: opts.proxy,
2796
- kernelApiKey: opts.kernelApiKey ?? process.env.KERNEL_API_KEY
2827
+ kernelApiKey: opts.kernelApiKey ?? browserServiceApiKey()
2797
2828
  });
2798
2829
  console.log(JSON.stringify({ totalQuestions: result.totalQuestions, outputDir: result.stats.seed }));
2799
2830
  } catch (err) {