mcp-scraper 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -2
- package/dist/bin/api-server.cjs +957 -243
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +2 -2
- package/dist/bin/mcp-stdio-server.cjs +540 -158
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +2 -1
- package/dist/bin/mcp-stdio-server.js.map +1 -1
- package/dist/bin/paa-harvest.cjs +36 -5
- package/dist/bin/paa-harvest.cjs.map +1 -1
- package/dist/bin/paa-harvest.js +5 -3
- package/dist/bin/paa-harvest.js.map +1 -1
- package/dist/{chunk-6TWZS2FQ.js → chunk-RE6HCRYC.js} +543 -159
- package/dist/chunk-RE6HCRYC.js.map +1 -0
- package/dist/{chunk-W4P2U5VF.js → chunk-TM22BLWP.js} +46 -34
- package/dist/chunk-TM22BLWP.js.map +1 -0
- package/dist/{chunk-7HB7NDOY.js → chunk-ZK456YXN.js} +12 -2
- package/dist/chunk-ZK456YXN.js.map +1 -0
- package/dist/chunk-ZMOWIBMK.js +36 -0
- package/dist/chunk-ZMOWIBMK.js.map +1 -0
- package/dist/index.cjs +34 -3
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +2 -1
- package/dist/index.js.map +1 -1
- package/dist/{server-2Y27U4TO.js → server-QXVVTKJP.js} +311 -48
- package/dist/server-QXVVTKJP.js.map +1 -0
- package/dist/{worker-UT4ZQU2T.js → worker-AUCXFHEL.js} +6 -4
- package/dist/worker-AUCXFHEL.js.map +1 -0
- package/docs/adr/0001-in-page-graphql-interception-for-anti-bot-scraping.md +58 -0
- package/docs/adr/README.md +11 -0
- package/docs/mcp-tool-quality-spec.md +238 -0
- package/package.json +5 -4
- package/dist/chunk-6TWZS2FQ.js.map +0 -1
- package/dist/chunk-7HB7NDOY.js.map +0 -1
- package/dist/chunk-W4P2U5VF.js.map +0 -1
- package/dist/server-2Y27U4TO.js.map +0 -1
- package/dist/worker-UT4ZQU2T.js.map +0 -1
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../bin/mcp-stdio-server.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { readFileSync } from 'node:fs'\nimport { homedir } from 'node:os'\nimport { join } from 'node:path'\nimport { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'\nimport { HttpMcpToolExecutor } from '../src/mcp/http-mcp-tool-executor.js'\nimport { buildPaaExtractorMcpServer } from '../src/mcp/paa-mcp-server.js'\n\nfunction readApiKeyFile(): string | undefined {\n const explicitPath = process.env.MCP_SCRAPER_KEY_PATH?.trim()\n const paths = [explicitPath, join(homedir(), '.mcp-scraper-key')].filter(Boolean) as string[]\n for (const path of paths) {\n try {\n const value = readFileSync(path, 'utf8').trim()\n if (value) return value\n } catch {}\n }\n return undefined\n}\n\nconst apiKey = (\n process.env.MCP_SCRAPER_API_KEY ??\n process.env.MCP_SCRAPER_KEY ??\n process.env.MCP_API_KEY ??\n readApiKeyFile()\n)?.trim()\nif (!apiKey) {\n process.stderr.write('MCP_SCRAPER_API_KEY env var or ~/.mcp-scraper-key is required\\n')\n process.exit(1)\n}\n\nconst baseUrl = process.env.MCP_SCRAPER_BASE_URL?.trim() ?? process.env.MCP_BASE_URL?.trim() ?? 'https://mcpscraper.dev'\nconst executor = new HttpMcpToolExecutor(baseUrl, apiKey)\nconst server = buildPaaExtractorMcpServer(executor)\nconst transport = new StdioServerTransport()\n\nasync function main() {\n await server.connect(transport)\n}\n\nmain().catch((err) => {\n process.stderr.write(`${err instanceof Error ? err.message : String(err)}\\n`)\n process.exit(1)\n})\n"],"mappings":"
|
|
1
|
+
{"version":3,"sources":["../../bin/mcp-stdio-server.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { readFileSync } from 'node:fs'\nimport { homedir } from 'node:os'\nimport { join } from 'node:path'\nimport { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'\nimport { HttpMcpToolExecutor } from '../src/mcp/http-mcp-tool-executor.js'\nimport { buildPaaExtractorMcpServer } from '../src/mcp/paa-mcp-server.js'\n\nfunction readApiKeyFile(): string | undefined {\n const explicitPath = process.env.MCP_SCRAPER_KEY_PATH?.trim()\n const paths = [explicitPath, join(homedir(), '.mcp-scraper-key')].filter(Boolean) as string[]\n for (const path of paths) {\n try {\n const value = readFileSync(path, 'utf8').trim()\n if (value) return value\n } catch {}\n }\n return undefined\n}\n\nconst apiKey = (\n process.env.MCP_SCRAPER_API_KEY ??\n process.env.MCP_SCRAPER_KEY ??\n process.env.MCP_API_KEY ??\n readApiKeyFile()\n)?.trim()\nif (!apiKey) {\n process.stderr.write('MCP_SCRAPER_API_KEY env var or ~/.mcp-scraper-key is required\\n')\n process.exit(1)\n}\n\nconst baseUrl = process.env.MCP_SCRAPER_BASE_URL?.trim() ?? process.env.MCP_BASE_URL?.trim() ?? 'https://mcpscraper.dev'\nconst executor = new HttpMcpToolExecutor(baseUrl, apiKey)\nconst server = buildPaaExtractorMcpServer(executor)\nconst transport = new StdioServerTransport()\n\nasync function main() {\n await server.connect(transport)\n}\n\nmain().catch((err) => {\n process.stderr.write(`${err instanceof Error ? err.message : String(err)}\\n`)\n process.exit(1)\n})\n"],"mappings":";;;;;;;;AACA,SAAS,oBAAoB;AAC7B,SAAS,eAAe;AACxB,SAAS,YAAY;AACrB,SAAS,4BAA4B;AAIrC,SAAS,iBAAqC;AAC5C,QAAM,eAAe,QAAQ,IAAI,sBAAsB,KAAK;AAC5D,QAAM,QAAQ,CAAC,cAAc,KAAK,QAAQ,GAAG,kBAAkB,CAAC,EAAE,OAAO,OAAO;AAChF,aAAW,QAAQ,OAAO;AACxB,QAAI;AACF,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,KAAK;AAC9C,UAAI,MAAO,QAAO;AAAA,IACpB,QAAQ;AAAA,IAAC;AAAA,EACX;AACA,SAAO;AACT;AAEA,IAAM,UACJ,QAAQ,IAAI,uBACZ,QAAQ,IAAI,mBACZ,QAAQ,IAAI,eACZ,eAAe,IACd,KAAK;AACR,IAAI,CAAC,QAAQ;AACX,UAAQ,OAAO,MAAM,iEAAiE;AACtF,UAAQ,KAAK,CAAC;AAChB;AAEA,IAAM,UAAU,QAAQ,IAAI,sBAAsB,KAAK,KAAK,QAAQ,IAAI,cAAc,KAAK,KAAK;AAChG,IAAM,WAAW,IAAI,oBAAoB,SAAS,MAAM;AACxD,IAAM,SAAS,2BAA2B,QAAQ;AAClD,IAAM,YAAY,IAAI,qBAAqB;AAE3C,eAAe,OAAO;AACpB,QAAM,OAAO,QAAQ,SAAS;AAChC;AAEA,KAAK,EAAE,MAAM,CAAC,QAAQ;AACpB,UAAQ,OAAO,MAAM,GAAG,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,CAAI;AAC5E,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
|
package/dist/bin/paa-harvest.cjs
CHANGED
|
@@ -26,6 +26,16 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
26
26
|
// src/cli.ts
|
|
27
27
|
var import_commander = require("commander");
|
|
28
28
|
|
|
29
|
+
// src/lib/browser-service-env.ts
|
|
30
|
+
function browserServiceApiKey() {
|
|
31
|
+
const value = (process.env.BROWSER_SERVICE_API_KEY ?? process.env.KERNEL_API_KEY)?.trim();
|
|
32
|
+
return value || void 0;
|
|
33
|
+
}
|
|
34
|
+
function browserServiceProxyId() {
|
|
35
|
+
const value = (process.env.BROWSER_SERVICE_PROXY_ID ?? process.env.KERNEL_PROXY_ID)?.trim();
|
|
36
|
+
return value || void 0;
|
|
37
|
+
}
|
|
38
|
+
|
|
29
39
|
// src/schemas.ts
|
|
30
40
|
var import_zod = require("zod");
|
|
31
41
|
var HarvestOptionsSchema = import_zod.z.object({
|
|
@@ -61,6 +71,16 @@ var MapsPlaceOptionsSchema = import_zod.z.object({
|
|
|
61
71
|
kernelProxyId: import_zod.z.string().optional(),
|
|
62
72
|
headless: import_zod.z.boolean().default(true)
|
|
63
73
|
});
|
|
74
|
+
var MapsSearchOptionsSchema = import_zod.z.object({
|
|
75
|
+
query: import_zod.z.string().min(1),
|
|
76
|
+
location: import_zod.z.string().optional(),
|
|
77
|
+
gl: import_zod.z.string().length(2).default("us"),
|
|
78
|
+
hl: import_zod.z.string().length(2).default("en"),
|
|
79
|
+
maxResults: import_zod.z.number().int().min(1).max(50).default(10),
|
|
80
|
+
kernelApiKey: import_zod.z.string().optional(),
|
|
81
|
+
kernelProxyId: import_zod.z.string().optional(),
|
|
82
|
+
headless: import_zod.z.boolean().default(true)
|
|
83
|
+
});
|
|
64
84
|
var RawPAAItemSchema = import_zod.z.object({
|
|
65
85
|
question: import_zod.z.string().min(1),
|
|
66
86
|
answer: import_zod.z.string().optional(),
|
|
@@ -924,8 +944,19 @@ function addCandidate(candidates, city, region, example) {
|
|
|
924
944
|
}
|
|
925
945
|
candidates.set(key, { city: normalizedCity, regionCode, count: 1, examples: [example] });
|
|
926
946
|
}
|
|
947
|
+
function decodeSerpText(text) {
|
|
948
|
+
try {
|
|
949
|
+
return decodeURIComponent(text);
|
|
950
|
+
} catch {
|
|
951
|
+
}
|
|
952
|
+
try {
|
|
953
|
+
return decodeURIComponent(text.replace(/%(?![0-9a-fA-F]{2})/g, "%25"));
|
|
954
|
+
} catch {
|
|
955
|
+
return text;
|
|
956
|
+
}
|
|
957
|
+
}
|
|
927
958
|
function scanText(candidates, text) {
|
|
928
|
-
const normalized =
|
|
959
|
+
const normalized = decodeSerpText(text).replace(/[+/|_-]+/g, " ");
|
|
929
960
|
for (const match of normalized.matchAll(CITY_STATE_RE)) {
|
|
930
961
|
addCandidate(candidates, match[1] ?? "", match[2] ?? "", normalized.slice(0, 180));
|
|
931
962
|
}
|
|
@@ -2593,8 +2624,8 @@ async function harvest(rawOptions) {
|
|
|
2593
2624
|
const onAttemptEvent = getAttemptLogSink(rawOptions);
|
|
2594
2625
|
const requestedProxyMode = raw.proxyMode;
|
|
2595
2626
|
const proxyMode = requestedProxyMode === "none" ? "none" : requestedProxyMode === "configured" ? "configured" : "location";
|
|
2596
|
-
const kernelApiKey = typeof raw.kernelApiKey === "string" ? raw.kernelApiKey.trim() :
|
|
2597
|
-
const configuredKernelProxyId = typeof raw.kernelProxyId === "string" ? raw.kernelProxyId.trim() :
|
|
2627
|
+
const kernelApiKey = typeof raw.kernelApiKey === "string" ? raw.kernelApiKey.trim() : browserServiceApiKey();
|
|
2628
|
+
const configuredKernelProxyId = typeof raw.kernelProxyId === "string" ? raw.kernelProxyId.trim() : browserServiceProxyId();
|
|
2598
2629
|
const proxyOpts = {
|
|
2599
2630
|
kernelApiKey,
|
|
2600
2631
|
proxyMode,
|
|
@@ -2779,7 +2810,7 @@ async function harvest(rawOptions) {
|
|
|
2779
2810
|
|
|
2780
2811
|
// src/cli.ts
|
|
2781
2812
|
var program = new import_commander.Command();
|
|
2782
|
-
program.name("paa-harvest").description("Recursively extract Google People Also Ask questions").requiredOption("-q, --query <query>", "Seed query").option("-l, --location <location>", 'Location name (e.g. "austin" or "Austin,Texas,United States")').option("--gl <gl>", "Google country code", "us").option("--hl <hl>", "Google language code", "en").option("-d, --depth <depth>", "BFS depth (1-30)", "3").option("-m, --max-questions <n>", "Max questions to harvest", "100").option("-o, --output <dir>", "Output directory", "./paa-output").option("-f, --format <format>", "Output format: json, csv, or both", "both").option("--headless", "Run browser in headless mode", false).option("--profile <dir>", "Persistent browser profile directory").option("--proxy <url>", "Proxy server URL").option("--kernel-api-key <key>", "
|
|
2813
|
+
program.name("paa-harvest").description("Recursively extract Google People Also Ask questions").requiredOption("-q, --query <query>", "Seed query").option("-l, --location <location>", 'Location name (e.g. "austin" or "Austin,Texas,United States")').option("--gl <gl>", "Google country code", "us").option("--hl <hl>", "Google language code", "en").option("-d, --depth <depth>", "BFS depth (1-30)", "3").option("-m, --max-questions <n>", "Max questions to harvest", "100").option("-o, --output <dir>", "Output directory", "./paa-output").option("-f, --format <format>", "Output format: json, csv, or both", "both").option("--headless", "Run browser in headless mode", false).option("--profile <dir>", "Persistent browser profile directory").option("--proxy <url>", "Proxy server URL").option("--kernel-api-key <key>", "Browser service API key (or set BROWSER_SERVICE_API_KEY env var)").action(async (opts) => {
|
|
2783
2814
|
try {
|
|
2784
2815
|
const result = await harvest({
|
|
2785
2816
|
query: opts.query,
|
|
@@ -2793,7 +2824,7 @@ program.name("paa-harvest").description("Recursively extract Google People Also
|
|
|
2793
2824
|
headless: opts.headless,
|
|
2794
2825
|
profileDir: opts.profile,
|
|
2795
2826
|
proxy: opts.proxy,
|
|
2796
|
-
kernelApiKey: opts.kernelApiKey ??
|
|
2827
|
+
kernelApiKey: opts.kernelApiKey ?? browserServiceApiKey()
|
|
2797
2828
|
});
|
|
2798
2829
|
console.log(JSON.stringify({ totalQuestions: result.totalQuestions, outputDir: result.stats.seed }));
|
|
2799
2830
|
} catch (err) {
|