mcp-scraper 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/api-server.cjs +136 -46
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +1 -1
- package/dist/bin/mcp-stdio-server.cjs +51 -18
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +1 -1
- package/dist/{chunk-DZY3XO3M.js → chunk-6TWZS2FQ.js} +54 -20
- package/dist/chunk-6TWZS2FQ.js.map +1 -0
- package/dist/{server-KUF3QJC7.js → server-2Y27U4TO.js} +78 -30
- package/dist/server-2Y27U4TO.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-DZY3XO3M.js.map +0 -1
- package/dist/server-KUF3QJC7.js.map +0 -1
package/dist/bin/api-server.js
CHANGED
|
@@ -17,7 +17,7 @@ loadDotEnv();
|
|
|
17
17
|
async function main() {
|
|
18
18
|
const [{ serve }, { app }, { startWorker }, { migrate }] = await Promise.all([
|
|
19
19
|
import("@hono/node-server"),
|
|
20
|
-
import("../server-
|
|
20
|
+
import("../server-2Y27U4TO.js"),
|
|
21
21
|
import("../worker-UT4ZQU2T.js"),
|
|
22
22
|
import("../db-YWCNHBLH.js")
|
|
23
23
|
]);
|
|
@@ -7,17 +7,34 @@ var import_node_os2 = require("os");
|
|
|
7
7
|
var import_node_path2 = require("path");
|
|
8
8
|
var import_stdio = require("@modelcontextprotocol/sdk/server/stdio.js");
|
|
9
9
|
|
|
10
|
+
// src/harvest-timeout.ts
|
|
11
|
+
var VERCEL_FUNCTION_MAX_MS = 3e5;
|
|
12
|
+
var CLIENT_OVER_SERVER_MARGIN_MS = 15e3;
|
|
13
|
+
function harvestTimeoutBudget(maxQuestions, serpOnly = false) {
|
|
14
|
+
const requested = Number.isFinite(maxQuestions) && maxQuestions > 0 ? Math.trunc(maxQuestions) : 30;
|
|
15
|
+
let serverMs;
|
|
16
|
+
if (serpOnly || requested <= 50) serverMs = 11e4;
|
|
17
|
+
else if (requested <= 100) serverMs = 18e4;
|
|
18
|
+
else if (requested <= 150) serverMs = 24e4;
|
|
19
|
+
else serverMs = 28e4;
|
|
20
|
+
const clientMs = Math.min(serverMs + CLIENT_OVER_SERVER_MARGIN_MS, VERCEL_FUNCTION_MAX_MS - 5e3);
|
|
21
|
+
return { serverMs, clientMs };
|
|
22
|
+
}
|
|
23
|
+
|
|
10
24
|
// src/mcp/http-mcp-tool-executor.ts
|
|
11
25
|
var HttpMcpToolExecutor = class {
|
|
12
26
|
baseUrl;
|
|
13
27
|
apiKey;
|
|
14
28
|
timeoutMs;
|
|
29
|
+
httpTimeoutOverrideMs;
|
|
15
30
|
serpIntelligenceTimeoutMs;
|
|
16
31
|
constructor(baseUrl2, apiKey2) {
|
|
17
32
|
this.baseUrl = baseUrl2.replace(/\/$/, "");
|
|
18
33
|
this.apiKey = apiKey2;
|
|
19
|
-
const
|
|
20
|
-
|
|
34
|
+
const rawOverride = process.env.MCP_SCRAPER_HTTP_TIMEOUT_MS;
|
|
35
|
+
const parsedOverride = rawOverride === void 0 ? NaN : Number(rawOverride);
|
|
36
|
+
this.httpTimeoutOverrideMs = Number.isFinite(parsedOverride) && parsedOverride > 0 ? parsedOverride : null;
|
|
37
|
+
this.timeoutMs = this.httpTimeoutOverrideMs ?? 11e4;
|
|
21
38
|
const configuredSerpIntelligenceTimeoutMs = Number(process.env.MCP_SCRAPER_SERP_INTELLIGENCE_HTTP_TIMEOUT_MS ?? this.timeoutMs);
|
|
22
39
|
this.serpIntelligenceTimeoutMs = Number.isFinite(configuredSerpIntelligenceTimeoutMs) && configuredSerpIntelligenceTimeoutMs > 0 ? configuredSerpIntelligenceTimeoutMs : this.timeoutMs;
|
|
23
40
|
}
|
|
@@ -59,10 +76,12 @@ var HttpMcpToolExecutor = class {
|
|
|
59
76
|
}
|
|
60
77
|
}
|
|
61
78
|
harvestPaa(input) {
|
|
62
|
-
|
|
79
|
+
const timeoutMs = this.httpTimeoutOverrideMs ?? harvestTimeoutBudget(input.maxQuestions ?? 30).clientMs;
|
|
80
|
+
return this.call("/harvest/sync", input, timeoutMs);
|
|
63
81
|
}
|
|
64
82
|
searchSerp(input) {
|
|
65
|
-
|
|
83
|
+
const timeoutMs = this.httpTimeoutOverrideMs ?? harvestTimeoutBudget(0, true).clientMs;
|
|
84
|
+
return this.call("/harvest/sync", { ...input, serpOnly: true }, timeoutMs);
|
|
66
85
|
}
|
|
67
86
|
extractUrl(input) {
|
|
68
87
|
return this.call("/extract-url", input);
|
|
@@ -110,7 +129,7 @@ var import_zod = require("zod");
|
|
|
110
129
|
var HarvestPaaInputSchema = {
|
|
111
130
|
query: import_zod.z.string().min(1).describe('Core search topic only. If the user says "best hvac company in Denver CO", use query="best hvac company" and location="Denver, CO". Do not include the location in query when it can be separated.'),
|
|
112
131
|
location: import_zod.z.string().optional().describe('City, region, or country for geo-targeted results, inferred from the user request when present, e.g. "Denver, CO", "Tokyo, Japan", "London, UK".'),
|
|
113
|
-
maxQuestions: import_zod.z.number().int().min(1).max(
|
|
132
|
+
maxQuestions: import_zod.z.number().int().min(1).max(200).default(30).describe("Number of PAA questions to extract. Default 30. Maximum 200. Use 10 for quick probes, 30 for normal research, 100-200 when the user asks for everything/full/deep research. Larger harvests get a longer server time budget (151-200 questions \u2192 up to 280s). Credits are charged by extracted question; unused request hold is refunded."),
|
|
114
133
|
gl: import_zod.z.string().length(2).default("us").describe("Google country code inferred from location or user language. Examples: United States us, United Kingdom gb, Japan jp, Canada ca, Australia au."),
|
|
115
134
|
hl: import_zod.z.string().default("en").describe("Google interface/content language inferred from the user request. Use en unless the user asks for another language or locale."),
|
|
116
135
|
device: import_zod.z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use desktop by default; use mobile only when the user asks for mobile rankings."),
|
|
@@ -223,9 +242,12 @@ function reportTitle(full) {
|
|
|
223
242
|
const title = full.split("\n").find((line) => line.startsWith("# "));
|
|
224
243
|
return title?.replace(/^#\s+/, "").trim() || "MCP Scraper Report";
|
|
225
244
|
}
|
|
245
|
+
function outputBaseDir() {
|
|
246
|
+
return process.env.MCP_SCRAPER_OUTPUT_DIR?.trim() || (0, import_node_path.join)((0, import_node_os.homedir)(), "Downloads", "mcp-scraper");
|
|
247
|
+
}
|
|
226
248
|
function saveFullReport(full) {
|
|
227
249
|
if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
|
|
228
|
-
const outDir =
|
|
250
|
+
const outDir = outputBaseDir();
|
|
229
251
|
try {
|
|
230
252
|
(0, import_node_fs.mkdirSync)(outDir, { recursive: true });
|
|
231
253
|
const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
@@ -236,6 +258,20 @@ function saveFullReport(full) {
|
|
|
236
258
|
return null;
|
|
237
259
|
}
|
|
238
260
|
}
|
|
261
|
+
function persistScreenshotLocally(base64, url) {
|
|
262
|
+
if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
|
|
263
|
+
try {
|
|
264
|
+
const dir = (0, import_node_path.join)(outputBaseDir(), "screenshots");
|
|
265
|
+
(0, import_node_fs.mkdirSync)(dir, { recursive: true });
|
|
266
|
+
const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
267
|
+
const slug = url.replace(/^https?:\/\//, "").replace(/[^a-z0-9]+/gi, "-").replace(/^-+|-+$/g, "").slice(0, 60);
|
|
268
|
+
const filePath = (0, import_node_path.join)(dir, `${stamp}-${slug}.png`);
|
|
269
|
+
(0, import_node_fs.writeFileSync)(filePath, Buffer.from(base64, "base64"));
|
|
270
|
+
return filePath;
|
|
271
|
+
} catch {
|
|
272
|
+
return null;
|
|
273
|
+
}
|
|
274
|
+
}
|
|
239
275
|
function oneBlock(content) {
|
|
240
276
|
const filePath = saveFullReport(content);
|
|
241
277
|
const text = filePath ? `${content}
|
|
@@ -456,6 +492,7 @@ function formatExtractUrl(raw, input) {
|
|
|
456
492
|
const bodyMd = d.bodyMarkdown ?? "";
|
|
457
493
|
const schema = d.schema;
|
|
458
494
|
const screenshotMeta = d.screenshot;
|
|
495
|
+
const screenshotPath = screenshotMeta?.base64 ? persistScreenshotLocally(screenshotMeta.base64, url) : null;
|
|
459
496
|
const branding = d.branding;
|
|
460
497
|
const media = d.media;
|
|
461
498
|
const h1Lines = headings.filter((h) => h.level === 1).map((h) => `- ${h.text}`).join("\n");
|
|
@@ -482,7 +519,7 @@ ${[h1Lines, h2Lines].filter(Boolean).join("\n")}` : "";
|
|
|
482
519
|
${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
|
|
483
520
|
const screenshotSection = screenshotMeta ? `
|
|
484
521
|
## Screenshot
|
|
485
|
-
- **File:** ${
|
|
522
|
+
- **File:** ${screenshotPath ?? "(returned inline only \u2014 disk write unavailable in this environment)"}
|
|
486
523
|
- **Size:** ${(screenshotMeta.sizeBytes / 1024).toFixed(1)} KB
|
|
487
524
|
- **Device:** ${screenshotMeta.device}` : "";
|
|
488
525
|
const brandingSection = branding ? [
|
|
@@ -511,17 +548,13 @@ ${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
|
|
|
511
548
|
**${title}**
|
|
512
549
|
${headingSection}${kpoSection}${brandingSection}${bodySection}${screenshotSection}${mediaSection}${tips}`;
|
|
513
550
|
const textResult = oneBlock(full);
|
|
514
|
-
if (screenshotMeta?.
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
]
|
|
522
|
-
};
|
|
523
|
-
} catch {
|
|
524
|
-
}
|
|
551
|
+
if (screenshotMeta?.base64) {
|
|
552
|
+
return {
|
|
553
|
+
content: [
|
|
554
|
+
...textResult.content,
|
|
555
|
+
{ type: "image", data: screenshotMeta.base64, mimeType: "image/png" }
|
|
556
|
+
]
|
|
557
|
+
};
|
|
525
558
|
}
|
|
526
559
|
return textResult;
|
|
527
560
|
}
|