mcp-scraper 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/api-server.cjs +1040 -829
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +2 -2
- package/dist/bin/mcp-stdio-server.cjs +51 -18
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +1 -1
- package/dist/bin/paa-harvest.js +1 -1
- package/dist/{chunk-JQKZWEON.js → chunk-4OHPDEZM.js} +54 -20
- package/dist/chunk-4OHPDEZM.js.map +1 -0
- package/dist/{chunk-Y74EXABN.js → chunk-7HB7NDOY.js} +2 -2
- package/dist/{chunk-HERFK7W6.js → chunk-W4P2U5VF.js} +2 -1
- package/dist/index.js +1 -1
- package/dist/{server-6CHHLOII.js → server-V5XMVRYE.js} +209 -47
- package/dist/server-V5XMVRYE.js.map +1 -0
- package/dist/{worker-D4D2YQTA.js → worker-UT4ZQU2T.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-JQKZWEON.js.map +0 -1
- package/dist/server-6CHHLOII.js.map +0 -1
- /package/dist/{chunk-Y74EXABN.js.map → chunk-7HB7NDOY.js.map} +0 -0
- /package/dist/{chunk-HERFK7W6.js.map → chunk-W4P2U5VF.js.map} +0 -0
- /package/dist/{worker-D4D2YQTA.js.map → worker-UT4ZQU2T.js.map} +0 -0
package/dist/bin/api-server.js
CHANGED
|
@@ -17,8 +17,8 @@ loadDotEnv();
|
|
|
17
17
|
async function main() {
|
|
18
18
|
const [{ serve }, { app }, { startWorker }, { migrate }] = await Promise.all([
|
|
19
19
|
import("@hono/node-server"),
|
|
20
|
-
import("../server-
|
|
21
|
-
import("../worker-
|
|
20
|
+
import("../server-V5XMVRYE.js"),
|
|
21
|
+
import("../worker-UT4ZQU2T.js"),
|
|
22
22
|
import("../db-YWCNHBLH.js")
|
|
23
23
|
]);
|
|
24
24
|
const PORT = parseInt(process.env.PORT ?? "3001");
|
|
@@ -7,17 +7,34 @@ var import_node_os2 = require("os");
|
|
|
7
7
|
var import_node_path2 = require("path");
|
|
8
8
|
var import_stdio = require("@modelcontextprotocol/sdk/server/stdio.js");
|
|
9
9
|
|
|
10
|
+
// src/harvest-timeout.ts
|
|
11
|
+
var VERCEL_FUNCTION_MAX_MS = 3e5;
|
|
12
|
+
var CLIENT_OVER_SERVER_MARGIN_MS = 15e3;
|
|
13
|
+
function harvestTimeoutBudget(maxQuestions, serpOnly = false) {
|
|
14
|
+
const requested = Number.isFinite(maxQuestions) && maxQuestions > 0 ? Math.trunc(maxQuestions) : 30;
|
|
15
|
+
let serverMs;
|
|
16
|
+
if (serpOnly || requested <= 50) serverMs = 11e4;
|
|
17
|
+
else if (requested <= 100) serverMs = 18e4;
|
|
18
|
+
else if (requested <= 150) serverMs = 24e4;
|
|
19
|
+
else serverMs = 28e4;
|
|
20
|
+
const clientMs = Math.min(serverMs + CLIENT_OVER_SERVER_MARGIN_MS, VERCEL_FUNCTION_MAX_MS - 5e3);
|
|
21
|
+
return { serverMs, clientMs };
|
|
22
|
+
}
|
|
23
|
+
|
|
10
24
|
// src/mcp/http-mcp-tool-executor.ts
|
|
11
25
|
var HttpMcpToolExecutor = class {
|
|
12
26
|
baseUrl;
|
|
13
27
|
apiKey;
|
|
14
28
|
timeoutMs;
|
|
29
|
+
httpTimeoutOverrideMs;
|
|
15
30
|
serpIntelligenceTimeoutMs;
|
|
16
31
|
constructor(baseUrl2, apiKey2) {
|
|
17
32
|
this.baseUrl = baseUrl2.replace(/\/$/, "");
|
|
18
33
|
this.apiKey = apiKey2;
|
|
19
|
-
const
|
|
20
|
-
|
|
34
|
+
const rawOverride = process.env.MCP_SCRAPER_HTTP_TIMEOUT_MS;
|
|
35
|
+
const parsedOverride = rawOverride === void 0 ? NaN : Number(rawOverride);
|
|
36
|
+
this.httpTimeoutOverrideMs = Number.isFinite(parsedOverride) && parsedOverride > 0 ? parsedOverride : null;
|
|
37
|
+
this.timeoutMs = this.httpTimeoutOverrideMs ?? 11e4;
|
|
21
38
|
const configuredSerpIntelligenceTimeoutMs = Number(process.env.MCP_SCRAPER_SERP_INTELLIGENCE_HTTP_TIMEOUT_MS ?? this.timeoutMs);
|
|
22
39
|
this.serpIntelligenceTimeoutMs = Number.isFinite(configuredSerpIntelligenceTimeoutMs) && configuredSerpIntelligenceTimeoutMs > 0 ? configuredSerpIntelligenceTimeoutMs : this.timeoutMs;
|
|
23
40
|
}
|
|
@@ -59,10 +76,12 @@ var HttpMcpToolExecutor = class {
|
|
|
59
76
|
}
|
|
60
77
|
}
|
|
61
78
|
harvestPaa(input) {
|
|
62
|
-
|
|
79
|
+
const timeoutMs = this.httpTimeoutOverrideMs ?? harvestTimeoutBudget(input.maxQuestions ?? 30).clientMs;
|
|
80
|
+
return this.call("/harvest/sync", input, timeoutMs);
|
|
63
81
|
}
|
|
64
82
|
searchSerp(input) {
|
|
65
|
-
|
|
83
|
+
const timeoutMs = this.httpTimeoutOverrideMs ?? harvestTimeoutBudget(0, true).clientMs;
|
|
84
|
+
return this.call("/harvest/sync", { ...input, serpOnly: true }, timeoutMs);
|
|
66
85
|
}
|
|
67
86
|
extractUrl(input) {
|
|
68
87
|
return this.call("/extract-url", input);
|
|
@@ -223,9 +242,12 @@ function reportTitle(full) {
|
|
|
223
242
|
const title = full.split("\n").find((line) => line.startsWith("# "));
|
|
224
243
|
return title?.replace(/^#\s+/, "").trim() || "MCP Scraper Report";
|
|
225
244
|
}
|
|
245
|
+
function outputBaseDir() {
|
|
246
|
+
return process.env.MCP_SCRAPER_OUTPUT_DIR?.trim() || (0, import_node_path.join)((0, import_node_os.homedir)(), "Downloads", "mcp-scraper");
|
|
247
|
+
}
|
|
226
248
|
function saveFullReport(full) {
|
|
227
249
|
if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
|
|
228
|
-
const outDir =
|
|
250
|
+
const outDir = outputBaseDir();
|
|
229
251
|
try {
|
|
230
252
|
(0, import_node_fs.mkdirSync)(outDir, { recursive: true });
|
|
231
253
|
const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
@@ -236,6 +258,20 @@ function saveFullReport(full) {
|
|
|
236
258
|
return null;
|
|
237
259
|
}
|
|
238
260
|
}
|
|
261
|
+
function persistScreenshotLocally(base64, url) {
|
|
262
|
+
if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
|
|
263
|
+
try {
|
|
264
|
+
const dir = (0, import_node_path.join)(outputBaseDir(), "screenshots");
|
|
265
|
+
(0, import_node_fs.mkdirSync)(dir, { recursive: true });
|
|
266
|
+
const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
267
|
+
const slug = url.replace(/^https?:\/\//, "").replace(/[^a-z0-9]+/gi, "-").replace(/^-+|-+$/g, "").slice(0, 60);
|
|
268
|
+
const filePath = (0, import_node_path.join)(dir, `${stamp}-${slug}.png`);
|
|
269
|
+
(0, import_node_fs.writeFileSync)(filePath, Buffer.from(base64, "base64"));
|
|
270
|
+
return filePath;
|
|
271
|
+
} catch {
|
|
272
|
+
return null;
|
|
273
|
+
}
|
|
274
|
+
}
|
|
239
275
|
function oneBlock(content) {
|
|
240
276
|
const filePath = saveFullReport(content);
|
|
241
277
|
const text = filePath ? `${content}
|
|
@@ -456,6 +492,7 @@ function formatExtractUrl(raw, input) {
|
|
|
456
492
|
const bodyMd = d.bodyMarkdown ?? "";
|
|
457
493
|
const schema = d.schema;
|
|
458
494
|
const screenshotMeta = d.screenshot;
|
|
495
|
+
const screenshotPath = screenshotMeta?.base64 ? persistScreenshotLocally(screenshotMeta.base64, url) : null;
|
|
459
496
|
const branding = d.branding;
|
|
460
497
|
const media = d.media;
|
|
461
498
|
const h1Lines = headings.filter((h) => h.level === 1).map((h) => `- ${h.text}`).join("\n");
|
|
@@ -482,7 +519,7 @@ ${[h1Lines, h2Lines].filter(Boolean).join("\n")}` : "";
|
|
|
482
519
|
${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
|
|
483
520
|
const screenshotSection = screenshotMeta ? `
|
|
484
521
|
## Screenshot
|
|
485
|
-
- **File:** ${
|
|
522
|
+
- **File:** ${screenshotPath ?? "(returned inline only \u2014 disk write unavailable in this environment)"}
|
|
486
523
|
- **Size:** ${(screenshotMeta.sizeBytes / 1024).toFixed(1)} KB
|
|
487
524
|
- **Device:** ${screenshotMeta.device}` : "";
|
|
488
525
|
const brandingSection = branding ? [
|
|
@@ -511,17 +548,13 @@ ${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
|
|
|
511
548
|
**${title}**
|
|
512
549
|
${headingSection}${kpoSection}${brandingSection}${bodySection}${screenshotSection}${mediaSection}${tips}`;
|
|
513
550
|
const textResult = oneBlock(full);
|
|
514
|
-
if (screenshotMeta?.
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
]
|
|
522
|
-
};
|
|
523
|
-
} catch {
|
|
524
|
-
}
|
|
551
|
+
if (screenshotMeta?.base64) {
|
|
552
|
+
return {
|
|
553
|
+
content: [
|
|
554
|
+
...textResult.content,
|
|
555
|
+
{ type: "image", data: screenshotMeta.base64, mimeType: "image/png" }
|
|
556
|
+
]
|
|
557
|
+
};
|
|
525
558
|
}
|
|
526
559
|
return textResult;
|
|
527
560
|
}
|
|
@@ -721,7 +754,7 @@ function formatFacebookAdSearch(raw, input) {
|
|
|
721
754
|
const d = parsed.data;
|
|
722
755
|
const advertisers = d.results ?? d.advertisers ?? [];
|
|
723
756
|
const rows = advertisers.map(
|
|
724
|
-
(a, i) => `| ${i + 1} | ${cell(a.name)} | ${a.adCount ?? "\u2014"} | \`${a.libraryId ?? "\u2014"}\` |`
|
|
757
|
+
(a, i) => `| ${i + 1} | ${cell(a.pageName ?? a.name)} | ${a.adCount ?? "\u2014"} | \`${a.sampleLibraryId ?? a.libraryId ?? "\u2014"}\` |`
|
|
725
758
|
).join("\n");
|
|
726
759
|
const full = [
|
|
727
760
|
`# Facebook Ad Library Search: "${input.query}"`,
|