mcp-scraper 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,8 +17,8 @@ loadDotEnv();
17
17
  async function main() {
18
18
  const [{ serve }, { app }, { startWorker }, { migrate }] = await Promise.all([
19
19
  import("@hono/node-server"),
20
- import("../server-6CHHLOII.js"),
21
- import("../worker-D4D2YQTA.js"),
20
+ import("../server-V5XMVRYE.js"),
21
+ import("../worker-UT4ZQU2T.js"),
22
22
  import("../db-YWCNHBLH.js")
23
23
  ]);
24
24
  const PORT = parseInt(process.env.PORT ?? "3001");
@@ -7,17 +7,34 @@ var import_node_os2 = require("os");
7
7
  var import_node_path2 = require("path");
8
8
  var import_stdio = require("@modelcontextprotocol/sdk/server/stdio.js");
9
9
 
10
+ // src/harvest-timeout.ts
11
+ var VERCEL_FUNCTION_MAX_MS = 3e5;
12
+ var CLIENT_OVER_SERVER_MARGIN_MS = 15e3;
13
+ function harvestTimeoutBudget(maxQuestions, serpOnly = false) {
14
+ const requested = Number.isFinite(maxQuestions) && maxQuestions > 0 ? Math.trunc(maxQuestions) : 30;
15
+ let serverMs;
16
+ if (serpOnly || requested <= 50) serverMs = 11e4;
17
+ else if (requested <= 100) serverMs = 18e4;
18
+ else if (requested <= 150) serverMs = 24e4;
19
+ else serverMs = 28e4;
20
+ const clientMs = Math.min(serverMs + CLIENT_OVER_SERVER_MARGIN_MS, VERCEL_FUNCTION_MAX_MS - 5e3);
21
+ return { serverMs, clientMs };
22
+ }
23
+
10
24
  // src/mcp/http-mcp-tool-executor.ts
11
25
  var HttpMcpToolExecutor = class {
12
26
  baseUrl;
13
27
  apiKey;
14
28
  timeoutMs;
29
+ httpTimeoutOverrideMs;
15
30
  serpIntelligenceTimeoutMs;
16
31
  constructor(baseUrl2, apiKey2) {
17
32
  this.baseUrl = baseUrl2.replace(/\/$/, "");
18
33
  this.apiKey = apiKey2;
19
- const configuredTimeoutMs = Number(process.env.MCP_SCRAPER_HTTP_TIMEOUT_MS ?? 11e4);
20
- this.timeoutMs = Number.isFinite(configuredTimeoutMs) && configuredTimeoutMs > 0 ? configuredTimeoutMs : 11e4;
34
+ const rawOverride = process.env.MCP_SCRAPER_HTTP_TIMEOUT_MS;
35
+ const parsedOverride = rawOverride === void 0 ? NaN : Number(rawOverride);
36
+ this.httpTimeoutOverrideMs = Number.isFinite(parsedOverride) && parsedOverride > 0 ? parsedOverride : null;
37
+ this.timeoutMs = this.httpTimeoutOverrideMs ?? 11e4;
21
38
  const configuredSerpIntelligenceTimeoutMs = Number(process.env.MCP_SCRAPER_SERP_INTELLIGENCE_HTTP_TIMEOUT_MS ?? this.timeoutMs);
22
39
  this.serpIntelligenceTimeoutMs = Number.isFinite(configuredSerpIntelligenceTimeoutMs) && configuredSerpIntelligenceTimeoutMs > 0 ? configuredSerpIntelligenceTimeoutMs : this.timeoutMs;
23
40
  }
@@ -59,10 +76,12 @@ var HttpMcpToolExecutor = class {
59
76
  }
60
77
  }
61
78
  harvestPaa(input) {
62
- return this.call("/harvest/sync", input);
79
+ const timeoutMs = this.httpTimeoutOverrideMs ?? harvestTimeoutBudget(input.maxQuestions ?? 30).clientMs;
80
+ return this.call("/harvest/sync", input, timeoutMs);
63
81
  }
64
82
  searchSerp(input) {
65
- return this.call("/harvest/sync", { ...input, serpOnly: true });
83
+ const timeoutMs = this.httpTimeoutOverrideMs ?? harvestTimeoutBudget(0, true).clientMs;
84
+ return this.call("/harvest/sync", { ...input, serpOnly: true }, timeoutMs);
66
85
  }
67
86
  extractUrl(input) {
68
87
  return this.call("/extract-url", input);
@@ -223,9 +242,12 @@ function reportTitle(full) {
223
242
  const title = full.split("\n").find((line) => line.startsWith("# "));
224
243
  return title?.replace(/^#\s+/, "").trim() || "MCP Scraper Report";
225
244
  }
245
+ function outputBaseDir() {
246
+ return process.env.MCP_SCRAPER_OUTPUT_DIR?.trim() || (0, import_node_path.join)((0, import_node_os.homedir)(), "Downloads", "mcp-scraper");
247
+ }
226
248
  function saveFullReport(full) {
227
249
  if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
228
- const outDir = process.env.MCP_SCRAPER_OUTPUT_DIR?.trim() || (0, import_node_path.join)((0, import_node_os.homedir)(), "Downloads", "mcp-scraper");
250
+ const outDir = outputBaseDir();
229
251
  try {
230
252
  (0, import_node_fs.mkdirSync)(outDir, { recursive: true });
231
253
  const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
@@ -236,6 +258,20 @@ function saveFullReport(full) {
236
258
  return null;
237
259
  }
238
260
  }
261
+ function persistScreenshotLocally(base64, url) {
262
+ if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
263
+ try {
264
+ const dir = (0, import_node_path.join)(outputBaseDir(), "screenshots");
265
+ (0, import_node_fs.mkdirSync)(dir, { recursive: true });
266
+ const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
267
+ const slug = url.replace(/^https?:\/\//, "").replace(/[^a-z0-9]+/gi, "-").replace(/^-+|-+$/g, "").slice(0, 60);
268
+ const filePath = (0, import_node_path.join)(dir, `${stamp}-${slug}.png`);
269
+ (0, import_node_fs.writeFileSync)(filePath, Buffer.from(base64, "base64"));
270
+ return filePath;
271
+ } catch {
272
+ return null;
273
+ }
274
+ }
239
275
  function oneBlock(content) {
240
276
  const filePath = saveFullReport(content);
241
277
  const text = filePath ? `${content}
@@ -456,6 +492,7 @@ function formatExtractUrl(raw, input) {
456
492
  const bodyMd = d.bodyMarkdown ?? "";
457
493
  const schema = d.schema;
458
494
  const screenshotMeta = d.screenshot;
495
+ const screenshotPath = screenshotMeta?.base64 ? persistScreenshotLocally(screenshotMeta.base64, url) : null;
459
496
  const branding = d.branding;
460
497
  const media = d.media;
461
498
  const h1Lines = headings.filter((h) => h.level === 1).map((h) => `- ${h.text}`).join("\n");
@@ -482,7 +519,7 @@ ${[h1Lines, h2Lines].filter(Boolean).join("\n")}` : "";
482
519
  ${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
483
520
  const screenshotSection = screenshotMeta ? `
484
521
  ## Screenshot
485
- - **File:** ${screenshotMeta.savedPath}
522
+ - **File:** ${screenshotPath ?? "(returned inline only \u2014 disk write unavailable in this environment)"}
486
523
  - **Size:** ${(screenshotMeta.sizeBytes / 1024).toFixed(1)} KB
487
524
  - **Device:** ${screenshotMeta.device}` : "";
488
525
  const brandingSection = branding ? [
@@ -511,17 +548,13 @@ ${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
511
548
  **${title}**
512
549
  ${headingSection}${kpoSection}${brandingSection}${bodySection}${screenshotSection}${mediaSection}${tips}`;
513
550
  const textResult = oneBlock(full);
514
- if (screenshotMeta?.savedPath) {
515
- try {
516
- const imgBuf = (0, import_node_fs.readFileSync)(screenshotMeta.savedPath);
517
- return {
518
- content: [
519
- ...textResult.content,
520
- { type: "image", data: imgBuf.toString("base64"), mimeType: "image/png" }
521
- ]
522
- };
523
- } catch {
524
- }
551
+ if (screenshotMeta?.base64) {
552
+ return {
553
+ content: [
554
+ ...textResult.content,
555
+ { type: "image", data: screenshotMeta.base64, mimeType: "image/png" }
556
+ ]
557
+ };
525
558
  }
526
559
  return textResult;
527
560
  }
@@ -721,7 +754,7 @@ function formatFacebookAdSearch(raw, input) {
721
754
  const d = parsed.data;
722
755
  const advertisers = d.results ?? d.advertisers ?? [];
723
756
  const rows = advertisers.map(
724
- (a, i) => `| ${i + 1} | ${cell(a.name)} | ${a.adCount ?? "\u2014"} | \`${a.libraryId ?? "\u2014"}\` |`
757
+ (a, i) => `| ${i + 1} | ${cell(a.pageName ?? a.name)} | ${a.adCount ?? "\u2014"} | \`${a.sampleLibraryId ?? a.libraryId ?? "\u2014"}\` |`
725
758
  ).join("\n");
726
759
  const full = [
727
760
  `# Facebook Ad Library Search: "${input.query}"`,