mcp-scraper 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,7 +17,7 @@ loadDotEnv();
17
17
  async function main() {
18
18
  const [{ serve }, { app }, { startWorker }, { migrate }] = await Promise.all([
19
19
  import("@hono/node-server"),
20
- import("../server-KUF3QJC7.js"),
20
+ import("../server-2Y27U4TO.js"),
21
21
  import("../worker-UT4ZQU2T.js"),
22
22
  import("../db-YWCNHBLH.js")
23
23
  ]);
@@ -7,17 +7,34 @@ var import_node_os2 = require("os");
7
7
  var import_node_path2 = require("path");
8
8
  var import_stdio = require("@modelcontextprotocol/sdk/server/stdio.js");
9
9
 
10
+ // src/harvest-timeout.ts
11
+ var VERCEL_FUNCTION_MAX_MS = 3e5;
12
+ var CLIENT_OVER_SERVER_MARGIN_MS = 15e3;
13
+ function harvestTimeoutBudget(maxQuestions, serpOnly = false) {
14
+ const requested = Number.isFinite(maxQuestions) && maxQuestions > 0 ? Math.trunc(maxQuestions) : 30;
15
+ let serverMs;
16
+ if (serpOnly || requested <= 50) serverMs = 11e4;
17
+ else if (requested <= 100) serverMs = 18e4;
18
+ else if (requested <= 150) serverMs = 24e4;
19
+ else serverMs = 28e4;
20
+ const clientMs = Math.min(serverMs + CLIENT_OVER_SERVER_MARGIN_MS, VERCEL_FUNCTION_MAX_MS - 5e3);
21
+ return { serverMs, clientMs };
22
+ }
23
+
10
24
  // src/mcp/http-mcp-tool-executor.ts
11
25
  var HttpMcpToolExecutor = class {
12
26
  baseUrl;
13
27
  apiKey;
14
28
  timeoutMs;
29
+ httpTimeoutOverrideMs;
15
30
  serpIntelligenceTimeoutMs;
16
31
  constructor(baseUrl2, apiKey2) {
17
32
  this.baseUrl = baseUrl2.replace(/\/$/, "");
18
33
  this.apiKey = apiKey2;
19
- const configuredTimeoutMs = Number(process.env.MCP_SCRAPER_HTTP_TIMEOUT_MS ?? 11e4);
20
- this.timeoutMs = Number.isFinite(configuredTimeoutMs) && configuredTimeoutMs > 0 ? configuredTimeoutMs : 11e4;
34
+ const rawOverride = process.env.MCP_SCRAPER_HTTP_TIMEOUT_MS;
35
+ const parsedOverride = rawOverride === void 0 ? NaN : Number(rawOverride);
36
+ this.httpTimeoutOverrideMs = Number.isFinite(parsedOverride) && parsedOverride > 0 ? parsedOverride : null;
37
+ this.timeoutMs = this.httpTimeoutOverrideMs ?? 11e4;
21
38
  const configuredSerpIntelligenceTimeoutMs = Number(process.env.MCP_SCRAPER_SERP_INTELLIGENCE_HTTP_TIMEOUT_MS ?? this.timeoutMs);
22
39
  this.serpIntelligenceTimeoutMs = Number.isFinite(configuredSerpIntelligenceTimeoutMs) && configuredSerpIntelligenceTimeoutMs > 0 ? configuredSerpIntelligenceTimeoutMs : this.timeoutMs;
23
40
  }
@@ -59,10 +76,12 @@ var HttpMcpToolExecutor = class {
59
76
  }
60
77
  }
61
78
  harvestPaa(input) {
62
- return this.call("/harvest/sync", input);
79
+ const timeoutMs = this.httpTimeoutOverrideMs ?? harvestTimeoutBudget(input.maxQuestions ?? 30).clientMs;
80
+ return this.call("/harvest/sync", input, timeoutMs);
63
81
  }
64
82
  searchSerp(input) {
65
- return this.call("/harvest/sync", { ...input, serpOnly: true });
83
+ const timeoutMs = this.httpTimeoutOverrideMs ?? harvestTimeoutBudget(0, true).clientMs;
84
+ return this.call("/harvest/sync", { ...input, serpOnly: true }, timeoutMs);
66
85
  }
67
86
  extractUrl(input) {
68
87
  return this.call("/extract-url", input);
@@ -110,7 +129,7 @@ var import_zod = require("zod");
110
129
  var HarvestPaaInputSchema = {
111
130
  query: import_zod.z.string().min(1).describe('Core search topic only. If the user says "best hvac company in Denver CO", use query="best hvac company" and location="Denver, CO". Do not include the location in query when it can be separated.'),
112
131
  location: import_zod.z.string().optional().describe('City, region, or country for geo-targeted results, inferred from the user request when present, e.g. "Denver, CO", "Tokyo, Japan", "London, UK".'),
113
- maxQuestions: import_zod.z.number().int().min(1).max(150).default(30).describe("Number of PAA questions to extract. Default 30. Maximum 150. Use 10 for quick probes, 30 for normal research, 100-150 when the user asks for everything/full/deep research. Credits are charged by extracted question; unused request hold is refunded."),
132
+ maxQuestions: import_zod.z.number().int().min(1).max(200).default(30).describe("Number of PAA questions to extract. Default 30. Maximum 200. Use 10 for quick probes, 30 for normal research, 100-200 when the user asks for everything/full/deep research. Larger harvests get a longer server time budget (151-200 questions \u2192 up to 280s). Credits are charged by extracted question; unused request hold is refunded."),
114
133
  gl: import_zod.z.string().length(2).default("us").describe("Google country code inferred from location or user language. Examples: United States us, United Kingdom gb, Japan jp, Canada ca, Australia au."),
115
134
  hl: import_zod.z.string().default("en").describe("Google interface/content language inferred from the user request. Use en unless the user asks for another language or locale."),
116
135
  device: import_zod.z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use desktop by default; use mobile only when the user asks for mobile rankings."),
@@ -223,9 +242,12 @@ function reportTitle(full) {
223
242
  const title = full.split("\n").find((line) => line.startsWith("# "));
224
243
  return title?.replace(/^#\s+/, "").trim() || "MCP Scraper Report";
225
244
  }
245
+ function outputBaseDir() {
246
+ return process.env.MCP_SCRAPER_OUTPUT_DIR?.trim() || (0, import_node_path.join)((0, import_node_os.homedir)(), "Downloads", "mcp-scraper");
247
+ }
226
248
  function saveFullReport(full) {
227
249
  if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
228
- const outDir = process.env.MCP_SCRAPER_OUTPUT_DIR?.trim() || (0, import_node_path.join)((0, import_node_os.homedir)(), "Downloads", "mcp-scraper");
250
+ const outDir = outputBaseDir();
229
251
  try {
230
252
  (0, import_node_fs.mkdirSync)(outDir, { recursive: true });
231
253
  const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
@@ -236,6 +258,20 @@ function saveFullReport(full) {
236
258
  return null;
237
259
  }
238
260
  }
261
+ function persistScreenshotLocally(base64, url) {
262
+ if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
263
+ try {
264
+ const dir = (0, import_node_path.join)(outputBaseDir(), "screenshots");
265
+ (0, import_node_fs.mkdirSync)(dir, { recursive: true });
266
+ const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
267
+ const slug = url.replace(/^https?:\/\//, "").replace(/[^a-z0-9]+/gi, "-").replace(/^-+|-+$/g, "").slice(0, 60);
268
+ const filePath = (0, import_node_path.join)(dir, `${stamp}-${slug}.png`);
269
+ (0, import_node_fs.writeFileSync)(filePath, Buffer.from(base64, "base64"));
270
+ return filePath;
271
+ } catch {
272
+ return null;
273
+ }
274
+ }
239
275
  function oneBlock(content) {
240
276
  const filePath = saveFullReport(content);
241
277
  const text = filePath ? `${content}
@@ -456,6 +492,7 @@ function formatExtractUrl(raw, input) {
456
492
  const bodyMd = d.bodyMarkdown ?? "";
457
493
  const schema = d.schema;
458
494
  const screenshotMeta = d.screenshot;
495
+ const screenshotPath = screenshotMeta?.base64 ? persistScreenshotLocally(screenshotMeta.base64, url) : null;
459
496
  const branding = d.branding;
460
497
  const media = d.media;
461
498
  const h1Lines = headings.filter((h) => h.level === 1).map((h) => `- ${h.text}`).join("\n");
@@ -482,7 +519,7 @@ ${[h1Lines, h2Lines].filter(Boolean).join("\n")}` : "";
482
519
  ${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
483
520
  const screenshotSection = screenshotMeta ? `
484
521
  ## Screenshot
485
- - **File:** ${screenshotMeta.savedPath}
522
+ - **File:** ${screenshotPath ?? "(returned inline only \u2014 disk write unavailable in this environment)"}
486
523
  - **Size:** ${(screenshotMeta.sizeBytes / 1024).toFixed(1)} KB
487
524
  - **Device:** ${screenshotMeta.device}` : "";
488
525
  const brandingSection = branding ? [
@@ -511,17 +548,13 @@ ${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
511
548
  **${title}**
512
549
  ${headingSection}${kpoSection}${brandingSection}${bodySection}${screenshotSection}${mediaSection}${tips}`;
513
550
  const textResult = oneBlock(full);
514
- if (screenshotMeta?.savedPath) {
515
- try {
516
- const imgBuf = (0, import_node_fs.readFileSync)(screenshotMeta.savedPath);
517
- return {
518
- content: [
519
- ...textResult.content,
520
- { type: "image", data: imgBuf.toString("base64"), mimeType: "image/png" }
521
- ]
522
- };
523
- } catch {
524
- }
551
+ if (screenshotMeta?.base64) {
552
+ return {
553
+ content: [
554
+ ...textResult.content,
555
+ { type: "image", data: screenshotMeta.base64, mimeType: "image/png" }
556
+ ]
557
+ };
525
558
  }
526
559
  return textResult;
527
560
  }