mcp-scraper 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,6 +30,26 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
30
30
  mod
31
31
  ));
32
32
 
33
+ // src/harvest-timeout.ts
34
+ function harvestTimeoutBudget(maxQuestions, serpOnly = false) {
35
+ const requested = Number.isFinite(maxQuestions) && maxQuestions > 0 ? Math.trunc(maxQuestions) : 30;
36
+ let serverMs;
37
+ if (serpOnly || requested <= 50) serverMs = 11e4;
38
+ else if (requested <= 100) serverMs = 18e4;
39
+ else if (requested <= 150) serverMs = 24e4;
40
+ else serverMs = 28e4;
41
+ const clientMs = Math.min(serverMs + CLIENT_OVER_SERVER_MARGIN_MS, VERCEL_FUNCTION_MAX_MS - 5e3);
42
+ return { serverMs, clientMs };
43
+ }
44
+ var VERCEL_FUNCTION_MAX_MS, CLIENT_OVER_SERVER_MARGIN_MS;
45
+ var init_harvest_timeout = __esm({
46
+ "src/harvest-timeout.ts"() {
47
+ "use strict";
48
+ VERCEL_FUNCTION_MAX_MS = 3e5;
49
+ CLIENT_OVER_SERVER_MARGIN_MS = 15e3;
50
+ }
51
+ });
52
+
33
53
  // src/blog/registry.ts
34
54
  var posts;
35
55
  var init_registry = __esm({
@@ -3825,25 +3845,73 @@ function firstFont(fontFamily) {
3825
3845
  const first = fontFamily.split(",")[0].trim().replace(/['"]/g, "");
3826
3846
  return first || null;
3827
3847
  }
3848
+ function dominantColor(freq) {
3849
+ return Object.entries(freq).filter(([hex]) => !isTransparentOrWhite(hex) && hex !== "#000000" && hex !== "#020101").sort((a, b) => b[1] - a[1])[0]?.[0] ?? null;
3850
+ }
3828
3851
  async function extractBrandingFromPage(page) {
3829
3852
  const evalScript = `
3830
3853
  (function() {
3831
3854
  function cs(el) { return el ? window.getComputedStyle(el) : null; }
3855
+ function toHex(rgb) {
3856
+ var m = rgb && rgb.match(/rgba?\\((\\d+),\\s*(\\d+),\\s*(\\d+)/);
3857
+ if (!m) return null;
3858
+ return '#' + [m[1],m[2],m[3]].map(function(v){ return ('0'+parseInt(v).toString(16)).slice(-2); }).join('');
3859
+ }
3860
+ function isUsable(hex) {
3861
+ if (!hex) return false;
3862
+ if (hex === '#000000' || hex === '#020101' || hex === '#ffffff' || hex === '#fffffe') return false;
3863
+ var r=parseInt(hex.slice(1,3),16), g=parseInt(hex.slice(3,5),16), b=parseInt(hex.slice(5,7),16);
3864
+ return (0.2126*r + 0.7152*g + 0.0722*b) <= 230;
3865
+ }
3866
+
3832
3867
  var navEl = document.querySelector('nav, header, [role="banner"]');
3833
3868
  var bodyEl = document.body;
3834
3869
  var h1El = document.querySelector('h1');
3835
3870
  var btnEl = document.querySelector(
3836
3871
  'a.btn-primary, button.btn-primary, .btn-primary, .cta-btn,' +
3837
3872
  'a.button--primary, button.button--primary, [class*="btn-cta"],' +
3838
- '[class*="cta-button"], .wp-block-button__link, [class*="hero"] a'
3873
+ '[class*="cta-button"], .wp-block-button__link, [class*="hero"] a,' +
3874
+ '.elementor-button, .elementor-button-link,' +
3875
+ '.et_pb_button,' +
3876
+ '.fl-button,' +
3877
+ '.vc_btn,' +
3878
+ '[class*="cta"][href], [class*="get-started"], [class*="contact-btn"]'
3839
3879
  );
3840
3880
  var navStyle = cs(navEl);
3841
3881
  var bodyStyle = cs(bodyEl);
3842
3882
  var h1Style = cs(h1El);
3843
3883
  var btnStyle = cs(btnEl);
3844
- var pageHost = window.location.hostname.replace(/^www./, '');
3884
+
3885
+ var svgFreq = {};
3886
+ var svgScope = navEl || document.querySelector('header, [role="banner"]');
3887
+ if (svgScope) {
3888
+ var svgEls = svgScope.querySelectorAll('svg *, [fill], path, circle, rect, polygon, polyline');
3889
+ for (var si = 0; si < svgEls.length; si++) {
3890
+ var svgEl = svgEls[si];
3891
+ var fillComp = cs(svgEl) ? cs(svgEl).fill : null;
3892
+ var fillAttr = svgEl.getAttribute('fill');
3893
+ var fillHex = null;
3894
+ if (fillComp && fillComp !== 'none') { fillHex = toHex(fillComp); }
3895
+ else if (fillAttr && fillAttr !== 'none' && fillAttr.startsWith('#')) { fillHex = fillAttr; }
3896
+ if (fillHex && isUsable(fillHex)) { svgFreq[fillHex] = (svgFreq[fillHex] || 0) + 1; }
3897
+ }
3898
+ }
3899
+
3900
+ var navChildBgFreq = {};
3901
+ if (navEl) {
3902
+ var navChildren = navEl.querySelectorAll('li, a, button, [class*="menu-item"]');
3903
+ for (var ni = 0; ni < navChildren.length; ni++) {
3904
+ var nbg = cs(navChildren[ni]);
3905
+ if (nbg) {
3906
+ var bghex = toHex(nbg.backgroundColor);
3907
+ if (bghex && isUsable(bghex)) { navChildBgFreq[bghex] = (navChildBgFreq[bghex] || 0) + 1; }
3908
+ }
3909
+ }
3910
+ }
3911
+
3912
+ var pageHost = window.location.hostname.replace(/^www\\./, '');
3845
3913
  function isSameDomain(src) {
3846
- try { return new URL(src).hostname.replace(/^www./, '').endsWith(pageHost); } catch { return false; }
3914
+ try { return new URL(src).hostname.replace(/^www\\./, '').endsWith(pageHost); } catch { return false; }
3847
3915
  }
3848
3916
  var logoSelectors = [
3849
3917
  'header img[class*="logo"]', 'nav img[class*="logo"]',
@@ -3866,22 +3934,27 @@ async function extractBrandingFromPage(page) {
3866
3934
  'link[rel~="icon"], link[rel="shortcut icon"], link[rel="apple-touch-icon"]'
3867
3935
  );
3868
3936
  return {
3869
- navBg: navStyle ? navStyle.backgroundColor : null,
3870
- bodyBg: bodyStyle ? bodyStyle.backgroundColor : null,
3871
- bodyColor: bodyStyle ? bodyStyle.color : null,
3872
- h1Color: h1Style ? h1Style.color : null,
3873
- btnBg: btnStyle ? btnStyle.backgroundColor : null,
3874
- bodyFont: bodyStyle ? bodyStyle.fontFamily : null,
3875
- h1Font: h1Style ? h1Style.fontFamily : null,
3876
- logoSrc: logoSrc,
3877
- faviconHref: faviconEl ? faviconEl.href : null,
3937
+ navBg: navStyle ? navStyle.backgroundColor : null,
3938
+ bodyBg: bodyStyle ? bodyStyle.backgroundColor : null,
3939
+ bodyColor: bodyStyle ? bodyStyle.color : null,
3940
+ h1Color: h1Style ? h1Style.color : null,
3941
+ btnBg: btnStyle ? btnStyle.backgroundColor : null,
3942
+ bodyFont: bodyStyle ? bodyStyle.fontFamily : null,
3943
+ h1Font: h1Style ? h1Style.fontFamily : null,
3944
+ logoSrc: logoSrc,
3945
+ faviconHref: faviconEl ? faviconEl.href : null,
3946
+ svgFreq: svgFreq,
3947
+ navChildBgFreq: navChildBgFreq,
3878
3948
  };
3879
3949
  })()
3880
3950
  `;
3881
3951
  const raw = await page.evaluate(evalScript);
3882
3952
  const navBgHex = rgbToHex(raw.navBg ?? "");
3883
3953
  const bodyBgHex = rgbToHex(raw.bodyBg ?? "");
3884
- const primary = !isTransparentOrWhite(navBgHex) ? navBgHex : bodyBgHex;
3954
+ const navBgUsable = navBgHex && !isTransparentOrWhite(navBgHex) && navBgHex !== "#000000" && navBgHex !== "#020101" ? navBgHex : null;
3955
+ const svgPrimary = dominantColor(raw.svgFreq ?? {});
3956
+ const navChildBg = dominantColor(raw.navChildBgFreq ?? {});
3957
+ const primary = navBgUsable ?? svgPrimary ?? navChildBg ?? bodyBgHex;
3885
3958
  const accent = rgbToHex(raw.btnBg ?? "");
3886
3959
  const text = rgbToHex(raw.bodyColor ?? "");
3887
3960
  const heading = rgbToHex(raw.h1Color ?? "");
@@ -15000,9 +15073,12 @@ function reportTitle(full) {
15000
15073
  const title = full.split("\n").find((line) => line.startsWith("# "));
15001
15074
  return title?.replace(/^#\s+/, "").trim() || "MCP Scraper Report";
15002
15075
  }
15076
+ function outputBaseDir() {
15077
+ return process.env.MCP_SCRAPER_OUTPUT_DIR?.trim() || (0, import_node_path6.join)((0, import_node_os3.homedir)(), "Downloads", "mcp-scraper");
15078
+ }
15003
15079
  function saveFullReport(full) {
15004
15080
  if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
15005
- const outDir = process.env.MCP_SCRAPER_OUTPUT_DIR?.trim() || (0, import_node_path6.join)((0, import_node_os3.homedir)(), "Downloads", "mcp-scraper");
15081
+ const outDir = outputBaseDir();
15006
15082
  try {
15007
15083
  (0, import_node_fs4.mkdirSync)(outDir, { recursive: true });
15008
15084
  const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
@@ -15013,6 +15089,20 @@ function saveFullReport(full) {
15013
15089
  return null;
15014
15090
  }
15015
15091
  }
15092
+ function persistScreenshotLocally(base64, url) {
15093
+ if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
15094
+ try {
15095
+ const dir = (0, import_node_path6.join)(outputBaseDir(), "screenshots");
15096
+ (0, import_node_fs4.mkdirSync)(dir, { recursive: true });
15097
+ const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
15098
+ const slug = url.replace(/^https?:\/\//, "").replace(/[^a-z0-9]+/gi, "-").replace(/^-+|-+$/g, "").slice(0, 60);
15099
+ const filePath = (0, import_node_path6.join)(dir, `${stamp}-${slug}.png`);
15100
+ (0, import_node_fs4.writeFileSync)(filePath, Buffer.from(base64, "base64"));
15101
+ return filePath;
15102
+ } catch {
15103
+ return null;
15104
+ }
15105
+ }
15016
15106
  function oneBlock(content) {
15017
15107
  const filePath = saveFullReport(content);
15018
15108
  const text = filePath ? `${content}
@@ -15233,6 +15323,7 @@ function formatExtractUrl(raw, input) {
15233
15323
  const bodyMd = d.bodyMarkdown ?? "";
15234
15324
  const schema = d.schema;
15235
15325
  const screenshotMeta = d.screenshot;
15326
+ const screenshotPath = screenshotMeta?.base64 ? persistScreenshotLocally(screenshotMeta.base64, url) : null;
15236
15327
  const branding = d.branding;
15237
15328
  const media = d.media;
15238
15329
  const h1Lines = headings.filter((h) => h.level === 1).map((h) => `- ${h.text}`).join("\n");
@@ -15259,7 +15350,7 @@ ${[h1Lines, h2Lines].filter(Boolean).join("\n")}` : "";
15259
15350
  ${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
15260
15351
  const screenshotSection = screenshotMeta ? `
15261
15352
  ## Screenshot
15262
- - **File:** ${screenshotMeta.savedPath}
15353
+ - **File:** ${screenshotPath ?? "(returned inline only \u2014 disk write unavailable in this environment)"}
15263
15354
  - **Size:** ${(screenshotMeta.sizeBytes / 1024).toFixed(1)} KB
15264
15355
  - **Device:** ${screenshotMeta.device}` : "";
15265
15356
  const brandingSection = branding ? [
@@ -15288,17 +15379,13 @@ ${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
15288
15379
  **${title}**
15289
15380
  ${headingSection}${kpoSection}${brandingSection}${bodySection}${screenshotSection}${mediaSection}${tips}`;
15290
15381
  const textResult = oneBlock(full);
15291
- if (screenshotMeta?.savedPath) {
15292
- try {
15293
- const imgBuf = (0, import_node_fs4.readFileSync)(screenshotMeta.savedPath);
15294
- return {
15295
- content: [
15296
- ...textResult.content,
15297
- { type: "image", data: imgBuf.toString("base64"), mimeType: "image/png" }
15298
- ]
15299
- };
15300
- } catch {
15301
- }
15382
+ if (screenshotMeta?.base64) {
15383
+ return {
15384
+ content: [
15385
+ ...textResult.content,
15386
+ { type: "image", data: screenshotMeta.base64, mimeType: "image/png" }
15387
+ ]
15388
+ };
15302
15389
  }
15303
15390
  return textResult;
15304
15391
  }
@@ -15800,16 +15887,20 @@ var HttpMcpToolExecutor;
15800
15887
  var init_http_mcp_tool_executor = __esm({
15801
15888
  "src/mcp/http-mcp-tool-executor.ts"() {
15802
15889
  "use strict";
15890
+ init_harvest_timeout();
15803
15891
  HttpMcpToolExecutor = class {
15804
15892
  baseUrl;
15805
15893
  apiKey;
15806
15894
  timeoutMs;
15895
+ httpTimeoutOverrideMs;
15807
15896
  serpIntelligenceTimeoutMs;
15808
15897
  constructor(baseUrl, apiKey) {
15809
15898
  this.baseUrl = baseUrl.replace(/\/$/, "");
15810
15899
  this.apiKey = apiKey;
15811
- const configuredTimeoutMs = Number(process.env.MCP_SCRAPER_HTTP_TIMEOUT_MS ?? 11e4);
15812
- this.timeoutMs = Number.isFinite(configuredTimeoutMs) && configuredTimeoutMs > 0 ? configuredTimeoutMs : 11e4;
15900
+ const rawOverride = process.env.MCP_SCRAPER_HTTP_TIMEOUT_MS;
15901
+ const parsedOverride = rawOverride === void 0 ? NaN : Number(rawOverride);
15902
+ this.httpTimeoutOverrideMs = Number.isFinite(parsedOverride) && parsedOverride > 0 ? parsedOverride : null;
15903
+ this.timeoutMs = this.httpTimeoutOverrideMs ?? 11e4;
15813
15904
  const configuredSerpIntelligenceTimeoutMs = Number(process.env.MCP_SCRAPER_SERP_INTELLIGENCE_HTTP_TIMEOUT_MS ?? this.timeoutMs);
15814
15905
  this.serpIntelligenceTimeoutMs = Number.isFinite(configuredSerpIntelligenceTimeoutMs) && configuredSerpIntelligenceTimeoutMs > 0 ? configuredSerpIntelligenceTimeoutMs : this.timeoutMs;
15815
15906
  }
@@ -15851,10 +15942,12 @@ var init_http_mcp_tool_executor = __esm({
15851
15942
  }
15852
15943
  }
15853
15944
  harvestPaa(input) {
15854
- return this.call("/harvest/sync", input);
15945
+ const timeoutMs = this.httpTimeoutOverrideMs ?? harvestTimeoutBudget(input.maxQuestions ?? 30).clientMs;
15946
+ return this.call("/harvest/sync", input, timeoutMs);
15855
15947
  }
15856
15948
  searchSerp(input) {
15857
- return this.call("/harvest/sync", { ...input, serpOnly: true });
15949
+ const timeoutMs = this.httpTimeoutOverrideMs ?? harvestTimeoutBudget(0, true).clientMs;
15950
+ return this.call("/harvest/sync", { ...input, serpOnly: true }, timeoutMs);
15858
15951
  }
15859
15952
  extractUrl(input) {
15860
15953
  return this.call("/extract-url", input);
@@ -16501,18 +16594,16 @@ async function checkHarvestLimits(userId, email, extraSlots = 0) {
16501
16594
  if (active >= limit) return { error: `You have ${active} job${active !== 1 ? "s" : ""} running. Your account allows ${limit} concurrent job${limit !== 1 ? "s" : ""}. Wait for one to finish or add a concurrency slot at mcpscraper.dev/billing.` };
16502
16595
  return null;
16503
16596
  }
16504
- var import_resend, import_node_fs5, import_node_os4, import_node_path7, import_hono9, import_hono10, import_factory6, import_cookie, import_stripe2, secureCookies, isProduction2, sessionCookieOptions, requireAllowedOrigin, auth, adminAuth, sessionAuth, app, STRIPE_API_VERSION, BYPASS_EMAILS, SYNC_HARVEST_TIMEOUT_MS;
16597
+ var import_resend, import_hono9, import_hono10, import_factory6, import_cookie, import_stripe2, secureCookies, isProduction2, sessionCookieOptions, requireAllowedOrigin, auth, adminAuth, sessionAuth, app, STRIPE_API_VERSION, BYPASS_EMAILS, SYNC_HARVEST_TIMEOUT_OVERRIDE_MS;
16505
16598
  var init_server = __esm({
16506
16599
  "src/api/server.ts"() {
16507
16600
  "use strict";
16601
+ init_harvest_timeout();
16508
16602
  init_registry();
16509
16603
  init_template();
16510
16604
  init_og();
16511
16605
  import_resend = require("resend");
16512
16606
  init_url_utils();
16513
- import_node_fs5 = require("fs");
16514
- import_node_os4 = require("os");
16515
- import_node_path7 = require("path");
16516
16607
  init_kpo_extractor();
16517
16608
  init_screenshot();
16518
16609
  init_media_extractor();
@@ -16742,7 +16833,11 @@ var init_server = __esm({
16742
16833
  BYPASS_EMAILS = new Set(
16743
16834
  (process.env.HARVEST_LIMIT_BYPASS_EMAILS ?? "").split(",").map((e) => e.trim()).filter(Boolean)
16744
16835
  );
16745
- SYNC_HARVEST_TIMEOUT_MS = Number(process.env.SYNC_HARVEST_TIMEOUT_MS ?? 105e3);
16836
+ SYNC_HARVEST_TIMEOUT_OVERRIDE_MS = (() => {
16837
+ const raw = process.env.SYNC_HARVEST_TIMEOUT_MS;
16838
+ const parsed = raw === void 0 ? NaN : Number(raw);
16839
+ return Number.isFinite(parsed) && parsed > 0 ? parsed : null;
16840
+ })();
16746
16841
  app.post("/harvest", auth, async (c) => {
16747
16842
  const user = c.get("user");
16748
16843
  const raw = await c.req.json().catch(() => ({}));
@@ -16812,9 +16907,10 @@ var init_server = __esm({
16812
16907
  if (!syncOk) return c.json(insufficientBalanceResponse(syncBal, syncCost), 402);
16813
16908
  const jobId = await createRunningJob(user.id, options.query, options);
16814
16909
  const recordAttempt = createHarvestAttemptRecorder(jobId, user.id);
16910
+ const syncTimeoutMs = SYNC_HARVEST_TIMEOUT_OVERRIDE_MS ?? harvestTimeoutBudget(options.maxQuestions, options.serpOnly).serverMs;
16815
16911
  const syncSignal = combineAbortSignals([
16816
16912
  c.req.raw.signal,
16817
- AbortSignal.timeout(Number.isFinite(SYNC_HARVEST_TIMEOUT_MS) && SYNC_HARVEST_TIMEOUT_MS > 0 ? SYNC_HARVEST_TIMEOUT_MS : 105e3)
16913
+ AbortSignal.timeout(syncTimeoutMs)
16818
16914
  ]);
16819
16915
  try {
16820
16916
  const result = await harvest({
@@ -16962,13 +17058,7 @@ var init_server = __esm({
16962
17058
  const brandingData = pageData?.branding ?? null;
16963
17059
  let screenshotMeta = null;
16964
17060
  if (screenshotBuf) {
16965
- const outDir = (0, import_node_path7.join)((0, import_node_os4.homedir)(), "Downloads", "mcp-scraper", "screenshots");
16966
- (0, import_node_fs5.mkdirSync)(outDir, { recursive: true });
16967
- const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
16968
- const slug = canonicalUrl.replace(/^https?:\/\//, "").replace(/[^a-z0-9]+/gi, "-").replace(/^-+|-+$/g, "").slice(0, 60);
16969
- const filePath = (0, import_node_path7.join)(outDir, `${stamp}-${slug}.png`);
16970
- (0, import_node_fs5.writeFileSync)(filePath, screenshotBuf);
16971
- screenshotMeta = { savedPath: filePath, sizeBytes: screenshotBuf.length, device };
17061
+ screenshotMeta = { base64: screenshotBuf.toString("base64"), sizeBytes: screenshotBuf.length, device };
16972
17062
  }
16973
17063
  const mediaMeta = downloadMedia ? await harvestPageMedia(result.bodyHtml, canonicalUrl, { types: mediaTypes ?? ["image", "video", "audio"] }) : null;
16974
17064
  await logRequestEvent({ userId: user.id, source: "extract_url", status: "done", query: canonicalUrl, resultCount: result.headings.length, result });
@@ -17306,10 +17396,10 @@ var init_server = __esm({
17306
17396
  });
17307
17397
 
17308
17398
  // bin/api-server.ts
17309
- var import_node_fs6 = require("fs");
17399
+ var import_node_fs5 = require("fs");
17310
17400
  function loadDotEnv() {
17311
17401
  try {
17312
- for (const line of (0, import_node_fs6.readFileSync)(".env", "utf8").split("\n")) {
17402
+ for (const line of (0, import_node_fs5.readFileSync)(".env", "utf8").split("\n")) {
17313
17403
  const eq = line.indexOf("=");
17314
17404
  if (eq < 1 || line.trimStart().startsWith("#")) continue;
17315
17405
  const k = line.slice(0, eq).trim();