mcp-scraper 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/README.md +13 -2
  2. package/dist/bin/api-server.cjs +573 -172
  3. package/dist/bin/api-server.cjs.map +1 -1
  4. package/dist/bin/api-server.js +2 -2
  5. package/dist/bin/mcp-stdio-server.cjs +300 -150
  6. package/dist/bin/mcp-stdio-server.cjs.map +1 -1
  7. package/dist/bin/mcp-stdio-server.js +2 -1
  8. package/dist/bin/mcp-stdio-server.js.map +1 -1
  9. package/dist/bin/paa-harvest.cjs +22 -1
  10. package/dist/bin/paa-harvest.cjs.map +1 -1
  11. package/dist/bin/paa-harvest.js +2 -1
  12. package/dist/bin/paa-harvest.js.map +1 -1
  13. package/dist/{chunk-4OHPDEZM.js → chunk-3OIRNUF5.js} +303 -151
  14. package/dist/chunk-3OIRNUF5.js.map +1 -0
  15. package/dist/{chunk-W4P2U5VF.js → chunk-LUBDFS67.js} +32 -32
  16. package/dist/chunk-LUBDFS67.js.map +1 -0
  17. package/dist/{chunk-7HB7NDOY.js → chunk-ZK456YXN.js} +12 -2
  18. package/dist/chunk-ZK456YXN.js.map +1 -0
  19. package/dist/chunk-ZMOWIBMK.js +36 -0
  20. package/dist/chunk-ZMOWIBMK.js.map +1 -0
  21. package/dist/index.cjs +22 -1
  22. package/dist/index.cjs.map +1 -1
  23. package/dist/index.js +2 -1
  24. package/dist/index.js.map +1 -1
  25. package/dist/{server-V5XMVRYE.js → server-YNJHP5PU.js} +235 -22
  26. package/dist/server-YNJHP5PU.js.map +1 -0
  27. package/dist/{worker-UT4ZQU2T.js → worker-PBG6LGET.js} +4 -3
  28. package/dist/{worker-UT4ZQU2T.js.map → worker-PBG6LGET.js.map} +1 -1
  29. package/docs/adr/0001-in-page-graphql-interception-for-anti-bot-scraping.md +58 -0
  30. package/docs/adr/README.md +11 -0
  31. package/docs/mcp-tool-quality-spec.md +238 -0
  32. package/package.json +5 -4
  33. package/dist/chunk-4OHPDEZM.js.map +0 -1
  34. package/dist/chunk-7HB7NDOY.js.map +0 -1
  35. package/dist/chunk-W4P2U5VF.js.map +0 -1
  36. package/dist/server-V5XMVRYE.js.map +0 -1
@@ -3468,9 +3468,9 @@ async function extractKpo(opts) {
3468
3468
  redirect: "manual"
3469
3469
  });
3470
3470
  if (res.status >= 300 && res.status < 400) {
3471
- const location = res.headers.get("location");
3472
- if (!location) return null;
3473
- const next = new URL(location, target).href;
3471
+ const location2 = res.headers.get("location");
3472
+ if (!location2) return null;
3473
+ const next = new URL(location2, target).href;
3474
3474
  const checkedRedirect = await validatePublicHttpUrl(next, { field: "redirect URL" });
3475
3475
  if (checkedRedirect.error || !checkedRedirect.parsed) return null;
3476
3476
  target = checkedRedirect.parsed.href;
@@ -8216,6 +8216,7 @@ var init_rates = __esm({
8216
8216
  yt_channel: 50,
8217
8217
  yt_transcription: 200,
8218
8218
  fb_ad: 50,
8219
+ maps_search: 2e3,
8219
8220
  maps_place: 2e3,
8220
8221
  maps_review: 50,
8221
8222
  fb_search: 50,
@@ -8277,6 +8278,14 @@ var init_rates = __esm({
8277
8278
  credits: mcToCredits(MC_COSTS.fb_ad),
8278
8279
  unit: "per call"
8279
8280
  },
8281
+ {
8282
+ key: "maps_search",
8283
+ label: "Maps business search",
8284
+ aliases: ["maps_search", "google maps search", "gmb search", "gbp search", "business profiles"],
8285
+ credits: mcToCredits(MC_COSTS.maps_search),
8286
+ unit: "per search",
8287
+ notes: "Returns up to 50 Google Maps business/profile candidates. Use maps_place_intel to hydrate selected businesses."
8288
+ },
8280
8289
  {
8281
8290
  key: "maps_place",
8282
8291
  label: "Maps business lookup",
@@ -8338,6 +8347,7 @@ var init_rates = __esm({
8338
8347
  TRANSCRIPTION_REFUND: "transcription_refund",
8339
8348
  YT_CHANNEL: "yt_channel",
8340
8349
  FB_AD: "fb_ad",
8350
+ MAPS_SEARCH: "maps_search",
8341
8351
  MAPS_PLACE: "maps_place",
8342
8352
  MAPS_REVIEW: "maps_review",
8343
8353
  MAPS_REVIEW_REFUND: "maps_review_refund",
@@ -11034,9 +11044,9 @@ function proxyName(country, state, city) {
11034
11044
  function zipProxyName(zip) {
11035
11045
  return `mcp-serp-residential-us-zip-${zip}`;
11036
11046
  }
11037
- function parseKernelLocationProxyTarget(location, gl) {
11038
- if (!location || gl.toLowerCase() !== "us") return null;
11039
- const canonicalLocation = normalizeLocation(location);
11047
+ function parseKernelLocationProxyTarget(location2, gl) {
11048
+ if (!location2 || gl.toLowerCase() !== "us") return null;
11049
+ const canonicalLocation = normalizeLocation(location2);
11040
11050
  let parts = canonicalLocation.split(",").map((part) => part.trim()).filter(Boolean);
11041
11051
  if (parts.length > 1 && isUnitedStates(parts[parts.length - 1])) {
11042
11052
  parts = parts.slice(0, -1);
@@ -11687,7 +11697,7 @@ var init_facebook_ad_routes = __esm({
11687
11697
  });
11688
11698
 
11689
11699
  // src/schemas.ts
11690
- var import_zod16, HarvestOptionsSchema, MapsPlaceOptionsSchema, RawPAAItemSchema, RawMapsOverviewSchema, RawMapsHoursRowSchema, RawMapsReviewStatsSchema, RawMapsReviewCardSchema, RawMapsAboutAttributeSchema;
11700
+ var import_zod16, HarvestOptionsSchema, MapsPlaceOptionsSchema, MapsSearchOptionsSchema, RawPAAItemSchema, RawMapsOverviewSchema, RawMapsHoursRowSchema, RawMapsReviewStatsSchema, RawMapsReviewCardSchema, RawMapsAboutAttributeSchema;
11691
11701
  var init_schemas3 = __esm({
11692
11702
  "src/schemas.ts"() {
11693
11703
  "use strict";
@@ -11725,6 +11735,16 @@ var init_schemas3 = __esm({
11725
11735
  kernelProxyId: import_zod16.z.string().optional(),
11726
11736
  headless: import_zod16.z.boolean().default(true)
11727
11737
  });
11738
+ MapsSearchOptionsSchema = import_zod16.z.object({
11739
+ query: import_zod16.z.string().min(1),
11740
+ location: import_zod16.z.string().optional(),
11741
+ gl: import_zod16.z.string().length(2).default("us"),
11742
+ hl: import_zod16.z.string().length(2).default("en"),
11743
+ maxResults: import_zod16.z.number().int().min(1).max(50).default(10),
11744
+ kernelApiKey: import_zod16.z.string().optional(),
11745
+ kernelProxyId: import_zod16.z.string().optional(),
11746
+ headless: import_zod16.z.boolean().default(true)
11747
+ });
11728
11748
  RawPAAItemSchema = import_zod16.z.object({
11729
11749
  question: import_zod16.z.string().min(1),
11730
11750
  answer: import_zod16.z.string().optional(),
@@ -11785,8 +11805,8 @@ var init_MapsNavigator = __esm({
11785
11805
  this.page = page;
11786
11806
  }
11787
11807
  page;
11788
- async navigateToPlacePage(businessName, location) {
11789
- const query = `${businessName} ${location}`;
11808
+ async navigateToPlacePage(businessName, location2) {
11809
+ const query = `${businessName} ${location2}`;
11790
11810
  const searchUrl = `https://www.google.com/maps/search/${encodeURIComponent(query)}`;
11791
11811
  await this.page.goto(searchUrl, { waitUntil: "domcontentloaded", timeout: 45e3 });
11792
11812
  const onPlacePage = await this.page.evaluate(() => /\/maps\/place\//.test(window.location.href));
@@ -12230,7 +12250,172 @@ var init_MapsExtractor = __esm({
12230
12250
  }
12231
12251
  });
12232
12252
 
12253
+ // src/extractor/MapsSearchExtractor.ts
12254
+ var MAPS_SEARCH_SCROLL_BUDGET_MS, MAPS_SEARCH_SCROLL_STEP_MS, MAPS_SEARCH_MAX_NO_GROWTH_ROUNDS, MapsSearchExtractor;
12255
+ var init_MapsSearchExtractor = __esm({
12256
+ "src/extractor/MapsSearchExtractor.ts"() {
12257
+ "use strict";
12258
+ init_errors();
12259
+ MAPS_SEARCH_SCROLL_BUDGET_MS = 6e4;
12260
+ MAPS_SEARCH_SCROLL_STEP_MS = 1200;
12261
+ MAPS_SEARCH_MAX_NO_GROWTH_ROUNDS = 4;
12262
+ MapsSearchExtractor = class {
12263
+ constructor(driver) {
12264
+ this.driver = driver;
12265
+ }
12266
+ driver;
12267
+ async extract(options) {
12268
+ const startMs = Date.now();
12269
+ const searchQuery = [options.query, options.location].filter(Boolean).join(" ");
12270
+ const searchUrl = `https://www.google.com/maps/search/${encodeURIComponent(searchQuery)}?hl=${encodeURIComponent(options.hl)}`;
12271
+ const config = {
12272
+ headless: options.headless,
12273
+ kernelApiKey: options.kernelApiKey,
12274
+ kernelProxyId: options.kernelProxyId,
12275
+ viewport: { width: 1280, height: 900 },
12276
+ locale: `${options.hl}-${options.gl.toUpperCase()}`
12277
+ };
12278
+ try {
12279
+ await this.driver.launch(config);
12280
+ const page = this.driver.getPage();
12281
+ await page.goto(searchUrl, { waitUntil: "domcontentloaded", timeout: 6e4 });
12282
+ await page.waitForTimeout(3e3);
12283
+ const blocked = await this.detectBlock(page);
12284
+ if (blocked) throw new CaptchaError(RECAPTCHA_INSTRUCTIONS);
12285
+ const results = await this.collectResults(page, options.maxResults);
12286
+ return {
12287
+ query: options.query,
12288
+ location: options.location ?? null,
12289
+ searchQuery,
12290
+ searchUrl,
12291
+ extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
12292
+ requestedMaxResults: options.maxResults,
12293
+ resultCount: results.length,
12294
+ results,
12295
+ durationMs: Date.now() - startMs
12296
+ };
12297
+ } finally {
12298
+ await this.driver.close();
12299
+ }
12300
+ }
12301
+ async detectBlock(page) {
12302
+ return page.evaluate(() => {
12303
+ const text = document.body.innerText.slice(0, 2e3);
12304
+ return /unusual traffic|captcha|recaptcha|about this page/i.test(text) || /\/sorry\//.test(location.href);
12305
+ });
12306
+ }
12307
+ async collectResults(page, maxResults) {
12308
+ const seen = /* @__PURE__ */ new Map();
12309
+ const started = Date.now();
12310
+ let noGrowthRounds = 0;
12311
+ while (Date.now() - started < MAPS_SEARCH_SCROLL_BUDGET_MS) {
12312
+ const before = seen.size;
12313
+ const batch = await this.extractVisibleResults(page);
12314
+ for (const result of batch) {
12315
+ const key = this.resultKey(result);
12316
+ if (!seen.has(key)) seen.set(key, { ...result, position: seen.size + 1 });
12317
+ if (seen.size >= maxResults) break;
12318
+ }
12319
+ if (seen.size >= maxResults) break;
12320
+ if (seen.size === before) noGrowthRounds += 1;
12321
+ else noGrowthRounds = 0;
12322
+ if (noGrowthRounds >= MAPS_SEARCH_MAX_NO_GROWTH_ROUNDS) break;
12323
+ await page.evaluate(() => {
12324
+ const feed = document.querySelector('[role="feed"]');
12325
+ if (feed) {
12326
+ feed.scrollTop = feed.scrollHeight;
12327
+ } else {
12328
+ window.scrollTo(0, document.body.scrollHeight);
12329
+ }
12330
+ });
12331
+ await page.waitForTimeout(MAPS_SEARCH_SCROLL_STEP_MS);
12332
+ }
12333
+ return [...seen.values()].slice(0, maxResults);
12334
+ }
12335
+ resultKey(result) {
12336
+ return result.cidDecimal ?? result.placeUrl.replace(/[?&].*$/, "") ?? result.name;
12337
+ }
12338
+ async extractVisibleResults(page) {
12339
+ return page.evaluate(() => {
12340
+ function normalizeText(value) {
12341
+ const text = value?.replace(/\s+/g, " ").trim() ?? "";
12342
+ return text || null;
12343
+ }
12344
+ function cidFromUrl(url) {
12345
+ const fid = url.match(/!1s(0x[0-9a-f]+):(0x[0-9a-f]+)/i);
12346
+ if (!fid) return { cid: null, cidDecimal: null };
12347
+ let cidDecimal = null;
12348
+ try {
12349
+ cidDecimal = BigInt(fid[2]).toString();
12350
+ } catch {
12351
+ }
12352
+ return { cid: `${fid[1]}:${fid[2]}`, cidDecimal };
12353
+ }
12354
+ function textParts(card) {
12355
+ if (!card) return [];
12356
+ const parts = [];
12357
+ card.querySelectorAll("div, span").forEach((el2) => {
12358
+ const text = Array.from(el2.childNodes).filter((node) => node.nodeType === 3).map((node) => node.textContent?.trim() ?? "").filter((text2) => text2.length > 1 && text2.length < 140).join(" ");
12359
+ if (text && !parts.includes(text)) parts.push(text);
12360
+ });
12361
+ return parts;
12362
+ }
12363
+ function firstMatching(parts, pattern) {
12364
+ const value = parts.find((part) => pattern.test(part));
12365
+ return value ?? null;
12366
+ }
12367
+ const out = [];
12368
+ const seen = /* @__PURE__ */ new Set();
12369
+ const anchors = Array.from(document.querySelectorAll('a[href*="/maps/place/"]'));
12370
+ for (const anchor of anchors) {
12371
+ const placeUrl = anchor.href;
12372
+ const stableUrl = placeUrl.replace(/[?&].*$/, "");
12373
+ if (seen.has(stableUrl)) continue;
12374
+ seen.add(stableUrl);
12375
+ const card = anchor.closest('.Nv2PK, [role="article"], .bfdHYd') ?? anchor.parentElement;
12376
+ const parts = textParts(card);
12377
+ const aria = normalizeText(anchor.getAttribute("aria-label"));
12378
+ const heading = normalizeText(card?.querySelector('.qBF1Pd, .fontHeadlineSmall, [role="heading"]')?.textContent);
12379
+ const name = aria ?? heading ?? parts[0] ?? stableUrl;
12380
+ const links = Array.from(card?.querySelectorAll("a[href]") ?? []);
12381
+ const websiteUrl = links.find((link) => link.href.startsWith("http") && !link.href.includes("google."))?.href ?? null;
12382
+ const directionsUrl = links.find((link) => /google\.[^/]+\/maps\/dir|\/dir\//i.test(link.href))?.href ?? null;
12383
+ const rating = firstMatching(parts, /^\d(?:\.\d)?$/);
12384
+ const reviewCountRaw = firstMatching(parts, /^\(?[\d,]+\)?$/);
12385
+ const category = parts.find((part) => !/^\d(?:\.\d)?$|^\(?[\d,]+\)?$|open|closed|directions|website/i.test(part)) ?? null;
12386
+ const address = parts.find((part) => /\b[A-Z]{2}\s+\d{5}\b|\b(?:St|Street|Ave|Avenue|Rd|Road|Blvd|Drive|Dr)\b/i.test(part)) ?? null;
12387
+ const { cid, cidDecimal } = cidFromUrl(placeUrl);
12388
+ out.push({
12389
+ position: out.length + 1,
12390
+ name,
12391
+ placeUrl,
12392
+ cid,
12393
+ cidDecimal,
12394
+ rating,
12395
+ reviewCount: reviewCountRaw ? reviewCountRaw.replace(/[()]/g, "") : null,
12396
+ category,
12397
+ address,
12398
+ websiteUrl,
12399
+ directionsUrl,
12400
+ metadata: parts.slice(0, 20)
12401
+ });
12402
+ }
12403
+ return out;
12404
+ });
12405
+ }
12406
+ };
12407
+ }
12408
+ });
12409
+
12233
12410
  // src/api/maps-routes.ts
12411
+ function mapsErrorResponse(c, msg, errorCode) {
12412
+ const blocked = msg.includes("CAPTCHA") || msg.includes("blocked");
12413
+ return c.json({
12414
+ error: sanitizeVendorName(msg),
12415
+ error_code: blocked ? "captcha_or_blocked" : errorCode,
12416
+ retryable: blocked
12417
+ }, blocked ? 503 : 500);
12418
+ }
12234
12419
  var import_hono5, mapsApp;
12235
12420
  var init_maps_routes = __esm({
12236
12421
  "src/api/maps-routes.ts"() {
@@ -12239,10 +12424,59 @@ var init_maps_routes = __esm({
12239
12424
  init_db();
12240
12425
  init_rates();
12241
12426
  init_MapsExtractor();
12427
+ init_MapsSearchExtractor();
12242
12428
  init_BrowserDriver();
12243
12429
  init_schemas3();
12244
12430
  init_api_auth();
12431
+ init_errors();
12245
12432
  mapsApp = new import_hono5.Hono();
12433
+ mapsApp.post("/search", createApiKeyAuth(), async (c) => {
12434
+ const user = c.get("user");
12435
+ const body = await c.req.json().catch(() => ({}));
12436
+ const parsed = MapsSearchOptionsSchema.safeParse({
12437
+ kernelApiKey: process.env.KERNEL_API_KEY,
12438
+ ...body
12439
+ });
12440
+ if (!parsed.success) {
12441
+ return c.json({ error: parsed.error.issues[0]?.message ?? "Invalid request" }, 400);
12442
+ }
12443
+ const { ok, balance_mc } = await debitMc(
12444
+ user.id,
12445
+ MC_COSTS.maps_search,
12446
+ LedgerOperation.MAPS_SEARCH,
12447
+ [parsed.data.query, parsed.data.location].filter(Boolean).join(" ")
12448
+ );
12449
+ if (!ok) return c.json(insufficientBalanceResponse(balance_mc, MC_COSTS.maps_search), 402);
12450
+ const driver = new BrowserDriver();
12451
+ const extractor = new MapsSearchExtractor(driver);
12452
+ try {
12453
+ const result = await extractor.extract(parsed.data);
12454
+ await logRequestEvent({
12455
+ userId: user.id,
12456
+ source: "maps_search",
12457
+ status: "done",
12458
+ query: result.searchQuery,
12459
+ location: parsed.data.location,
12460
+ resultCount: result.resultCount,
12461
+ result
12462
+ });
12463
+ return c.json(result);
12464
+ } catch (err) {
12465
+ await creditMc(user.id, MC_COSTS.maps_search, LedgerOperation.REFUND, "failed maps_search call");
12466
+ const msg = err instanceof Error ? err.message : String(err);
12467
+ await logRequestEvent({
12468
+ userId: user.id,
12469
+ source: "maps_search",
12470
+ status: "failed",
12471
+ query: [parsed.data.query, parsed.data.location].filter(Boolean).join(" "),
12472
+ location: parsed.data.location,
12473
+ error: msg
12474
+ });
12475
+ return mapsErrorResponse(c, msg, "maps_search_failed");
12476
+ } finally {
12477
+ await driver.close();
12478
+ }
12479
+ });
12246
12480
  mapsApp.post("/place", createApiKeyAuth(), async (c) => {
12247
12481
  const user = c.get("user");
12248
12482
  const body = await c.req.json().catch(() => ({}));
@@ -12309,10 +12543,7 @@ var init_maps_routes = __esm({
12309
12543
  location: parsed.data.location,
12310
12544
  error: msg
12311
12545
  });
12312
- if (msg.includes("CAPTCHA") || msg.includes("blocked")) {
12313
- return c.json({ error: msg }, 503);
12314
- }
12315
- return c.json({ error: msg }, 500);
12546
+ return mapsErrorResponse(c, msg, "maps_place_failed");
12316
12547
  } finally {
12317
12548
  await driver.close();
12318
12549
  }
@@ -12670,8 +12901,19 @@ function addCandidate(candidates, city, region, example) {
12670
12901
  }
12671
12902
  candidates.set(key, { city: normalizedCity, regionCode, count: 1, examples: [example] });
12672
12903
  }
12904
+ function decodeSerpText(text) {
12905
+ try {
12906
+ return decodeURIComponent(text);
12907
+ } catch {
12908
+ }
12909
+ try {
12910
+ return decodeURIComponent(text.replace(/%(?![0-9a-fA-F]{2})/g, "%25"));
12911
+ } catch {
12912
+ return text;
12913
+ }
12914
+ }
12673
12915
  function scanText(candidates, text) {
12674
- const normalized = decodeURIComponent(text).replace(/[+/|_-]+/g, " ");
12916
+ const normalized = decodeSerpText(text).replace(/[+/|_-]+/g, " ");
12675
12917
  for (const match of normalized.matchAll(CITY_STATE_RE)) {
12676
12918
  addCandidate(candidates, match[1] ?? "", match[2] ?? "", normalized.slice(0, 180));
12677
12919
  }
@@ -14952,8 +15194,17 @@ var init_serp_intelligence_routes = __esm({
14952
15194
  }
14953
15195
  });
14954
15196
 
15197
+ // src/version.ts
15198
+ var PACKAGE_VERSION;
15199
+ var init_version = __esm({
15200
+ "src/version.ts"() {
15201
+ "use strict";
15202
+ PACKAGE_VERSION = "0.1.7";
15203
+ }
15204
+ });
15205
+
14955
15206
  // src/mcp/mcp-tool-schemas.ts
14956
- var import_zod19, HarvestPaaInputSchema, ExtractUrlInputSchema, MapSiteUrlsInputSchema, ExtractSiteInputSchema, YoutubeHarvestInputSchema, YoutubeTranscribeInputSchema, FacebookPageIntelInputSchema, FacebookAdSearchInputSchema, FacebookAdTranscribeInputSchema, MapsPlaceIntelInputSchema, CreditsInfoInputSchema, SearchSerpInputSchema, CaptureSerpSnapshotInputSchema, ScreenshotInputSchema, CaptureSerpPageSnapshotsInputSchema;
15207
+ var import_zod19, HarvestPaaInputSchema, ExtractUrlInputSchema, MapSiteUrlsInputSchema, ExtractSiteInputSchema, YoutubeHarvestInputSchema, YoutubeTranscribeInputSchema, FacebookPageIntelInputSchema, FacebookAdSearchInputSchema, FacebookAdTranscribeInputSchema, MapsPlaceIntelInputSchema, MapsSearchInputSchema, NullableString, MapsSearchOutputSchema, MapSiteUrlsOutputSchema, YoutubeHarvestOutputSchema, FacebookAdSearchOutputSchema, FacebookPageIntelOutputSchema, CreditsInfoInputSchema, SearchSerpInputSchema, CaptureSerpSnapshotInputSchema, ScreenshotInputSchema, CaptureSerpPageSnapshotsInputSchema;
14957
15208
  var init_mcp_tool_schemas = __esm({
14958
15209
  "src/mcp/mcp-tool-schemas.ts"() {
14959
15210
  "use strict";
@@ -14961,7 +15212,7 @@ var init_mcp_tool_schemas = __esm({
14961
15212
  HarvestPaaInputSchema = {
14962
15213
  query: import_zod19.z.string().min(1).describe('Core search topic only. If the user says "best hvac company in Denver CO", use query="best hvac company" and location="Denver, CO". Do not include the location in query when it can be separated.'),
14963
15214
  location: import_zod19.z.string().optional().describe('City, region, or country for geo-targeted results, inferred from the user request when present, e.g. "Denver, CO", "Tokyo, Japan", "London, UK".'),
14964
- maxQuestions: import_zod19.z.number().int().min(1).max(150).default(30).describe("Number of PAA questions to extract. Default 30. Maximum 150. Use 10 for quick probes, 30 for normal research, 100-150 when the user asks for everything/full/deep research. Credits are charged by extracted question; unused request hold is refunded."),
15215
+ maxQuestions: import_zod19.z.number().int().min(1).max(200).default(30).describe("Number of PAA questions to extract. Default 30. Maximum 200. Use 10 for quick probes, 30 for normal research, 100-200 when the user asks for everything/full/deep research. Larger harvests get a longer server time budget (151-200 questions \u2192 up to 280s). Credits are charged by extracted question; unused request hold is refunded."),
14965
15216
  gl: import_zod19.z.string().length(2).default("us").describe("Google country code inferred from location or user language. Examples: United States us, United Kingdom gb, Japan jp, Canada ca, Australia au."),
14966
15217
  hl: import_zod19.z.string().default("en").describe("Google interface/content language inferred from the user request. Use en unless the user asks for another language or locale."),
14967
15218
  device: import_zod19.z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use desktop by default; use mobile only when the user asks for mobile rankings."),
@@ -15018,6 +15269,93 @@ var init_mcp_tool_schemas = __esm({
15018
15269
  includeReviews: import_zod19.z.boolean().default(false).describe("Whether to fetch individual review cards"),
15019
15270
  maxReviews: import_zod19.z.number().int().min(1).max(500).default(50).describe("Max review cards to return (requires includeReviews: true)")
15020
15271
  };
15272
+ MapsSearchInputSchema = {
15273
+ query: import_zod19.z.string().min(1).describe('Business category, niche, keyword, or search term. If the user says "roofers in Denver CO", use query="roofers" and location="Denver, CO". Do not put the location here when it can be separated.'),
15274
+ location: import_zod19.z.string().optional().describe('City, region, country, or service area for the Maps search, e.g. "Denver, CO". Infer from the user request when present.'),
15275
+ gl: import_zod19.z.string().length(2).default("us").describe("Google country code inferred from location."),
15276
+ hl: import_zod19.z.string().length(2).default("en").describe("Language inferred from user request."),
15277
+ maxResults: import_zod19.z.number().int().min(1).max(50).default(10).describe("Number of Google Maps business/profile candidates to return. Default 10. Maximum 50. Use 10 unless the user asks for more.")
15278
+ };
15279
+ NullableString = import_zod19.z.string().nullable();
15280
+ MapsSearchOutputSchema = {
15281
+ query: import_zod19.z.string(),
15282
+ location: import_zod19.z.string().nullable(),
15283
+ searchQuery: import_zod19.z.string(),
15284
+ searchUrl: import_zod19.z.string().url(),
15285
+ extractedAt: import_zod19.z.string(),
15286
+ requestedMaxResults: import_zod19.z.number().int().min(1).max(50),
15287
+ resultCount: import_zod19.z.number().int().min(0).max(50),
15288
+ results: import_zod19.z.array(import_zod19.z.object({
15289
+ position: import_zod19.z.number().int().min(1),
15290
+ name: import_zod19.z.string(),
15291
+ placeUrl: import_zod19.z.string().url(),
15292
+ cid: NullableString,
15293
+ cidDecimal: NullableString,
15294
+ rating: NullableString,
15295
+ reviewCount: NullableString,
15296
+ category: NullableString,
15297
+ address: NullableString,
15298
+ websiteUrl: NullableString,
15299
+ directionsUrl: NullableString,
15300
+ metadata: import_zod19.z.array(import_zod19.z.string())
15301
+ })),
15302
+ durationMs: import_zod19.z.number().int().min(0)
15303
+ };
15304
+ MapSiteUrlsOutputSchema = {
15305
+ startUrl: import_zod19.z.string(),
15306
+ totalFound: import_zod19.z.number().int().min(0),
15307
+ truncated: import_zod19.z.boolean(),
15308
+ okCount: import_zod19.z.number().int().min(0),
15309
+ redirectCount: import_zod19.z.number().int().min(0),
15310
+ brokenCount: import_zod19.z.number().int().min(0),
15311
+ urls: import_zod19.z.array(import_zod19.z.object({
15312
+ url: import_zod19.z.string(),
15313
+ status: import_zod19.z.number().int().nullable()
15314
+ })),
15315
+ durationMs: import_zod19.z.number().min(0)
15316
+ };
15317
+ YoutubeHarvestOutputSchema = {
15318
+ mode: import_zod19.z.string(),
15319
+ videoCount: import_zod19.z.number().int().min(0),
15320
+ channel: import_zod19.z.object({
15321
+ title: NullableString,
15322
+ subscriberCount: NullableString
15323
+ }).nullable(),
15324
+ videos: import_zod19.z.array(import_zod19.z.object({
15325
+ videoId: import_zod19.z.string(),
15326
+ title: import_zod19.z.string(),
15327
+ channelName: NullableString,
15328
+ views: NullableString,
15329
+ duration: NullableString,
15330
+ url: NullableString
15331
+ }))
15332
+ };
15333
+ FacebookAdSearchOutputSchema = {
15334
+ query: import_zod19.z.string(),
15335
+ advertiserCount: import_zod19.z.number().int().min(0),
15336
+ advertisers: import_zod19.z.array(import_zod19.z.object({
15337
+ name: NullableString,
15338
+ adCount: import_zod19.z.number().int().nullable(),
15339
+ libraryId: NullableString
15340
+ }))
15341
+ };
15342
+ FacebookPageIntelOutputSchema = {
15343
+ advertiserName: NullableString,
15344
+ totalAds: import_zod19.z.number().int().min(0),
15345
+ activeCount: import_zod19.z.number().int().min(0),
15346
+ videoCount: import_zod19.z.number().int().min(0),
15347
+ imageCount: import_zod19.z.number().int().min(0),
15348
+ ads: import_zod19.z.array(import_zod19.z.object({
15349
+ libraryId: NullableString,
15350
+ status: NullableString,
15351
+ creativeType: NullableString,
15352
+ headline: NullableString,
15353
+ cta: NullableString,
15354
+ startDate: NullableString,
15355
+ videoUrl: NullableString,
15356
+ variations: import_zod19.z.number().int().nullable()
15357
+ }))
15358
+ };
15021
15359
  CreditsInfoInputSchema = {
15022
15360
  item: import_zod19.z.string().optional().describe('Optional tool, action, or feature to look up, e.g. "maps reviews", "extract_url", or "YouTube transcription"'),
15023
15361
  includeLedger: import_zod19.z.boolean().default(false).describe("Whether to include recent credit ledger entries")
@@ -15066,6 +15404,14 @@ var init_mcp_tool_schemas = __esm({
15066
15404
  });
15067
15405
 
15068
15406
  // src/mcp/mcp-response-formatter.ts
15407
+ function configureReportSaving(enabled) {
15408
+ reportSavingEnabled = enabled;
15409
+ }
15410
+ function sanitizeVendorText(text) {
15411
+ return sanitizeVendorName(
15412
+ text.replace(/kernel_session_id/gi, "browser_session_id").replace(/kernel_delete_succeeded/gi, "session_cleanup_succeeded").replace(/kernel_delete_started/gi, "session_cleanup_started").replace(/kernel_delete_error/gi, "session_cleanup_error").replace(/kernelSessionId/g, "browserSessionId").replace(/kernelProxyId/g, "proxyId").replace(/KERNEL_API_KEY/g, "BROWSER_SERVICE_API_KEY").replace(/"kernel"\s*:/gi, '"browserRuntime":')
15413
+ );
15414
+ }
15069
15415
  function slugifyReportName(input) {
15070
15416
  return input.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80) || "mcp-scraper-report";
15071
15417
  }
@@ -15077,7 +15423,7 @@ function outputBaseDir() {
15077
15423
  return process.env.MCP_SCRAPER_OUTPUT_DIR?.trim() || (0, import_node_path6.join)((0, import_node_os3.homedir)(), "Downloads", "mcp-scraper");
15078
15424
  }
15079
15425
  function saveFullReport(full) {
15080
- if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
15426
+ if (!reportSavingEnabled || process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
15081
15427
  const outDir = outputBaseDir();
15082
15428
  try {
15083
15429
  (0, import_node_fs4.mkdirSync)(outDir, { recursive: true });
@@ -15090,7 +15436,7 @@ function saveFullReport(full) {
15090
15436
  }
15091
15437
  }
15092
15438
  function persistScreenshotLocally(base64, url) {
15093
- if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
15439
+ if (!reportSavingEnabled || process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
15094
15440
  try {
15095
15441
  const dir = (0, import_node_path6.join)(outputBaseDir(), "screenshots");
15096
15442
  (0, import_node_fs4.mkdirSync)(dir, { recursive: true });
@@ -15130,11 +15476,11 @@ function parseData(raw) {
15130
15476
  const text = first?.type === "text" ? first.text : "";
15131
15477
  try {
15132
15478
  const parsed = JSON.parse(text || "{}");
15133
- if (raw.isError || parsed.error || parsed.error_code) return { error: formatStructuredError(parsed, text) };
15479
+ if (raw.isError || parsed.error || parsed.error_code) return { error: sanitizeVendorText(formatStructuredError(parsed, text)) };
15134
15480
  const data = parsed.result ?? parsed;
15135
15481
  return { data };
15136
15482
  } catch {
15137
- if (raw.isError) return { error: text || "Tool error" };
15483
+ if (raw.isError) return { error: sanitizeVendorText(text || "Tool error") };
15138
15484
  return { error: "Failed to parse tool response" };
15139
15485
  }
15140
15486
  }
@@ -15148,15 +15494,6 @@ function entityIdsSection(ids) {
15148
15494
  ## Entity IDs
15149
15495
  ${lines.join("\n")}` : "";
15150
15496
  }
15151
- function entityIdsSummaryLine(ids) {
15152
- if (!ids) return "";
15153
- const parts = [];
15154
- if (ids.kgIds?.length) parts.push(`KG MID: ${ids.kgIds[0]}`);
15155
- if (ids.cids?.length) parts.push(`CID: ${ids.cids[0]}`);
15156
- if (ids.gcids?.length) parts.push(`GCID: ${ids.gcids[0]}`);
15157
- return parts.length ? `
15158
- **Entity IDs:** ${parts.join(" \xB7 ")}` : "";
15159
- }
15160
15497
  function truncate(s, max) {
15161
15498
  if (!s) return "";
15162
15499
  return s.length > max ? s.slice(0, max) + "\u2026" : s;
@@ -15186,7 +15523,7 @@ function debugSection(debug) {
15186
15523
  if (locationEvidence) {
15187
15524
  lines.push(`- Location evidence: ${locationEvidence.status}${locationEvidence.expected ? ` \xB7 expected ${locationEvidence.expected.city}${locationEvidence.expected.regionCode ? `, ${locationEvidence.expected.regionCode}` : ""}` : ""}${candidates ? ` \xB7 candidates ${candidates}` : ""}`);
15188
15525
  }
15189
- return lines.join("\n");
15526
+ return sanitizeVendorText(lines.join("\n"));
15190
15527
  }
15191
15528
  function errorAttemptsSection(body) {
15192
15529
  const attempts = Array.isArray(body.attempts) ? body.attempts : [];
@@ -15240,26 +15577,12 @@ ${serpRows}` : "";
15240
15577
  const tips = `
15241
15578
  ---
15242
15579
  \u{1F4A1} **Tips**
15243
- - Max questions: \`maxQuestions: 150\` (current: ${input.maxQuestions ?? 30})
15580
+ - Max questions: \`maxQuestions: 200\` (current: ${input.maxQuestions ?? 30})
15244
15581
  - Organic results only: use \`search_serp\`
15245
15582
  - Dig into a result: use \`extract_url\` on any organic URL`;
15246
15583
  const full = `# PAA Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
15247
15584
 
15248
15585
  ${paaTable}${serpTable}${entityIdsSection(entityIds)}${aiSection}${statsLine}${debugSection(diagnostics?.debug)}${tips}`;
15249
- const topQ = flat.slice(0, 10).map((r, i) => `${i + 1}. ${r.question}`).join("\n");
15250
- const topO = organic.slice(0, 5).map((r) => `${r.position}. [${r.title}](${r.url}) \u2014 ${r.domain}`).join("\n");
15251
- const summary = [
15252
- `**PAA: "${input.query}"** \u2014 ${flat.length} questions extracted`,
15253
- topQ ? `
15254
- **Top questions:**
15255
- ${topQ}` : "",
15256
- organic.length ? `
15257
- **Top organic results:**
15258
- ${topO}` : "",
15259
- entityIdsSummaryLine(entityIds),
15260
- `
15261
- \u{1F4A1} \`maxQuestions\` up to 150 \xB7Use \`extract_url\` to dig into any result`
15262
- ].filter(Boolean).join("\n");
15263
15586
  return oneBlock(full);
15264
15587
  }
15265
15588
  function formatSearchSerp(raw, input) {
@@ -15298,18 +15621,6 @@ ${localRows}` : "";
15298
15621
  const full = `# SERP Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
15299
15622
 
15300
15623
  ${serpTable}${localSection}${entityIdsSection(entityIds)}${aiSection}${debugSection(diagnostics?.debug)}${tips}`;
15301
- const topO = organic.slice(0, 5).map((r) => `${r.position}. [${r.title}](${r.url}) \u2014 ${r.domain}`).join("\n");
15302
- const summary = [
15303
- `**SERP: "${input.query}"** \u2014 ${organic.length} organic results`,
15304
- topO ? `
15305
- **Top results:**
15306
- ${topO}` : "",
15307
- localPack.length ? `
15308
- **Local Pack:** ${localPack.map((b) => b.name).join(", ")}` : "",
15309
- entityIdsSummaryLine(entityIds),
15310
- `
15311
- \u{1F4A1} Use \`harvest_paa\` for questions \xB7 \`extract_url\` to scrape any result`
15312
- ].filter(Boolean).join("\n");
15313
15624
  return oneBlock(full);
15314
15625
  }
15315
15626
  function formatExtractUrl(raw, input) {
@@ -15420,15 +15731,19 @@ ${broken.map((u) => `- ${u.url} (${u.status})`).join("\n")}` : "",
15420
15731
  - Extract content from all pages: use \`extract_site\`
15421
15732
  - Scrape a single page: use \`extract_url\``
15422
15733
  ].filter(Boolean).join("\n");
15423
- const summary = [
15424
- `**URL Map: ${input.url}**`,
15425
- `${d.totalFound} URLs \u2014 ${ok.length} OK \xB7 ${broken.length} broken \xB7 ${redirects.length} redirects`,
15426
- broken.length ? `
15427
- **Broken URLs:** ${broken.slice(0, 3).map((u) => u.url).join(", ")}` : "",
15428
- `
15429
- \u{1F4A1} Use \`extract_site\` to extract content from all pages`
15430
- ].filter(Boolean).join("\n");
15431
- return oneBlock(full);
15734
+ return {
15735
+ ...oneBlock(full),
15736
+ structuredContent: {
15737
+ startUrl: d.startUrl ?? input.url,
15738
+ totalFound: d.totalFound ?? urls.length,
15739
+ truncated: d.truncated === true,
15740
+ okCount: ok.length,
15741
+ redirectCount: redirects.length,
15742
+ brokenCount: broken.length,
15743
+ urls: urls.map((u) => ({ url: u.url, status: u.status ?? null })),
15744
+ durationMs: d.durationMs ?? 0
15745
+ }
15746
+ };
15432
15747
  }
15433
15748
  function formatExtractSite(raw, input) {
15434
15749
  const parsed = parseData(raw);
@@ -15453,13 +15768,6 @@ ${pageRows}`,
15453
15768
  - Map URLs first: use \`map_site_urls\`
15454
15769
  - Inspect a single page: use \`extract_url\``
15455
15770
  ].join("\n");
15456
- const summary = [
15457
- `**Site Extract: ${input.url}** \u2014 ${pages.length} pages`,
15458
- pages.slice(0, 5).map((p) => `- ${p.title ?? p.url}`).join("\n"),
15459
- pages.length > 5 ? `- \u2026 and ${pages.length - 5} more` : "",
15460
- `
15461
- \u{1F4A1} Use \`extract_url\` to inspect any individual page`
15462
- ].filter(Boolean).join("\n");
15463
15771
  return oneBlock(full);
15464
15772
  }
15465
15773
  function formatYoutubeHarvest(raw, input) {
@@ -15490,16 +15798,22 @@ ${videoRows}`,
15490
15798
  - Transcribe a video: use \`youtube_transcribe\` with the \`videoId\` above
15491
15799
  - Switch mode: \`mode: "channel"\` with \`channelHandle\` or \`mode: "search"\` with \`query\``
15492
15800
  ].filter(Boolean).join("\n");
15493
- const top5 = videos.slice(0, 5).map((v, i) => `${i + 1}. ${v.title} (\`${v.videoId}\`)`).join("\n");
15494
- const summary = [
15495
- `**YouTube: ${label}** \u2014 ${videos.length} videos`,
15496
- `
15497
- **Top videos:**
15498
- ${top5}`,
15499
- `
15500
- \u{1F4A1} Transcribe any video: \`youtube_transcribe\` with its videoId`
15501
- ].join("\n");
15502
- return oneBlock(full);
15801
+ return {
15802
+ ...oneBlock(full),
15803
+ structuredContent: {
15804
+ mode: input.mode,
15805
+ videoCount: videos.length,
15806
+ channel: d.channelMeta ? { title: d.channelMeta.title ?? null, subscriberCount: d.channelMeta.subscriberCount ?? null } : null,
15807
+ videos: videos.map((v) => ({
15808
+ videoId: String(v.videoId ?? ""),
15809
+ title: String(v.title ?? ""),
15810
+ channelName: v.channelName ?? null,
15811
+ views: v.views ?? null,
15812
+ duration: v.duration ?? null,
15813
+ url: v.url ?? null
15814
+ }))
15815
+ }
15816
+ };
15503
15817
  }
15504
15818
  function formatYoutubeTranscribe(raw, input) {
15505
15819
  const parsed = parseData(raw);
@@ -15529,14 +15843,6 @@ ${chunkRows}` : "",
15529
15843
  ---
15530
15844
  \u{1F4A1} Harvest more from this channel: use \`youtube_harvest\` with \`mode: "channel"\``
15531
15845
  ].filter(Boolean).join("\n");
15532
- const summary = [
15533
- `**YouTube Transcript: \`${input.videoId}\`** \u2014 ${text.split(" ").length} words \xB7 ${durSec}s`,
15534
- `
15535
- **Preview:**
15536
- > ${truncate(text, 300)}`,
15537
- `
15538
- \u{1F4A1} Full transcript in artifact above`
15539
- ].join("\n");
15540
15846
  return oneBlock(full);
15541
15847
  }
15542
15848
  function formatFacebookPageIntel(raw, input) {
@@ -15565,19 +15871,26 @@ ${adBlocks}`,
15565
15871
  - Transcribe video ads: use \`facebook_ad_transcribe\` with the \`videoUrl\` above
15566
15872
  - Find other advertisers: use \`facebook_ad_search\``
15567
15873
  ].filter(Boolean).join("\n");
15568
- const activeAds = ads.filter((a) => a.status?.toLowerCase() === "active").slice(0, 5);
15569
- const adSummary = activeAds.map((a, i) => `${i + 1}. ${truncate(a.headline ?? a.primaryText, 80)} (${a.creativeType ?? "\u2014"})`).join("\n");
15570
- const videoCount = ads.filter((a) => a.videoUrl).length;
15571
- const summary = [
15572
- `**Facebook Ads: ${advertiser}** \u2014 ${s.totalAds} ads (${s.activeCount} active)`,
15573
- adSummary ? `
15574
- **Active ads:**
15575
- ${adSummary}` : "",
15576
- `**Creative mix:** ${s.videoCount} video \xB7 ${s.imageCount} image`,
15577
- videoCount ? `
15578
- \u{1F4A1} ${videoCount} video ads \u2014 transcribe with \`facebook_ad_transcribe\` using the videoUrl` : ""
15579
- ].filter(Boolean).join("\n");
15580
- return oneBlock(full);
15874
+ return {
15875
+ ...oneBlock(full),
15876
+ structuredContent: {
15877
+ advertiserName: d.advertiserName ?? null,
15878
+ totalAds: s.totalAds ?? 0,
15879
+ activeCount: s.activeCount ?? 0,
15880
+ videoCount: s.videoCount ?? 0,
15881
+ imageCount: s.imageCount ?? 0,
15882
+ ads: ads.map((ad) => ({
15883
+ libraryId: ad.libraryId ?? null,
15884
+ status: ad.status ?? null,
15885
+ creativeType: ad.creativeType ?? null,
15886
+ headline: ad.headline ?? null,
15887
+ cta: ad.cta ?? null,
15888
+ startDate: ad.startDate ?? null,
15889
+ videoUrl: ad.videoUrl ?? null,
15890
+ variations: typeof ad.variations === "number" ? ad.variations : null
15891
+ }))
15892
+ }
15893
+ };
15581
15894
  }
15582
15895
  function formatFacebookAdSearch(raw, input) {
15583
15896
  const parsed = parseData(raw);
@@ -15601,15 +15914,18 @@ ${rows}`,
15601
15914
  - Scan all ads: use \`facebook_page_intel\` with \`libraryId\`
15602
15915
  - Or pass the advertiser name as \`query\` in \`facebook_page_intel\``
15603
15916
  ].join("\n");
15604
- const summary = [
15605
- `**Facebook Ad Search: "${input.query}"** \u2014 ${advertisers.length} advertisers`,
15606
- advertisers.slice(0, 5).map(
15607
- (a, i) => `${i + 1}. ${a.name}${a.adCount ? ` (${a.adCount} ads)` : ""} \u2014 \`${a.libraryId ?? "\u2014"}\``
15608
- ).join("\n"),
15609
- `
15610
- \u{1F4A1} Scan ads with \`facebook_page_intel\` using \`libraryId\``
15611
- ].filter(Boolean).join("\n");
15612
- return oneBlock(full);
15917
+ return {
15918
+ ...oneBlock(full),
15919
+ structuredContent: {
15920
+ query: input.query,
15921
+ advertiserCount: advertisers.length,
15922
+ advertisers: advertisers.map((a) => ({
15923
+ name: a.pageName ?? a.name ?? null,
15924
+ adCount: typeof a.adCount === "number" ? a.adCount : null,
15925
+ libraryId: a.sampleLibraryId ?? a.libraryId ?? null
15926
+ }))
15927
+ }
15928
+ };
15613
15929
  }
15614
15930
  function formatCreditsInfo(raw, input) {
15615
15931
  const parsed = parseData(raw);
@@ -15648,16 +15964,58 @@ ${costRows}` : "",
15648
15964
  | Date | Operation | Credits | Description |
15649
15965
  |------|-----------|---------|-------------|
15650
15966
  ${ledgerRows}` : ""
15651
- ].filter(Boolean).join("\n");
15652
- const summary = [
15653
- `**Credit balance:** ${balance ?? "unknown"} credits`,
15654
- matched ? `
15655
- **${matched.label}:** ${matched.credits} credits ${matched.unit}` : null,
15656
- input.includeLedger && ledger.length ? `
15657
- Recent ledger entries included in the full report.` : null
15658
15967
  ].filter(Boolean).join("\n");
15659
15968
  return oneBlock(full);
15660
15969
  }
15970
+ function formatMapsSearch(raw, input) {
15971
+ const parsed = parseData(raw);
15972
+ if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
15973
+ const d = parsed.data;
15974
+ const results = d.results ?? [];
15975
+ const searchQuery = d.searchQuery ?? [input.query, input.location].filter(Boolean).join(" ");
15976
+ const requestedMax = d.requestedMaxResults ?? input.maxResults ?? 10;
15977
+ const durationMs = d.durationMs;
15978
+ const rows = results.map((r) => {
15979
+ const rating = [r.rating, r.reviewCount ? `(${r.reviewCount})` : null].filter(Boolean).join(" ");
15980
+ return `| ${r.position} | ${cell(r.name)} | ${cell(r.category)} | ${cell(rating)} | ${cell(r.address)} | ${r.cidDecimal ? `\`${r.cidDecimal}\`` : "\u2014"} | ${r.websiteUrl ? `[site](${r.websiteUrl})` : "\u2014"} | [maps](${r.placeUrl}) |`;
15981
+ }).join("\n");
15982
+ const metadataSection = results.length ? `
15983
+ ## Candidate Metadata
15984
+ ${results.map((r) => {
15985
+ const meta = r.metadata?.length ? r.metadata.slice(0, 8).map((m) => ` - ${m}`).join("\n") : " - none";
15986
+ return `### ${r.position}. ${r.name}
15987
+ ${meta}`;
15988
+ }).join("\n\n")}` : "";
15989
+ const full = [
15990
+ `# Google Maps Search: "${searchQuery}"`,
15991
+ `**Returned:** ${results.length} profile candidate${results.length === 1 ? "" : "s"} \xB7 **Requested max:** ${requestedMax} \xB7 **Limit:** 50`,
15992
+ `
15993
+ ## Results
15994
+ | # | Name | Category | Rating | Address | CID | Website | Maps |
15995
+ |---|------|----------|--------|---------|-----|---------|------|
15996
+ ${rows}`,
15997
+ metadataSection,
15998
+ `
15999
+ ---
16000
+ \u{1F4A1} **Next step:** use \`maps_place_intel\` with a selected business name and location to hydrate full hours, phone, review topics, and optional review cards.`,
16001
+ durationMs != null ? `
16002
+ *Extracted in ${(durationMs / 1e3).toFixed(1)}s*` : null
16003
+ ].filter(Boolean).join("\n");
16004
+ return {
16005
+ ...oneBlock(full),
16006
+ structuredContent: {
16007
+ query: d.query,
16008
+ location: d.location ?? null,
16009
+ searchQuery: d.searchQuery,
16010
+ searchUrl: d.searchUrl,
16011
+ extractedAt: d.extractedAt,
16012
+ requestedMaxResults: requestedMax,
16013
+ resultCount: results.length,
16014
+ results,
16015
+ durationMs: durationMs ?? 0
16016
+ }
16017
+ };
16018
+ }
15661
16019
  function formatMapsPlaceIntel(raw, input) {
15662
16020
  const parsed = parseData(raw);
15663
16021
  if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
@@ -15755,19 +16113,6 @@ ${entitySection}` : null,
15755
16113
  durationMs != null ? `
15756
16114
  ---
15757
16115
  *Extracted in ${(durationMs / 1e3).toFixed(1)}s*` : null
15758
- ].filter(Boolean).join("\n");
15759
- const summary = [
15760
- `**${name}** \u2014 ${category ?? "Business"} \xB7 ${ratingLine || "No rating"}`,
15761
- address ? `\u{1F4CD} ${address}` : null,
15762
- phone ? `\u{1F4DE} ${phone}` : null,
15763
- hoursSummary ? `\u{1F550} ${hoursSummary}` : null,
15764
- website ? `\u{1F310} ${website}` : null,
15765
- reviewsStatus === "collected" && reviews.length ? `
15766
- \u{1F4AC} ${reviews.length} reviews fetched \u2014 full list in artifact above` : null,
15767
- reviewsStatus === "unavailable" ? `
15768
- \u26A0\uFE0F Reviews could not be retrieved this run` : null,
15769
- reviewsStatus === "none_exist" ? `
15770
- \u{1F4AC} No reviews on Google Maps` : null
15771
16116
  ].filter(Boolean).join("\n");
15772
16117
  return oneBlock(full);
15773
16118
  }
@@ -15799,76 +16144,123 @@ ${chunkRows}` : "",
15799
16144
  ---
15800
16145
  \u{1F4A1} Get more ads from this advertiser: use \`facebook_page_intel\``
15801
16146
  ].filter(Boolean).join("\n");
15802
- const summary = [
15803
- `**Facebook Ad Transcript** \u2014 ${text.split(" ").length} words \xB7 ${durSec}s`,
15804
- `
15805
- **Preview:**
15806
- > ${truncate(text, 300)}`,
15807
- `
15808
- \u{1F4A1} Full transcript in artifact above`
15809
- ].join("\n");
15810
16147
  return oneBlock(full);
15811
16148
  }
15812
- var import_node_fs4, import_node_os3, import_node_path6;
16149
+ var import_node_fs4, import_node_os3, import_node_path6, reportSavingEnabled;
15813
16150
  var init_mcp_response_formatter = __esm({
15814
16151
  "src/mcp/mcp-response-formatter.ts"() {
15815
16152
  "use strict";
15816
16153
  import_node_fs4 = require("fs");
15817
16154
  import_node_os3 = require("os");
15818
16155
  import_node_path6 = require("path");
16156
+ init_errors();
16157
+ reportSavingEnabled = true;
15819
16158
  }
15820
16159
  });
15821
16160
 
15822
16161
  // src/mcp/paa-mcp-server.ts
15823
- function buildPaaExtractorMcpServer(executor) {
15824
- const server = new import_mcp.McpServer({ name: "mcp-scraper", version: "1.0.0" });
16162
+ function liveWebToolAnnotations(title) {
16163
+ return {
16164
+ title,
16165
+ readOnlyHint: true,
16166
+ destructiveHint: false,
16167
+ idempotentHint: false,
16168
+ openWorldHint: true
16169
+ };
16170
+ }
16171
+ function buildPaaExtractorMcpServer(executor, options = {}) {
16172
+ const savesReports = options.savesReportsLocally !== false;
16173
+ const reportNote = savesReports ? " Saves a full Markdown report locally." : " Reports are returned inline; no files are saved on this hosted endpoint.";
16174
+ const withReportNote = (description) => `${description}${reportNote}`;
16175
+ const server = new import_mcp.McpServer({ name: "mcp-scraper", version: PACKAGE_VERSION });
15825
16176
  server.registerTool("harvest_paa", {
15826
- description: 'Best default tool for Google search research. Extracts People Also Ask questions plus answers/source URLs, organic SERP, local pack when present, entity IDs (CID/GCID/KG MID), and AI Overview. Infer the user language: split topic from location (e.g. "best hvac company in Denver CO" => query "best hvac company", location "Denver, CO", gl "us", hl "en"). Use maxQuestions 30 normally, 100-150 for "full", "deep", "all", or comprehensive research. Credits are charged by extracted question; unused request hold is refunded. Saves a full Markdown report locally.',
15827
- inputSchema: HarvestPaaInputSchema
16177
+ title: "Google PAA + SERP Harvest",
16178
+ description: withReportNote('Best default tool for Google search research. Extracts People Also Ask questions plus answers/source URLs, organic SERP, local pack when present, entity IDs (CID/GCID/KG MID), and AI Overview. Infer the user language: split topic from location (e.g. "best hvac company in Denver CO" => query "best hvac company", location "Denver, CO", gl "us", hl "en"). Use maxQuestions 30 normally, 100-150 for "full", "deep", "all", or comprehensive research. Credits are charged by extracted question; unused request hold is refunded.'),
16179
+ inputSchema: HarvestPaaInputSchema,
16180
+ annotations: liveWebToolAnnotations("Google PAA + SERP Harvest")
15828
16181
  }, async (input) => formatHarvestPaa(await executor.harvestPaa(input), input));
15829
16182
  server.registerTool("search_serp", {
15830
- description: "Fast Google SERP lookup without PAA expansion. Use when the user asks for rankings, organic results, local pack, quick SERP, or positions. Split topic from location and infer gl/hl from the user request. Saves a full Markdown report locally.",
15831
- inputSchema: SearchSerpInputSchema
16183
+ title: "Google SERP Lookup",
16184
+ description: withReportNote("Fast Google SERP lookup without PAA expansion. Use when the user asks for rankings, organic results, local pack, quick SERP, or positions. Split topic from location and infer gl/hl from the user request."),
16185
+ inputSchema: SearchSerpInputSchema,
16186
+ annotations: liveWebToolAnnotations("Google SERP Lookup")
15832
16187
  }, async (input) => formatSearchSerp(await executor.searchSerp(input), input));
15833
16188
  server.registerTool("extract_url", {
15834
- description: "Extract structured data from one public URL: page content as Markdown, heading structure, JSON-LD schema, entity details, NAP score, metadata, and missing schema fields. Use when the user provides a single URL or asks to inspect/scrape one page. Saves a full Markdown report locally.",
15835
- inputSchema: ExtractUrlInputSchema
16189
+ title: "Single URL Extract",
16190
+ description: withReportNote("Extract structured data from one public URL: page content as Markdown, heading structure, JSON-LD schema, entity details, NAP score, metadata, and missing schema fields. Use when the user provides a single URL or asks to inspect/scrape one page."),
16191
+ inputSchema: ExtractUrlInputSchema,
16192
+ annotations: liveWebToolAnnotations("Single URL Extract")
15836
16193
  }, async (input) => formatExtractUrl(await executor.extractUrl(input), input));
15837
16194
  server.registerTool("map_site_urls", {
15838
- description: "Map/crawl a public website to build a URL inventory with HTTP status codes, broken links, redirects, and site scope. Use before extract_site for audits or when the user asks for a sitemap/URL inventory. Saves a full Markdown report locally.",
15839
- inputSchema: MapSiteUrlsInputSchema
16195
+ title: "Site URL Map",
16196
+ description: withReportNote("Map/crawl a public website to build a URL inventory with HTTP status codes, broken links, redirects, and site scope. Use before extract_site for audits or when the user asks for a sitemap/URL inventory."),
16197
+ inputSchema: MapSiteUrlsInputSchema,
16198
+ outputSchema: MapSiteUrlsOutputSchema,
16199
+ annotations: liveWebToolAnnotations("Site URL Map")
15840
16200
  }, async (input) => formatMapSiteUrls(await executor.mapSiteUrls(input), input));
15841
16201
  server.registerTool("extract_site", {
15842
- description: "Run multi-page extraction across a public website. Returns per-page titles, H1s, metadata, headings, schema/entity data, canonical URLs, and content. Use for website audits, competitor audits, and full-site extraction. Saves a full Markdown report locally.",
15843
- inputSchema: ExtractSiteInputSchema
16202
+ title: "Multi-Page Site Extract",
16203
+ description: withReportNote("Run multi-page extraction across a public website. Returns per-page titles, H1s, metadata, headings, schema/entity data, canonical URLs, and content. Use for website audits, competitor audits, and full-site extraction."),
16204
+ inputSchema: ExtractSiteInputSchema,
16205
+ annotations: liveWebToolAnnotations("Multi-Page Site Extract")
15844
16206
  }, async (input) => formatExtractSite(await executor.extractSite(input), input));
15845
16207
  server.registerTool("youtube_harvest", {
15846
- description: 'Harvest YouTube video metadata by search query or channel handle/ID/URL. Use mode "search" for keyword/topic requests and mode "channel" for @handles, channel IDs, or channel URLs. Returns titles, views, dates, durations, URLs, thumbnails, and videoIds for follow-up transcription. Saves a full Markdown report locally.',
15847
- inputSchema: YoutubeHarvestInputSchema
16208
+ title: "YouTube Video Harvest",
16209
+ description: withReportNote('Harvest YouTube video metadata by search query or channel handle/ID/URL. Use mode "search" for keyword/topic requests and mode "channel" for @handles, channel IDs, or channel URLs. Returns titles, views, dates, durations, URLs, thumbnails, and videoIds for follow-up transcription.'),
16210
+ inputSchema: YoutubeHarvestInputSchema,
16211
+ outputSchema: YoutubeHarvestOutputSchema,
16212
+ annotations: liveWebToolAnnotations("YouTube Video Harvest")
15848
16213
  }, async (input) => formatYoutubeHarvest(await executor.youtubeHarvest(input), input));
15849
16214
  server.registerTool("youtube_transcribe", {
15850
- description: "Fetch and transcribe captions from a YouTube video. Returns full transcript, timestamped chunks, and word count. Pass a videoId from youtube_harvest results or infer it from a YouTube URL if the user provided one. Saves a full Markdown report locally.",
15851
- inputSchema: YoutubeTranscribeInputSchema
16215
+ title: "YouTube Transcription",
16216
+ description: withReportNote("Fetch and transcribe captions from a YouTube video. Returns full transcript, timestamped chunks, and word count. Pass a videoId from youtube_harvest results or infer it from a YouTube URL if the user provided one."),
16217
+ inputSchema: YoutubeTranscribeInputSchema,
16218
+ annotations: liveWebToolAnnotations("YouTube Transcription")
15852
16219
  }, async (input) => formatYoutubeTranscribe(await executor.youtubeTranscribe(input), input));
15853
16220
  server.registerTool("facebook_page_intel", {
15854
- description: "Harvest ads from a Facebook advertiser. Returns ad copy, headlines, CTAs, creative type, status, landing URLs, and video URLs ready for transcription. Accepts pageId, libraryId, or a brand/advertiser name as query. Use after facebook_ad_search when possible. Saves a full Markdown report locally.",
15855
- inputSchema: FacebookPageIntelInputSchema
16221
+ title: "Facebook Advertiser Ad Intel",
16222
+ description: withReportNote("Harvest ads from a Facebook advertiser. Returns ad copy, headlines, CTAs, creative type, status, landing URLs, and video URLs ready for transcription. Accepts pageId, libraryId, or a brand/advertiser name as query. Use after facebook_ad_search when possible."),
16223
+ inputSchema: FacebookPageIntelInputSchema,
16224
+ outputSchema: FacebookPageIntelOutputSchema,
16225
+ annotations: liveWebToolAnnotations("Facebook Advertiser Ad Intel")
15856
16226
  }, async (input) => formatFacebookPageIntel(await executor.facebookPageIntel(input), input));
15857
16227
  server.registerTool("facebook_ad_search", {
15858
- description: "Search Facebook Ad Library by brand, advertiser, competitor, niche, or keyword. Returns advertisers with ad counts and library IDs. Use to discover competitors, then pass libraryId to facebook_page_intel. Saves a full Markdown report locally.",
15859
- inputSchema: FacebookAdSearchInputSchema
16228
+ title: "Facebook Ad Library Search",
16229
+ description: withReportNote("Search Facebook Ad Library by brand, advertiser, competitor, niche, or keyword. Returns advertisers with ad counts and library IDs. Use to discover competitors, then pass libraryId to facebook_page_intel."),
16230
+ inputSchema: FacebookAdSearchInputSchema,
16231
+ outputSchema: FacebookAdSearchOutputSchema,
16232
+ annotations: liveWebToolAnnotations("Facebook Ad Library Search")
15860
16233
  }, async (input) => formatFacebookAdSearch(await executor.facebookAdSearch(input), input));
15861
16234
  server.registerTool("facebook_ad_transcribe", {
16235
+ title: "Facebook Ad Transcription",
15862
16236
  description: "Transcribe audio from a Facebook ad video. Returns full transcript and timestamped chunks. Use the videoUrl value from facebook_page_intel results.",
15863
- inputSchema: FacebookAdTranscribeInputSchema
16237
+ inputSchema: FacebookAdTranscribeInputSchema,
16238
+ annotations: liveWebToolAnnotations("Facebook Ad Transcription")
15864
16239
  }, async (input) => formatFacebookAdTranscribe(await executor.facebookAdTranscribe(input), input));
15865
16240
  server.registerTool("maps_place_intel", {
15866
- description: 'Extract Google Maps business intelligence for a named business: rating, review count, category, address, phone, website, hours, booking URL, review histogram, review topics, about attributes, entity IDs, and optional review cards. Split business name from location (e.g. "Elite Roofing Denver CO" => businessName "Elite Roofing", location "Denver, CO"). Pass includeReviews true when the user asks for reviews/customer pain. Saves a full Markdown report locally.',
15867
- inputSchema: MapsPlaceIntelInputSchema
16241
+ title: "Google Maps Business Profile Details",
16242
+ description: withReportNote('Extract Google Maps business intelligence for one known/named business: rating, review count, category, address, phone, website, hours, booking URL, review histogram, review topics, about attributes, entity IDs, and optional review cards. Do not use this for category searches, local market prospect lists, or requests for multiple GMB/GBP profiles; use maps_search first for those. Split business name from location (e.g. "Elite Roofing Denver CO" => businessName "Elite Roofing", location "Denver, CO"). Pass includeReviews true when the user asks for reviews/customer pain.'),
16243
+ inputSchema: MapsPlaceIntelInputSchema,
16244
+ annotations: liveWebToolAnnotations("Google Maps Business Profile Details")
15868
16245
  }, async (input) => formatMapsPlaceIntel(await executor.mapsPlaceIntel(input), input));
16246
+ server.registerTool("maps_search", {
16247
+ title: "Google Maps Business Search",
16248
+ description: withReportNote('Search Google Maps for multiple businesses/profiles by category, niche, keyword, or local market. Use this when the user asks for several Google Business Profiles, GMBs, GBPs, leads, prospects, competitors, or "more than the 3-pack." Returns up to 50 candidates with names, place URLs, CIDs when available, ratings, review counts, and profile metadata. Default maxResults is 10; maximum is 50. Use maps_place_intel afterward only when a selected business needs full details and reviews.'),
16249
+ inputSchema: MapsSearchInputSchema,
16250
+ outputSchema: MapsSearchOutputSchema,
16251
+ annotations: liveWebToolAnnotations("Google Maps Business Search")
16252
+ }, async (input) => formatMapsSearch(await executor.mapsSearch(input), input));
15869
16253
  server.registerTool("credits_info", {
16254
+ title: "MCP Scraper Credits & Costs",
15870
16255
  description: "Answer questions about MCP Scraper credits: current credit balance, what a specific tool/action costs, the full cost table, and optionally recent credit ledger entries. Does not expose payment methods or credit card information.",
15871
- inputSchema: CreditsInfoInputSchema
16256
+ inputSchema: CreditsInfoInputSchema,
16257
+ annotations: {
16258
+ title: "MCP Scraper Credits & Costs",
16259
+ readOnlyHint: true,
16260
+ destructiveHint: false,
16261
+ idempotentHint: true,
16262
+ openWorldHint: false
16263
+ }
15872
16264
  }, async (input) => formatCreditsInfo(await executor.creditsInfo(input), input));
15873
16265
  return server;
15874
16266
  }
@@ -15877,6 +16269,7 @@ var init_paa_mcp_server = __esm({
15877
16269
  "src/mcp/paa-mcp-server.ts"() {
15878
16270
  "use strict";
15879
16271
  import_mcp = require("@modelcontextprotocol/sdk/server/mcp.js");
16272
+ init_version();
15880
16273
  init_mcp_tool_schemas();
15881
16274
  init_mcp_response_formatter();
15882
16275
  }
@@ -15976,6 +16369,9 @@ var init_http_mcp_tool_executor = __esm({
15976
16369
  mapsPlaceIntel(input) {
15977
16370
  return this.call("/maps/place", input);
15978
16371
  }
16372
+ mapsSearch(input) {
16373
+ return this.call("/maps/search", input);
16374
+ }
15979
16375
  creditsInfo(input) {
15980
16376
  return this.call("/billing/credits", input);
15981
16377
  }
@@ -16015,15 +16411,18 @@ async function requireMcpCallerKey(c) {
16015
16411
  return callerKey;
16016
16412
  }
16017
16413
  function registerSerpIntelligenceCaptureTools(server, executor) {
16018
- const serpExecutor = executor;
16019
16414
  server.registerTool("capture_serp_snapshot", {
16415
+ title: "SERP Intelligence Snapshot",
16020
16416
  description: "Capture a structured SERP Intelligence Google snapshot through POST /serp-intelligence/capture, the same product capture path used by Phoenix. Split query from location, infer gl/hl, use proxyMode location for localized residential proxy evidence, configured for the static residential proxy, and none only for direct-network debugging. Set debug true when investigating location evidence, proxy behavior, CAPTCHA, or capture reliability.",
16021
- inputSchema: CaptureSerpSnapshotInputSchema
16022
- }, async (input) => serpExecutor.captureSerpSnapshot ? serpExecutor.captureSerpSnapshot(input) : Promise.resolve({ content: [{ type: "text", text: "{}" }], isError: true }));
16417
+ inputSchema: CaptureSerpSnapshotInputSchema,
16418
+ annotations: liveWebToolAnnotations("SERP Intelligence Snapshot")
16419
+ }, async (input) => executor.captureSerpSnapshot(input));
16023
16420
  server.registerTool("capture_serp_page_snapshots", {
16421
+ title: "SERP Intelligence Page Snapshots",
16024
16422
  description: "Capture public ranking-page evidence through POST /serp-intelligence/page-snapshots, the same product page snapshot path used by Phoenix. Provide urls for simple captures or targets when preserving organic, AI citation, local-pack, configured target, or site-subject source metadata. Private IPs, localhost, file URLs, and internal URLs are rejected by the service. Use timeoutMs for slow pages and debug true for sanitized proxy/browser diagnostics.",
16025
- inputSchema: CaptureSerpPageSnapshotsInputSchema
16026
- }, async (input) => serpExecutor.captureSerpPageSnapshots ? serpExecutor.captureSerpPageSnapshots(input) : Promise.resolve({ content: [{ type: "text", text: "{}" }], isError: true }));
16423
+ inputSchema: CaptureSerpPageSnapshotsInputSchema,
16424
+ annotations: liveWebToolAnnotations("SERP Intelligence Page Snapshots")
16425
+ }, async (input) => executor.captureSerpPageSnapshots(input));
16027
16426
  }
16028
16427
  var import_hono7, import_webStandardStreamableHttp, mcpApp;
16029
16428
  var init_mcp_routes = __esm({
@@ -16033,8 +16432,10 @@ var init_mcp_routes = __esm({
16033
16432
  import_webStandardStreamableHttp = require("@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js");
16034
16433
  init_paa_mcp_server();
16035
16434
  init_http_mcp_tool_executor();
16435
+ init_mcp_response_formatter();
16036
16436
  init_db();
16037
16437
  init_mcp_tool_schemas();
16438
+ configureReportSaving(false);
16038
16439
  mcpApp = new import_hono7.Hono();
16039
16440
  mcpApp.all("/", async (c) => {
16040
16441
  try {
@@ -16047,7 +16448,7 @@ var init_mcp_routes = __esm({
16047
16448
  sessionIdGenerator: void 0,
16048
16449
  enableJsonResponse: true
16049
16450
  });
16050
- const server = buildPaaExtractorMcpServer(executor);
16451
+ const server = buildPaaExtractorMcpServer(executor, { savesReportsLocally: false });
16051
16452
  registerSerpIntelligenceCaptureTools(server, executor);
16052
16453
  await server.connect(transport);
16053
16454
  return transport.handleRequest(c.req.raw);