mcp-scraper 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/README.md +13 -2
  2. package/dist/bin/api-server.cjs +957 -243
  3. package/dist/bin/api-server.cjs.map +1 -1
  4. package/dist/bin/api-server.js +2 -2
  5. package/dist/bin/mcp-stdio-server.cjs +540 -158
  6. package/dist/bin/mcp-stdio-server.cjs.map +1 -1
  7. package/dist/bin/mcp-stdio-server.js +2 -1
  8. package/dist/bin/mcp-stdio-server.js.map +1 -1
  9. package/dist/bin/paa-harvest.cjs +36 -5
  10. package/dist/bin/paa-harvest.cjs.map +1 -1
  11. package/dist/bin/paa-harvest.js +5 -3
  12. package/dist/bin/paa-harvest.js.map +1 -1
  13. package/dist/{chunk-6TWZS2FQ.js → chunk-RE6HCRYC.js} +543 -159
  14. package/dist/chunk-RE6HCRYC.js.map +1 -0
  15. package/dist/{chunk-W4P2U5VF.js → chunk-TM22BLWP.js} +46 -34
  16. package/dist/chunk-TM22BLWP.js.map +1 -0
  17. package/dist/{chunk-7HB7NDOY.js → chunk-ZK456YXN.js} +12 -2
  18. package/dist/chunk-ZK456YXN.js.map +1 -0
  19. package/dist/chunk-ZMOWIBMK.js +36 -0
  20. package/dist/chunk-ZMOWIBMK.js.map +1 -0
  21. package/dist/index.cjs +34 -3
  22. package/dist/index.cjs.map +1 -1
  23. package/dist/index.js +2 -1
  24. package/dist/index.js.map +1 -1
  25. package/dist/{server-2Y27U4TO.js → server-QXVVTKJP.js} +311 -48
  26. package/dist/server-QXVVTKJP.js.map +1 -0
  27. package/dist/{worker-UT4ZQU2T.js → worker-AUCXFHEL.js} +6 -4
  28. package/dist/worker-AUCXFHEL.js.map +1 -0
  29. package/docs/adr/0001-in-page-graphql-interception-for-anti-bot-scraping.md +58 -0
  30. package/docs/adr/README.md +11 -0
  31. package/docs/mcp-tool-quality-spec.md +238 -0
  32. package/package.json +5 -4
  33. package/dist/chunk-6TWZS2FQ.js.map +0 -1
  34. package/dist/chunk-7HB7NDOY.js.map +0 -1
  35. package/dist/chunk-W4P2U5VF.js.map +0 -1
  36. package/dist/server-2Y27U4TO.js.map +0 -1
  37. package/dist/worker-UT4ZQU2T.js.map +0 -1
@@ -110,6 +110,9 @@ var HttpMcpToolExecutor = class {
110
110
  mapsPlaceIntel(input) {
111
111
  return this.call("/maps/place", input);
112
112
  }
113
+ mapsSearch(input) {
114
+ return this.call("/maps/search", input);
115
+ }
113
116
  creditsInfo(input) {
114
117
  return this.call("/billing/credits", input);
115
118
  }
@@ -124,6 +127,9 @@ var HttpMcpToolExecutor = class {
124
127
  // src/mcp/paa-mcp-server.ts
125
128
  var import_mcp = require("@modelcontextprotocol/sdk/server/mcp.js");
126
129
 
130
+ // src/version.ts
131
+ var PACKAGE_VERSION = "0.1.8";
132
+
127
133
  // src/mcp/mcp-tool-schemas.ts
128
134
  var import_zod = require("zod");
129
135
  var HarvestPaaInputSchema = {
@@ -186,6 +192,207 @@ var MapsPlaceIntelInputSchema = {
186
192
  includeReviews: import_zod.z.boolean().default(false).describe("Whether to fetch individual review cards"),
187
193
  maxReviews: import_zod.z.number().int().min(1).max(500).default(50).describe("Max review cards to return (requires includeReviews: true)")
188
194
  };
195
+ var MapsSearchInputSchema = {
196
+ query: import_zod.z.string().min(1).describe('Business category, niche, keyword, or search term. If the user says "roofers in Denver CO", use query="roofers" and location="Denver, CO". Do not put the location here when it can be separated.'),
197
+ location: import_zod.z.string().optional().describe('City, region, country, or service area for the Maps search, e.g. "Denver, CO". Infer from the user request when present.'),
198
+ gl: import_zod.z.string().length(2).default("us").describe("Google country code inferred from location."),
199
+ hl: import_zod.z.string().length(2).default("en").describe("Language inferred from user request."),
200
+ maxResults: import_zod.z.number().int().min(1).max(50).default(10).describe("Number of Google Maps business/profile candidates to return. Default 10. Maximum 50. Use 10 unless the user asks for more.")
201
+ };
202
+ var NullableString = import_zod.z.string().nullable();
203
+ var MapsSearchOutputSchema = {
204
+ query: import_zod.z.string(),
205
+ location: import_zod.z.string().nullable(),
206
+ searchQuery: import_zod.z.string(),
207
+ searchUrl: import_zod.z.string().url(),
208
+ extractedAt: import_zod.z.string(),
209
+ requestedMaxResults: import_zod.z.number().int().min(1).max(50),
210
+ resultCount: import_zod.z.number().int().min(0).max(50),
211
+ results: import_zod.z.array(import_zod.z.object({
212
+ position: import_zod.z.number().int().min(1),
213
+ name: import_zod.z.string(),
214
+ placeUrl: import_zod.z.string().url(),
215
+ cid: NullableString,
216
+ cidDecimal: NullableString,
217
+ rating: NullableString,
218
+ reviewCount: NullableString,
219
+ category: NullableString,
220
+ address: NullableString,
221
+ websiteUrl: NullableString,
222
+ directionsUrl: NullableString,
223
+ metadata: import_zod.z.array(import_zod.z.string())
224
+ })),
225
+ durationMs: import_zod.z.number().int().min(0)
226
+ };
227
+ var OrganicResultOutput = import_zod.z.object({
228
+ position: import_zod.z.number().int(),
229
+ title: import_zod.z.string(),
230
+ url: import_zod.z.string(),
231
+ domain: import_zod.z.string(),
232
+ snippet: NullableString
233
+ });
234
+ var AiOverviewOutput = import_zod.z.object({
235
+ detected: import_zod.z.boolean(),
236
+ text: NullableString
237
+ }).nullable();
238
+ var EntityIdsOutput = import_zod.z.object({
239
+ kgIds: import_zod.z.array(import_zod.z.string()),
240
+ cids: import_zod.z.array(import_zod.z.string()),
241
+ gcids: import_zod.z.array(import_zod.z.string())
242
+ }).nullable();
243
+ var HarvestPaaOutputSchema = {
244
+ query: import_zod.z.string(),
245
+ location: NullableString,
246
+ questionCount: import_zod.z.number().int().min(0),
247
+ completionStatus: NullableString,
248
+ questions: import_zod.z.array(import_zod.z.object({
249
+ question: import_zod.z.string(),
250
+ answer: NullableString,
251
+ sourceTitle: NullableString,
252
+ sourceSite: NullableString
253
+ })),
254
+ organicResults: import_zod.z.array(OrganicResultOutput),
255
+ aiOverview: AiOverviewOutput,
256
+ entityIds: EntityIdsOutput,
257
+ durationMs: import_zod.z.number().min(0).nullable()
258
+ };
259
+ var SearchSerpOutputSchema = {
260
+ query: import_zod.z.string(),
261
+ location: NullableString,
262
+ organicResults: import_zod.z.array(OrganicResultOutput),
263
+ localPack: import_zod.z.array(import_zod.z.object({
264
+ position: import_zod.z.number().int(),
265
+ name: import_zod.z.string(),
266
+ rating: NullableString,
267
+ reviewCount: NullableString,
268
+ websiteUrl: NullableString
269
+ })),
270
+ aiOverview: AiOverviewOutput,
271
+ entityIds: EntityIdsOutput
272
+ };
273
+ var ExtractUrlOutputSchema = {
274
+ url: import_zod.z.string(),
275
+ title: NullableString,
276
+ headings: import_zod.z.array(import_zod.z.object({
277
+ level: import_zod.z.number().int(),
278
+ text: import_zod.z.string()
279
+ })),
280
+ schemaBlockCount: import_zod.z.number().int().min(0),
281
+ entityName: NullableString,
282
+ entityTypes: import_zod.z.array(import_zod.z.string()),
283
+ napScore: import_zod.z.number().nullable(),
284
+ missingSchemaFields: import_zod.z.array(import_zod.z.string()),
285
+ screenshotSaved: NullableString
286
+ };
287
+ var ExtractSiteOutputSchema = {
288
+ url: import_zod.z.string(),
289
+ pageCount: import_zod.z.number().int().min(0),
290
+ pages: import_zod.z.array(import_zod.z.object({
291
+ url: import_zod.z.string(),
292
+ title: NullableString,
293
+ schemaTypes: import_zod.z.array(import_zod.z.string())
294
+ })),
295
+ durationMs: import_zod.z.number().min(0)
296
+ };
297
+ var MapsPlaceIntelOutputSchema = {
298
+ name: import_zod.z.string(),
299
+ rating: NullableString,
300
+ reviewCount: NullableString,
301
+ category: NullableString,
302
+ address: NullableString,
303
+ phone: NullableString,
304
+ website: NullableString,
305
+ hoursSummary: NullableString,
306
+ bookingUrl: NullableString,
307
+ kgmid: NullableString,
308
+ cidDecimal: NullableString,
309
+ cidUrl: NullableString,
310
+ lat: import_zod.z.number().nullable(),
311
+ lng: import_zod.z.number().nullable(),
312
+ reviewsStatus: import_zod.z.string(),
313
+ reviewsCollected: import_zod.z.number().int().min(0),
314
+ reviewTopics: import_zod.z.array(import_zod.z.object({
315
+ label: import_zod.z.string(),
316
+ count: import_zod.z.string()
317
+ }))
318
+ };
319
+ var CreditsInfoOutputSchema = {
320
+ balanceCredits: import_zod.z.number().nullable(),
321
+ matchedCost: import_zod.z.object({
322
+ label: import_zod.z.string(),
323
+ credits: import_zod.z.number(),
324
+ unit: import_zod.z.string(),
325
+ notes: NullableString
326
+ }).nullable(),
327
+ costs: import_zod.z.array(import_zod.z.object({
328
+ key: import_zod.z.string(),
329
+ label: import_zod.z.string(),
330
+ credits: import_zod.z.number(),
331
+ unit: import_zod.z.string(),
332
+ notes: NullableString
333
+ })),
334
+ ledger: import_zod.z.array(import_zod.z.object({
335
+ createdAt: import_zod.z.string(),
336
+ operation: import_zod.z.string(),
337
+ credits: import_zod.z.number(),
338
+ description: NullableString
339
+ }))
340
+ };
341
+ var MapSiteUrlsOutputSchema = {
342
+ startUrl: import_zod.z.string(),
343
+ totalFound: import_zod.z.number().int().min(0),
344
+ truncated: import_zod.z.boolean(),
345
+ okCount: import_zod.z.number().int().min(0),
346
+ redirectCount: import_zod.z.number().int().min(0),
347
+ brokenCount: import_zod.z.number().int().min(0),
348
+ urls: import_zod.z.array(import_zod.z.object({
349
+ url: import_zod.z.string(),
350
+ status: import_zod.z.number().int().nullable()
351
+ })),
352
+ durationMs: import_zod.z.number().min(0)
353
+ };
354
+ var YoutubeHarvestOutputSchema = {
355
+ mode: import_zod.z.string(),
356
+ videoCount: import_zod.z.number().int().min(0),
357
+ channel: import_zod.z.object({
358
+ title: NullableString,
359
+ subscriberCount: NullableString
360
+ }).nullable(),
361
+ videos: import_zod.z.array(import_zod.z.object({
362
+ videoId: import_zod.z.string(),
363
+ title: import_zod.z.string(),
364
+ channelName: NullableString,
365
+ views: NullableString,
366
+ duration: NullableString,
367
+ url: NullableString
368
+ }))
369
+ };
370
+ var FacebookAdSearchOutputSchema = {
371
+ query: import_zod.z.string(),
372
+ advertiserCount: import_zod.z.number().int().min(0),
373
+ advertisers: import_zod.z.array(import_zod.z.object({
374
+ name: NullableString,
375
+ adCount: import_zod.z.number().int().nullable(),
376
+ libraryId: NullableString
377
+ }))
378
+ };
379
+ var FacebookPageIntelOutputSchema = {
380
+ advertiserName: NullableString,
381
+ totalAds: import_zod.z.number().int().min(0),
382
+ activeCount: import_zod.z.number().int().min(0),
383
+ videoCount: import_zod.z.number().int().min(0),
384
+ imageCount: import_zod.z.number().int().min(0),
385
+ ads: import_zod.z.array(import_zod.z.object({
386
+ libraryId: NullableString,
387
+ status: NullableString,
388
+ creativeType: NullableString,
389
+ headline: NullableString,
390
+ cta: NullableString,
391
+ startDate: NullableString,
392
+ videoUrl: NullableString,
393
+ variations: import_zod.z.number().int().nullable()
394
+ }))
395
+ };
189
396
  var CreditsInfoInputSchema = {
190
397
  item: import_zod.z.string().optional().describe('Optional tool, action, or feature to look up, e.g. "maps reviews", "extract_url", or "YouTube transcription"'),
191
398
  includeLedger: import_zod.z.boolean().default(false).describe("Whether to include recent credit ledger entries")
@@ -235,6 +442,19 @@ var CaptureSerpPageSnapshotsInputSchema = {
235
442
  var import_node_fs = require("fs");
236
443
  var import_node_os = require("os");
237
444
  var import_node_path = require("path");
445
+
446
+ // src/errors.ts
447
+ function sanitizeVendorName(message) {
448
+ return message.replace(/kernel\.sh\s+sessions?/gi, "sessions").replace(/kernel\.sh\s+session/gi, "this session").replace(/kernel\.sh/gi, "the service").replace(/kernel\s+sessions?/gi, "sessions").replace(/kernel\s+session/gi, "this session").replace(/\bkernel\b/gi, "the service").replace(/ +/g, " ").trim();
449
+ }
450
+
451
+ // src/mcp/mcp-response-formatter.ts
452
+ var reportSavingEnabled = true;
453
+ function sanitizeVendorText(text) {
454
+ return sanitizeVendorName(
455
+ text.replace(/kernel_session_id/gi, "browser_session_id").replace(/kernel_delete_succeeded/gi, "session_cleanup_succeeded").replace(/kernel_delete_started/gi, "session_cleanup_started").replace(/kernel_delete_error/gi, "session_cleanup_error").replace(/kernelSessionId/g, "browserSessionId").replace(/kernelProxyId/g, "proxyId").replace(/KERNEL_API_KEY/g, "BROWSER_SERVICE_API_KEY").replace(/"kernel"\s*:/gi, '"browserRuntime":')
456
+ );
457
+ }
238
458
  function slugifyReportName(input) {
239
459
  return input.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80) || "mcp-scraper-report";
240
460
  }
@@ -246,7 +466,7 @@ function outputBaseDir() {
246
466
  return process.env.MCP_SCRAPER_OUTPUT_DIR?.trim() || (0, import_node_path.join)((0, import_node_os.homedir)(), "Downloads", "mcp-scraper");
247
467
  }
248
468
  function saveFullReport(full) {
249
- if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
469
+ if (!reportSavingEnabled || process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
250
470
  const outDir = outputBaseDir();
251
471
  try {
252
472
  (0, import_node_fs.mkdirSync)(outDir, { recursive: true });
@@ -259,7 +479,7 @@ function saveFullReport(full) {
259
479
  }
260
480
  }
261
481
  function persistScreenshotLocally(base64, url) {
262
- if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
482
+ if (!reportSavingEnabled || process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
263
483
  try {
264
484
  const dir = (0, import_node_path.join)(outputBaseDir(), "screenshots");
265
485
  (0, import_node_fs.mkdirSync)(dir, { recursive: true });
@@ -299,11 +519,11 @@ function parseData(raw) {
299
519
  const text = first?.type === "text" ? first.text : "";
300
520
  try {
301
521
  const parsed = JSON.parse(text || "{}");
302
- if (raw.isError || parsed.error || parsed.error_code) return { error: formatStructuredError(parsed, text) };
522
+ if (raw.isError || parsed.error || parsed.error_code) return { error: sanitizeVendorText(formatStructuredError(parsed, text)) };
303
523
  const data = parsed.result ?? parsed;
304
524
  return { data };
305
525
  } catch {
306
- if (raw.isError) return { error: text || "Tool error" };
526
+ if (raw.isError) return { error: sanitizeVendorText(text || "Tool error") };
307
527
  return { error: "Failed to parse tool response" };
308
528
  }
309
529
  }
@@ -317,15 +537,6 @@ function entityIdsSection(ids) {
317
537
  ## Entity IDs
318
538
  ${lines.join("\n")}` : "";
319
539
  }
320
- function entityIdsSummaryLine(ids) {
321
- if (!ids) return "";
322
- const parts = [];
323
- if (ids.kgIds?.length) parts.push(`KG MID: ${ids.kgIds[0]}`);
324
- if (ids.cids?.length) parts.push(`CID: ${ids.cids[0]}`);
325
- if (ids.gcids?.length) parts.push(`GCID: ${ids.gcids[0]}`);
326
- return parts.length ? `
327
- **Entity IDs:** ${parts.join(" \xB7 ")}` : "";
328
- }
329
540
  function truncate(s, max) {
330
541
  if (!s) return "";
331
542
  return s.length > max ? s.slice(0, max) + "\u2026" : s;
@@ -337,7 +548,7 @@ function debugSection(debug) {
337
548
  if (!debug || typeof debug !== "object") return "";
338
549
  const request = debug.request ?? {};
339
550
  const browser = debug.browser ?? {};
340
- const kernel = browser.kernel ?? {};
551
+ const kernel = browser.browserRuntime ?? browser.kernel ?? {};
341
552
  const network = browser.networkLocation ?? {};
342
553
  const nav = browser.serpNavigation ?? {};
343
554
  const proxyResolution = kernel.proxyResolution ?? {};
@@ -355,7 +566,7 @@ function debugSection(debug) {
355
566
  if (locationEvidence) {
356
567
  lines.push(`- Location evidence: ${locationEvidence.status}${locationEvidence.expected ? ` \xB7 expected ${locationEvidence.expected.city}${locationEvidence.expected.regionCode ? `, ${locationEvidence.expected.regionCode}` : ""}` : ""}${candidates ? ` \xB7 candidates ${candidates}` : ""}`);
357
568
  }
358
- return lines.join("\n");
569
+ return sanitizeVendorText(lines.join("\n"));
359
570
  }
360
571
  function errorAttemptsSection(body) {
361
572
  const attempts = Array.isArray(body.attempts) ? body.attempts : [];
@@ -363,12 +574,14 @@ function errorAttemptsSection(body) {
363
574
  const lines = attempts.slice(0, 5).map((attempt) => {
364
575
  const debug = attempt.debug ?? {};
365
576
  const browser = debug.browser ?? {};
366
- const kernel = browser.kernel ?? {};
577
+ const kernel = browser.browserRuntime ?? browser.kernel ?? {};
367
578
  const proxyResolution = kernel.proxyResolution ?? {};
368
579
  const network = browser.networkLocation ?? {};
369
580
  const nav = browser.serpNavigation ?? {};
370
581
  const geo = [network.ip, network.city, network.region].filter(Boolean).join(" / ") || "geo unknown";
371
- return `- Attempt ${attempt.attempt_number ?? "?"}: ${attempt.outcome ?? attempt.status ?? "unknown"} \xB7 session ${attempt.kernel_session_id ?? kernel.sessionId ?? "unknown"} \xB7 proxy ${debug.request?.proxyMode ?? kernel.proxyMode ?? "unknown"}${proxyResolution.source ? `/${proxyResolution.source}` : ""} \xB7 ${geo} \xB7 CAPTCHA ${nav.captchaDetected === true ? "yes" : nav.captchaDetected === false ? "no" : "unknown"} \xB7 deleted ${attempt.kernel_delete_succeeded === true ? "yes" : attempt.kernel_delete_succeeded === false ? "no" : "unknown"}`;
582
+ const sessionId = attempt.browser_session_id ?? attempt.kernel_session_id ?? kernel.sessionId ?? "unknown";
583
+ const cleanupSucceeded = attempt.session_cleanup_succeeded ?? attempt.kernel_delete_succeeded;
584
+ return `- Attempt ${attempt.attempt_number ?? "?"}: ${attempt.outcome ?? attempt.status ?? "unknown"} \xB7 session ${sessionId} \xB7 proxy ${debug.request?.proxyMode ?? kernel.proxyMode ?? "unknown"}${proxyResolution.source ? `/${proxyResolution.source}` : ""} \xB7 ${geo} \xB7 CAPTCHA ${nav.captchaDetected === true ? "yes" : nav.captchaDetected === false ? "no" : "unknown"} \xB7 cleanup ${cleanupSucceeded === true ? "yes" : cleanupSucceeded === false ? "no" : "unknown"}`;
372
585
  });
373
586
  return `
374
587
 
@@ -409,27 +622,37 @@ ${serpRows}` : "";
409
622
  const tips = `
410
623
  ---
411
624
  \u{1F4A1} **Tips**
412
- - Max questions: \`maxQuestions: 150\` (current: ${input.maxQuestions ?? 30})
625
+ - Max questions: \`maxQuestions: 200\` (current: ${input.maxQuestions ?? 30})
413
626
  - Organic results only: use \`search_serp\`
414
627
  - Dig into a result: use \`extract_url\` on any organic URL`;
415
628
  const full = `# PAA Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
416
629
 
417
630
  ${paaTable}${serpTable}${entityIdsSection(entityIds)}${aiSection}${statsLine}${debugSection(diagnostics?.debug)}${tips}`;
418
- const topQ = flat.slice(0, 10).map((r, i) => `${i + 1}. ${r.question}`).join("\n");
419
- const topO = organic.slice(0, 5).map((r) => `${r.position}. [${r.title}](${r.url}) \u2014 ${r.domain}`).join("\n");
420
- const summary = [
421
- `**PAA: "${input.query}"** \u2014 ${flat.length} questions extracted`,
422
- topQ ? `
423
- **Top questions:**
424
- ${topQ}` : "",
425
- organic.length ? `
426
- **Top organic results:**
427
- ${topO}` : "",
428
- entityIdsSummaryLine(entityIds),
429
- `
430
- \u{1F4A1} \`maxQuestions\` up to 150 \xB7Use \`extract_url\` to dig into any result`
431
- ].filter(Boolean).join("\n");
432
- return oneBlock(full);
631
+ return {
632
+ ...oneBlock(full),
633
+ structuredContent: {
634
+ query: input.query,
635
+ location: input.location ?? null,
636
+ questionCount: flat.length,
637
+ completionStatus: diagnostics?.completionStatus ?? null,
638
+ questions: flat.map((r) => ({
639
+ question: String(r.question ?? ""),
640
+ answer: r.answer ?? null,
641
+ sourceTitle: r.source_title ?? null,
642
+ sourceSite: r.source_site ?? null
643
+ })),
644
+ organicResults: organic.map((r) => ({
645
+ position: Number(r.position) || 0,
646
+ title: String(r.title ?? ""),
647
+ url: String(r.url ?? ""),
648
+ domain: String(r.domain ?? ""),
649
+ snippet: r.snippet ?? null
650
+ })),
651
+ aiOverview: aiOvw ? { detected: aiOvw.detected === true, text: aiOvw.text ?? null } : null,
652
+ entityIds: entityIds ? { kgIds: entityIds.kgIds ?? [], cids: entityIds.cids ?? [], gcids: entityIds.gcids ?? [] } : null,
653
+ durationMs: durationMs ?? null
654
+ }
655
+ };
433
656
  }
434
657
  function formatSearchSerp(raw, input) {
435
658
  const parsed = parseData(raw);
@@ -467,19 +690,29 @@ ${localRows}` : "";
467
690
  const full = `# SERP Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
468
691
 
469
692
  ${serpTable}${localSection}${entityIdsSection(entityIds)}${aiSection}${debugSection(diagnostics?.debug)}${tips}`;
470
- const topO = organic.slice(0, 5).map((r) => `${r.position}. [${r.title}](${r.url}) \u2014 ${r.domain}`).join("\n");
471
- const summary = [
472
- `**SERP: "${input.query}"** \u2014 ${organic.length} organic results`,
473
- topO ? `
474
- **Top results:**
475
- ${topO}` : "",
476
- localPack.length ? `
477
- **Local Pack:** ${localPack.map((b) => b.name).join(", ")}` : "",
478
- entityIdsSummaryLine(entityIds),
479
- `
480
- \u{1F4A1} Use \`harvest_paa\` for questions \xB7 \`extract_url\` to scrape any result`
481
- ].filter(Boolean).join("\n");
482
- return oneBlock(full);
693
+ return {
694
+ ...oneBlock(full),
695
+ structuredContent: {
696
+ query: input.query,
697
+ location: input.location ?? null,
698
+ organicResults: organic.map((r) => ({
699
+ position: Number(r.position) || 0,
700
+ title: String(r.title ?? ""),
701
+ url: String(r.url ?? ""),
702
+ domain: String(r.domain ?? ""),
703
+ snippet: r.snippet ?? null
704
+ })),
705
+ localPack: localPack.map((b) => ({
706
+ position: Number(b.position) || 0,
707
+ name: String(b.name ?? ""),
708
+ rating: b.rating ?? null,
709
+ reviewCount: b.reviewCount ?? null,
710
+ websiteUrl: b.websiteUrl ?? null
711
+ })),
712
+ aiOverview: aiOvw ? { detected: aiOvw.detected === true, text: aiOvw.text ?? null } : null,
713
+ entityIds: entityIds ? { kgIds: entityIds.kgIds ?? [], cids: entityIds.cids ?? [], gcids: entityIds.gcids ?? [] } : null
714
+ }
715
+ };
483
716
  }
484
717
  function formatExtractUrl(raw, input) {
485
718
  const parsed = parseData(raw);
@@ -548,15 +781,27 @@ ${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
548
781
  **${title}**
549
782
  ${headingSection}${kpoSection}${brandingSection}${bodySection}${screenshotSection}${mediaSection}${tips}`;
550
783
  const textResult = oneBlock(full);
784
+ const structuredContent = {
785
+ url,
786
+ title: d.title ?? null,
787
+ headings: headings.map((h) => ({ level: Number(h.level) || 0, text: String(h.text ?? "") })),
788
+ schemaBlockCount: schemaCount,
789
+ entityName: kpo?.entityName ?? null,
790
+ entityTypes: kpo?.type ?? [],
791
+ napScore: kpo?.napScore ?? null,
792
+ missingSchemaFields: kpo?.missingFields ?? [],
793
+ screenshotSaved: screenshotPath ?? null
794
+ };
551
795
  if (screenshotMeta?.base64) {
552
796
  return {
553
797
  content: [
554
798
  ...textResult.content,
555
799
  { type: "image", data: screenshotMeta.base64, mimeType: "image/png" }
556
- ]
800
+ ],
801
+ structuredContent
557
802
  };
558
803
  }
559
- return textResult;
804
+ return { ...textResult, structuredContent };
560
805
  }
561
806
  function formatMapSiteUrls(raw, input) {
562
807
  const parsed = parseData(raw);
@@ -589,15 +834,19 @@ ${broken.map((u) => `- ${u.url} (${u.status})`).join("\n")}` : "",
589
834
  - Extract content from all pages: use \`extract_site\`
590
835
  - Scrape a single page: use \`extract_url\``
591
836
  ].filter(Boolean).join("\n");
592
- const summary = [
593
- `**URL Map: ${input.url}**`,
594
- `${d.totalFound} URLs \u2014 ${ok.length} OK \xB7 ${broken.length} broken \xB7 ${redirects.length} redirects`,
595
- broken.length ? `
596
- **Broken URLs:** ${broken.slice(0, 3).map((u) => u.url).join(", ")}` : "",
597
- `
598
- \u{1F4A1} Use \`extract_site\` to extract content from all pages`
599
- ].filter(Boolean).join("\n");
600
- return oneBlock(full);
837
+ return {
838
+ ...oneBlock(full),
839
+ structuredContent: {
840
+ startUrl: d.startUrl ?? input.url,
841
+ totalFound: d.totalFound ?? urls.length,
842
+ truncated: d.truncated === true,
843
+ okCount: ok.length,
844
+ redirectCount: redirects.length,
845
+ brokenCount: broken.length,
846
+ urls: urls.map((u) => ({ url: u.url, status: u.status ?? null })),
847
+ durationMs: d.durationMs ?? 0
848
+ }
849
+ };
601
850
  }
602
851
  function formatExtractSite(raw, input) {
603
852
  const parsed = parseData(raw);
@@ -622,14 +871,19 @@ ${pageRows}`,
622
871
  - Map URLs first: use \`map_site_urls\`
623
872
  - Inspect a single page: use \`extract_url\``
624
873
  ].join("\n");
625
- const summary = [
626
- `**Site Extract: ${input.url}** \u2014 ${pages.length} pages`,
627
- pages.slice(0, 5).map((p) => `- ${p.title ?? p.url}`).join("\n"),
628
- pages.length > 5 ? `- \u2026 and ${pages.length - 5} more` : "",
629
- `
630
- \u{1F4A1} Use \`extract_url\` to inspect any individual page`
631
- ].filter(Boolean).join("\n");
632
- return oneBlock(full);
874
+ return {
875
+ ...oneBlock(full),
876
+ structuredContent: {
877
+ url: input.url,
878
+ pageCount: pages.length,
879
+ pages: pages.map((p) => ({
880
+ url: String(p.url ?? ""),
881
+ title: p.title ?? null,
882
+ schemaTypes: p.kpo?.type ?? []
883
+ })),
884
+ durationMs: d.durationMs ?? 0
885
+ }
886
+ };
633
887
  }
634
888
  function formatYoutubeHarvest(raw, input) {
635
889
  const parsed = parseData(raw);
@@ -659,16 +913,22 @@ ${videoRows}`,
659
913
  - Transcribe a video: use \`youtube_transcribe\` with the \`videoId\` above
660
914
  - Switch mode: \`mode: "channel"\` with \`channelHandle\` or \`mode: "search"\` with \`query\``
661
915
  ].filter(Boolean).join("\n");
662
- const top5 = videos.slice(0, 5).map((v, i) => `${i + 1}. ${v.title} (\`${v.videoId}\`)`).join("\n");
663
- const summary = [
664
- `**YouTube: ${label}** \u2014 ${videos.length} videos`,
665
- `
666
- **Top videos:**
667
- ${top5}`,
668
- `
669
- \u{1F4A1} Transcribe any video: \`youtube_transcribe\` with its videoId`
670
- ].join("\n");
671
- return oneBlock(full);
916
+ return {
917
+ ...oneBlock(full),
918
+ structuredContent: {
919
+ mode: input.mode,
920
+ videoCount: videos.length,
921
+ channel: d.channelMeta ? { title: d.channelMeta.title ?? null, subscriberCount: d.channelMeta.subscriberCount ?? null } : null,
922
+ videos: videos.map((v) => ({
923
+ videoId: String(v.videoId ?? ""),
924
+ title: String(v.title ?? ""),
925
+ channelName: v.channelName ?? null,
926
+ views: v.views ?? null,
927
+ duration: v.duration ?? null,
928
+ url: v.url ?? null
929
+ }))
930
+ }
931
+ };
672
932
  }
673
933
  function formatYoutubeTranscribe(raw, input) {
674
934
  const parsed = parseData(raw);
@@ -698,14 +958,6 @@ ${chunkRows}` : "",
698
958
  ---
699
959
  \u{1F4A1} Harvest more from this channel: use \`youtube_harvest\` with \`mode: "channel"\``
700
960
  ].filter(Boolean).join("\n");
701
- const summary = [
702
- `**YouTube Transcript: \`${input.videoId}\`** \u2014 ${text.split(" ").length} words \xB7 ${durSec}s`,
703
- `
704
- **Preview:**
705
- > ${truncate(text, 300)}`,
706
- `
707
- \u{1F4A1} Full transcript in artifact above`
708
- ].join("\n");
709
961
  return oneBlock(full);
710
962
  }
711
963
  function formatFacebookPageIntel(raw, input) {
@@ -734,19 +986,26 @@ ${adBlocks}`,
734
986
  - Transcribe video ads: use \`facebook_ad_transcribe\` with the \`videoUrl\` above
735
987
  - Find other advertisers: use \`facebook_ad_search\``
736
988
  ].filter(Boolean).join("\n");
737
- const activeAds = ads.filter((a) => a.status?.toLowerCase() === "active").slice(0, 5);
738
- const adSummary = activeAds.map((a, i) => `${i + 1}. ${truncate(a.headline ?? a.primaryText, 80)} (${a.creativeType ?? "\u2014"})`).join("\n");
739
- const videoCount = ads.filter((a) => a.videoUrl).length;
740
- const summary = [
741
- `**Facebook Ads: ${advertiser}** \u2014 ${s.totalAds} ads (${s.activeCount} active)`,
742
- adSummary ? `
743
- **Active ads:**
744
- ${adSummary}` : "",
745
- `**Creative mix:** ${s.videoCount} video \xB7 ${s.imageCount} image`,
746
- videoCount ? `
747
- \u{1F4A1} ${videoCount} video ads \u2014 transcribe with \`facebook_ad_transcribe\` using the videoUrl` : ""
748
- ].filter(Boolean).join("\n");
749
- return oneBlock(full);
989
+ return {
990
+ ...oneBlock(full),
991
+ structuredContent: {
992
+ advertiserName: d.advertiserName ?? null,
993
+ totalAds: s.totalAds ?? 0,
994
+ activeCount: s.activeCount ?? 0,
995
+ videoCount: s.videoCount ?? 0,
996
+ imageCount: s.imageCount ?? 0,
997
+ ads: ads.map((ad) => ({
998
+ libraryId: ad.libraryId ?? null,
999
+ status: ad.status ?? null,
1000
+ creativeType: ad.creativeType ?? null,
1001
+ headline: ad.headline ?? null,
1002
+ cta: ad.cta ?? null,
1003
+ startDate: ad.startDate ?? null,
1004
+ videoUrl: ad.videoUrl ?? null,
1005
+ variations: typeof ad.variations === "number" ? ad.variations : null
1006
+ }))
1007
+ }
1008
+ };
750
1009
  }
751
1010
  function formatFacebookAdSearch(raw, input) {
752
1011
  const parsed = parseData(raw);
@@ -770,15 +1029,18 @@ ${rows}`,
770
1029
  - Scan all ads: use \`facebook_page_intel\` with \`libraryId\`
771
1030
  - Or pass the advertiser name as \`query\` in \`facebook_page_intel\``
772
1031
  ].join("\n");
773
- const summary = [
774
- `**Facebook Ad Search: "${input.query}"** \u2014 ${advertisers.length} advertisers`,
775
- advertisers.slice(0, 5).map(
776
- (a, i) => `${i + 1}. ${a.name}${a.adCount ? ` (${a.adCount} ads)` : ""} \u2014 \`${a.libraryId ?? "\u2014"}\``
777
- ).join("\n"),
778
- `
779
- \u{1F4A1} Scan ads with \`facebook_page_intel\` using \`libraryId\``
780
- ].filter(Boolean).join("\n");
781
- return oneBlock(full);
1032
+ return {
1033
+ ...oneBlock(full),
1034
+ structuredContent: {
1035
+ query: input.query,
1036
+ advertiserCount: advertisers.length,
1037
+ advertisers: advertisers.map((a) => ({
1038
+ name: a.pageName ?? a.name ?? null,
1039
+ adCount: typeof a.adCount === "number" ? a.adCount : null,
1040
+ libraryId: a.sampleLibraryId ?? a.libraryId ?? null
1041
+ }))
1042
+ }
1043
+ };
782
1044
  }
783
1045
  function formatCreditsInfo(raw, input) {
784
1046
  const parsed = parseData(raw);
@@ -818,14 +1080,75 @@ ${costRows}` : "",
818
1080
  |------|-----------|---------|-------------|
819
1081
  ${ledgerRows}` : ""
820
1082
  ].filter(Boolean).join("\n");
821
- const summary = [
822
- `**Credit balance:** ${balance ?? "unknown"} credits`,
823
- matched ? `
824
- **${matched.label}:** ${matched.credits} credits ${matched.unit}` : null,
825
- input.includeLedger && ledger.length ? `
826
- Recent ledger entries included in the full report.` : null
1083
+ return {
1084
+ ...oneBlock(full),
1085
+ structuredContent: {
1086
+ balanceCredits: typeof balance === "number" ? balance : null,
1087
+ matchedCost: matched ? { label: matched.label, credits: matched.credits, unit: matched.unit, notes: matched.notes ?? null } : null,
1088
+ costs: costs.map((c) => ({
1089
+ key: c.key,
1090
+ label: c.label,
1091
+ credits: c.credits,
1092
+ unit: c.unit,
1093
+ notes: c.notes ?? null
1094
+ })),
1095
+ ledger: ledger.map((row) => ({
1096
+ createdAt: String(row.created_at ?? ""),
1097
+ operation: String(row.operation ?? ""),
1098
+ credits: row.amount_mc / 1e3,
1099
+ description: row.description ?? null
1100
+ }))
1101
+ }
1102
+ };
1103
+ }
1104
+ function formatMapsSearch(raw, input) {
1105
+ const parsed = parseData(raw);
1106
+ if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
1107
+ const d = parsed.data;
1108
+ const results = d.results ?? [];
1109
+ const searchQuery = d.searchQuery ?? [input.query, input.location].filter(Boolean).join(" ");
1110
+ const requestedMax = d.requestedMaxResults ?? input.maxResults ?? 10;
1111
+ const durationMs = d.durationMs;
1112
+ const rows = results.map((r) => {
1113
+ const rating = [r.rating, r.reviewCount ? `(${r.reviewCount})` : null].filter(Boolean).join(" ");
1114
+ return `| ${r.position} | ${cell(r.name)} | ${cell(r.category)} | ${cell(rating)} | ${cell(r.address)} | ${r.cidDecimal ? `\`${r.cidDecimal}\`` : "\u2014"} | ${r.websiteUrl ? `[site](${r.websiteUrl})` : "\u2014"} | [maps](${r.placeUrl}) |`;
1115
+ }).join("\n");
1116
+ const metadataSection = results.length ? `
1117
+ ## Candidate Metadata
1118
+ ${results.map((r) => {
1119
+ const meta = r.metadata?.length ? r.metadata.slice(0, 8).map((m) => ` - ${m}`).join("\n") : " - none";
1120
+ return `### ${r.position}. ${r.name}
1121
+ ${meta}`;
1122
+ }).join("\n\n")}` : "";
1123
+ const full = [
1124
+ `# Google Maps Search: "${searchQuery}"`,
1125
+ `**Returned:** ${results.length} profile candidate${results.length === 1 ? "" : "s"} \xB7 **Requested max:** ${requestedMax} \xB7 **Limit:** 50`,
1126
+ `
1127
+ ## Results
1128
+ | # | Name | Category | Rating | Address | CID | Website | Maps |
1129
+ |---|------|----------|--------|---------|-----|---------|------|
1130
+ ${rows}`,
1131
+ metadataSection,
1132
+ `
1133
+ ---
1134
+ \u{1F4A1} **Next step:** use \`maps_place_intel\` with a selected business name and location to hydrate full hours, phone, review topics, and optional review cards.`,
1135
+ durationMs != null ? `
1136
+ *Extracted in ${(durationMs / 1e3).toFixed(1)}s*` : null
827
1137
  ].filter(Boolean).join("\n");
828
- return oneBlock(full);
1138
+ return {
1139
+ ...oneBlock(full),
1140
+ structuredContent: {
1141
+ query: d.query,
1142
+ location: d.location ?? null,
1143
+ searchQuery: d.searchQuery,
1144
+ searchUrl: d.searchUrl,
1145
+ extractedAt: d.extractedAt,
1146
+ requestedMaxResults: requestedMax,
1147
+ resultCount: results.length,
1148
+ results,
1149
+ durationMs: durationMs ?? 0
1150
+ }
1151
+ };
829
1152
  }
830
1153
  function formatMapsPlaceIntel(raw, input) {
831
1154
  const parsed = parseData(raw);
@@ -925,20 +1248,28 @@ ${entitySection}` : null,
925
1248
  ---
926
1249
  *Extracted in ${(durationMs / 1e3).toFixed(1)}s*` : null
927
1250
  ].filter(Boolean).join("\n");
928
- const summary = [
929
- `**${name}** \u2014 ${category ?? "Business"} \xB7 ${ratingLine || "No rating"}`,
930
- address ? `\u{1F4CD} ${address}` : null,
931
- phone ? `\u{1F4DE} ${phone}` : null,
932
- hoursSummary ? `\u{1F550} ${hoursSummary}` : null,
933
- website ? `\u{1F310} ${website}` : null,
934
- reviewsStatus === "collected" && reviews.length ? `
935
- \u{1F4AC} ${reviews.length} reviews fetched \u2014 full list in artifact above` : null,
936
- reviewsStatus === "unavailable" ? `
937
- \u26A0\uFE0F Reviews could not be retrieved this run` : null,
938
- reviewsStatus === "none_exist" ? `
939
- \u{1F4AC} No reviews on Google Maps` : null
940
- ].filter(Boolean).join("\n");
941
- return oneBlock(full);
1251
+ return {
1252
+ ...oneBlock(full),
1253
+ structuredContent: {
1254
+ name,
1255
+ rating: rating ?? null,
1256
+ reviewCount: reviewCount ?? null,
1257
+ category: category ?? null,
1258
+ address: address ?? null,
1259
+ phone: phone ?? null,
1260
+ website: website ?? null,
1261
+ hoursSummary: hoursSummary ?? null,
1262
+ bookingUrl: bookingUrl ?? null,
1263
+ kgmid: kgmid ?? null,
1264
+ cidDecimal: cidDecimal ?? null,
1265
+ cidUrl: cidUrl ?? null,
1266
+ lat: lat ?? null,
1267
+ lng: lng ?? null,
1268
+ reviewsStatus,
1269
+ reviewsCollected: reviews.length,
1270
+ reviewTopics: topics.map((t) => ({ label: String(t.label ?? ""), count: String(t.count ?? "") }))
1271
+ }
1272
+ };
942
1273
  }
943
1274
  function formatFacebookAdTranscribe(raw, input) {
944
1275
  const parsed = parseData(raw);
@@ -968,67 +1299,118 @@ ${chunkRows}` : "",
968
1299
  ---
969
1300
  \u{1F4A1} Get more ads from this advertiser: use \`facebook_page_intel\``
970
1301
  ].filter(Boolean).join("\n");
971
- const summary = [
972
- `**Facebook Ad Transcript** \u2014 ${text.split(" ").length} words \xB7 ${durSec}s`,
973
- `
974
- **Preview:**
975
- > ${truncate(text, 300)}`,
976
- `
977
- \u{1F4A1} Full transcript in artifact above`
978
- ].join("\n");
979
1302
  return oneBlock(full);
980
1303
  }
981
1304
 
982
1305
  // src/mcp/paa-mcp-server.ts
983
- function buildPaaExtractorMcpServer(executor2) {
984
- const server2 = new import_mcp.McpServer({ name: "mcp-scraper", version: "1.0.0" });
1306
+ function liveWebToolAnnotations(title) {
1307
+ return {
1308
+ title,
1309
+ readOnlyHint: true,
1310
+ destructiveHint: false,
1311
+ idempotentHint: false,
1312
+ openWorldHint: true
1313
+ };
1314
+ }
1315
+ function buildPaaExtractorMcpServer(executor2, options = {}) {
1316
+ const savesReports = options.savesReportsLocally !== false;
1317
+ const reportNote = savesReports ? " Saves a full Markdown report locally." : " Reports are returned inline; no files are saved on this hosted endpoint.";
1318
+ const withReportNote = (description) => `${description}${reportNote}`;
1319
+ const server2 = new import_mcp.McpServer({ name: "mcp-scraper", version: PACKAGE_VERSION });
985
1320
  server2.registerTool("harvest_paa", {
986
- description: 'Best default tool for Google search research. Extracts People Also Ask questions plus answers/source URLs, organic SERP, local pack when present, entity IDs (CID/GCID/KG MID), and AI Overview. Infer the user language: split topic from location (e.g. "best hvac company in Denver CO" => query "best hvac company", location "Denver, CO", gl "us", hl "en"). Use maxQuestions 30 normally, 100-150 for "full", "deep", "all", or comprehensive research. Credits are charged by extracted question; unused request hold is refunded. Saves a full Markdown report locally.',
987
- inputSchema: HarvestPaaInputSchema
1321
+ title: "Google PAA + SERP Harvest",
1322
+ description: withReportNote('Best default tool for Google search research. Extracts People Also Ask questions plus answers/source URLs, organic SERP, local pack when present, entity IDs (CID/GCID/KG MID), and AI Overview. Infer the user language: split topic from location (e.g. "best hvac company in Denver CO" => query "best hvac company", location "Denver, CO", gl "us", hl "en"). Use maxQuestions 30 normally, 100-150 for "full", "deep", "all", or comprehensive research. Credits are charged by extracted question; unused request hold is refunded.'),
1323
+ inputSchema: HarvestPaaInputSchema,
1324
+ outputSchema: HarvestPaaOutputSchema,
1325
+ annotations: liveWebToolAnnotations("Google PAA + SERP Harvest")
988
1326
  }, async (input) => formatHarvestPaa(await executor2.harvestPaa(input), input));
989
1327
  server2.registerTool("search_serp", {
990
- description: "Fast Google SERP lookup without PAA expansion. Use when the user asks for rankings, organic results, local pack, quick SERP, or positions. Split topic from location and infer gl/hl from the user request. Saves a full Markdown report locally.",
991
- inputSchema: SearchSerpInputSchema
1328
+ title: "Google SERP Lookup",
1329
+ description: withReportNote("Fast Google SERP lookup without PAA expansion. Use when the user asks for rankings, organic results, local pack, quick SERP, or positions. Split topic from location and infer gl/hl from the user request."),
1330
+ inputSchema: SearchSerpInputSchema,
1331
+ outputSchema: SearchSerpOutputSchema,
1332
+ annotations: liveWebToolAnnotations("Google SERP Lookup")
992
1333
  }, async (input) => formatSearchSerp(await executor2.searchSerp(input), input));
993
1334
  server2.registerTool("extract_url", {
994
- description: "Extract structured data from one public URL: page content as Markdown, heading structure, JSON-LD schema, entity details, NAP score, metadata, and missing schema fields. Use when the user provides a single URL or asks to inspect/scrape one page. Saves a full Markdown report locally.",
995
- inputSchema: ExtractUrlInputSchema
1335
+ title: "Single URL Extract",
1336
+ description: withReportNote("Extract structured data from one public URL: page content as Markdown, heading structure, JSON-LD schema, entity details, NAP score, metadata, and missing schema fields. Use when the user provides a single URL or asks to inspect/scrape one page."),
1337
+ inputSchema: ExtractUrlInputSchema,
1338
+ outputSchema: ExtractUrlOutputSchema,
1339
+ annotations: liveWebToolAnnotations("Single URL Extract")
996
1340
  }, async (input) => formatExtractUrl(await executor2.extractUrl(input), input));
997
1341
  server2.registerTool("map_site_urls", {
998
- description: "Map/crawl a public website to build a URL inventory with HTTP status codes, broken links, redirects, and site scope. Use before extract_site for audits or when the user asks for a sitemap/URL inventory. Saves a full Markdown report locally.",
999
- inputSchema: MapSiteUrlsInputSchema
1342
+ title: "Site URL Map",
1343
+ description: withReportNote("Map/crawl a public website to build a URL inventory with HTTP status codes, broken links, redirects, and site scope. Use before extract_site for audits or when the user asks for a sitemap/URL inventory."),
1344
+ inputSchema: MapSiteUrlsInputSchema,
1345
+ outputSchema: MapSiteUrlsOutputSchema,
1346
+ annotations: liveWebToolAnnotations("Site URL Map")
1000
1347
  }, async (input) => formatMapSiteUrls(await executor2.mapSiteUrls(input), input));
1001
1348
  server2.registerTool("extract_site", {
1002
- description: "Run multi-page extraction across a public website. Returns per-page titles, H1s, metadata, headings, schema/entity data, canonical URLs, and content. Use for website audits, competitor audits, and full-site extraction. Saves a full Markdown report locally.",
1003
- inputSchema: ExtractSiteInputSchema
1349
+ title: "Multi-Page Site Extract",
1350
+ description: withReportNote("Run multi-page extraction across a public website. Returns per-page titles, H1s, metadata, headings, schema/entity data, canonical URLs, and content. Use for website audits, competitor audits, and full-site extraction."),
1351
+ inputSchema: ExtractSiteInputSchema,
1352
+ outputSchema: ExtractSiteOutputSchema,
1353
+ annotations: liveWebToolAnnotations("Multi-Page Site Extract")
1004
1354
  }, async (input) => formatExtractSite(await executor2.extractSite(input), input));
1005
1355
  server2.registerTool("youtube_harvest", {
1006
- description: 'Harvest YouTube video metadata by search query or channel handle/ID/URL. Use mode "search" for keyword/topic requests and mode "channel" for @handles, channel IDs, or channel URLs. Returns titles, views, dates, durations, URLs, thumbnails, and videoIds for follow-up transcription. Saves a full Markdown report locally.',
1007
- inputSchema: YoutubeHarvestInputSchema
1356
+ title: "YouTube Video Harvest",
1357
+ description: withReportNote('Harvest YouTube video metadata by search query or channel handle/ID/URL. Use mode "search" for keyword/topic requests and mode "channel" for @handles, channel IDs, or channel URLs. Returns titles, views, dates, durations, URLs, thumbnails, and videoIds for follow-up transcription.'),
1358
+ inputSchema: YoutubeHarvestInputSchema,
1359
+ outputSchema: YoutubeHarvestOutputSchema,
1360
+ annotations: liveWebToolAnnotations("YouTube Video Harvest")
1008
1361
  }, async (input) => formatYoutubeHarvest(await executor2.youtubeHarvest(input), input));
1009
1362
  server2.registerTool("youtube_transcribe", {
1010
- description: "Fetch and transcribe captions from a YouTube video. Returns full transcript, timestamped chunks, and word count. Pass a videoId from youtube_harvest results or infer it from a YouTube URL if the user provided one. Saves a full Markdown report locally.",
1011
- inputSchema: YoutubeTranscribeInputSchema
1363
+ title: "YouTube Transcription",
1364
+ description: withReportNote("Fetch and transcribe captions from a YouTube video. Returns full transcript, timestamped chunks, and word count. Pass a videoId from youtube_harvest results or infer it from a YouTube URL if the user provided one."),
1365
+ inputSchema: YoutubeTranscribeInputSchema,
1366
+ annotations: liveWebToolAnnotations("YouTube Transcription")
1012
1367
  }, async (input) => formatYoutubeTranscribe(await executor2.youtubeTranscribe(input), input));
1013
1368
  server2.registerTool("facebook_page_intel", {
1014
- description: "Harvest ads from a Facebook advertiser. Returns ad copy, headlines, CTAs, creative type, status, landing URLs, and video URLs ready for transcription. Accepts pageId, libraryId, or a brand/advertiser name as query. Use after facebook_ad_search when possible. Saves a full Markdown report locally.",
1015
- inputSchema: FacebookPageIntelInputSchema
1369
+ title: "Facebook Advertiser Ad Intel",
1370
+ description: withReportNote("Harvest ads from a Facebook advertiser. Returns ad copy, headlines, CTAs, creative type, status, landing URLs, and video URLs ready for transcription. Accepts pageId, libraryId, or a brand/advertiser name as query. Use after facebook_ad_search when possible."),
1371
+ inputSchema: FacebookPageIntelInputSchema,
1372
+ outputSchema: FacebookPageIntelOutputSchema,
1373
+ annotations: liveWebToolAnnotations("Facebook Advertiser Ad Intel")
1016
1374
  }, async (input) => formatFacebookPageIntel(await executor2.facebookPageIntel(input), input));
1017
1375
  server2.registerTool("facebook_ad_search", {
1018
- description: "Search Facebook Ad Library by brand, advertiser, competitor, niche, or keyword. Returns advertisers with ad counts and library IDs. Use to discover competitors, then pass libraryId to facebook_page_intel. Saves a full Markdown report locally.",
1019
- inputSchema: FacebookAdSearchInputSchema
1376
+ title: "Facebook Ad Library Search",
1377
+ description: withReportNote("Search Facebook Ad Library by brand, advertiser, competitor, niche, or keyword. Returns advertisers with ad counts and library IDs. Use to discover competitors, then pass libraryId to facebook_page_intel."),
1378
+ inputSchema: FacebookAdSearchInputSchema,
1379
+ outputSchema: FacebookAdSearchOutputSchema,
1380
+ annotations: liveWebToolAnnotations("Facebook Ad Library Search")
1020
1381
  }, async (input) => formatFacebookAdSearch(await executor2.facebookAdSearch(input), input));
1021
1382
  server2.registerTool("facebook_ad_transcribe", {
1383
+ title: "Facebook Ad Transcription",
1022
1384
  description: "Transcribe audio from a Facebook ad video. Returns full transcript and timestamped chunks. Use the videoUrl value from facebook_page_intel results.",
1023
- inputSchema: FacebookAdTranscribeInputSchema
1385
+ inputSchema: FacebookAdTranscribeInputSchema,
1386
+ annotations: liveWebToolAnnotations("Facebook Ad Transcription")
1024
1387
  }, async (input) => formatFacebookAdTranscribe(await executor2.facebookAdTranscribe(input), input));
1025
1388
  server2.registerTool("maps_place_intel", {
1026
- description: 'Extract Google Maps business intelligence for a named business: rating, review count, category, address, phone, website, hours, booking URL, review histogram, review topics, about attributes, entity IDs, and optional review cards. Split business name from location (e.g. "Elite Roofing Denver CO" => businessName "Elite Roofing", location "Denver, CO"). Pass includeReviews true when the user asks for reviews/customer pain. Saves a full Markdown report locally.',
1027
- inputSchema: MapsPlaceIntelInputSchema
1389
+ title: "Google Maps Business Profile Details",
1390
+ description: withReportNote('Extract Google Maps business intelligence for one known/named business: rating, review count, category, address, phone, website, hours, booking URL, review histogram, review topics, about attributes, entity IDs, and optional review cards. Do not use this for category searches, local market prospect lists, or requests for multiple GMB/GBP profiles; use maps_search first for those. Split business name from location (e.g. "Elite Roofing Denver CO" => businessName "Elite Roofing", location "Denver, CO"). Pass includeReviews true when the user asks for reviews/customer pain.'),
1391
+ inputSchema: MapsPlaceIntelInputSchema,
1392
+ outputSchema: MapsPlaceIntelOutputSchema,
1393
+ annotations: liveWebToolAnnotations("Google Maps Business Profile Details")
1028
1394
  }, async (input) => formatMapsPlaceIntel(await executor2.mapsPlaceIntel(input), input));
1395
+ server2.registerTool("maps_search", {
1396
+ title: "Google Maps Business Search",
1397
+ description: withReportNote('Search Google Maps for multiple businesses/profiles by category, niche, keyword, or local market. Use this when the user asks for several Google Business Profiles, GMBs, GBPs, leads, prospects, competitors, or "more than the 3-pack." Returns up to 50 candidates with names, place URLs, CIDs when available, ratings, review counts, and profile metadata. Default maxResults is 10; maximum is 50. Use maps_place_intel afterward only when a selected business needs full details and reviews.'),
1398
+ inputSchema: MapsSearchInputSchema,
1399
+ outputSchema: MapsSearchOutputSchema,
1400
+ annotations: liveWebToolAnnotations("Google Maps Business Search")
1401
+ }, async (input) => formatMapsSearch(await executor2.mapsSearch(input), input));
1029
1402
  server2.registerTool("credits_info", {
1403
+ title: "MCP Scraper Credits & Costs",
1030
1404
  description: "Answer questions about MCP Scraper credits: current credit balance, what a specific tool/action costs, the full cost table, and optionally recent credit ledger entries. Does not expose payment methods or credit card information.",
1031
- inputSchema: CreditsInfoInputSchema
1405
+ inputSchema: CreditsInfoInputSchema,
1406
+ outputSchema: CreditsInfoOutputSchema,
1407
+ annotations: {
1408
+ title: "MCP Scraper Credits & Costs",
1409
+ readOnlyHint: true,
1410
+ destructiveHint: false,
1411
+ idempotentHint: true,
1412
+ openWorldHint: false
1413
+ }
1032
1414
  }, async (input) => formatCreditsInfo(await executor2.creditsInfo(input), input));
1033
1415
  return server2;
1034
1416
  }