mcp-scraper 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +5 -0
  2. package/dist/bin/api-server.cjs +15730 -7780
  3. package/dist/bin/api-server.cjs.map +1 -1
  4. package/dist/bin/api-server.js +3 -3
  5. package/dist/bin/mcp-stdio-server.cjs +300 -110
  6. package/dist/bin/mcp-stdio-server.cjs.map +1 -1
  7. package/dist/bin/mcp-stdio-server.js +1 -1
  8. package/dist/bin/paa-harvest.cjs +1537 -165
  9. package/dist/bin/paa-harvest.cjs.map +1 -1
  10. package/dist/bin/paa-harvest.js +1 -1
  11. package/dist/{chunk-ZBP4RHNW.js → chunk-4743MZHT.js} +298 -106
  12. package/dist/chunk-4743MZHT.js.map +1 -0
  13. package/dist/{chunk-LXZDJJXR.js → chunk-D4CJBZBY.js} +426 -29
  14. package/dist/chunk-D4CJBZBY.js.map +1 -0
  15. package/dist/chunk-HERFK7W6.js +2781 -0
  16. package/dist/chunk-HERFK7W6.js.map +1 -0
  17. package/dist/chunk-Y74EXABN.js +295 -0
  18. package/dist/chunk-Y74EXABN.js.map +1 -0
  19. package/dist/{db-IOYMX64U.js → db-YWCNHBLH.js} +36 -4
  20. package/dist/index.cjs +1660 -237
  21. package/dist/index.cjs.map +1 -1
  22. package/dist/index.d.cts +169 -2
  23. package/dist/index.d.ts +169 -2
  24. package/dist/index.js +120 -69
  25. package/dist/index.js.map +1 -1
  26. package/dist/server-N7Q6H4OR.js +11612 -0
  27. package/dist/server-N7Q6H4OR.js.map +1 -0
  28. package/dist/{worker-3ECJHPRE.js → worker-D4D2YQTA.js} +44 -9
  29. package/dist/worker-D4D2YQTA.js.map +1 -0
  30. package/package.json +17 -5
  31. package/dist/chunk-4API3ZCT.js +0 -1387
  32. package/dist/chunk-4API3ZCT.js.map +0 -1
  33. package/dist/chunk-LXZDJJXR.js.map +0 -1
  34. package/dist/chunk-ZBP4RHNW.js.map +0 -1
  35. package/dist/server-63DR2HE5.js +0 -6062
  36. package/dist/server-63DR2HE5.js.map +0 -1
  37. package/dist/worker-3ECJHPRE.js.map +0 -1
  38. /package/dist/{db-IOYMX64U.js.map → db-YWCNHBLH.js.map} +0 -0
@@ -2,20 +2,26 @@
2
2
  "use strict";
3
3
 
4
4
  // bin/mcp-stdio-server.ts
5
- var import_node_fs = require("fs");
6
- var import_node_os = require("os");
7
- var import_node_path = require("path");
5
+ var import_node_fs2 = require("fs");
6
+ var import_node_os2 = require("os");
7
+ var import_node_path2 = require("path");
8
8
  var import_stdio = require("@modelcontextprotocol/sdk/server/stdio.js");
9
9
 
10
10
  // src/mcp/http-mcp-tool-executor.ts
11
11
  var HttpMcpToolExecutor = class {
12
12
  baseUrl;
13
13
  apiKey;
14
+ timeoutMs;
15
+ serpIntelligenceTimeoutMs;
14
16
  constructor(baseUrl2, apiKey2) {
15
17
  this.baseUrl = baseUrl2.replace(/\/$/, "");
16
18
  this.apiKey = apiKey2;
19
+ const configuredTimeoutMs = Number(process.env.MCP_SCRAPER_HTTP_TIMEOUT_MS ?? 11e4);
20
+ this.timeoutMs = Number.isFinite(configuredTimeoutMs) && configuredTimeoutMs > 0 ? configuredTimeoutMs : 11e4;
21
+ const configuredSerpIntelligenceTimeoutMs = Number(process.env.MCP_SCRAPER_SERP_INTELLIGENCE_HTTP_TIMEOUT_MS ?? this.timeoutMs);
22
+ this.serpIntelligenceTimeoutMs = Number.isFinite(configuredSerpIntelligenceTimeoutMs) && configuredSerpIntelligenceTimeoutMs > 0 ? configuredSerpIntelligenceTimeoutMs : this.timeoutMs;
17
23
  }
18
- async call(path, body) {
24
+ async call(path, body, timeoutMs = this.timeoutMs) {
19
25
  try {
20
26
  const res = await fetch(`${this.baseUrl}${path}`, {
21
27
  method: "POST",
@@ -24,7 +30,7 @@ var HttpMcpToolExecutor = class {
24
30
  "x-api-key": this.apiKey
25
31
  },
26
32
  body: JSON.stringify(body),
27
- signal: AbortSignal.timeout(29e4)
33
+ signal: AbortSignal.timeout(timeoutMs)
28
34
  });
29
35
  const data = await res.json();
30
36
  if (!res.ok) {
@@ -33,6 +39,22 @@ var HttpMcpToolExecutor = class {
33
39
  return { content: [{ type: "text", text: JSON.stringify(data) }] };
34
40
  } catch (err) {
35
41
  const msg = err instanceof Error ? err.message : String(err);
42
+ if (err instanceof DOMException && err.name === "TimeoutError") {
43
+ return {
44
+ content: [{
45
+ type: "text",
46
+ text: JSON.stringify({
47
+ error: "mcp_request_timeout",
48
+ error_type: "timeout",
49
+ retryable: true,
50
+ path,
51
+ timeoutMs,
52
+ message: `MCP Scraper request exceeded ${Math.round(timeoutMs / 1e3)}s and was cancelled. Retry with fewer results or use the async API for deep harvests.`
53
+ })
54
+ }],
55
+ isError: true
56
+ };
57
+ }
36
58
  return { content: [{ type: "text", text: msg }], isError: true };
37
59
  }
38
60
  }
@@ -72,6 +94,12 @@ var HttpMcpToolExecutor = class {
72
94
  creditsInfo(input) {
73
95
  return this.call("/billing/credits", input);
74
96
  }
97
+ captureSerpSnapshot(input) {
98
+ return this.call("/serp-intelligence/capture", input, this.serpIntelligenceTimeoutMs);
99
+ }
100
+ captureSerpPageSnapshots(input) {
101
+ return this.call("/serp-intelligence/page-snapshots", input, this.serpIntelligenceTimeoutMs);
102
+ }
75
103
  };
76
104
 
77
105
  // src/mcp/paa-mcp-server.ts
@@ -80,28 +108,38 @@ var import_mcp = require("@modelcontextprotocol/sdk/server/mcp.js");
80
108
  // src/mcp/mcp-tool-schemas.ts
81
109
  var import_zod = require("zod");
82
110
  var HarvestPaaInputSchema = {
83
- query: import_zod.z.string().min(1).describe("Search query to harvest PAA questions for"),
84
- location: import_zod.z.string().optional().describe("Location name for geo-targeted results"),
85
- maxQuestions: import_zod.z.number().int().min(1).max(100).default(30).describe("Number of PAA questions to extract (max 100)"),
86
- gl: import_zod.z.string().length(2).default("us"),
87
- hl: import_zod.z.string().default("en")
111
+ query: import_zod.z.string().min(1).describe('Core search topic only. If the user says "best hvac company in Denver CO", use query="best hvac company" and location="Denver, CO". Do not include the location in query when it can be separated.'),
112
+ location: import_zod.z.string().optional().describe('City, region, or country for geo-targeted results, inferred from the user request when present, e.g. "Denver, CO", "Tokyo, Japan", "London, UK".'),
113
+ maxQuestions: import_zod.z.number().int().min(1).max(150).default(30).describe("Number of PAA questions to extract. Default 30. Maximum 150. Use 10 for quick probes, 30 for normal research, 100-150 when the user asks for everything/full/deep research. Credits are charged by extracted question; unused request hold is refunded."),
114
+ gl: import_zod.z.string().length(2).default("us").describe("Google country code inferred from location or user language. Examples: United States us, United Kingdom gb, Japan jp, Canada ca, Australia au."),
115
+ hl: import_zod.z.string().default("en").describe("Google interface/content language inferred from the user request. Use en unless the user asks for another language or locale."),
116
+ device: import_zod.z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use desktop by default; use mobile only when the user asks for mobile rankings."),
117
+ proxyMode: import_zod.z.enum(["location", "configured", "none"]).default("location").describe("Proxy targeting mode. Use location by default so city/state searches create or reuse a matching residential proxy. Use configured for the static configured proxy. Use none only for direct-network debugging."),
118
+ proxyZip: import_zod.z.string().regex(/^\d{5}$/).optional().describe("Optional US ZIP override for residential location proxy targeting. Use only when the user gives a specific ZIP or city-center proxy targeting needs to be forced."),
119
+ debug: import_zod.z.boolean().default(false).describe("Include sanitized browser/session/location diagnostics in the response. Use true when debugging localization, CAPTCHA, or proxy behavior.")
88
120
  };
89
121
  var ExtractUrlInputSchema = {
90
- url: import_zod.z.string().url()
122
+ url: import_zod.z.string().url().describe("Public http/https URL to extract. Use this when the user provides one specific page URL."),
123
+ screenshot: import_zod.z.boolean().default(false).describe("Also capture a full-page screenshot of the URL. Saved to ~/Downloads/mcp-scraper/screenshots/ and returned inline. Use when the user asks to see or capture the page visually."),
124
+ screenshotDevice: import_zod.z.enum(["desktop", "mobile"]).default("desktop").describe("Viewport for screenshot. desktop = 1440\xD7900. mobile = 390\xD7844. Default desktop."),
125
+ extractBranding: import_zod.z.boolean().default(false).describe("Extract brand colors, fonts, logo, and favicon using a rendered browser session. Returns colorScheme (light/dark), colors (primary/accent/background/text/heading as hex), fonts (heading/body family names), and assets (logo URL, favicon URL). Use when the user asks about brand colors, site theme, or brand assets."),
126
+ downloadMedia: import_zod.z.boolean().default(false).describe("Extract and download all page media (images, video, audio) to ~/Downloads/mcp-scraper/media/. Ad networks, tracking pixels, and noise URLs are filtered automatically. Use when the user asks to download or harvest assets from a page."),
127
+ mediaTypes: import_zod.z.array(import_zod.z.enum(["image", "video", "audio"])).default(["image", "video", "audio"]).describe("Which media types to download. Default all three."),
128
+ allowLocal: import_zod.z.boolean().default(false).describe("Allow localhost and private-network URLs. For local development only.")
91
129
  };
92
130
  var MapSiteUrlsInputSchema = {
93
- url: import_zod.z.string().url(),
94
- maxUrls: import_zod.z.number().int().min(1).max(500).optional()
131
+ url: import_zod.z.string().url().describe("Public website URL or domain to crawl for internal URLs. Use before extract_site when the user asks to audit/map/crawl a site."),
132
+ maxUrls: import_zod.z.number().int().min(1).max(500).optional().describe("Maximum URLs to discover. Use 100 for normal maps, higher when the user asks for a full inventory.")
95
133
  };
96
134
  var ExtractSiteInputSchema = {
97
- url: import_zod.z.string().url(),
98
- maxPages: import_zod.z.number().int().min(1).max(50).optional()
135
+ url: import_zod.z.string().url().describe("Public website URL or domain to extract across multiple pages. Use when the user asks for a site audit, website crawl, or full-site content/schema extraction."),
136
+ maxPages: import_zod.z.number().int().min(1).max(50).optional().describe("Maximum pages to extract. Use 50 when the user asks for full results or a complete crawl within MCP limits.")
99
137
  };
100
138
  var YoutubeHarvestInputSchema = {
101
- mode: import_zod.z.enum(["search", "channel"]),
102
- query: import_zod.z.string().optional().describe("Required when mode is search"),
103
- channelHandle: import_zod.z.string().optional().describe("YouTube channel handle, e.g. @mkbhd"),
104
- maxVideos: import_zod.z.number().int().min(1).max(500).default(50)
139
+ mode: import_zod.z.enum(["search", "channel"]).describe("Use search for topic/keyword requests. Use channel when the user provides @handle, channel ID, or channel URL."),
140
+ query: import_zod.z.string().optional().describe("Required when mode is search. The YouTube search topic in the user\u2019s words."),
141
+ channelHandle: import_zod.z.string().optional().describe("YouTube channel handle, channel ID, or URL. Examples: @mkbhd, UC..., https://youtube.com/@mkbhd."),
142
+ maxVideos: import_zod.z.number().int().min(1).max(500).default(50).describe("Number of videos to return. Default 50. Increase when user asks for full channel/history.")
105
143
  };
106
144
  var YoutubeTranscribeInputSchema = {
107
145
  videoId: import_zod.z.string().min(1).describe("YouTube video ID, e.g. dQw4w9WgXcQ")
@@ -109,12 +147,12 @@ var YoutubeTranscribeInputSchema = {
109
147
  var FacebookPageIntelInputSchema = {
110
148
  pageId: import_zod.z.string().optional(),
111
149
  libraryId: import_zod.z.string().optional(),
112
- query: import_zod.z.string().optional().describe("One of pageId, libraryId, or query is required"),
150
+ query: import_zod.z.string().optional().describe("Advertiser or brand name when pageId/libraryId is not known. One of pageId, libraryId, or query is required."),
113
151
  maxAds: import_zod.z.number().int().min(1).max(200).default(50),
114
152
  country: import_zod.z.string().length(2).default("US")
115
153
  };
116
154
  var FacebookAdSearchInputSchema = {
117
- query: import_zod.z.string().min(1),
155
+ query: import_zod.z.string().min(1).describe("Advertiser, brand, competitor, niche, or keyword to search in Facebook Ad Library."),
118
156
  country: import_zod.z.string().length(2).default("US"),
119
157
  maxResults: import_zod.z.number().int().min(1).max(20).default(10)
120
158
  };
@@ -122,10 +160,10 @@ var FacebookAdTranscribeInputSchema = {
122
160
  videoUrl: import_zod.z.string().url().describe("Facebook CDN video URL from a facebook_page_intel result")
123
161
  };
124
162
  var MapsPlaceIntelInputSchema = {
125
- businessName: import_zod.z.string().min(1).describe("Business name to search for on Google Maps"),
126
- location: import_zod.z.string().min(1).describe('City and state, e.g. "Denver, CO"'),
127
- gl: import_zod.z.string().length(2).default("us"),
128
- hl: import_zod.z.string().length(2).default("en"),
163
+ businessName: import_zod.z.string().min(1).describe('Business name only. If user says "Elite Roofing Denver CO", use businessName="Elite Roofing" and location="Denver, CO".'),
164
+ location: import_zod.z.string().min(1).describe('City/region/country where the business should be searched, e.g. "Denver, CO". Infer from the user request when possible.'),
165
+ gl: import_zod.z.string().length(2).default("us").describe("Google country code inferred from location."),
166
+ hl: import_zod.z.string().length(2).default("en").describe("Language inferred from user request."),
129
167
  includeReviews: import_zod.z.boolean().default(false).describe("Whether to fetch individual review cards"),
130
168
  maxReviews: import_zod.z.number().int().min(1).max(500).default(50).describe("Max review cards to return (requires includeReviews: true)")
131
169
  };
@@ -134,26 +172,98 @@ var CreditsInfoInputSchema = {
134
172
  includeLedger: import_zod.z.boolean().default(false).describe("Whether to include recent credit ledger entries")
135
173
  };
136
174
  var SearchSerpInputSchema = {
137
- query: import_zod.z.string().min(1).describe("Search query to retrieve organic Google results for"),
138
- location: import_zod.z.string().optional().describe("Location name for geo-targeted results"),
139
- gl: import_zod.z.string().length(2).default("us"),
140
- hl: import_zod.z.string().default("en"),
175
+ query: import_zod.z.string().min(1).describe('Core search topic only. Separate location when possible. If user says "best dentist in Brooklyn NY serp", use query="best dentist" and location="Brooklyn, NY".'),
176
+ location: import_zod.z.string().optional().describe("City, region, or country for geo-targeted results, inferred from user request when present."),
177
+ gl: import_zod.z.string().length(2).default("us").describe("Google country code inferred from location or user language."),
178
+ hl: import_zod.z.string().default("en").describe("Google interface/content language inferred from user request."),
179
+ device: import_zod.z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use desktop by default; use mobile only when the user asks for mobile rankings."),
180
+ proxyMode: import_zod.z.enum(["location", "configured", "none"]).default("location").describe("Proxy targeting mode. Use location by default so city/state searches create or reuse a matching residential proxy. Use configured for the static configured proxy. Use none only for direct-network debugging."),
181
+ proxyZip: import_zod.z.string().regex(/^\d{5}$/).optional().describe("Optional US ZIP override for residential location proxy targeting. Use only when the user gives a specific ZIP or city-center proxy targeting needs to be forced."),
182
+ debug: import_zod.z.boolean().default(false).describe("Include sanitized browser/session/location diagnostics in the response. Use true when debugging localization, CAPTCHA, or proxy behavior."),
141
183
  pages: import_zod.z.number().int().min(1).max(2).default(1).describe("Number of result pages to fetch (1\u20132)")
142
184
  };
185
+ var CaptureSerpSnapshotInputSchema = {
186
+ query: import_zod.z.string().min(1).describe("Core search query to capture as a structured SERP Intelligence snapshot. Separate the place into location when the user gives a city, region, country, or ZIP."),
187
+ location: import_zod.z.string().optional().describe("City, region, country, or service area used for localized Google results. MCP Scraper records location evidence; UULE alone is not proof of localization."),
188
+ gl: import_zod.z.string().length(2).default("us").describe("Google country code inferred from the requested market, e.g. us, gb, ca, au."),
189
+ hl: import_zod.z.string().default("en").describe("Google interface/content language inferred from the user request."),
190
+ device: import_zod.z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use mobile only when the user asks for mobile rankings or mobile SERP evidence."),
191
+ proxyMode: import_zod.z.enum(["location", "configured", "none"]).default("location").describe("Proxy behavior for capture. Use location for localized residential proxy targeting, configured for the static residential proxy, and none only for direct-network debugging."),
192
+ proxyZip: import_zod.z.string().regex(/^\d{5}$/).optional().describe("Optional US ZIP override for residential location proxy targeting when a precise city-center or ZIP proxy is needed."),
193
+ pages: import_zod.z.number().int().min(1).max(2).default(1).describe("Number of Google result pages to capture. Use 1 normally and 2 only when the user needs deeper ranking evidence."),
194
+ debug: import_zod.z.boolean().default(false).describe("Include sanitized browser, proxy, and location diagnostics. Use true when debugging localization, CAPTCHA, proxy selection, or capture reliability."),
195
+ includePageSnapshots: import_zod.z.boolean().default(false).describe("Also capture ranking-page snapshots for selected SERP URLs through the same product capture path."),
196
+ pageSnapshotLimit: import_zod.z.number().int().min(0).max(10).default(0).describe("Maximum ranking-page snapshots to capture when includePageSnapshots is true. Use 0 when only SERP evidence is needed.")
197
+ };
198
+ var ScreenshotInputSchema = {
199
+ url: import_zod.z.string().url().describe("URL to capture as a full-page screenshot. Use http or https. Pass allowLocal: true to capture localhost or private-network URLs during development."),
200
+ device: import_zod.z.enum(["desktop", "mobile"]).default("desktop").describe("Viewport profile. desktop = 1440\xD7900. mobile = 390\xD7844. Use desktop by default; use mobile when the user asks for a mobile view."),
201
+ allowLocal: import_zod.z.boolean().default(false).describe("Allow localhost and private-network URLs (127.x, 192.168.x, 10.x, etc.). For local development only \u2014 not for production use.")
202
+ };
203
+ var CaptureSerpPageSnapshotsInputSchema = {
204
+ urls: import_zod.z.array(import_zod.z.string().url()).min(1).max(25).describe("Public HTTP/HTTPS URLs to capture as SERP Intelligence page snapshots. Do not pass localhost, private IPs, file URLs, or internal admin URLs."),
205
+ targets: import_zod.z.array(import_zod.z.object({
206
+ url: import_zod.z.string().url().describe("Public HTTP/HTTPS URL to capture."),
207
+ sourceKind: import_zod.z.enum(["organic", "ai_citation", "local_pack_website", "configured_target", "site_subject"]).default("configured_target").describe("Why this page is being captured for SERP Intelligence evidence."),
208
+ sourcePosition: import_zod.z.number().int().min(1).optional().describe("Ranking or citation position when the page came from SERP evidence.")
209
+ }).strict()).min(1).max(25).optional().describe("Structured page snapshot targets. Use this instead of urls when source kind or position should be preserved."),
210
+ maxConcurrency: import_zod.z.number().int().min(1).max(5).default(2).describe("Parallel page captures. Use 2 normally; higher values can increase proxy/browser pressure."),
211
+ timeoutMs: import_zod.z.number().int().min(1e3).max(6e4).default(15e3).describe("Per-page capture timeout in milliseconds. Increase for slow pages; timeout artifacts are returned as structured capture failures."),
212
+ debug: import_zod.z.boolean().default(false).describe("Include sanitized browser/proxy diagnostics for page snapshot debugging. Use true for capture, network, or proxy troubleshooting.")
213
+ };
143
214
 
144
215
  // src/mcp/mcp-response-formatter.ts
145
- function twoBlocks(full, summary) {
146
- return { content: [{ type: "text", text: full }, { type: "text", text: summary }] };
216
+ var import_node_fs = require("fs");
217
+ var import_node_os = require("os");
218
+ var import_node_path = require("path");
219
+ function slugifyReportName(input) {
220
+ return input.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80) || "mcp-scraper-report";
221
+ }
222
+ function reportTitle(full) {
223
+ const title = full.split("\n").find((line) => line.startsWith("# "));
224
+ return title?.replace(/^#\s+/, "").trim() || "MCP Scraper Report";
225
+ }
226
+ function saveFullReport(full) {
227
+ if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
228
+ const outDir = process.env.MCP_SCRAPER_OUTPUT_DIR?.trim() || (0, import_node_path.join)((0, import_node_os.homedir)(), "Downloads", "mcp-scraper");
229
+ try {
230
+ (0, import_node_fs.mkdirSync)(outDir, { recursive: true });
231
+ const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
232
+ const file = (0, import_node_path.join)(outDir, `${stamp}-${slugifyReportName(reportTitle(full))}.md`);
233
+ (0, import_node_fs.writeFileSync)(file, full, "utf8");
234
+ return file;
235
+ } catch {
236
+ return null;
237
+ }
238
+ }
239
+ function oneBlock(content) {
240
+ const filePath = saveFullReport(content);
241
+ const text = filePath ? `${content}
242
+
243
+ \u{1F4C4} Saved: \`${filePath}\`` : content;
244
+ return { content: [{ type: "text", text }] };
245
+ }
246
+ function formatStructuredError(body, fallback) {
247
+ if (body.error === "insufficient_balance") {
248
+ return `Insufficient credits. Balance: ${body.balance_credits} credits. This call requires ${body.required_credits} credits. Top up at ${body.topup_url}`;
249
+ }
250
+ if (body.error === "mcp_request_timeout") {
251
+ return typeof body.message === "string" ? body.message : "MCP Scraper request timed out and was cancelled.";
252
+ }
253
+ if (typeof body.error_code === "string") {
254
+ const message = typeof body.error === "string" ? body.error : typeof body.message === "string" ? body.message : fallback;
255
+ const retryable = body.retryable === true ? " Retryable: yes." : "";
256
+ return `${body.error_code}: ${message}${retryable}${errorAttemptsSection(body)}`;
257
+ }
258
+ if (typeof body.error === "string") return body.error;
259
+ return fallback || "Tool error";
147
260
  }
148
261
  function parseData(raw) {
149
262
  const first = raw.content.find((b) => b.type === "text");
150
263
  const text = first?.type === "text" ? first.text : "";
151
264
  try {
152
265
  const parsed = JSON.parse(text || "{}");
153
- if (parsed.error === "insufficient_balance") {
154
- return { error: `Insufficient credits. Balance: ${parsed.balance_credits} credits. This call requires ${parsed.required_credits} credits. Top up at ${parsed.topup_url}` };
155
- }
156
- if (raw.isError) return { error: text || "Tool error" };
266
+ if (raw.isError || parsed.error || parsed.error_code) return { error: formatStructuredError(parsed, text) };
157
267
  const data = parsed.result ?? parsed;
158
268
  return { data };
159
269
  } catch {
@@ -184,8 +294,48 @@ function truncate(s, max) {
184
294
  if (!s) return "";
185
295
  return s.length > max ? s.slice(0, max) + "\u2026" : s;
186
296
  }
187
- var DIRECTIVE_FULL = "> \u{1F4C4} **FULL REPORT** \u2014 render as artifact or expandable block\n\n";
188
- var DIRECTIVE_SUMMARY = "> \u{1F4AC} **SUMMARY** \u2014 present this inline; offer the full report above as an artifact\n\n";
297
+ function debugSection(debug) {
298
+ if (!debug || typeof debug !== "object") return "";
299
+ const request = debug.request ?? {};
300
+ const browser = debug.browser ?? {};
301
+ const kernel = browser.kernel ?? {};
302
+ const network = browser.networkLocation ?? {};
303
+ const nav = browser.serpNavigation ?? {};
304
+ const proxyResolution = kernel.proxyResolution ?? {};
305
+ const locationEvidence = debug.locationEvidence;
306
+ const candidates = Array.isArray(locationEvidence?.candidates) ? locationEvidence.candidates.slice(0, 4).map((c) => `${c.city}, ${c.regionCode} (${c.count})`).join(", ") : "";
307
+ const lines = [
308
+ "\n## Debug",
309
+ `- Proxy mode: ${request.proxyMode ?? kernel.proxyMode ?? "unknown"} \xB7 requested proxy: ${kernel.requestedProxyIdPresent === true ? `yes (${kernel.requestedProxyIdSuffix ?? "redacted"})` : "no"}`,
310
+ `- Proxy resolution: ${proxyResolution.source ?? "unknown"}${proxyResolution.target ? ` \xB7 ${proxyResolution.target.level ?? "city"} ${proxyResolution.target.city}, ${proxyResolution.target.state}` : ""}${proxyResolution.error ? ` \xB7 ${truncate(proxyResolution.error, 180)}` : ""}`,
311
+ `- Browser session: ${kernel.sessionId ?? "unknown"} \xB7 retrieved proxy: ${kernel.retrievedProxyIdPresent === true ? `yes (${kernel.retrievedProxyIdSuffix ?? "redacted"})` : kernel.retrievedProxyIdPresent === false ? "no" : "unknown"}`,
312
+ `- Browser IP geo: ${[network.ip, network.city, network.region, network.country].filter(Boolean).join(" \xB7 ") || network.error || "unknown"}`,
313
+ `- Google URL: ${truncate(nav.requestedUrl, 240) || "unknown"}`,
314
+ `- Final URL: ${truncate(nav.finalUrl, 240) || "unknown"} \xB7 CAPTCHA: ${nav.captchaDetected === true ? "yes" : nav.captchaDetected === false ? "no" : "unknown"} \xB7 redirected: ${nav.redirected === true ? "yes" : nav.redirected === false ? "no" : "unknown"}`
315
+ ];
316
+ if (locationEvidence) {
317
+ lines.push(`- Location evidence: ${locationEvidence.status}${locationEvidence.expected ? ` \xB7 expected ${locationEvidence.expected.city}${locationEvidence.expected.regionCode ? `, ${locationEvidence.expected.regionCode}` : ""}` : ""}${candidates ? ` \xB7 candidates ${candidates}` : ""}`);
318
+ }
319
+ return lines.join("\n");
320
+ }
321
+ function errorAttemptsSection(body) {
322
+ const attempts = Array.isArray(body.attempts) ? body.attempts : [];
323
+ if (attempts.length === 0) return "";
324
+ const lines = attempts.slice(0, 5).map((attempt) => {
325
+ const debug = attempt.debug ?? {};
326
+ const browser = debug.browser ?? {};
327
+ const kernel = browser.kernel ?? {};
328
+ const proxyResolution = kernel.proxyResolution ?? {};
329
+ const network = browser.networkLocation ?? {};
330
+ const nav = browser.serpNavigation ?? {};
331
+ const geo = [network.ip, network.city, network.region].filter(Boolean).join(" / ") || "geo unknown";
332
+ return `- Attempt ${attempt.attempt_number ?? "?"}: ${attempt.outcome ?? attempt.status ?? "unknown"} \xB7 session ${attempt.kernel_session_id ?? kernel.sessionId ?? "unknown"} \xB7 proxy ${debug.request?.proxyMode ?? kernel.proxyMode ?? "unknown"}${proxyResolution.source ? `/${proxyResolution.source}` : ""} \xB7 ${geo} \xB7 CAPTCHA ${nav.captchaDetected === true ? "yes" : nav.captchaDetected === false ? "no" : "unknown"} \xB7 deleted ${attempt.kernel_delete_succeeded === true ? "yes" : attempt.kernel_delete_succeeded === false ? "no" : "unknown"}`;
333
+ });
334
+ return `
335
+
336
+ Attempts:
337
+ ${lines.join("\n")}`;
338
+ }
189
339
  function formatHarvestPaa(raw, input) {
190
340
  const parsed = parseData(raw);
191
341
  if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
@@ -194,6 +344,7 @@ function formatHarvestPaa(raw, input) {
194
344
  const organic = d.organicResults ?? [];
195
345
  const entityIds = d.entityIds;
196
346
  const aiOvw = d.aiOverview;
347
+ const diagnostics = d.diagnostics;
197
348
  const durationMs = d.stats?.durationMs;
198
349
  const paaRows = flat.map(
199
350
  (r, i) => `| ${i + 1} | ${r.question} | ${truncate(r.answer, 120)} | ${r.source_site ?? ""} |`
@@ -201,7 +352,7 @@ function formatHarvestPaa(raw, input) {
201
352
  const paaTable = flat.length ? `## People Also Ask (${flat.length} questions)
202
353
  | # | Question | Answer | Source |
203
354
  |---|----------|--------|--------|
204
- ${paaRows}` : "## People Also Ask\n*No questions extracted*";
355
+ ${paaRows}` : "## People Also Ask\n*Google did not return a People Also Ask block for this query/location. SERP data was extracted successfully when available.*";
205
356
  const serpRows = organic.map(
206
357
  (r) => `| ${r.position} | ${r.title} | [${r.domain}](${r.url}) | ${truncate(r.snippet, 100)} |`
207
358
  ).join("\n");
@@ -215,20 +366,20 @@ ${serpRows}` : "";
215
366
  > ${truncate(aiOvw.text, 600)}` : "";
216
367
  const statsLine = durationMs ? `
217
368
  ## Stats
218
- - Questions: ${flat.length} \xB7 Duration: ${(durationMs / 1e3).toFixed(1)}s` : "";
369
+ - Status: ${diagnostics?.completionStatus ?? (flat.length ? "paa_found" : "no_paa")} \xB7 Questions: ${flat.length} \xB7 Duration: ${(durationMs / 1e3).toFixed(1)}s` : "";
219
370
  const tips = `
220
371
  ---
221
372
  \u{1F4A1} **Tips**
222
- - Max questions: \`maxQuestions: 100\` (current: ${input.maxQuestions ?? 30})
373
+ - Max questions: \`maxQuestions: 150\` (current: ${input.maxQuestions ?? 30})
223
374
  - Organic results only: use \`search_serp\`
224
375
  - Dig into a result: use \`extract_url\` on any organic URL`;
225
- const full = `${DIRECTIVE_FULL}# PAA Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
376
+ const full = `# PAA Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
226
377
 
227
- ${paaTable}${serpTable}${entityIdsSection(entityIds)}${aiSection}${statsLine}${tips}`;
378
+ ${paaTable}${serpTable}${entityIdsSection(entityIds)}${aiSection}${statsLine}${debugSection(diagnostics?.debug)}${tips}`;
228
379
  const topQ = flat.slice(0, 10).map((r, i) => `${i + 1}. ${r.question}`).join("\n");
229
380
  const topO = organic.slice(0, 5).map((r) => `${r.position}. [${r.title}](${r.url}) \u2014 ${r.domain}`).join("\n");
230
381
  const summary = [
231
- `${DIRECTIVE_SUMMARY}**PAA: "${input.query}"** \u2014 ${flat.length} questions extracted`,
382
+ `**PAA: "${input.query}"** \u2014 ${flat.length} questions extracted`,
232
383
  topQ ? `
233
384
  **Top questions:**
234
385
  ${topQ}` : "",
@@ -237,9 +388,9 @@ ${topQ}` : "",
237
388
  ${topO}` : "",
238
389
  entityIdsSummaryLine(entityIds),
239
390
  `
240
- \u{1F4A1} \`maxQuestions\` up to 100 \xB7Use \`extract_url\` to dig into any result`
391
+ \u{1F4A1} \`maxQuestions\` up to 150 \xB7Use \`extract_url\` to dig into any result`
241
392
  ].filter(Boolean).join("\n");
242
- return twoBlocks(full, summary);
393
+ return oneBlock(full);
243
394
  }
244
395
  function formatSearchSerp(raw, input) {
245
396
  const parsed = parseData(raw);
@@ -249,6 +400,7 @@ function formatSearchSerp(raw, input) {
249
400
  const localPack = d.localPack ?? [];
250
401
  const entityIds = d.entityIds;
251
402
  const aiOvw = d.aiOverview;
403
+ const diagnostics = d.diagnostics;
252
404
  const serpRows = organic.map(
253
405
  (r) => `| ${r.position} | ${r.title} | [${r.domain}](${r.url}) | ${truncate(r.snippet, 100)} |`
254
406
  ).join("\n");
@@ -273,12 +425,12 @@ ${localRows}` : "";
273
425
  - Get PAA questions: use \`harvest_paa\` for this query
274
426
  - Scrape any result: use \`extract_url\`
275
427
  - Business entity IDs (CID/GCID/KG MID) shown above if found`;
276
- const full = `${DIRECTIVE_FULL}# SERP Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
428
+ const full = `# SERP Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
277
429
 
278
- ${serpTable}${localSection}${entityIdsSection(entityIds)}${aiSection}${tips}`;
430
+ ${serpTable}${localSection}${entityIdsSection(entityIds)}${aiSection}${debugSection(diagnostics?.debug)}${tips}`;
279
431
  const topO = organic.slice(0, 5).map((r) => `${r.position}. [${r.title}](${r.url}) \u2014 ${r.domain}`).join("\n");
280
432
  const summary = [
281
- `${DIRECTIVE_SUMMARY}**SERP: "${input.query}"** \u2014 ${organic.length} organic results`,
433
+ `**SERP: "${input.query}"** \u2014 ${organic.length} organic results`,
282
434
  topO ? `
283
435
  **Top results:**
284
436
  ${topO}` : "",
@@ -288,7 +440,7 @@ ${topO}` : "",
288
440
  `
289
441
  \u{1F4A1} Use \`harvest_paa\` for questions \xB7 \`extract_url\` to scrape any result`
290
442
  ].filter(Boolean).join("\n");
291
- return twoBlocks(full, summary);
443
+ return oneBlock(full);
292
444
  }
293
445
  function formatExtractUrl(raw, input) {
294
446
  const parsed = parseData(raw);
@@ -300,6 +452,9 @@ function formatExtractUrl(raw, input) {
300
452
  const kpo = d.kpo;
301
453
  const bodyMd = d.bodyMarkdown ?? "";
302
454
  const schema = d.schema;
455
+ const screenshotMeta = d.screenshot;
456
+ const branding = d.branding;
457
+ const media = d.media;
303
458
  const h1Lines = headings.filter((h) => h.level === 1).map((h) => `- ${h.text}`).join("\n");
304
459
  const h2Lines = headings.filter((h) => h.level === 2).map((h) => ` - ${h.text}`).join("\n");
305
460
  const headingSection = h1Lines || h2Lines ? `
@@ -322,6 +477,26 @@ ${[h1Lines, h2Lines].filter(Boolean).join("\n")}` : "";
322
477
  const bodySection = bodyMd ? `
323
478
  ## Page Content
324
479
  ${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
480
+ const screenshotSection = screenshotMeta ? `
481
+ ## Screenshot
482
+ - **File:** ${screenshotMeta.savedPath}
483
+ - **Size:** ${(screenshotMeta.sizeBytes / 1024).toFixed(1)} KB
484
+ - **Device:** ${screenshotMeta.device}` : "";
485
+ const brandingSection = branding ? [
486
+ `
487
+ ## Branding`,
488
+ branding.colorScheme ? `- **Color scheme:** ${branding.colorScheme}` : "",
489
+ `- **Colors:**${Object.entries(branding.colors ?? {}).filter(([, v]) => v).map(([k, v]) => ` ${k}=${v}`).join(",") || " (none extracted)"}`,
490
+ `- **Fonts:**${Object.entries(branding.fonts ?? {}).filter(([, v]) => v).map(([k, v]) => ` ${k}=${v}`).join(",") || " (none extracted)"}`,
491
+ branding.assets?.logo ? `- **Logo:** ${branding.assets.logo}` : "",
492
+ branding.assets?.favicon ? `- **Favicon:** ${branding.assets.favicon}` : ""
493
+ ].filter(Boolean).join("\n") : "";
494
+ const mediaSection = media ? [
495
+ `
496
+ ## Media Assets`,
497
+ `- **Found:** ${media.totalFound} total, ${media.filteredCount} filtered (ads/noise), ${media.assets.length} downloaded`,
498
+ media.outputDir ? `- **Saved to:** ${media.outputDir}` : ""
499
+ ].filter(Boolean).join("\n") : "";
325
500
  const schemaCount = Array.isArray(schema) ? schema.length : 0;
326
501
  const tips = `
327
502
  ---
@@ -329,19 +504,23 @@ ${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
329
504
  - Crawl entire site: use \`extract_site\`
330
505
  - Map all URLs: use \`map_site_urls\`
331
506
  - ${schemaCount} JSON-LD schema block(s) detected`;
332
- const full = `${DIRECTIVE_FULL}# URL Extract: ${url}
507
+ const full = `# URL Extract: ${url}
333
508
  **${title}**
334
- ${headingSection}${kpoSection}${bodySection}${tips}`;
335
- const summary = [
336
- `${DIRECTIVE_SUMMARY}**Extracted:** ${title}`,
337
- `**URL:** ${url}`,
338
- kpo?.entityName ? `**Entity:** ${kpo.entityName} (${kpo.type?.join(", ") ?? "unknown"})` : "",
339
- kpo?.napScore !== void 0 ? `**NAP Score:** ${kpo.napScore}/5` : "",
340
- headings.length ? `**${headings.length} headings**` : "",
341
- `
342
- \u{1F4A1} Use \`extract_site\` to crawl the full domain`
343
- ].filter(Boolean).join("\n");
344
- return twoBlocks(full, summary);
509
+ ${headingSection}${kpoSection}${brandingSection}${bodySection}${screenshotSection}${mediaSection}${tips}`;
510
+ const textResult = oneBlock(full);
511
+ if (screenshotMeta?.savedPath) {
512
+ try {
513
+ const imgBuf = (0, import_node_fs.readFileSync)(screenshotMeta.savedPath);
514
+ return {
515
+ content: [
516
+ ...textResult.content,
517
+ { type: "image", data: imgBuf.toString("base64"), mimeType: "image/png" }
518
+ ]
519
+ };
520
+ } catch {
521
+ }
522
+ }
523
+ return textResult;
345
524
  }
346
525
  function formatMapSiteUrls(raw, input) {
347
526
  const parsed = parseData(raw);
@@ -353,7 +532,7 @@ function formatMapSiteUrls(raw, input) {
353
532
  const redirects = urls.filter((u) => u.status !== null && u.status >= 300 && u.status < 400);
354
533
  const urlRows = urls.slice(0, 200).map((u, i) => `| ${i + 1} | ${u.url} | ${u.status ?? "\u2014"} |`).join("\n");
355
534
  const full = [
356
- `${DIRECTIVE_FULL}# URL Map: ${input.url}`,
535
+ `# URL Map: ${input.url}`,
357
536
  `**${d.totalFound} URLs** \xB7 ${(d.durationMs / 1e3).toFixed(1)}s${d.truncated ? " \xB7 *truncated*" : ""}`,
358
537
  `
359
538
  ## Summary
@@ -375,14 +554,14 @@ ${broken.map((u) => `- ${u.url} (${u.status})`).join("\n")}` : "",
375
554
  - Scrape a single page: use \`extract_url\``
376
555
  ].filter(Boolean).join("\n");
377
556
  const summary = [
378
- `${DIRECTIVE_SUMMARY}**URL Map: ${input.url}**`,
557
+ `**URL Map: ${input.url}**`,
379
558
  `${d.totalFound} URLs \u2014 ${ok.length} OK \xB7 ${broken.length} broken \xB7 ${redirects.length} redirects`,
380
559
  broken.length ? `
381
560
  **Broken URLs:** ${broken.slice(0, 3).map((u) => u.url).join(", ")}` : "",
382
561
  `
383
562
  \u{1F4A1} Use \`extract_site\` to extract content from all pages`
384
563
  ].filter(Boolean).join("\n");
385
- return twoBlocks(full, summary);
564
+ return oneBlock(full);
386
565
  }
387
566
  function formatExtractSite(raw, input) {
388
567
  const parsed = parseData(raw);
@@ -394,7 +573,7 @@ function formatExtractSite(raw, input) {
394
573
  return `| ${i + 1} | ${p.title ?? "Untitled"} | ${p.url} | ${schemaInfo} |`;
395
574
  }).join("\n");
396
575
  const full = [
397
- `${DIRECTIVE_FULL}# Site Extract: ${input.url}`,
576
+ `# Site Extract: ${input.url}`,
398
577
  `**${pages.length} pages** \xB7 ${((d.durationMs ?? 0) / 1e3).toFixed(1)}s`,
399
578
  `
400
579
  ## Pages
@@ -408,13 +587,13 @@ ${pageRows}`,
408
587
  - Inspect a single page: use \`extract_url\``
409
588
  ].join("\n");
410
589
  const summary = [
411
- `${DIRECTIVE_SUMMARY}**Site Extract: ${input.url}** \u2014 ${pages.length} pages`,
590
+ `**Site Extract: ${input.url}** \u2014 ${pages.length} pages`,
412
591
  pages.slice(0, 5).map((p) => `- ${p.title ?? p.url}`).join("\n"),
413
592
  pages.length > 5 ? `- \u2026 and ${pages.length - 5} more` : "",
414
593
  `
415
594
  \u{1F4A1} Use \`extract_url\` to inspect any individual page`
416
595
  ].filter(Boolean).join("\n");
417
- return twoBlocks(full, summary);
596
+ return oneBlock(full);
418
597
  }
419
598
  function formatYoutubeHarvest(raw, input) {
420
599
  const parsed = parseData(raw);
@@ -430,7 +609,7 @@ function formatYoutubeHarvest(raw, input) {
430
609
  - **Name:** ${d.channelMeta.title ?? "\u2014"}
431
610
  - **Subscribers:** ${d.channelMeta.subscriberCount ?? "\u2014"}` : "";
432
611
  const full = [
433
- `${DIRECTIVE_FULL}# YouTube Harvest: ${label}`,
612
+ `# YouTube Harvest: ${label}`,
434
613
  `**${videos.length} videos** \xB7 ${(d.stats.durationMs / 1e3).toFixed(1)}s`,
435
614
  channelSection,
436
615
  `
@@ -446,14 +625,14 @@ ${videoRows}`,
446
625
  ].filter(Boolean).join("\n");
447
626
  const top5 = videos.slice(0, 5).map((v, i) => `${i + 1}. ${v.title} (\`${v.videoId}\`)`).join("\n");
448
627
  const summary = [
449
- `${DIRECTIVE_SUMMARY}**YouTube: ${label}** \u2014 ${videos.length} videos`,
628
+ `**YouTube: ${label}** \u2014 ${videos.length} videos`,
450
629
  `
451
630
  **Top videos:**
452
631
  ${top5}`,
453
632
  `
454
633
  \u{1F4A1} Transcribe any video: \`youtube_transcribe\` with its videoId`
455
634
  ].join("\n");
456
- return twoBlocks(full, summary);
635
+ return oneBlock(full);
457
636
  }
458
637
  function formatYoutubeTranscribe(raw, input) {
459
638
  const parsed = parseData(raw);
@@ -463,13 +642,13 @@ function formatYoutubeTranscribe(raw, input) {
463
642
  const chunks = d.chunks ?? [];
464
643
  const durSec = d.durationMs ? (d.durationMs / 1e3).toFixed(0) : "\u2014";
465
644
  const chunkRows = chunks.slice(0, 50).map((c) => {
466
- const sec = Math.floor(c.startMs / 1e3);
645
+ const sec = Number.isFinite(c.timestamp[0]) ? Math.floor(c.timestamp[0]) : 0;
467
646
  const mm = String(Math.floor(sec / 60)).padStart(2, "0");
468
647
  const ss = String(sec % 60).padStart(2, "0");
469
648
  return `| ${mm}:${ss} | ${truncate(c.text, 120)} |`;
470
649
  }).join("\n");
471
650
  const full = [
472
- `${DIRECTIVE_FULL}# YouTube Transcript: \`${input.videoId}\``,
651
+ `# YouTube Transcript: \`${input.videoId}\``,
473
652
  `**Duration:** ${durSec}s \xB7 **${text.split(" ").length} words**`,
474
653
  `
475
654
  ## Full Transcript
@@ -484,14 +663,14 @@ ${chunkRows}` : "",
484
663
  \u{1F4A1} Harvest more from this channel: use \`youtube_harvest\` with \`mode: "channel"\``
485
664
  ].filter(Boolean).join("\n");
486
665
  const summary = [
487
- `${DIRECTIVE_SUMMARY}**YouTube Transcript: \`${input.videoId}\`** \u2014 ${text.split(" ").length} words \xB7 ${durSec}s`,
666
+ `**YouTube Transcript: \`${input.videoId}\`** \u2014 ${text.split(" ").length} words \xB7 ${durSec}s`,
488
667
  `
489
668
  **Preview:**
490
669
  > ${truncate(text, 300)}`,
491
670
  `
492
671
  \u{1F4A1} Full transcript in artifact above`
493
672
  ].join("\n");
494
- return twoBlocks(full, summary);
673
+ return oneBlock(full);
495
674
  }
496
675
  function formatFacebookPageIntel(raw, input) {
497
676
  const parsed = parseData(raw);
@@ -509,7 +688,7 @@ function formatFacebookPageIntel(raw, input) {
509
688
  ad.variations ? `**Variations:** ${ad.variations}` : ""
510
689
  ].filter(Boolean).join("\n")).join("\n\n---\n\n");
511
690
  const full = [
512
- `${DIRECTIVE_FULL}# Facebook Ad Intel: ${advertiser}`,
691
+ `# Facebook Ad Intel: ${advertiser}`,
513
692
  `**${s.totalAds} ads** \xB7 ${s.activeCount} active \xB7 ${s.videoCount} video \xB7 ${s.imageCount} image`,
514
693
  `
515
694
  ${adBlocks}`,
@@ -523,7 +702,7 @@ ${adBlocks}`,
523
702
  const adSummary = activeAds.map((a, i) => `${i + 1}. ${truncate(a.headline ?? a.primaryText, 80)} (${a.creativeType ?? "\u2014"})`).join("\n");
524
703
  const videoCount = ads.filter((a) => a.videoUrl).length;
525
704
  const summary = [
526
- `${DIRECTIVE_SUMMARY}**Facebook Ads: ${advertiser}** \u2014 ${s.totalAds} ads (${s.activeCount} active)`,
705
+ `**Facebook Ads: ${advertiser}** \u2014 ${s.totalAds} ads (${s.activeCount} active)`,
527
706
  adSummary ? `
528
707
  **Active ads:**
529
708
  ${adSummary}` : "",
@@ -531,7 +710,7 @@ ${adSummary}` : "",
531
710
  videoCount ? `
532
711
  \u{1F4A1} ${videoCount} video ads \u2014 transcribe with \`facebook_ad_transcribe\` using the videoUrl` : ""
533
712
  ].filter(Boolean).join("\n");
534
- return twoBlocks(full, summary);
713
+ return oneBlock(full);
535
714
  }
536
715
  function formatFacebookAdSearch(raw, input) {
537
716
  const parsed = parseData(raw);
@@ -542,7 +721,7 @@ function formatFacebookAdSearch(raw, input) {
542
721
  (a, i) => `| ${i + 1} | ${a.name} | ${a.adCount ?? "\u2014"} | \`${a.libraryId ?? "\u2014"}\` |`
543
722
  ).join("\n");
544
723
  const full = [
545
- `${DIRECTIVE_FULL}# Facebook Ad Library Search: "${input.query}"`,
724
+ `# Facebook Ad Library Search: "${input.query}"`,
546
725
  `**${advertisers.length} advertisers found**`,
547
726
  `
548
727
  ## Advertisers
@@ -556,14 +735,14 @@ ${rows}`,
556
735
  - Or pass the advertiser name as \`query\` in \`facebook_page_intel\``
557
736
  ].join("\n");
558
737
  const summary = [
559
- `${DIRECTIVE_SUMMARY}**Facebook Ad Search: "${input.query}"** \u2014 ${advertisers.length} advertisers`,
738
+ `**Facebook Ad Search: "${input.query}"** \u2014 ${advertisers.length} advertisers`,
560
739
  advertisers.slice(0, 5).map(
561
740
  (a, i) => `${i + 1}. ${a.name}${a.adCount ? ` (${a.adCount} ads)` : ""} \u2014 \`${a.libraryId ?? "\u2014"}\``
562
741
  ).join("\n"),
563
742
  `
564
743
  \u{1F4A1} Scan ads with \`facebook_page_intel\` using \`libraryId\``
565
744
  ].filter(Boolean).join("\n");
566
- return twoBlocks(full, summary);
745
+ return oneBlock(full);
567
746
  }
568
747
  function formatCreditsInfo(raw, input) {
569
748
  const parsed = parseData(raw);
@@ -589,7 +768,7 @@ ${matched.notes}` : ""}` : input.item ? `
589
768
  ## Matched Cost
590
769
  No exact cost match found for "${input.item}". See the full cost table below.` : "";
591
770
  const full = [
592
- `${DIRECTIVE_FULL}# Credits`,
771
+ `# Credits`,
593
772
  `**Balance:** ${balance ?? "unknown"} credits`,
594
773
  matchedSection,
595
774
  costs.length ? `
@@ -604,13 +783,13 @@ ${costRows}` : "",
604
783
  ${ledgerRows}` : ""
605
784
  ].filter(Boolean).join("\n");
606
785
  const summary = [
607
- `${DIRECTIVE_SUMMARY}**Credit balance:** ${balance ?? "unknown"} credits`,
786
+ `**Credit balance:** ${balance ?? "unknown"} credits`,
608
787
  matched ? `
609
788
  **${matched.label}:** ${matched.credits} credits ${matched.unit}` : null,
610
789
  input.includeLedger && ledger.length ? `
611
790
  Recent ledger entries included in the full report.` : null
612
791
  ].filter(Boolean).join("\n");
613
- return twoBlocks(full, summary);
792
+ return oneBlock(full);
614
793
  }
615
794
  function formatMapsPlaceIntel(raw, input) {
616
795
  const parsed = parseData(raw);
@@ -636,6 +815,7 @@ function formatMapsPlaceIntel(raw, input) {
636
815
  const topics = d.reviewTopics ?? [];
637
816
  const about = d.aboutAttributes ?? [];
638
817
  const reviews = d.reviews ?? [];
818
+ const reviewsStatus = d.reviewsStatus ?? "not_requested";
639
819
  const hoursTable = d.hoursTable ?? [];
640
820
  const ratingLine = [rating, reviewCount ? `(${reviewCount} reviews)` : null].filter(Boolean).join(" ");
641
821
  const basicLines = [
@@ -674,18 +854,24 @@ ${attrs.map((a) => `- ${a}`).join("\n")}`).join("\n\n")}` : "";
674
854
  cidUrl ? `- **Maps CID URL:** ${cidUrl}` : null,
675
855
  lat != null && lng != null ? `- **Coordinates:** ${lat}, ${lng}` : null
676
856
  ].filter(Boolean).join("\n");
677
- const reviewsSection = reviews.length ? `
857
+ const reviewsSection = (() => {
858
+ if (reviewsStatus === "not_requested") return "";
859
+ if (reviewsStatus === "unavailable") return "\n## Reviews\n> Reviews could not be retrieved this run \u2014 retry with `includeReviews: true`.";
860
+ if (reviewsStatus === "none_exist") return "\n## Reviews\n*This business has no reviews on Google Maps.*";
861
+ if (reviews.length === 0) return "\n## Reviews\n*0 reviews collected.*";
862
+ return `
678
863
  ## Reviews (${reviews.length})
679
864
  ${reviews.map((r, i) => {
680
- const starsN = parseInt(r.stars ?? "0");
681
- const stars = "\u2605".repeat(starsN) + "\u2606".repeat(5 - starsN);
682
- return `### ${i + 1}. ${r.author ?? "Anonymous"} \u2014 ${stars}
865
+ const starsN = parseInt(r.stars ?? "0");
866
+ const stars = "\u2605".repeat(starsN) + "\u2606".repeat(5 - starsN);
867
+ return `### ${i + 1}. ${r.author ?? "Anonymous"} \u2014 ${stars}
683
868
  *${r.date ?? ""}*
684
869
 
685
870
  ${r.text ?? ""}`;
686
- }).join("\n\n")}` : "";
871
+ }).join("\n\n")}`;
872
+ })();
687
873
  const full = [
688
- `${DIRECTIVE_FULL}# ${name}`,
874
+ `# ${name}`,
689
875
  category ? `*${category}*` : null,
690
876
  ratingLine ? `
691
877
  **Rating:** ${ratingLine}` : null,
@@ -704,15 +890,19 @@ ${entitySection}` : null,
704
890
  *Extracted in ${(durationMs / 1e3).toFixed(1)}s*` : null
705
891
  ].filter(Boolean).join("\n");
706
892
  const summary = [
707
- `${DIRECTIVE_SUMMARY}**${name}** \u2014 ${category ?? "Business"} \xB7 ${ratingLine || "No rating"}`,
893
+ `**${name}** \u2014 ${category ?? "Business"} \xB7 ${ratingLine || "No rating"}`,
708
894
  address ? `\u{1F4CD} ${address}` : null,
709
895
  phone ? `\u{1F4DE} ${phone}` : null,
710
896
  hoursSummary ? `\u{1F550} ${hoursSummary}` : null,
711
897
  website ? `\u{1F310} ${website}` : null,
712
- reviews.length ? `
713
- \u{1F4AC} ${reviews.length} reviews fetched \u2014 full list in artifact above` : null
898
+ reviewsStatus === "collected" && reviews.length ? `
899
+ \u{1F4AC} ${reviews.length} reviews fetched \u2014 full list in artifact above` : null,
900
+ reviewsStatus === "unavailable" ? `
901
+ \u26A0\uFE0F Reviews could not be retrieved this run` : null,
902
+ reviewsStatus === "none_exist" ? `
903
+ \u{1F4AC} No reviews on Google Maps` : null
714
904
  ].filter(Boolean).join("\n");
715
- return twoBlocks(full, summary);
905
+ return oneBlock(full);
716
906
  }
717
907
  function formatFacebookAdTranscribe(raw, input) {
718
908
  const parsed = parseData(raw);
@@ -722,13 +912,13 @@ function formatFacebookAdTranscribe(raw, input) {
722
912
  const chunks = d.chunks ?? [];
723
913
  const durSec = d.durationMs ? (d.durationMs / 1e3).toFixed(0) : "\u2014";
724
914
  const chunkRows = chunks.slice(0, 50).map((c) => {
725
- const sec = Math.floor(c.startMs / 1e3);
915
+ const sec = Number.isFinite(c.timestamp[0]) ? Math.floor(c.timestamp[0]) : 0;
726
916
  const mm = String(Math.floor(sec / 60)).padStart(2, "0");
727
917
  const ss = String(sec % 60).padStart(2, "0");
728
918
  return `| ${mm}:${ss} | ${truncate(c.text, 120)} |`;
729
919
  }).join("\n");
730
920
  const full = [
731
- `${DIRECTIVE_FULL}# Facebook Ad Transcript`,
921
+ `# Facebook Ad Transcript`,
732
922
  `**Duration:** ${durSec}s \xB7 **${text.split(" ").length} words**`,
733
923
  `
734
924
  ## Full Transcript
@@ -743,53 +933,53 @@ ${chunkRows}` : "",
743
933
  \u{1F4A1} Get more ads from this advertiser: use \`facebook_page_intel\``
744
934
  ].filter(Boolean).join("\n");
745
935
  const summary = [
746
- `${DIRECTIVE_SUMMARY}**Facebook Ad Transcript** \u2014 ${text.split(" ").length} words \xB7 ${durSec}s`,
936
+ `**Facebook Ad Transcript** \u2014 ${text.split(" ").length} words \xB7 ${durSec}s`,
747
937
  `
748
938
  **Preview:**
749
939
  > ${truncate(text, 300)}`,
750
940
  `
751
941
  \u{1F4A1} Full transcript in artifact above`
752
942
  ].join("\n");
753
- return twoBlocks(full, summary);
943
+ return oneBlock(full);
754
944
  }
755
945
 
756
946
  // src/mcp/paa-mcp-server.ts
757
947
  function buildPaaExtractorMcpServer(executor2) {
758
- const server2 = new import_mcp.McpServer({ name: "paa-extractor", version: "1.0.0" });
948
+ const server2 = new import_mcp.McpServer({ name: "mcp-scraper", version: "1.0.0" });
759
949
  server2.registerTool("harvest_paa", {
760
- description: "Extract PAA (People Also Ask) questions from Google Search. Returns full question list with answers, organic SERP, entity IDs (CID/GCID/KG MID), and AI Overview. Use maxQuestions to control volume (up to 40).",
950
+ description: 'Best default tool for Google search research. Extracts People Also Ask questions plus answers/source URLs, organic SERP, local pack when present, entity IDs (CID/GCID/KG MID), and AI Overview. Infer the user language: split topic from location (e.g. "best hvac company in Denver CO" => query "best hvac company", location "Denver, CO", gl "us", hl "en"). Use maxQuestions 30 normally, 100-150 for "full", "deep", "all", or comprehensive research. Credits are charged by extracted question; unused request hold is refunded. Saves a full Markdown report locally.',
761
951
  inputSchema: HarvestPaaInputSchema
762
952
  }, async (input) => formatHarvestPaa(await executor2.harvestPaa(input), input));
763
953
  server2.registerTool("search_serp", {
764
- description: "Fetch organic Google search results. Returns ranked URLs, titles, snippets, local pack, entity IDs (CID/GCID/KG MID), and AI Overview. Use when you need SERP positions without PAA expansion.",
954
+ description: "Fast Google SERP lookup without PAA expansion. Use when the user asks for rankings, organic results, local pack, quick SERP, or positions. Split topic from location and infer gl/hl from the user request. Saves a full Markdown report locally.",
765
955
  inputSchema: SearchSerpInputSchema
766
956
  }, async (input) => formatSearchSerp(await executor2.searchSerp(input), input));
767
957
  server2.registerTool("extract_url", {
768
- description: "Extract structured data from a single URL: page content as Markdown, heading structure, JSON-LD schema, entity details, NAP score, and missing schema fields. Use for SEO audits and entity validation.",
958
+ description: "Extract structured data from one public URL: page content as Markdown, heading structure, JSON-LD schema, entity details, NAP score, metadata, and missing schema fields. Use when the user provides a single URL or asks to inspect/scrape one page. Saves a full Markdown report locally.",
769
959
  inputSchema: ExtractUrlInputSchema
770
960
  }, async (input) => formatExtractUrl(await executor2.extractUrl(input), input));
771
961
  server2.registerTool("map_site_urls", {
772
- description: "Spider a website to build a complete URL inventory with HTTP status codes. Identifies broken links and redirect chains. Use before extract_site to understand site scope.",
962
+ description: "Map/crawl a public website to build a URL inventory with HTTP status codes, broken links, redirects, and site scope. Use before extract_site for audits or when the user asks for a sitemap/URL inventory. Saves a full Markdown report locally.",
773
963
  inputSchema: MapSiteUrlsInputSchema
774
964
  }, async (input) => formatMapSiteUrls(await executor2.mapSiteUrls(input), input));
775
965
  server2.registerTool("extract_site", {
776
- description: "Run multi-page extraction across an entire website. Returns schema, entity data, headings, and content from each page. Use map_site_urls first to check scope.",
966
+ description: "Run multi-page extraction across a public website. Returns per-page titles, H1s, metadata, headings, schema/entity data, canonical URLs, and content. Use for website audits, competitor audits, and full-site extraction. Saves a full Markdown report locally.",
777
967
  inputSchema: ExtractSiteInputSchema
778
968
  }, async (input) => formatExtractSite(await executor2.extractSite(input), input));
779
969
  server2.registerTool("youtube_harvest", {
780
- description: 'Harvest YouTube video metadata by search query or channel handle. Returns titles, view counts, durations, and videoIds. Use mode "search" for keyword results or "channel" for a specific creator.',
970
+ description: 'Harvest YouTube video metadata by search query or channel handle/ID/URL. Use mode "search" for keyword/topic requests and mode "channel" for @handles, channel IDs, or channel URLs. Returns titles, views, dates, durations, URLs, thumbnails, and videoIds for follow-up transcription. Saves a full Markdown report locally.',
781
971
  inputSchema: YoutubeHarvestInputSchema
782
972
  }, async (input) => formatYoutubeHarvest(await executor2.youtubeHarvest(input), input));
783
973
  server2.registerTool("youtube_transcribe", {
784
- description: "Fetch and transcribe captions from a YouTube video. Returns full transcript, timestamped chunks, and word count. Pass a videoId from youtube_harvest results.",
974
+ description: "Fetch and transcribe captions from a YouTube video. Returns full transcript, timestamped chunks, and word count. Pass a videoId from youtube_harvest results or infer it from a YouTube URL if the user provided one. Saves a full Markdown report locally.",
785
975
  inputSchema: YoutubeTranscribeInputSchema
786
976
  }, async (input) => formatYoutubeTranscribe(await executor2.youtubeTranscribe(input), input));
787
977
  server2.registerTool("facebook_page_intel", {
788
- description: "Harvest all ads from a Facebook advertiser. Returns ad copy, headlines, CTAs, creative type, status, and video URLs ready for transcription. Accepts pageId, libraryId, or a brand name as query.",
978
+ description: "Harvest ads from a Facebook advertiser. Returns ad copy, headlines, CTAs, creative type, status, landing URLs, and video URLs ready for transcription. Accepts pageId, libraryId, or a brand/advertiser name as query. Use after facebook_ad_search when possible. Saves a full Markdown report locally.",
789
979
  inputSchema: FacebookPageIntelInputSchema
790
980
  }, async (input) => formatFacebookPageIntel(await executor2.facebookPageIntel(input), input));
791
981
  server2.registerTool("facebook_ad_search", {
792
- description: "Search Facebook Ad Library by keyword. Returns advertisers with ad counts and library IDs. Use to discover competitors, then pass libraryId to facebook_page_intel to get their full ad list.",
982
+ description: "Search Facebook Ad Library by brand, advertiser, competitor, niche, or keyword. Returns advertisers with ad counts and library IDs. Use to discover competitors, then pass libraryId to facebook_page_intel. Saves a full Markdown report locally.",
793
983
  inputSchema: FacebookAdSearchInputSchema
794
984
  }, async (input) => formatFacebookAdSearch(await executor2.facebookAdSearch(input), input));
795
985
  server2.registerTool("facebook_ad_transcribe", {
@@ -797,7 +987,7 @@ function buildPaaExtractorMcpServer(executor2) {
797
987
  inputSchema: FacebookAdTranscribeInputSchema
798
988
  }, async (input) => formatFacebookAdTranscribe(await executor2.facebookAdTranscribe(input), input));
799
989
  server2.registerTool("maps_place_intel", {
800
- description: "Extract Google Maps business intelligence for a named business: rating, review count, category, address, phone, website, hours, booking URL, review histogram, review topics, about attributes, and optional review cards. Pass includeReviews: true and maxReviews to fetch individual review text.",
990
+ description: 'Extract Google Maps business intelligence for a named business: rating, review count, category, address, phone, website, hours, booking URL, review histogram, review topics, about attributes, entity IDs, and optional review cards. Split business name from location (e.g. "Elite Roofing Denver CO" => businessName "Elite Roofing", location "Denver, CO"). Pass includeReviews true when the user asks for reviews/customer pain. Saves a full Markdown report locally.',
801
991
  inputSchema: MapsPlaceIntelInputSchema
802
992
  }, async (input) => formatMapsPlaceIntel(await executor2.mapsPlaceIntel(input), input));
803
993
  server2.registerTool("credits_info", {
@@ -810,10 +1000,10 @@ function buildPaaExtractorMcpServer(executor2) {
810
1000
  // bin/mcp-stdio-server.ts
811
1001
  function readApiKeyFile() {
812
1002
  const explicitPath = process.env.MCP_SCRAPER_KEY_PATH?.trim();
813
- const paths = [explicitPath, (0, import_node_path.join)((0, import_node_os.homedir)(), ".mcp-scraper-key")].filter(Boolean);
1003
+ const paths = [explicitPath, (0, import_node_path2.join)((0, import_node_os2.homedir)(), ".mcp-scraper-key")].filter(Boolean);
814
1004
  for (const path of paths) {
815
1005
  try {
816
- const value = (0, import_node_fs.readFileSync)(path, "utf8").trim();
1006
+ const value = (0, import_node_fs2.readFileSync)(path, "utf8").trim();
817
1007
  if (value) return value;
818
1008
  } catch {
819
1009
  }