mcp-scraper 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +5 -0
  2. package/dist/bin/api-server.cjs +15553 -7587
  3. package/dist/bin/api-server.cjs.map +1 -1
  4. package/dist/bin/api-server.js +3 -3
  5. package/dist/bin/mcp-stdio-server.cjs +312 -119
  6. package/dist/bin/mcp-stdio-server.cjs.map +1 -1
  7. package/dist/bin/mcp-stdio-server.js +1 -1
  8. package/dist/bin/paa-harvest.cjs +1537 -165
  9. package/dist/bin/paa-harvest.cjs.map +1 -1
  10. package/dist/bin/paa-harvest.js +1 -1
  11. package/dist/{chunk-LXZDJJXR.js → chunk-D4CJBZBY.js} +426 -29
  12. package/dist/chunk-D4CJBZBY.js.map +1 -0
  13. package/dist/chunk-HERFK7W6.js +2781 -0
  14. package/dist/chunk-HERFK7W6.js.map +1 -0
  15. package/dist/chunk-JQKZWEON.js +1000 -0
  16. package/dist/chunk-JQKZWEON.js.map +1 -0
  17. package/dist/chunk-Y74EXABN.js +295 -0
  18. package/dist/chunk-Y74EXABN.js.map +1 -0
  19. package/dist/{db-IOYMX64U.js → db-YWCNHBLH.js} +36 -4
  20. package/dist/index.cjs +1660 -237
  21. package/dist/index.cjs.map +1 -1
  22. package/dist/index.d.cts +169 -2
  23. package/dist/index.d.ts +169 -2
  24. package/dist/index.js +120 -69
  25. package/dist/index.js.map +1 -1
  26. package/dist/server-W5NWH5KF.js +11625 -0
  27. package/dist/server-W5NWH5KF.js.map +1 -0
  28. package/dist/{worker-3ECJHPRE.js → worker-D4D2YQTA.js} +44 -9
  29. package/dist/worker-D4D2YQTA.js.map +1 -0
  30. package/package.json +17 -5
  31. package/dist/chunk-4API3ZCT.js +0 -1387
  32. package/dist/chunk-4API3ZCT.js.map +0 -1
  33. package/dist/chunk-LXZDJJXR.js.map +0 -1
  34. package/dist/chunk-ZBP4RHNW.js +0 -805
  35. package/dist/chunk-ZBP4RHNW.js.map +0 -1
  36. package/dist/server-63DR2HE5.js +0 -6062
  37. package/dist/server-63DR2HE5.js.map +0 -1
  38. package/dist/worker-3ECJHPRE.js.map +0 -1
  39. /package/dist/{db-IOYMX64U.js.map → db-YWCNHBLH.js.map} +0 -0
@@ -0,0 +1,1000 @@
1
+ // src/mcp/paa-mcp-server.ts
2
+ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
3
+
4
+ // src/mcp/mcp-tool-schemas.ts
5
+ import { z } from "zod";
6
+ var HarvestPaaInputSchema = {
7
+ query: z.string().min(1).describe('Core search topic only. If the user says "best hvac company in Denver CO", use query="best hvac company" and location="Denver, CO". Do not include the location in query when it can be separated.'),
8
+ location: z.string().optional().describe('City, region, or country for geo-targeted results, inferred from the user request when present, e.g. "Denver, CO", "Tokyo, Japan", "London, UK".'),
9
+ maxQuestions: z.number().int().min(1).max(150).default(30).describe("Number of PAA questions to extract. Default 30. Maximum 150. Use 10 for quick probes, 30 for normal research, 100-150 when the user asks for everything/full/deep research. Credits are charged by extracted question; unused request hold is refunded."),
10
+ gl: z.string().length(2).default("us").describe("Google country code inferred from location or user language. Examples: United States us, United Kingdom gb, Japan jp, Canada ca, Australia au."),
11
+ hl: z.string().default("en").describe("Google interface/content language inferred from the user request. Use en unless the user asks for another language or locale."),
12
+ device: z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use desktop by default; use mobile only when the user asks for mobile rankings."),
13
+ proxyMode: z.enum(["location", "configured", "none"]).default("location").describe("Proxy targeting mode. Use location by default so city/state searches create or reuse a matching residential proxy. Use configured for the static configured proxy. Use none only for direct-network debugging."),
14
+ proxyZip: z.string().regex(/^\d{5}$/).optional().describe("Optional US ZIP override for residential location proxy targeting. Use only when the user gives a specific ZIP or city-center proxy targeting needs to be forced."),
15
+ debug: z.boolean().default(false).describe("Include sanitized browser/session/location diagnostics in the response. Use true when debugging localization, CAPTCHA, or proxy behavior.")
16
+ };
17
+ var ExtractUrlInputSchema = {
18
+ url: z.string().url().describe("Public http/https URL to extract. Use this when the user provides one specific page URL."),
19
+ screenshot: z.boolean().default(false).describe("Also capture a full-page screenshot of the URL. Saved to ~/Downloads/mcp-scraper/screenshots/ and returned inline. Use when the user asks to see or capture the page visually."),
20
+ screenshotDevice: z.enum(["desktop", "mobile"]).default("desktop").describe("Viewport for screenshot. desktop = 1440\xD7900. mobile = 390\xD7844. Default desktop."),
21
+ extractBranding: z.boolean().default(false).describe("Extract brand colors, fonts, logo, and favicon using a rendered browser session. Returns colorScheme (light/dark), colors (primary/accent/background/text/heading as hex), fonts (heading/body family names), and assets (logo URL, favicon URL). Use when the user asks about brand colors, site theme, or brand assets."),
22
+ downloadMedia: z.boolean().default(false).describe("Extract and download all page media (images, video, audio) to ~/Downloads/mcp-scraper/media/. Ad networks, tracking pixels, and noise URLs are filtered automatically. Use when the user asks to download or harvest assets from a page."),
23
+ mediaTypes: z.array(z.enum(["image", "video", "audio"])).default(["image", "video", "audio"]).describe("Which media types to download. Default all three."),
24
+ allowLocal: z.boolean().default(false).describe("Allow localhost and private-network URLs. For local development only.")
25
+ };
26
+ var MapSiteUrlsInputSchema = {
27
+ url: z.string().url().describe("Public website URL or domain to crawl for internal URLs. Use before extract_site when the user asks to audit/map/crawl a site."),
28
+ maxUrls: z.number().int().min(1).max(500).optional().describe("Maximum URLs to discover. Use 100 for normal maps, higher when the user asks for a full inventory.")
29
+ };
30
+ var ExtractSiteInputSchema = {
31
+ url: z.string().url().describe("Public website URL or domain to extract across multiple pages. Use when the user asks for a site audit, website crawl, or full-site content/schema extraction."),
32
+ maxPages: z.number().int().min(1).max(50).optional().describe("Maximum pages to extract. Use 50 when the user asks for full results or a complete crawl within MCP limits.")
33
+ };
34
+ var YoutubeHarvestInputSchema = {
35
+ mode: z.enum(["search", "channel"]).describe("Use search for topic/keyword requests. Use channel when the user provides @handle, channel ID, or channel URL."),
36
+ query: z.string().optional().describe("Required when mode is search. The YouTube search topic in the user\u2019s words."),
37
+ channelHandle: z.string().optional().describe("YouTube channel handle, channel ID, or URL. Examples: @mkbhd, UC..., https://youtube.com/@mkbhd."),
38
+ maxVideos: z.number().int().min(1).max(500).default(50).describe("Number of videos to return. Default 50. Increase when user asks for full channel/history.")
39
+ };
40
+ var YoutubeTranscribeInputSchema = {
41
+ videoId: z.string().min(1).describe("YouTube video ID, e.g. dQw4w9WgXcQ")
42
+ };
43
+ var FacebookPageIntelInputSchema = {
44
+ pageId: z.string().optional(),
45
+ libraryId: z.string().optional(),
46
+ query: z.string().optional().describe("Advertiser or brand name when pageId/libraryId is not known. One of pageId, libraryId, or query is required."),
47
+ maxAds: z.number().int().min(1).max(200).default(50),
48
+ country: z.string().length(2).default("US")
49
+ };
50
+ var FacebookAdSearchInputSchema = {
51
+ query: z.string().min(1).describe("Advertiser, brand, competitor, niche, or keyword to search in Facebook Ad Library."),
52
+ country: z.string().length(2).default("US"),
53
+ maxResults: z.number().int().min(1).max(20).default(10)
54
+ };
55
+ var FacebookAdTranscribeInputSchema = {
56
+ videoUrl: z.string().url().describe("Facebook CDN video URL from a facebook_page_intel result")
57
+ };
58
+ var MapsPlaceIntelInputSchema = {
59
+ businessName: z.string().min(1).describe('Business name only. If user says "Elite Roofing Denver CO", use businessName="Elite Roofing" and location="Denver, CO".'),
60
+ location: z.string().min(1).describe('City/region/country where the business should be searched, e.g. "Denver, CO". Infer from the user request when possible.'),
61
+ gl: z.string().length(2).default("us").describe("Google country code inferred from location."),
62
+ hl: z.string().length(2).default("en").describe("Language inferred from user request."),
63
+ includeReviews: z.boolean().default(false).describe("Whether to fetch individual review cards"),
64
+ maxReviews: z.number().int().min(1).max(500).default(50).describe("Max review cards to return (requires includeReviews: true)")
65
+ };
66
+ var CreditsInfoInputSchema = {
67
+ item: z.string().optional().describe('Optional tool, action, or feature to look up, e.g. "maps reviews", "extract_url", or "YouTube transcription"'),
68
+ includeLedger: z.boolean().default(false).describe("Whether to include recent credit ledger entries")
69
+ };
70
+ var SearchSerpInputSchema = {
71
+ query: z.string().min(1).describe('Core search topic only. Separate location when possible. If user says "best dentist in Brooklyn NY serp", use query="best dentist" and location="Brooklyn, NY".'),
72
+ location: z.string().optional().describe("City, region, or country for geo-targeted results, inferred from user request when present."),
73
+ gl: z.string().length(2).default("us").describe("Google country code inferred from location or user language."),
74
+ hl: z.string().default("en").describe("Google interface/content language inferred from user request."),
75
+ device: z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use desktop by default; use mobile only when the user asks for mobile rankings."),
76
+ proxyMode: z.enum(["location", "configured", "none"]).default("location").describe("Proxy targeting mode. Use location by default so city/state searches create or reuse a matching residential proxy. Use configured for the static configured proxy. Use none only for direct-network debugging."),
77
+ proxyZip: z.string().regex(/^\d{5}$/).optional().describe("Optional US ZIP override for residential location proxy targeting. Use only when the user gives a specific ZIP or city-center proxy targeting needs to be forced."),
78
+ debug: z.boolean().default(false).describe("Include sanitized browser/session/location diagnostics in the response. Use true when debugging localization, CAPTCHA, or proxy behavior."),
79
+ pages: z.number().int().min(1).max(2).default(1).describe("Number of result pages to fetch (1\u20132)")
80
+ };
81
+ var CaptureSerpSnapshotInputSchema = {
82
+ query: z.string().min(1).describe("Core search query to capture as a structured SERP Intelligence snapshot. Separate the place into location when the user gives a city, region, country, or ZIP."),
83
+ location: z.string().optional().describe("City, region, country, or service area used for localized Google results. MCP Scraper records location evidence; UULE alone is not proof of localization."),
84
+ gl: z.string().length(2).default("us").describe("Google country code inferred from the requested market, e.g. us, gb, ca, au."),
85
+ hl: z.string().default("en").describe("Google interface/content language inferred from the user request."),
86
+ device: z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use mobile only when the user asks for mobile rankings or mobile SERP evidence."),
87
+ proxyMode: z.enum(["location", "configured", "none"]).default("location").describe("Proxy behavior for capture. Use location for localized residential proxy targeting, configured for the static residential proxy, and none only for direct-network debugging."),
88
+ proxyZip: z.string().regex(/^\d{5}$/).optional().describe("Optional US ZIP override for residential location proxy targeting when a precise city-center or ZIP proxy is needed."),
89
+ pages: z.number().int().min(1).max(2).default(1).describe("Number of Google result pages to capture. Use 1 normally and 2 only when the user needs deeper ranking evidence."),
90
+ debug: z.boolean().default(false).describe("Include sanitized browser, proxy, and location diagnostics. Use true when debugging localization, CAPTCHA, proxy selection, or capture reliability."),
91
+ includePageSnapshots: z.boolean().default(false).describe("Also capture ranking-page snapshots for selected SERP URLs through the same product capture path."),
92
+ pageSnapshotLimit: z.number().int().min(0).max(10).default(0).describe("Maximum ranking-page snapshots to capture when includePageSnapshots is true. Use 0 when only SERP evidence is needed.")
93
+ };
94
+ var ScreenshotInputSchema = {
95
+ url: z.string().url().describe("URL to capture as a full-page screenshot. Use http or https. Pass allowLocal: true to capture localhost or private-network URLs during development."),
96
+ device: z.enum(["desktop", "mobile"]).default("desktop").describe("Viewport profile. desktop = 1440\xD7900. mobile = 390\xD7844. Use desktop by default; use mobile when the user asks for a mobile view."),
97
+ allowLocal: z.boolean().default(false).describe("Allow localhost and private-network URLs (127.x, 192.168.x, 10.x, etc.). For local development only \u2014 not for production use.")
98
+ };
99
+ var CaptureSerpPageSnapshotsInputSchema = {
100
+ urls: z.array(z.string().url()).min(1).max(25).describe("Public HTTP/HTTPS URLs to capture as SERP Intelligence page snapshots. Do not pass localhost, private IPs, file URLs, or internal admin URLs."),
101
+ targets: z.array(z.object({
102
+ url: z.string().url().describe("Public HTTP/HTTPS URL to capture."),
103
+ sourceKind: z.enum(["organic", "ai_citation", "local_pack_website", "configured_target", "site_subject"]).default("configured_target").describe("Why this page is being captured for SERP Intelligence evidence."),
104
+ sourcePosition: z.number().int().min(1).optional().describe("Ranking or citation position when the page came from SERP evidence.")
105
+ }).strict()).min(1).max(25).optional().describe("Structured page snapshot targets. Use this instead of urls when source kind or position should be preserved."),
106
+ maxConcurrency: z.number().int().min(1).max(5).default(2).describe("Parallel page captures. Use 2 normally; higher values can increase proxy/browser pressure."),
107
+ timeoutMs: z.number().int().min(1e3).max(6e4).default(15e3).describe("Per-page capture timeout in milliseconds. Increase for slow pages; timeout artifacts are returned as structured capture failures."),
108
+ debug: z.boolean().default(false).describe("Include sanitized browser/proxy diagnostics for page snapshot debugging. Use true for capture, network, or proxy troubleshooting.")
109
+ };
110
+
111
+ // src/mcp/mcp-response-formatter.ts
112
+ import { mkdirSync, writeFileSync, readFileSync } from "fs";
113
+ import { homedir } from "os";
114
+ import { join } from "path";
115
+ function slugifyReportName(input) {
116
+ return input.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80) || "mcp-scraper-report";
117
+ }
118
+ function reportTitle(full) {
119
+ const title = full.split("\n").find((line) => line.startsWith("# "));
120
+ return title?.replace(/^#\s+/, "").trim() || "MCP Scraper Report";
121
+ }
122
+ function saveFullReport(full) {
123
+ if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
124
+ const outDir = process.env.MCP_SCRAPER_OUTPUT_DIR?.trim() || join(homedir(), "Downloads", "mcp-scraper");
125
+ try {
126
+ mkdirSync(outDir, { recursive: true });
127
+ const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
128
+ const file = join(outDir, `${stamp}-${slugifyReportName(reportTitle(full))}.md`);
129
+ writeFileSync(file, full, "utf8");
130
+ return file;
131
+ } catch {
132
+ return null;
133
+ }
134
+ }
135
+ function oneBlock(content) {
136
+ const filePath = saveFullReport(content);
137
+ const text = filePath ? `${content}
138
+
139
+ \u{1F4C4} Saved: \`${filePath}\`` : content;
140
+ return { content: [{ type: "text", text }] };
141
+ }
142
+ function formatStructuredError(body, fallback) {
143
+ if (body.error === "insufficient_balance") {
144
+ return `Insufficient credits. Balance: ${body.balance_credits} credits. This call requires ${body.required_credits} credits. Top up at ${body.topup_url}`;
145
+ }
146
+ if (body.error === "mcp_request_timeout") {
147
+ return typeof body.message === "string" ? body.message : "MCP Scraper request timed out and was cancelled.";
148
+ }
149
+ if (typeof body.error_code === "string") {
150
+ const message = typeof body.error === "string" ? body.error : typeof body.message === "string" ? body.message : fallback;
151
+ const retryable = body.retryable === true ? " Retryable: yes." : "";
152
+ return `${body.error_code}: ${message}${retryable}${errorAttemptsSection(body)}`;
153
+ }
154
+ if (typeof body.error === "string") return body.error;
155
+ return fallback || "Tool error";
156
+ }
157
+ function parseData(raw) {
158
+ const first = raw.content.find((b) => b.type === "text");
159
+ const text = first?.type === "text" ? first.text : "";
160
+ try {
161
+ const parsed = JSON.parse(text || "{}");
162
+ if (raw.isError || parsed.error || parsed.error_code) return { error: formatStructuredError(parsed, text) };
163
+ const data = parsed.result ?? parsed;
164
+ return { data };
165
+ } catch {
166
+ if (raw.isError) return { error: text || "Tool error" };
167
+ return { error: "Failed to parse tool response" };
168
+ }
169
+ }
170
+ function entityIdsSection(ids) {
171
+ if (!ids) return "";
172
+ const lines = [];
173
+ if (ids.kgIds?.length) lines.push(`- **Knowledge Graph MID:** ${ids.kgIds.join(", ")}`);
174
+ if (ids.cids?.length) lines.push(`- **CID:** ${ids.cids.join(", ")}`);
175
+ if (ids.gcids?.length) lines.push(`- **GCID:** ${ids.gcids.join(", ")}`);
176
+ return lines.length ? `
177
+ ## Entity IDs
178
+ ${lines.join("\n")}` : "";
179
+ }
180
+ function entityIdsSummaryLine(ids) {
181
+ if (!ids) return "";
182
+ const parts = [];
183
+ if (ids.kgIds?.length) parts.push(`KG MID: ${ids.kgIds[0]}`);
184
+ if (ids.cids?.length) parts.push(`CID: ${ids.cids[0]}`);
185
+ if (ids.gcids?.length) parts.push(`GCID: ${ids.gcids[0]}`);
186
+ return parts.length ? `
187
+ **Entity IDs:** ${parts.join(" \xB7 ")}` : "";
188
+ }
189
+ function truncate(s, max) {
190
+ if (!s) return "";
191
+ return s.length > max ? s.slice(0, max) + "\u2026" : s;
192
+ }
193
+ function cell(s) {
194
+ return String(s ?? "").replace(/\r?\n+/g, " ").replace(/\|/g, "\\|").replace(/\s+/g, " ").trim();
195
+ }
196
+ function debugSection(debug) {
197
+ if (!debug || typeof debug !== "object") return "";
198
+ const request = debug.request ?? {};
199
+ const browser = debug.browser ?? {};
200
+ const kernel = browser.kernel ?? {};
201
+ const network = browser.networkLocation ?? {};
202
+ const nav = browser.serpNavigation ?? {};
203
+ const proxyResolution = kernel.proxyResolution ?? {};
204
+ const locationEvidence = debug.locationEvidence;
205
+ const candidates = Array.isArray(locationEvidence?.candidates) ? locationEvidence.candidates.slice(0, 4).map((c) => `${c.city}, ${c.regionCode} (${c.count})`).join(", ") : "";
206
+ const lines = [
207
+ "\n## Debug",
208
+ `- Proxy mode: ${request.proxyMode ?? kernel.proxyMode ?? "unknown"} \xB7 requested proxy: ${kernel.requestedProxyIdPresent === true ? `yes (${kernel.requestedProxyIdSuffix ?? "redacted"})` : "no"}`,
209
+ `- Proxy resolution: ${proxyResolution.source ?? "unknown"}${proxyResolution.target ? ` \xB7 ${proxyResolution.target.level ?? "city"} ${proxyResolution.target.city}, ${proxyResolution.target.state}` : ""}${proxyResolution.error ? ` \xB7 ${truncate(proxyResolution.error, 180)}` : ""}`,
210
+ `- Browser session: ${kernel.sessionId ?? "unknown"} \xB7 retrieved proxy: ${kernel.retrievedProxyIdPresent === true ? `yes (${kernel.retrievedProxyIdSuffix ?? "redacted"})` : kernel.retrievedProxyIdPresent === false ? "no" : "unknown"}`,
211
+ `- Browser IP geo: ${[network.ip, network.city, network.region, network.country].filter(Boolean).join(" \xB7 ") || network.error || "unknown"}`,
212
+ `- Google URL: ${truncate(nav.requestedUrl, 240) || "unknown"}`,
213
+ `- Final URL: ${truncate(nav.finalUrl, 240) || "unknown"} \xB7 CAPTCHA: ${nav.captchaDetected === true ? "yes" : nav.captchaDetected === false ? "no" : "unknown"} \xB7 redirected: ${nav.redirected === true ? "yes" : nav.redirected === false ? "no" : "unknown"}`
214
+ ];
215
+ if (locationEvidence) {
216
+ lines.push(`- Location evidence: ${locationEvidence.status}${locationEvidence.expected ? ` \xB7 expected ${locationEvidence.expected.city}${locationEvidence.expected.regionCode ? `, ${locationEvidence.expected.regionCode}` : ""}` : ""}${candidates ? ` \xB7 candidates ${candidates}` : ""}`);
217
+ }
218
+ return lines.join("\n");
219
+ }
220
+ function errorAttemptsSection(body) {
221
+ const attempts = Array.isArray(body.attempts) ? body.attempts : [];
222
+ if (attempts.length === 0) return "";
223
+ const lines = attempts.slice(0, 5).map((attempt) => {
224
+ const debug = attempt.debug ?? {};
225
+ const browser = debug.browser ?? {};
226
+ const kernel = browser.kernel ?? {};
227
+ const proxyResolution = kernel.proxyResolution ?? {};
228
+ const network = browser.networkLocation ?? {};
229
+ const nav = browser.serpNavigation ?? {};
230
+ const geo = [network.ip, network.city, network.region].filter(Boolean).join(" / ") || "geo unknown";
231
+ return `- Attempt ${attempt.attempt_number ?? "?"}: ${attempt.outcome ?? attempt.status ?? "unknown"} \xB7 session ${attempt.kernel_session_id ?? kernel.sessionId ?? "unknown"} \xB7 proxy ${debug.request?.proxyMode ?? kernel.proxyMode ?? "unknown"}${proxyResolution.source ? `/${proxyResolution.source}` : ""} \xB7 ${geo} \xB7 CAPTCHA ${nav.captchaDetected === true ? "yes" : nav.captchaDetected === false ? "no" : "unknown"} \xB7 deleted ${attempt.kernel_delete_succeeded === true ? "yes" : attempt.kernel_delete_succeeded === false ? "no" : "unknown"}`;
232
+ });
233
+ return `
234
+
235
+ Attempts:
236
+ ${lines.join("\n")}`;
237
+ }
238
+ function formatHarvestPaa(raw, input) {
239
+ const parsed = parseData(raw);
240
+ if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
241
+ const d = parsed.data;
242
+ const flat = d.flat ?? [];
243
+ const organic = d.organicResults ?? [];
244
+ const entityIds = d.entityIds;
245
+ const aiOvw = d.aiOverview;
246
+ const diagnostics = d.diagnostics;
247
+ const durationMs = d.stats?.durationMs;
248
+ const paaRows = flat.map(
249
+ (r, i) => `| ${i + 1} | ${cell(r.question)} | ${cell(truncate(r.answer, 120))} | ${cell(r.source_title || r.source_site || "")} |`
250
+ ).join("\n");
251
+ const paaTable = flat.length ? `## People Also Ask (${flat.length} questions)
252
+ | # | Question | Answer | Source |
253
+ |---|----------|--------|--------|
254
+ ${paaRows}` : "## People Also Ask\n*Google did not return a People Also Ask block for this query/location. SERP data was extracted successfully when available.*";
255
+ const serpRows = organic.map(
256
+ (r) => `| ${r.position} | ${cell(r.title)} | [${cell(r.domain)}](${r.url}) | ${cell(truncate(r.snippet, 100))} |`
257
+ ).join("\n");
258
+ const serpTable = organic.length ? `
259
+ ## Organic Results (${organic.length})
260
+ | # | Title | URL | Snippet |
261
+ |---|-------|-----|----------|
262
+ ${serpRows}` : "";
263
+ const aiSection = aiOvw?.detected && aiOvw.text ? `
264
+ ## AI Overview
265
+ > ${truncate(aiOvw.text, 600)}` : "";
266
+ const statsLine = durationMs ? `
267
+ ## Stats
268
+ - Status: ${diagnostics?.completionStatus ?? (flat.length ? "paa_found" : "no_paa")} \xB7 Questions: ${flat.length} \xB7 Duration: ${(durationMs / 1e3).toFixed(1)}s` : "";
269
+ const tips = `
270
+ ---
271
+ \u{1F4A1} **Tips**
272
+ - Max questions: \`maxQuestions: 150\` (current: ${input.maxQuestions ?? 30})
273
+ - Organic results only: use \`search_serp\`
274
+ - Dig into a result: use \`extract_url\` on any organic URL`;
275
+ const full = `# PAA Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
276
+
277
+ ${paaTable}${serpTable}${entityIdsSection(entityIds)}${aiSection}${statsLine}${debugSection(diagnostics?.debug)}${tips}`;
278
+ const topQ = flat.slice(0, 10).map((r, i) => `${i + 1}. ${r.question}`).join("\n");
279
+ const topO = organic.slice(0, 5).map((r) => `${r.position}. [${r.title}](${r.url}) \u2014 ${r.domain}`).join("\n");
280
+ const summary = [
281
+ `**PAA: "${input.query}"** \u2014 ${flat.length} questions extracted`,
282
+ topQ ? `
283
+ **Top questions:**
284
+ ${topQ}` : "",
285
+ organic.length ? `
286
+ **Top organic results:**
287
+ ${topO}` : "",
288
+ entityIdsSummaryLine(entityIds),
289
+ `
290
+ \u{1F4A1} \`maxQuestions\` up to 150 \xB7Use \`extract_url\` to dig into any result`
291
+ ].filter(Boolean).join("\n");
292
+ return oneBlock(full);
293
+ }
294
+ function formatSearchSerp(raw, input) {
295
+ const parsed = parseData(raw);
296
+ if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
297
+ const d = parsed.data;
298
+ const organic = d.organicResults ?? [];
299
+ const localPack = d.localPack ?? [];
300
+ const entityIds = d.entityIds;
301
+ const aiOvw = d.aiOverview;
302
+ const diagnostics = d.diagnostics;
303
+ const serpRows = organic.map(
304
+ (r) => `| ${r.position} | ${cell(r.title)} | [${cell(r.domain)}](${r.url}) | ${cell(truncate(r.snippet, 100))} |`
305
+ ).join("\n");
306
+ const serpTable = organic.length ? `## Organic Results (${organic.length})
307
+ | # | Title | URL | Snippet |
308
+ |---|-------|-----|----------|
309
+ ${serpRows}` : "## Organic Results\n*None found*";
310
+ const localRows = localPack.map(
311
+ (b) => `| ${b.position} | ${cell(b.name)} | ${b.rating ?? "\u2014"} (${b.reviewCount ?? "0"}) | ${b.websiteUrl ? `[link](${b.websiteUrl})` : "\u2014"} |`
312
+ ).join("\n");
313
+ const localSection = localPack.length ? `
314
+ ## Local Pack (${localPack.length})
315
+ | # | Name | Rating | Website |
316
+ |---|------|--------|---------|
317
+ ${localRows}` : "";
318
+ const aiSection = aiOvw?.detected && aiOvw.text ? `
319
+ ## AI Overview
320
+ > ${truncate(aiOvw.text, 600)}` : "";
321
+ const tips = `
322
+ ---
323
+ \u{1F4A1} **Tips**
324
+ - Get PAA questions: use \`harvest_paa\` for this query
325
+ - Scrape any result: use \`extract_url\`
326
+ - Business entity IDs (CID/GCID/KG MID) shown above if found`;
327
+ const full = `# SERP Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
328
+
329
+ ${serpTable}${localSection}${entityIdsSection(entityIds)}${aiSection}${debugSection(diagnostics?.debug)}${tips}`;
330
+ const topO = organic.slice(0, 5).map((r) => `${r.position}. [${r.title}](${r.url}) \u2014 ${r.domain}`).join("\n");
331
+ const summary = [
332
+ `**SERP: "${input.query}"** \u2014 ${organic.length} organic results`,
333
+ topO ? `
334
+ **Top results:**
335
+ ${topO}` : "",
336
+ localPack.length ? `
337
+ **Local Pack:** ${localPack.map((b) => b.name).join(", ")}` : "",
338
+ entityIdsSummaryLine(entityIds),
339
+ `
340
+ \u{1F4A1} Use \`harvest_paa\` for questions \xB7 \`extract_url\` to scrape any result`
341
+ ].filter(Boolean).join("\n");
342
+ return oneBlock(full);
343
+ }
344
+ function formatExtractUrl(raw, input) {
345
+ const parsed = parseData(raw);
346
+ if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
347
+ const d = parsed.data;
348
+ const url = d.url ?? input.url;
349
+ const title = d.title ?? "Untitled";
350
+ const headings = d.headings ?? [];
351
+ const kpo = d.kpo;
352
+ const bodyMd = d.bodyMarkdown ?? "";
353
+ const schema = d.schema;
354
+ const screenshotMeta = d.screenshot;
355
+ const branding = d.branding;
356
+ const media = d.media;
357
+ const h1Lines = headings.filter((h) => h.level === 1).map((h) => `- ${h.text}`).join("\n");
358
+ const h2Lines = headings.filter((h) => h.level === 2).map((h) => ` - ${h.text}`).join("\n");
359
+ const headingSection = h1Lines || h2Lines ? `
360
+ ## Heading Structure
361
+ ${[h1Lines, h2Lines].filter(Boolean).join("\n")}` : "";
362
+ const kpoSection = kpo ? [
363
+ `
364
+ ## Entity / Schema`,
365
+ kpo.entityName ? `- **Entity:** ${kpo.entityName}` : "",
366
+ kpo.type?.length ? `- **@type:** ${kpo.type.join(", ")}` : "",
367
+ kpo.napScore !== void 0 ? `- **NAP Score:** ${kpo.napScore}/5` : "",
368
+ kpo.address ? `- **Address:** ${kpo.address}` : "",
369
+ kpo.phone ? `- **Phone:** ${kpo.phone}` : "",
370
+ kpo.email ? `- **Email:** ${kpo.email}` : "",
371
+ kpo.faqCount ? `- **FAQ items:** ${kpo.faqCount}` : "",
372
+ kpo.sameAs?.length ? `- **sameAs:** ${kpo.sameAs.slice(0, 5).join(", ")}` : "",
373
+ kpo.missingFields?.length ? `
374
+ **Missing schema fields:** ${kpo.missingFields.slice(0, 5).join(", ")}` : ""
375
+ ].filter(Boolean).join("\n") : "";
376
+ const bodySection = bodyMd ? `
377
+ ## Page Content
378
+ ${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
379
+ const screenshotSection = screenshotMeta ? `
380
+ ## Screenshot
381
+ - **File:** ${screenshotMeta.savedPath}
382
+ - **Size:** ${(screenshotMeta.sizeBytes / 1024).toFixed(1)} KB
383
+ - **Device:** ${screenshotMeta.device}` : "";
384
+ const brandingSection = branding ? [
385
+ `
386
+ ## Branding`,
387
+ branding.colorScheme ? `- **Color scheme:** ${branding.colorScheme}` : "",
388
+ `- **Colors:**${Object.entries(branding.colors ?? {}).filter(([, v]) => v).map(([k, v]) => ` ${k}=${v}`).join(",") || " (none extracted)"}`,
389
+ `- **Fonts:**${Object.entries(branding.fonts ?? {}).filter(([, v]) => v).map(([k, v]) => ` ${k}=${v}`).join(",") || " (none extracted)"}`,
390
+ branding.assets?.logo ? `- **Logo:** ${branding.assets.logo}` : "",
391
+ branding.assets?.favicon ? `- **Favicon:** ${branding.assets.favicon}` : ""
392
+ ].filter(Boolean).join("\n") : "";
393
+ const mediaSection = media ? [
394
+ `
395
+ ## Media Assets`,
396
+ `- **Found:** ${media.totalFound} total, ${media.filteredCount} filtered (ads/noise), ${media.assets.length} downloaded`,
397
+ media.outputDir ? `- **Saved to:** ${media.outputDir}` : ""
398
+ ].filter(Boolean).join("\n") : "";
399
+ const schemaCount = Array.isArray(schema) ? schema.length : 0;
400
+ const tips = `
401
+ ---
402
+ \u{1F4A1} **Tips**
403
+ - Crawl entire site: use \`extract_site\`
404
+ - Map all URLs: use \`map_site_urls\`
405
+ - ${schemaCount} JSON-LD schema block(s) detected`;
406
+ const full = `# URL Extract: ${url}
407
+ **${title}**
408
+ ${headingSection}${kpoSection}${brandingSection}${bodySection}${screenshotSection}${mediaSection}${tips}`;
409
+ const textResult = oneBlock(full);
410
+ if (screenshotMeta?.savedPath) {
411
+ try {
412
+ const imgBuf = readFileSync(screenshotMeta.savedPath);
413
+ return {
414
+ content: [
415
+ ...textResult.content,
416
+ { type: "image", data: imgBuf.toString("base64"), mimeType: "image/png" }
417
+ ]
418
+ };
419
+ } catch {
420
+ }
421
+ }
422
+ return textResult;
423
+ }
424
+ function formatMapSiteUrls(raw, input) {
425
+ const parsed = parseData(raw);
426
+ if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
427
+ const d = parsed.data;
428
+ const urls = d.urls ?? [];
429
+ const ok = urls.filter((u) => (u.status ?? 0) >= 200 && (u.status ?? 0) < 300);
430
+ const broken = urls.filter((u) => u.status !== null && u.status >= 400);
431
+ const redirects = urls.filter((u) => u.status !== null && u.status >= 300 && u.status < 400);
432
+ const urlRows = urls.slice(0, 200).map((u, i) => `| ${i + 1} | ${u.url} | ${u.status ?? "\u2014"} |`).join("\n");
433
+ const full = [
434
+ `# URL Map: ${input.url}`,
435
+ `**${d.totalFound} URLs** \xB7 ${(d.durationMs / 1e3).toFixed(1)}s${d.truncated ? " \xB7 *truncated*" : ""}`,
436
+ `
437
+ ## Summary
438
+ - \u2705 2xx: ${ok.length}
439
+ - \u{1F500} 3xx: ${redirects.length}
440
+ - \u274C 4xx+: ${broken.length}`,
441
+ `
442
+ ## URL Inventory
443
+ | # | URL | Status |
444
+ |---|-----|--------|
445
+ ${urlRows}`,
446
+ broken.length ? `
447
+ ## Broken URLs
448
+ ${broken.map((u) => `- ${u.url} (${u.status})`).join("\n")}` : "",
449
+ `
450
+ ---
451
+ \u{1F4A1} **Tips**
452
+ - Extract content from all pages: use \`extract_site\`
453
+ - Scrape a single page: use \`extract_url\``
454
+ ].filter(Boolean).join("\n");
455
+ const summary = [
456
+ `**URL Map: ${input.url}**`,
457
+ `${d.totalFound} URLs \u2014 ${ok.length} OK \xB7 ${broken.length} broken \xB7 ${redirects.length} redirects`,
458
+ broken.length ? `
459
+ **Broken URLs:** ${broken.slice(0, 3).map((u) => u.url).join(", ")}` : "",
460
+ `
461
+ \u{1F4A1} Use \`extract_site\` to extract content from all pages`
462
+ ].filter(Boolean).join("\n");
463
+ return oneBlock(full);
464
+ }
465
+ function formatExtractSite(raw, input) {
466
+ const parsed = parseData(raw);
467
+ if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
468
+ const d = parsed.data;
469
+ const pages = d.pages ?? [];
470
+ const pageRows = pages.map((p, i) => {
471
+ const schemaInfo = p.kpo?.type?.join(", ") ?? (Array.isArray(p.schema) && p.schema.length ? `${p.schema.length} block(s)` : "\u2014");
472
+ return `| ${i + 1} | ${cell(p.title ?? "Untitled")} | ${p.url} | ${schemaInfo} |`;
473
+ }).join("\n");
474
+ const full = [
475
+ `# Site Extract: ${input.url}`,
476
+ `**${pages.length} pages** \xB7 ${((d.durationMs ?? 0) / 1e3).toFixed(1)}s`,
477
+ `
478
+ ## Pages
479
+ | # | Title | URL | Schema |
480
+ |---|-------|-----|--------|
481
+ ${pageRows}`,
482
+ `
483
+ ---
484
+ \u{1F4A1} **Tips**
485
+ - Map URLs first: use \`map_site_urls\`
486
+ - Inspect a single page: use \`extract_url\``
487
+ ].join("\n");
488
+ const summary = [
489
+ `**Site Extract: ${input.url}** \u2014 ${pages.length} pages`,
490
+ pages.slice(0, 5).map((p) => `- ${p.title ?? p.url}`).join("\n"),
491
+ pages.length > 5 ? `- \u2026 and ${pages.length - 5} more` : "",
492
+ `
493
+ \u{1F4A1} Use \`extract_url\` to inspect any individual page`
494
+ ].filter(Boolean).join("\n");
495
+ return oneBlock(full);
496
+ }
497
+ function formatYoutubeHarvest(raw, input) {
498
+ const parsed = parseData(raw);
499
+ if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
500
+ const d = parsed.data;
501
+ const videos = d.videos ?? [];
502
+ const label = input.mode === "channel" ? input.channelHandle ?? "channel" : `"${input.query ?? ""}"`;
503
+ const videoRows = videos.map(
504
+ (v, i) => `| ${i + 1} | ${cell(truncate(v.title, 70))} | ${cell(v.channelName)} | ${v.views ?? "\u2014"} | ${v.duration ?? "\u2014"} | \`${v.videoId}\` |`
505
+ ).join("\n");
506
+ const channelSection = d.channelMeta ? `
507
+ ## Channel
508
+ - **Name:** ${d.channelMeta.title ?? "\u2014"}
509
+ - **Subscribers:** ${d.channelMeta.subscriberCount ?? "\u2014"}` : "";
510
+ const full = [
511
+ `# YouTube Harvest: ${label}`,
512
+ `**${videos.length} videos** \xB7 ${(d.stats.durationMs / 1e3).toFixed(1)}s`,
513
+ channelSection,
514
+ `
515
+ ## Videos
516
+ | # | Title | Channel | Views | Duration | Video ID |
517
+ |---|-------|---------|-------|----------|----------|
518
+ ${videoRows}`,
519
+ `
520
+ ---
521
+ \u{1F4A1} **Tips**
522
+ - Transcribe a video: use \`youtube_transcribe\` with the \`videoId\` above
523
+ - Switch mode: \`mode: "channel"\` with \`channelHandle\` or \`mode: "search"\` with \`query\``
524
+ ].filter(Boolean).join("\n");
525
+ const top5 = videos.slice(0, 5).map((v, i) => `${i + 1}. ${v.title} (\`${v.videoId}\`)`).join("\n");
526
+ const summary = [
527
+ `**YouTube: ${label}** \u2014 ${videos.length} videos`,
528
+ `
529
+ **Top videos:**
530
+ ${top5}`,
531
+ `
532
+ \u{1F4A1} Transcribe any video: \`youtube_transcribe\` with its videoId`
533
+ ].join("\n");
534
+ return oneBlock(full);
535
+ }
536
+ function formatYoutubeTranscribe(raw, input) {
537
+ const parsed = parseData(raw);
538
+ if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
539
+ const d = parsed.data;
540
+ const text = d.text ?? "";
541
+ const chunks = d.chunks ?? [];
542
+ const durSec = d.durationMs ? (d.durationMs / 1e3).toFixed(0) : "\u2014";
543
+ const chunkRows = chunks.slice(0, 50).map((c) => {
544
+ const sec = Number.isFinite(c.timestamp[0]) ? Math.floor(c.timestamp[0]) : 0;
545
+ const mm = String(Math.floor(sec / 60)).padStart(2, "0");
546
+ const ss = String(sec % 60).padStart(2, "0");
547
+ return `| ${mm}:${ss} | ${cell(truncate(c.text, 120))} |`;
548
+ }).join("\n");
549
+ const full = [
550
+ `# YouTube Transcript: \`${input.videoId}\``,
551
+ `**Duration:** ${durSec}s \xB7 **${text.split(" ").length} words**`,
552
+ `
553
+ ## Full Transcript
554
+ ${text}`,
555
+ chunks.length ? `
556
+ ## Timestamped Chunks
557
+ | Time | Text |
558
+ |------|------|
559
+ ${chunkRows}` : "",
560
+ `
561
+ ---
562
+ \u{1F4A1} Harvest more from this channel: use \`youtube_harvest\` with \`mode: "channel"\``
563
+ ].filter(Boolean).join("\n");
564
+ const summary = [
565
+ `**YouTube Transcript: \`${input.videoId}\`** \u2014 ${text.split(" ").length} words \xB7 ${durSec}s`,
566
+ `
567
+ **Preview:**
568
+ > ${truncate(text, 300)}`,
569
+ `
570
+ \u{1F4A1} Full transcript in artifact above`
571
+ ].join("\n");
572
+ return oneBlock(full);
573
+ }
574
+ function formatFacebookPageIntel(raw, input) {
575
+ const parsed = parseData(raw);
576
+ if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
577
+ const d = parsed.data;
578
+ const advertiser = d.advertiserName ?? input.query ?? input.pageId ?? input.libraryId ?? "Advertiser";
579
+ const ads = d.ads ?? [];
580
+ const s = d.summary ?? { totalAds: 0, activeCount: 0, videoCount: 0, imageCount: 0 };
581
+ const adBlocks = ads.map((ad, i) => [
582
+ `### Ad ${i + 1}${ad.libraryId ? ` \xB7 \`${ad.libraryId}\`` : ""} \u2014 ${ad.status ?? "\u2014"} \xB7 ${ad.creativeType ?? "\u2014"} \xB7 ${ad.startDate ?? "\u2014"}`,
583
+ ad.headline ? `**Headline:** ${ad.headline}` : "",
584
+ ad.primaryText ? `**Copy:** ${truncate(ad.primaryText, 200)}` : "",
585
+ ad.cta ? `**CTA:** ${ad.cta}` : "",
586
+ ad.videoUrl ? `**Video URL:** \`${ad.videoUrl}\`` : "",
587
+ ad.variations ? `**Variations:** ${ad.variations}` : ""
588
+ ].filter(Boolean).join("\n")).join("\n\n---\n\n");
589
+ const full = [
590
+ `# Facebook Ad Intel: ${advertiser}`,
591
+ `**${s.totalAds} ads** \xB7 ${s.activeCount} active \xB7 ${s.videoCount} video \xB7 ${s.imageCount} image`,
592
+ `
593
+ ${adBlocks}`,
594
+ `
595
+ ---
596
+ \u{1F4A1} **Tips**
597
+ - Transcribe video ads: use \`facebook_ad_transcribe\` with the \`videoUrl\` above
598
+ - Find other advertisers: use \`facebook_ad_search\``
599
+ ].filter(Boolean).join("\n");
600
+ const activeAds = ads.filter((a) => a.status?.toLowerCase() === "active").slice(0, 5);
601
+ const adSummary = activeAds.map((a, i) => `${i + 1}. ${truncate(a.headline ?? a.primaryText, 80)} (${a.creativeType ?? "\u2014"})`).join("\n");
602
+ const videoCount = ads.filter((a) => a.videoUrl).length;
603
+ const summary = [
604
+ `**Facebook Ads: ${advertiser}** \u2014 ${s.totalAds} ads (${s.activeCount} active)`,
605
+ adSummary ? `
606
+ **Active ads:**
607
+ ${adSummary}` : "",
608
+ `**Creative mix:** ${s.videoCount} video \xB7 ${s.imageCount} image`,
609
+ videoCount ? `
610
+ \u{1F4A1} ${videoCount} video ads \u2014 transcribe with \`facebook_ad_transcribe\` using the videoUrl` : ""
611
+ ].filter(Boolean).join("\n");
612
+ return oneBlock(full);
613
+ }
614
+ function formatFacebookAdSearch(raw, input) {
615
+ const parsed = parseData(raw);
616
+ if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
617
+ const d = parsed.data;
618
+ const advertisers = d.results ?? d.advertisers ?? [];
619
+ const rows = advertisers.map(
620
+ (a, i) => `| ${i + 1} | ${cell(a.name)} | ${a.adCount ?? "\u2014"} | \`${a.libraryId ?? "\u2014"}\` |`
621
+ ).join("\n");
622
+ const full = [
623
+ `# Facebook Ad Library Search: "${input.query}"`,
624
+ `**${advertisers.length} advertisers found**`,
625
+ `
626
+ ## Advertisers
627
+ | # | Name | Ad Count | Library ID |
628
+ |---|------|----------|------------|
629
+ ${rows}`,
630
+ `
631
+ ---
632
+ \u{1F4A1} **Tips**
633
+ - Scan all ads: use \`facebook_page_intel\` with \`libraryId\`
634
+ - Or pass the advertiser name as \`query\` in \`facebook_page_intel\``
635
+ ].join("\n");
636
+ const summary = [
637
+ `**Facebook Ad Search: "${input.query}"** \u2014 ${advertisers.length} advertisers`,
638
+ advertisers.slice(0, 5).map(
639
+ (a, i) => `${i + 1}. ${a.name}${a.adCount ? ` (${a.adCount} ads)` : ""} \u2014 \`${a.libraryId ?? "\u2014"}\``
640
+ ).join("\n"),
641
+ `
642
+ \u{1F4A1} Scan ads with \`facebook_page_intel\` using \`libraryId\``
643
+ ].filter(Boolean).join("\n");
644
+ return oneBlock(full);
645
+ }
646
+ function formatCreditsInfo(raw, input) {
647
+ const parsed = parseData(raw);
648
+ if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
649
+ const d = parsed.data;
650
+ const balance = d.balance_credits;
651
+ const costs = d.costs ?? [];
652
+ const matched = d.matched_cost;
653
+ const ledger = d.ledger ?? [];
654
+ const costRows = costs.map((c) => {
655
+ const notes = c.notes ? ` ${c.notes}` : "";
656
+ return `| ${c.label} | ${c.credits} | ${c.unit}${notes} |`;
657
+ }).join("\n");
658
+ const ledgerRows = ledger.map((row) => {
659
+ const credits = row.amount_mc / 1e3;
660
+ return `| ${row.created_at} | ${row.operation} | ${credits} | ${row.description ?? ""} |`;
661
+ }).join("\n");
662
+ const matchedSection = matched ? `
663
+ ## Matched Cost
664
+ **${matched.label}:** ${matched.credits} credits ${matched.unit}${matched.notes ? `
665
+
666
+ ${matched.notes}` : ""}` : input.item ? `
667
+ ## Matched Cost
668
+ No exact cost match found for "${input.item}". See the full cost table below.` : "";
669
+ const full = [
670
+ `# Credits`,
671
+ `**Balance:** ${balance ?? "unknown"} credits`,
672
+ matchedSection,
673
+ costs.length ? `
674
+ ## Cost Table
675
+ | Item | Credits | Unit |
676
+ |------|---------|------|
677
+ ${costRows}` : "",
678
+ ledger.length ? `
679
+ ## Recent Ledger
680
+ | Date | Operation | Credits | Description |
681
+ |------|-----------|---------|-------------|
682
+ ${ledgerRows}` : ""
683
+ ].filter(Boolean).join("\n");
684
+ const summary = [
685
+ `**Credit balance:** ${balance ?? "unknown"} credits`,
686
+ matched ? `
687
+ **${matched.label}:** ${matched.credits} credits ${matched.unit}` : null,
688
+ input.includeLedger && ledger.length ? `
689
+ Recent ledger entries included in the full report.` : null
690
+ ].filter(Boolean).join("\n");
691
+ return oneBlock(full);
692
+ }
693
+ function formatMapsPlaceIntel(raw, input) {
694
+ const parsed = parseData(raw);
695
+ if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
696
+ const d = parsed.data;
697
+ const name = d.name ?? input.businessName;
698
+ const rating = d.rating;
699
+ const reviewCount = d.reviewCount;
700
+ const category = d.category;
701
+ const address = d.address;
702
+ const phone = d.phoneDisplay;
703
+ const website = d.website;
704
+ const hoursSummary = d.hoursSummary;
705
+ const plusCode = d.plusCode;
706
+ const bookingUrl = d.bookingUrl;
707
+ const kgmid = d.kgmid;
708
+ const cidDecimal = d.cidDecimal;
709
+ const cidUrl = d.cidUrl;
710
+ const lat = d.lat;
711
+ const lng = d.lng;
712
+ const durationMs = d.durationMs;
713
+ const histogram = d.reviewHistogram ?? [];
714
+ const topics = d.reviewTopics ?? [];
715
+ const about = d.aboutAttributes ?? [];
716
+ const reviews = d.reviews ?? [];
717
+ const reviewsStatus = d.reviewsStatus ?? "not_requested";
718
+ const hoursTable = d.hoursTable ?? [];
719
+ const ratingLine = [rating, reviewCount ? `(${reviewCount} reviews)` : null].filter(Boolean).join(" ");
720
+ const basicLines = [
721
+ address ? `- **Address:** ${address}` : null,
722
+ phone ? `- **Phone:** ${phone}` : null,
723
+ website ? `- **Website:** ${website}` : null,
724
+ hoursSummary ? `- **Hours:** ${hoursSummary}` : null,
725
+ plusCode ? `- **Plus Code:** ${plusCode}` : null,
726
+ bookingUrl ? `- **Book:** ${bookingUrl}` : null
727
+ ].filter(Boolean).join("\n");
728
+ const hoursSection = hoursTable.length ? `
729
+ ## Hours
730
+ | Day | Hours |
731
+ |-----|-------|
732
+ ${hoursTable.map((r) => `| ${r.day} | ${r.hours} |`).join("\n")}` : "";
733
+ const histSection = histogram.length ? `
734
+ ## Rating Distribution
735
+ | Stars | Count |
736
+ |-------|-------|
737
+ ${histogram.map((r) => `| ${"\u2605".repeat(r.stars)}${"\u2606".repeat(5 - r.stars)} | ${r.count} |`).join("\n")}` : "";
738
+ const topicsSection = topics.length ? `
739
+ ## Review Topics
740
+ ${topics.map((t) => `- **${t.label}:** ${t.count} mentions`).join("\n")}` : "";
741
+ const aboutBySection = {};
742
+ for (const a of about) {
743
+ if (!aboutBySection[a.section]) aboutBySection[a.section] = [];
744
+ aboutBySection[a.section].push(a.attribute);
745
+ }
746
+ const aboutSection = Object.keys(aboutBySection).length ? `
747
+ ## About
748
+ ${Object.entries(aboutBySection).map(([s, attrs]) => `**${s}**
749
+ ${attrs.map((a) => `- ${a}`).join("\n")}`).join("\n\n")}` : "";
750
+ const entitySection = [
751
+ kgmid ? `- **KGMID:** \`${kgmid}\`` : null,
752
+ cidDecimal ? `- **CID:** \`${cidDecimal}\`` : null,
753
+ cidUrl ? `- **Maps CID URL:** ${cidUrl}` : null,
754
+ lat != null && lng != null ? `- **Coordinates:** ${lat}, ${lng}` : null
755
+ ].filter(Boolean).join("\n");
756
+ const reviewsSection = (() => {
757
+ if (reviewsStatus === "not_requested") return "";
758
+ if (reviewsStatus === "unavailable") return "\n## Reviews\n> Reviews could not be retrieved this run \u2014 retry with `includeReviews: true`.";
759
+ if (reviewsStatus === "none_exist") return "\n## Reviews\n*This business has no reviews on Google Maps.*";
760
+ if (reviews.length === 0) return "\n## Reviews\n*0 reviews collected.*";
761
+ return `
762
+ ## Reviews (${reviews.length})
763
+ ${reviews.map((r, i) => {
764
+ const starsN = parseInt(r.stars ?? "0");
765
+ const stars = "\u2605".repeat(starsN) + "\u2606".repeat(5 - starsN);
766
+ return `### ${i + 1}. ${r.author ?? "Anonymous"} \u2014 ${stars}
767
+ *${r.date ?? ""}*
768
+
769
+ ${r.text ?? ""}`;
770
+ }).join("\n\n")}`;
771
+ })();
772
+ const full = [
773
+ `# ${name}`,
774
+ category ? `*${category}*` : null,
775
+ ratingLine ? `
776
+ **Rating:** ${ratingLine}` : null,
777
+ basicLines ? `
778
+ ${basicLines}` : null,
779
+ hoursSection,
780
+ histSection,
781
+ topicsSection,
782
+ aboutSection,
783
+ entitySection ? `
784
+ ## Entity IDs
785
+ ${entitySection}` : null,
786
+ reviewsSection,
787
+ durationMs != null ? `
788
+ ---
789
+ *Extracted in ${(durationMs / 1e3).toFixed(1)}s*` : null
790
+ ].filter(Boolean).join("\n");
791
+ const summary = [
792
+ `**${name}** \u2014 ${category ?? "Business"} \xB7 ${ratingLine || "No rating"}`,
793
+ address ? `\u{1F4CD} ${address}` : null,
794
+ phone ? `\u{1F4DE} ${phone}` : null,
795
+ hoursSummary ? `\u{1F550} ${hoursSummary}` : null,
796
+ website ? `\u{1F310} ${website}` : null,
797
+ reviewsStatus === "collected" && reviews.length ? `
798
+ \u{1F4AC} ${reviews.length} reviews fetched \u2014 full list in artifact above` : null,
799
+ reviewsStatus === "unavailable" ? `
800
+ \u26A0\uFE0F Reviews could not be retrieved this run` : null,
801
+ reviewsStatus === "none_exist" ? `
802
+ \u{1F4AC} No reviews on Google Maps` : null
803
+ ].filter(Boolean).join("\n");
804
+ return oneBlock(full);
805
+ }
806
+ function formatFacebookAdTranscribe(raw, input) {
807
+ const parsed = parseData(raw);
808
+ if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
809
+ const d = parsed.data;
810
+ const text = d.text ?? "";
811
+ const chunks = d.chunks ?? [];
812
+ const durSec = d.durationMs ? (d.durationMs / 1e3).toFixed(0) : "\u2014";
813
+ const chunkRows = chunks.slice(0, 50).map((c) => {
814
+ const sec = Number.isFinite(c.timestamp[0]) ? Math.floor(c.timestamp[0]) : 0;
815
+ const mm = String(Math.floor(sec / 60)).padStart(2, "0");
816
+ const ss = String(sec % 60).padStart(2, "0");
817
+ return `| ${mm}:${ss} | ${cell(truncate(c.text, 120))} |`;
818
+ }).join("\n");
819
+ const full = [
820
+ `# Facebook Ad Transcript`,
821
+ `**Duration:** ${durSec}s \xB7 **${text.split(" ").length} words**`,
822
+ `
823
+ ## Full Transcript
824
+ ${text}`,
825
+ chunks.length ? `
826
+ ## Timestamped Chunks
827
+ | Time | Text |
828
+ |------|------|
829
+ ${chunkRows}` : "",
830
+ `
831
+ ---
832
+ \u{1F4A1} Get more ads from this advertiser: use \`facebook_page_intel\``
833
+ ].filter(Boolean).join("\n");
834
+ const summary = [
835
+ `**Facebook Ad Transcript** \u2014 ${text.split(" ").length} words \xB7 ${durSec}s`,
836
+ `
837
+ **Preview:**
838
+ > ${truncate(text, 300)}`,
839
+ `
840
+ \u{1F4A1} Full transcript in artifact above`
841
+ ].join("\n");
842
+ return oneBlock(full);
843
+ }
844
+
845
+ // src/mcp/paa-mcp-server.ts
846
+ function buildPaaExtractorMcpServer(executor) {
847
+ const server = new McpServer({ name: "mcp-scraper", version: "1.0.0" });
848
+ server.registerTool("harvest_paa", {
849
+ description: 'Best default tool for Google search research. Extracts People Also Ask questions plus answers/source URLs, organic SERP, local pack when present, entity IDs (CID/GCID/KG MID), and AI Overview. Infer the user language: split topic from location (e.g. "best hvac company in Denver CO" => query "best hvac company", location "Denver, CO", gl "us", hl "en"). Use maxQuestions 30 normally, 100-150 for "full", "deep", "all", or comprehensive research. Credits are charged by extracted question; unused request hold is refunded. Saves a full Markdown report locally.',
850
+ inputSchema: HarvestPaaInputSchema
851
+ }, async (input) => formatHarvestPaa(await executor.harvestPaa(input), input));
852
+ server.registerTool("search_serp", {
853
+ description: "Fast Google SERP lookup without PAA expansion. Use when the user asks for rankings, organic results, local pack, quick SERP, or positions. Split topic from location and infer gl/hl from the user request. Saves a full Markdown report locally.",
854
+ inputSchema: SearchSerpInputSchema
855
+ }, async (input) => formatSearchSerp(await executor.searchSerp(input), input));
856
+ server.registerTool("extract_url", {
857
+ description: "Extract structured data from one public URL: page content as Markdown, heading structure, JSON-LD schema, entity details, NAP score, metadata, and missing schema fields. Use when the user provides a single URL or asks to inspect/scrape one page. Saves a full Markdown report locally.",
858
+ inputSchema: ExtractUrlInputSchema
859
+ }, async (input) => formatExtractUrl(await executor.extractUrl(input), input));
860
+ server.registerTool("map_site_urls", {
861
+ description: "Map/crawl a public website to build a URL inventory with HTTP status codes, broken links, redirects, and site scope. Use before extract_site for audits or when the user asks for a sitemap/URL inventory. Saves a full Markdown report locally.",
862
+ inputSchema: MapSiteUrlsInputSchema
863
+ }, async (input) => formatMapSiteUrls(await executor.mapSiteUrls(input), input));
864
+ server.registerTool("extract_site", {
865
+ description: "Run multi-page extraction across a public website. Returns per-page titles, H1s, metadata, headings, schema/entity data, canonical URLs, and content. Use for website audits, competitor audits, and full-site extraction. Saves a full Markdown report locally.",
866
+ inputSchema: ExtractSiteInputSchema
867
+ }, async (input) => formatExtractSite(await executor.extractSite(input), input));
868
+ server.registerTool("youtube_harvest", {
869
+ description: 'Harvest YouTube video metadata by search query or channel handle/ID/URL. Use mode "search" for keyword/topic requests and mode "channel" for @handles, channel IDs, or channel URLs. Returns titles, views, dates, durations, URLs, thumbnails, and videoIds for follow-up transcription. Saves a full Markdown report locally.',
870
+ inputSchema: YoutubeHarvestInputSchema
871
+ }, async (input) => formatYoutubeHarvest(await executor.youtubeHarvest(input), input));
872
+ server.registerTool("youtube_transcribe", {
873
+ description: "Fetch and transcribe captions from a YouTube video. Returns full transcript, timestamped chunks, and word count. Pass a videoId from youtube_harvest results or infer it from a YouTube URL if the user provided one. Saves a full Markdown report locally.",
874
+ inputSchema: YoutubeTranscribeInputSchema
875
+ }, async (input) => formatYoutubeTranscribe(await executor.youtubeTranscribe(input), input));
876
+ server.registerTool("facebook_page_intel", {
877
+ description: "Harvest ads from a Facebook advertiser. Returns ad copy, headlines, CTAs, creative type, status, landing URLs, and video URLs ready for transcription. Accepts pageId, libraryId, or a brand/advertiser name as query. Use after facebook_ad_search when possible. Saves a full Markdown report locally.",
878
+ inputSchema: FacebookPageIntelInputSchema
879
+ }, async (input) => formatFacebookPageIntel(await executor.facebookPageIntel(input), input));
880
+ server.registerTool("facebook_ad_search", {
881
+ description: "Search Facebook Ad Library by brand, advertiser, competitor, niche, or keyword. Returns advertisers with ad counts and library IDs. Use to discover competitors, then pass libraryId to facebook_page_intel. Saves a full Markdown report locally.",
882
+ inputSchema: FacebookAdSearchInputSchema
883
+ }, async (input) => formatFacebookAdSearch(await executor.facebookAdSearch(input), input));
884
+ server.registerTool("facebook_ad_transcribe", {
885
+ description: "Transcribe audio from a Facebook ad video. Returns full transcript and timestamped chunks. Use the videoUrl value from facebook_page_intel results.",
886
+ inputSchema: FacebookAdTranscribeInputSchema
887
+ }, async (input) => formatFacebookAdTranscribe(await executor.facebookAdTranscribe(input), input));
888
+ server.registerTool("maps_place_intel", {
889
+ description: 'Extract Google Maps business intelligence for a named business: rating, review count, category, address, phone, website, hours, booking URL, review histogram, review topics, about attributes, entity IDs, and optional review cards. Split business name from location (e.g. "Elite Roofing Denver CO" => businessName "Elite Roofing", location "Denver, CO"). Pass includeReviews true when the user asks for reviews/customer pain. Saves a full Markdown report locally.',
890
+ inputSchema: MapsPlaceIntelInputSchema
891
+ }, async (input) => formatMapsPlaceIntel(await executor.mapsPlaceIntel(input), input));
892
+ server.registerTool("credits_info", {
893
+ description: "Answer questions about MCP Scraper credits: current credit balance, what a specific tool/action costs, the full cost table, and optionally recent credit ledger entries. Does not expose payment methods or credit card information.",
894
+ inputSchema: CreditsInfoInputSchema
895
+ }, async (input) => formatCreditsInfo(await executor.creditsInfo(input), input));
896
+ return server;
897
+ }
898
+
899
+ // src/mcp/http-mcp-tool-executor.ts
900
+ var HttpMcpToolExecutor = class {
901
+ baseUrl;
902
+ apiKey;
903
+ timeoutMs;
904
+ serpIntelligenceTimeoutMs;
905
+ constructor(baseUrl, apiKey) {
906
+ this.baseUrl = baseUrl.replace(/\/$/, "");
907
+ this.apiKey = apiKey;
908
+ const configuredTimeoutMs = Number(process.env.MCP_SCRAPER_HTTP_TIMEOUT_MS ?? 11e4);
909
+ this.timeoutMs = Number.isFinite(configuredTimeoutMs) && configuredTimeoutMs > 0 ? configuredTimeoutMs : 11e4;
910
+ const configuredSerpIntelligenceTimeoutMs = Number(process.env.MCP_SCRAPER_SERP_INTELLIGENCE_HTTP_TIMEOUT_MS ?? this.timeoutMs);
911
+ this.serpIntelligenceTimeoutMs = Number.isFinite(configuredSerpIntelligenceTimeoutMs) && configuredSerpIntelligenceTimeoutMs > 0 ? configuredSerpIntelligenceTimeoutMs : this.timeoutMs;
912
+ }
913
+ async call(path, body, timeoutMs = this.timeoutMs) {
914
+ try {
915
+ const res = await fetch(`${this.baseUrl}${path}`, {
916
+ method: "POST",
917
+ headers: {
918
+ "Content-Type": "application/json",
919
+ "x-api-key": this.apiKey
920
+ },
921
+ body: JSON.stringify(body),
922
+ signal: AbortSignal.timeout(timeoutMs)
923
+ });
924
+ const data = await res.json();
925
+ if (!res.ok) {
926
+ return { content: [{ type: "text", text: JSON.stringify(data) }], isError: true };
927
+ }
928
+ return { content: [{ type: "text", text: JSON.stringify(data) }] };
929
+ } catch (err) {
930
+ const msg = err instanceof Error ? err.message : String(err);
931
+ if (err instanceof DOMException && err.name === "TimeoutError") {
932
+ return {
933
+ content: [{
934
+ type: "text",
935
+ text: JSON.stringify({
936
+ error: "mcp_request_timeout",
937
+ error_type: "timeout",
938
+ retryable: true,
939
+ path,
940
+ timeoutMs,
941
+ message: `MCP Scraper request exceeded ${Math.round(timeoutMs / 1e3)}s and was cancelled. Retry with fewer results or use the async API for deep harvests.`
942
+ })
943
+ }],
944
+ isError: true
945
+ };
946
+ }
947
+ return { content: [{ type: "text", text: msg }], isError: true };
948
+ }
949
+ }
950
+ harvestPaa(input) {
951
+ return this.call("/harvest/sync", input);
952
+ }
953
+ searchSerp(input) {
954
+ return this.call("/harvest/sync", { ...input, serpOnly: true });
955
+ }
956
+ extractUrl(input) {
957
+ return this.call("/extract-url", input);
958
+ }
959
+ mapSiteUrls(input) {
960
+ return this.call("/map-urls", input);
961
+ }
962
+ extractSite(input) {
963
+ return this.call("/extract-site", input);
964
+ }
965
+ youtubeHarvest(input) {
966
+ return this.call("/youtube/harvest", input);
967
+ }
968
+ youtubeTranscribe(input) {
969
+ return this.call("/youtube/transcribe", input);
970
+ }
971
+ facebookPageIntel(input) {
972
+ return this.call("/facebook/page-intel", input);
973
+ }
974
+ facebookAdSearch(input) {
975
+ return this.call("/facebook/search", input);
976
+ }
977
+ facebookAdTranscribe(input) {
978
+ return this.call("/facebook/transcribe", input);
979
+ }
980
+ mapsPlaceIntel(input) {
981
+ return this.call("/maps/place", input);
982
+ }
983
+ creditsInfo(input) {
984
+ return this.call("/billing/credits", input);
985
+ }
986
+ captureSerpSnapshot(input) {
987
+ return this.call("/serp-intelligence/capture", input, this.serpIntelligenceTimeoutMs);
988
+ }
989
+ captureSerpPageSnapshots(input) {
990
+ return this.call("/serp-intelligence/page-snapshots", input, this.serpIntelligenceTimeoutMs);
991
+ }
992
+ };
993
+
994
+ export {
995
+ CaptureSerpSnapshotInputSchema,
996
+ CaptureSerpPageSnapshotsInputSchema,
997
+ buildPaaExtractorMcpServer,
998
+ HttpMcpToolExecutor
999
+ };
1000
+ //# sourceMappingURL=chunk-JQKZWEON.js.map