mcp-scraper 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -0
- package/dist/bin/api-server.cjs +15730 -7780
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +3 -3
- package/dist/bin/mcp-stdio-server.cjs +300 -110
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +1 -1
- package/dist/bin/paa-harvest.cjs +1537 -165
- package/dist/bin/paa-harvest.cjs.map +1 -1
- package/dist/bin/paa-harvest.js +1 -1
- package/dist/{chunk-ZBP4RHNW.js → chunk-4743MZHT.js} +298 -106
- package/dist/chunk-4743MZHT.js.map +1 -0
- package/dist/{chunk-LXZDJJXR.js → chunk-D4CJBZBY.js} +426 -29
- package/dist/chunk-D4CJBZBY.js.map +1 -0
- package/dist/chunk-HERFK7W6.js +2781 -0
- package/dist/chunk-HERFK7W6.js.map +1 -0
- package/dist/chunk-Y74EXABN.js +295 -0
- package/dist/chunk-Y74EXABN.js.map +1 -0
- package/dist/{db-IOYMX64U.js → db-YWCNHBLH.js} +36 -4
- package/dist/index.cjs +1660 -237
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +169 -2
- package/dist/index.d.ts +169 -2
- package/dist/index.js +120 -69
- package/dist/index.js.map +1 -1
- package/dist/server-N7Q6H4OR.js +11612 -0
- package/dist/server-N7Q6H4OR.js.map +1 -0
- package/dist/{worker-3ECJHPRE.js → worker-D4D2YQTA.js} +44 -9
- package/dist/worker-D4D2YQTA.js.map +1 -0
- package/package.json +17 -5
- package/dist/chunk-4API3ZCT.js +0 -1387
- package/dist/chunk-4API3ZCT.js.map +0 -1
- package/dist/chunk-LXZDJJXR.js.map +0 -1
- package/dist/chunk-ZBP4RHNW.js.map +0 -1
- package/dist/server-63DR2HE5.js +0 -6062
- package/dist/server-63DR2HE5.js.map +0 -1
- package/dist/worker-3ECJHPRE.js.map +0 -1
- /package/dist/{db-IOYMX64U.js.map → db-YWCNHBLH.js.map} +0 -0
|
@@ -4,28 +4,38 @@ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
|
4
4
|
// src/mcp/mcp-tool-schemas.ts
|
|
5
5
|
import { z } from "zod";
|
|
6
6
|
var HarvestPaaInputSchema = {
|
|
7
|
-
query: z.string().min(1).describe("
|
|
8
|
-
location: z.string().optional().describe(
|
|
9
|
-
maxQuestions: z.number().int().min(1).max(
|
|
10
|
-
gl: z.string().length(2).default("us"),
|
|
11
|
-
hl: z.string().default("en")
|
|
7
|
+
query: z.string().min(1).describe('Core search topic only. If the user says "best hvac company in Denver CO", use query="best hvac company" and location="Denver, CO". Do not include the location in query when it can be separated.'),
|
|
8
|
+
location: z.string().optional().describe('City, region, or country for geo-targeted results, inferred from the user request when present, e.g. "Denver, CO", "Tokyo, Japan", "London, UK".'),
|
|
9
|
+
maxQuestions: z.number().int().min(1).max(150).default(30).describe("Number of PAA questions to extract. Default 30. Maximum 150. Use 10 for quick probes, 30 for normal research, 100-150 when the user asks for everything/full/deep research. Credits are charged by extracted question; unused request hold is refunded."),
|
|
10
|
+
gl: z.string().length(2).default("us").describe("Google country code inferred from location or user language. Examples: United States us, United Kingdom gb, Japan jp, Canada ca, Australia au."),
|
|
11
|
+
hl: z.string().default("en").describe("Google interface/content language inferred from the user request. Use en unless the user asks for another language or locale."),
|
|
12
|
+
device: z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use desktop by default; use mobile only when the user asks for mobile rankings."),
|
|
13
|
+
proxyMode: z.enum(["location", "configured", "none"]).default("location").describe("Proxy targeting mode. Use location by default so city/state searches create or reuse a matching residential proxy. Use configured for the static configured proxy. Use none only for direct-network debugging."),
|
|
14
|
+
proxyZip: z.string().regex(/^\d{5}$/).optional().describe("Optional US ZIP override for residential location proxy targeting. Use only when the user gives a specific ZIP or city-center proxy targeting needs to be forced."),
|
|
15
|
+
debug: z.boolean().default(false).describe("Include sanitized browser/session/location diagnostics in the response. Use true when debugging localization, CAPTCHA, or proxy behavior.")
|
|
12
16
|
};
|
|
13
17
|
var ExtractUrlInputSchema = {
|
|
14
|
-
url: z.string().url()
|
|
18
|
+
url: z.string().url().describe("Public http/https URL to extract. Use this when the user provides one specific page URL."),
|
|
19
|
+
screenshot: z.boolean().default(false).describe("Also capture a full-page screenshot of the URL. Saved to ~/Downloads/mcp-scraper/screenshots/ and returned inline. Use when the user asks to see or capture the page visually."),
|
|
20
|
+
screenshotDevice: z.enum(["desktop", "mobile"]).default("desktop").describe("Viewport for screenshot. desktop = 1440\xD7900. mobile = 390\xD7844. Default desktop."),
|
|
21
|
+
extractBranding: z.boolean().default(false).describe("Extract brand colors, fonts, logo, and favicon using a rendered browser session. Returns colorScheme (light/dark), colors (primary/accent/background/text/heading as hex), fonts (heading/body family names), and assets (logo URL, favicon URL). Use when the user asks about brand colors, site theme, or brand assets."),
|
|
22
|
+
downloadMedia: z.boolean().default(false).describe("Extract and download all page media (images, video, audio) to ~/Downloads/mcp-scraper/media/. Ad networks, tracking pixels, and noise URLs are filtered automatically. Use when the user asks to download or harvest assets from a page."),
|
|
23
|
+
mediaTypes: z.array(z.enum(["image", "video", "audio"])).default(["image", "video", "audio"]).describe("Which media types to download. Default all three."),
|
|
24
|
+
allowLocal: z.boolean().default(false).describe("Allow localhost and private-network URLs. For local development only.")
|
|
15
25
|
};
|
|
16
26
|
var MapSiteUrlsInputSchema = {
|
|
17
|
-
url: z.string().url(),
|
|
18
|
-
maxUrls: z.number().int().min(1).max(500).optional()
|
|
27
|
+
url: z.string().url().describe("Public website URL or domain to crawl for internal URLs. Use before extract_site when the user asks to audit/map/crawl a site."),
|
|
28
|
+
maxUrls: z.number().int().min(1).max(500).optional().describe("Maximum URLs to discover. Use 100 for normal maps, higher when the user asks for a full inventory.")
|
|
19
29
|
};
|
|
20
30
|
var ExtractSiteInputSchema = {
|
|
21
|
-
url: z.string().url(),
|
|
22
|
-
maxPages: z.number().int().min(1).max(50).optional()
|
|
31
|
+
url: z.string().url().describe("Public website URL or domain to extract across multiple pages. Use when the user asks for a site audit, website crawl, or full-site content/schema extraction."),
|
|
32
|
+
maxPages: z.number().int().min(1).max(50).optional().describe("Maximum pages to extract. Use 50 when the user asks for full results or a complete crawl within MCP limits.")
|
|
23
33
|
};
|
|
24
34
|
var YoutubeHarvestInputSchema = {
|
|
25
|
-
mode: z.enum(["search", "channel"]),
|
|
26
|
-
query: z.string().optional().describe("Required when mode is search"),
|
|
27
|
-
channelHandle: z.string().optional().describe("YouTube channel handle,
|
|
28
|
-
maxVideos: z.number().int().min(1).max(500).default(50)
|
|
35
|
+
mode: z.enum(["search", "channel"]).describe("Use search for topic/keyword requests. Use channel when the user provides @handle, channel ID, or channel URL."),
|
|
36
|
+
query: z.string().optional().describe("Required when mode is search. The YouTube search topic in the user\u2019s words."),
|
|
37
|
+
channelHandle: z.string().optional().describe("YouTube channel handle, channel ID, or URL. Examples: @mkbhd, UC..., https://youtube.com/@mkbhd."),
|
|
38
|
+
maxVideos: z.number().int().min(1).max(500).default(50).describe("Number of videos to return. Default 50. Increase when user asks for full channel/history.")
|
|
29
39
|
};
|
|
30
40
|
var YoutubeTranscribeInputSchema = {
|
|
31
41
|
videoId: z.string().min(1).describe("YouTube video ID, e.g. dQw4w9WgXcQ")
|
|
@@ -33,12 +43,12 @@ var YoutubeTranscribeInputSchema = {
|
|
|
33
43
|
var FacebookPageIntelInputSchema = {
|
|
34
44
|
pageId: z.string().optional(),
|
|
35
45
|
libraryId: z.string().optional(),
|
|
36
|
-
query: z.string().optional().describe("One of pageId, libraryId, or query is required"),
|
|
46
|
+
query: z.string().optional().describe("Advertiser or brand name when pageId/libraryId is not known. One of pageId, libraryId, or query is required."),
|
|
37
47
|
maxAds: z.number().int().min(1).max(200).default(50),
|
|
38
48
|
country: z.string().length(2).default("US")
|
|
39
49
|
};
|
|
40
50
|
var FacebookAdSearchInputSchema = {
|
|
41
|
-
query: z.string().min(1),
|
|
51
|
+
query: z.string().min(1).describe("Advertiser, brand, competitor, niche, or keyword to search in Facebook Ad Library."),
|
|
42
52
|
country: z.string().length(2).default("US"),
|
|
43
53
|
maxResults: z.number().int().min(1).max(20).default(10)
|
|
44
54
|
};
|
|
@@ -46,10 +56,10 @@ var FacebookAdTranscribeInputSchema = {
|
|
|
46
56
|
videoUrl: z.string().url().describe("Facebook CDN video URL from a facebook_page_intel result")
|
|
47
57
|
};
|
|
48
58
|
var MapsPlaceIntelInputSchema = {
|
|
49
|
-
businessName: z.string().min(1).describe(
|
|
50
|
-
location: z.string().min(1).describe('City
|
|
51
|
-
gl: z.string().length(2).default("us"),
|
|
52
|
-
hl: z.string().length(2).default("en"),
|
|
59
|
+
businessName: z.string().min(1).describe('Business name only. If user says "Elite Roofing Denver CO", use businessName="Elite Roofing" and location="Denver, CO".'),
|
|
60
|
+
location: z.string().min(1).describe('City/region/country where the business should be searched, e.g. "Denver, CO". Infer from the user request when possible.'),
|
|
61
|
+
gl: z.string().length(2).default("us").describe("Google country code inferred from location."),
|
|
62
|
+
hl: z.string().length(2).default("en").describe("Language inferred from user request."),
|
|
53
63
|
includeReviews: z.boolean().default(false).describe("Whether to fetch individual review cards"),
|
|
54
64
|
maxReviews: z.number().int().min(1).max(500).default(50).describe("Max review cards to return (requires includeReviews: true)")
|
|
55
65
|
};
|
|
@@ -58,26 +68,98 @@ var CreditsInfoInputSchema = {
|
|
|
58
68
|
includeLedger: z.boolean().default(false).describe("Whether to include recent credit ledger entries")
|
|
59
69
|
};
|
|
60
70
|
var SearchSerpInputSchema = {
|
|
61
|
-
query: z.string().min(1).describe("
|
|
62
|
-
location: z.string().optional().describe("
|
|
63
|
-
gl: z.string().length(2).default("us"),
|
|
64
|
-
hl: z.string().default("en"),
|
|
71
|
+
query: z.string().min(1).describe('Core search topic only. Separate location when possible. If user says "best dentist in Brooklyn NY serp", use query="best dentist" and location="Brooklyn, NY".'),
|
|
72
|
+
location: z.string().optional().describe("City, region, or country for geo-targeted results, inferred from user request when present."),
|
|
73
|
+
gl: z.string().length(2).default("us").describe("Google country code inferred from location or user language."),
|
|
74
|
+
hl: z.string().default("en").describe("Google interface/content language inferred from user request."),
|
|
75
|
+
device: z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use desktop by default; use mobile only when the user asks for mobile rankings."),
|
|
76
|
+
proxyMode: z.enum(["location", "configured", "none"]).default("location").describe("Proxy targeting mode. Use location by default so city/state searches create or reuse a matching residential proxy. Use configured for the static configured proxy. Use none only for direct-network debugging."),
|
|
77
|
+
proxyZip: z.string().regex(/^\d{5}$/).optional().describe("Optional US ZIP override for residential location proxy targeting. Use only when the user gives a specific ZIP or city-center proxy targeting needs to be forced."),
|
|
78
|
+
debug: z.boolean().default(false).describe("Include sanitized browser/session/location diagnostics in the response. Use true when debugging localization, CAPTCHA, or proxy behavior."),
|
|
65
79
|
pages: z.number().int().min(1).max(2).default(1).describe("Number of result pages to fetch (1\u20132)")
|
|
66
80
|
};
|
|
81
|
+
var CaptureSerpSnapshotInputSchema = {
|
|
82
|
+
query: z.string().min(1).describe("Core search query to capture as a structured SERP Intelligence snapshot. Separate the place into location when the user gives a city, region, country, or ZIP."),
|
|
83
|
+
location: z.string().optional().describe("City, region, country, or service area used for localized Google results. MCP Scraper records location evidence; UULE alone is not proof of localization."),
|
|
84
|
+
gl: z.string().length(2).default("us").describe("Google country code inferred from the requested market, e.g. us, gb, ca, au."),
|
|
85
|
+
hl: z.string().default("en").describe("Google interface/content language inferred from the user request."),
|
|
86
|
+
device: z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use mobile only when the user asks for mobile rankings or mobile SERP evidence."),
|
|
87
|
+
proxyMode: z.enum(["location", "configured", "none"]).default("location").describe("Proxy behavior for capture. Use location for localized residential proxy targeting, configured for the static residential proxy, and none only for direct-network debugging."),
|
|
88
|
+
proxyZip: z.string().regex(/^\d{5}$/).optional().describe("Optional US ZIP override for residential location proxy targeting when a precise city-center or ZIP proxy is needed."),
|
|
89
|
+
pages: z.number().int().min(1).max(2).default(1).describe("Number of Google result pages to capture. Use 1 normally and 2 only when the user needs deeper ranking evidence."),
|
|
90
|
+
debug: z.boolean().default(false).describe("Include sanitized browser, proxy, and location diagnostics. Use true when debugging localization, CAPTCHA, proxy selection, or capture reliability."),
|
|
91
|
+
includePageSnapshots: z.boolean().default(false).describe("Also capture ranking-page snapshots for selected SERP URLs through the same product capture path."),
|
|
92
|
+
pageSnapshotLimit: z.number().int().min(0).max(10).default(0).describe("Maximum ranking-page snapshots to capture when includePageSnapshots is true. Use 0 when only SERP evidence is needed.")
|
|
93
|
+
};
|
|
94
|
+
var ScreenshotInputSchema = {
|
|
95
|
+
url: z.string().url().describe("URL to capture as a full-page screenshot. Use http or https. Pass allowLocal: true to capture localhost or private-network URLs during development."),
|
|
96
|
+
device: z.enum(["desktop", "mobile"]).default("desktop").describe("Viewport profile. desktop = 1440\xD7900. mobile = 390\xD7844. Use desktop by default; use mobile when the user asks for a mobile view."),
|
|
97
|
+
allowLocal: z.boolean().default(false).describe("Allow localhost and private-network URLs (127.x, 192.168.x, 10.x, etc.). For local development only \u2014 not for production use.")
|
|
98
|
+
};
|
|
99
|
+
var CaptureSerpPageSnapshotsInputSchema = {
|
|
100
|
+
urls: z.array(z.string().url()).min(1).max(25).describe("Public HTTP/HTTPS URLs to capture as SERP Intelligence page snapshots. Do not pass localhost, private IPs, file URLs, or internal admin URLs."),
|
|
101
|
+
targets: z.array(z.object({
|
|
102
|
+
url: z.string().url().describe("Public HTTP/HTTPS URL to capture."),
|
|
103
|
+
sourceKind: z.enum(["organic", "ai_citation", "local_pack_website", "configured_target", "site_subject"]).default("configured_target").describe("Why this page is being captured for SERP Intelligence evidence."),
|
|
104
|
+
sourcePosition: z.number().int().min(1).optional().describe("Ranking or citation position when the page came from SERP evidence.")
|
|
105
|
+
}).strict()).min(1).max(25).optional().describe("Structured page snapshot targets. Use this instead of urls when source kind or position should be preserved."),
|
|
106
|
+
maxConcurrency: z.number().int().min(1).max(5).default(2).describe("Parallel page captures. Use 2 normally; higher values can increase proxy/browser pressure."),
|
|
107
|
+
timeoutMs: z.number().int().min(1e3).max(6e4).default(15e3).describe("Per-page capture timeout in milliseconds. Increase for slow pages; timeout artifacts are returned as structured capture failures."),
|
|
108
|
+
debug: z.boolean().default(false).describe("Include sanitized browser/proxy diagnostics for page snapshot debugging. Use true for capture, network, or proxy troubleshooting.")
|
|
109
|
+
};
|
|
67
110
|
|
|
68
111
|
// src/mcp/mcp-response-formatter.ts
|
|
69
|
-
|
|
70
|
-
|
|
112
|
+
import { mkdirSync, writeFileSync, readFileSync } from "fs";
|
|
113
|
+
import { homedir } from "os";
|
|
114
|
+
import { join } from "path";
|
|
115
|
+
function slugifyReportName(input) {
|
|
116
|
+
return input.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80) || "mcp-scraper-report";
|
|
117
|
+
}
|
|
118
|
+
function reportTitle(full) {
|
|
119
|
+
const title = full.split("\n").find((line) => line.startsWith("# "));
|
|
120
|
+
return title?.replace(/^#\s+/, "").trim() || "MCP Scraper Report";
|
|
121
|
+
}
|
|
122
|
+
function saveFullReport(full) {
|
|
123
|
+
if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
|
|
124
|
+
const outDir = process.env.MCP_SCRAPER_OUTPUT_DIR?.trim() || join(homedir(), "Downloads", "mcp-scraper");
|
|
125
|
+
try {
|
|
126
|
+
mkdirSync(outDir, { recursive: true });
|
|
127
|
+
const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
128
|
+
const file = join(outDir, `${stamp}-${slugifyReportName(reportTitle(full))}.md`);
|
|
129
|
+
writeFileSync(file, full, "utf8");
|
|
130
|
+
return file;
|
|
131
|
+
} catch {
|
|
132
|
+
return null;
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
function oneBlock(content) {
|
|
136
|
+
const filePath = saveFullReport(content);
|
|
137
|
+
const text = filePath ? `${content}
|
|
138
|
+
|
|
139
|
+
\u{1F4C4} Saved: \`${filePath}\`` : content;
|
|
140
|
+
return { content: [{ type: "text", text }] };
|
|
141
|
+
}
|
|
142
|
+
function formatStructuredError(body, fallback) {
|
|
143
|
+
if (body.error === "insufficient_balance") {
|
|
144
|
+
return `Insufficient credits. Balance: ${body.balance_credits} credits. This call requires ${body.required_credits} credits. Top up at ${body.topup_url}`;
|
|
145
|
+
}
|
|
146
|
+
if (body.error === "mcp_request_timeout") {
|
|
147
|
+
return typeof body.message === "string" ? body.message : "MCP Scraper request timed out and was cancelled.";
|
|
148
|
+
}
|
|
149
|
+
if (typeof body.error_code === "string") {
|
|
150
|
+
const message = typeof body.error === "string" ? body.error : typeof body.message === "string" ? body.message : fallback;
|
|
151
|
+
const retryable = body.retryable === true ? " Retryable: yes." : "";
|
|
152
|
+
return `${body.error_code}: ${message}${retryable}${errorAttemptsSection(body)}`;
|
|
153
|
+
}
|
|
154
|
+
if (typeof body.error === "string") return body.error;
|
|
155
|
+
return fallback || "Tool error";
|
|
71
156
|
}
|
|
72
157
|
function parseData(raw) {
|
|
73
158
|
const first = raw.content.find((b) => b.type === "text");
|
|
74
159
|
const text = first?.type === "text" ? first.text : "";
|
|
75
160
|
try {
|
|
76
161
|
const parsed = JSON.parse(text || "{}");
|
|
77
|
-
if (parsed.error
|
|
78
|
-
return { error: `Insufficient credits. Balance: ${parsed.balance_credits} credits. This call requires ${parsed.required_credits} credits. Top up at ${parsed.topup_url}` };
|
|
79
|
-
}
|
|
80
|
-
if (raw.isError) return { error: text || "Tool error" };
|
|
162
|
+
if (raw.isError || parsed.error || parsed.error_code) return { error: formatStructuredError(parsed, text) };
|
|
81
163
|
const data = parsed.result ?? parsed;
|
|
82
164
|
return { data };
|
|
83
165
|
} catch {
|
|
@@ -108,8 +190,48 @@ function truncate(s, max) {
|
|
|
108
190
|
if (!s) return "";
|
|
109
191
|
return s.length > max ? s.slice(0, max) + "\u2026" : s;
|
|
110
192
|
}
|
|
111
|
-
|
|
112
|
-
|
|
193
|
+
function debugSection(debug) {
|
|
194
|
+
if (!debug || typeof debug !== "object") return "";
|
|
195
|
+
const request = debug.request ?? {};
|
|
196
|
+
const browser = debug.browser ?? {};
|
|
197
|
+
const kernel = browser.kernel ?? {};
|
|
198
|
+
const network = browser.networkLocation ?? {};
|
|
199
|
+
const nav = browser.serpNavigation ?? {};
|
|
200
|
+
const proxyResolution = kernel.proxyResolution ?? {};
|
|
201
|
+
const locationEvidence = debug.locationEvidence;
|
|
202
|
+
const candidates = Array.isArray(locationEvidence?.candidates) ? locationEvidence.candidates.slice(0, 4).map((c) => `${c.city}, ${c.regionCode} (${c.count})`).join(", ") : "";
|
|
203
|
+
const lines = [
|
|
204
|
+
"\n## Debug",
|
|
205
|
+
`- Proxy mode: ${request.proxyMode ?? kernel.proxyMode ?? "unknown"} \xB7 requested proxy: ${kernel.requestedProxyIdPresent === true ? `yes (${kernel.requestedProxyIdSuffix ?? "redacted"})` : "no"}`,
|
|
206
|
+
`- Proxy resolution: ${proxyResolution.source ?? "unknown"}${proxyResolution.target ? ` \xB7 ${proxyResolution.target.level ?? "city"} ${proxyResolution.target.city}, ${proxyResolution.target.state}` : ""}${proxyResolution.error ? ` \xB7 ${truncate(proxyResolution.error, 180)}` : ""}`,
|
|
207
|
+
`- Browser session: ${kernel.sessionId ?? "unknown"} \xB7 retrieved proxy: ${kernel.retrievedProxyIdPresent === true ? `yes (${kernel.retrievedProxyIdSuffix ?? "redacted"})` : kernel.retrievedProxyIdPresent === false ? "no" : "unknown"}`,
|
|
208
|
+
`- Browser IP geo: ${[network.ip, network.city, network.region, network.country].filter(Boolean).join(" \xB7 ") || network.error || "unknown"}`,
|
|
209
|
+
`- Google URL: ${truncate(nav.requestedUrl, 240) || "unknown"}`,
|
|
210
|
+
`- Final URL: ${truncate(nav.finalUrl, 240) || "unknown"} \xB7 CAPTCHA: ${nav.captchaDetected === true ? "yes" : nav.captchaDetected === false ? "no" : "unknown"} \xB7 redirected: ${nav.redirected === true ? "yes" : nav.redirected === false ? "no" : "unknown"}`
|
|
211
|
+
];
|
|
212
|
+
if (locationEvidence) {
|
|
213
|
+
lines.push(`- Location evidence: ${locationEvidence.status}${locationEvidence.expected ? ` \xB7 expected ${locationEvidence.expected.city}${locationEvidence.expected.regionCode ? `, ${locationEvidence.expected.regionCode}` : ""}` : ""}${candidates ? ` \xB7 candidates ${candidates}` : ""}`);
|
|
214
|
+
}
|
|
215
|
+
return lines.join("\n");
|
|
216
|
+
}
|
|
217
|
+
function errorAttemptsSection(body) {
|
|
218
|
+
const attempts = Array.isArray(body.attempts) ? body.attempts : [];
|
|
219
|
+
if (attempts.length === 0) return "";
|
|
220
|
+
const lines = attempts.slice(0, 5).map((attempt) => {
|
|
221
|
+
const debug = attempt.debug ?? {};
|
|
222
|
+
const browser = debug.browser ?? {};
|
|
223
|
+
const kernel = browser.kernel ?? {};
|
|
224
|
+
const proxyResolution = kernel.proxyResolution ?? {};
|
|
225
|
+
const network = browser.networkLocation ?? {};
|
|
226
|
+
const nav = browser.serpNavigation ?? {};
|
|
227
|
+
const geo = [network.ip, network.city, network.region].filter(Boolean).join(" / ") || "geo unknown";
|
|
228
|
+
return `- Attempt ${attempt.attempt_number ?? "?"}: ${attempt.outcome ?? attempt.status ?? "unknown"} \xB7 session ${attempt.kernel_session_id ?? kernel.sessionId ?? "unknown"} \xB7 proxy ${debug.request?.proxyMode ?? kernel.proxyMode ?? "unknown"}${proxyResolution.source ? `/${proxyResolution.source}` : ""} \xB7 ${geo} \xB7 CAPTCHA ${nav.captchaDetected === true ? "yes" : nav.captchaDetected === false ? "no" : "unknown"} \xB7 deleted ${attempt.kernel_delete_succeeded === true ? "yes" : attempt.kernel_delete_succeeded === false ? "no" : "unknown"}`;
|
|
229
|
+
});
|
|
230
|
+
return `
|
|
231
|
+
|
|
232
|
+
Attempts:
|
|
233
|
+
${lines.join("\n")}`;
|
|
234
|
+
}
|
|
113
235
|
function formatHarvestPaa(raw, input) {
|
|
114
236
|
const parsed = parseData(raw);
|
|
115
237
|
if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
|
|
@@ -118,6 +240,7 @@ function formatHarvestPaa(raw, input) {
|
|
|
118
240
|
const organic = d.organicResults ?? [];
|
|
119
241
|
const entityIds = d.entityIds;
|
|
120
242
|
const aiOvw = d.aiOverview;
|
|
243
|
+
const diagnostics = d.diagnostics;
|
|
121
244
|
const durationMs = d.stats?.durationMs;
|
|
122
245
|
const paaRows = flat.map(
|
|
123
246
|
(r, i) => `| ${i + 1} | ${r.question} | ${truncate(r.answer, 120)} | ${r.source_site ?? ""} |`
|
|
@@ -125,7 +248,7 @@ function formatHarvestPaa(raw, input) {
|
|
|
125
248
|
const paaTable = flat.length ? `## People Also Ask (${flat.length} questions)
|
|
126
249
|
| # | Question | Answer | Source |
|
|
127
250
|
|---|----------|--------|--------|
|
|
128
|
-
${paaRows}` : "## People Also Ask\n*
|
|
251
|
+
${paaRows}` : "## People Also Ask\n*Google did not return a People Also Ask block for this query/location. SERP data was extracted successfully when available.*";
|
|
129
252
|
const serpRows = organic.map(
|
|
130
253
|
(r) => `| ${r.position} | ${r.title} | [${r.domain}](${r.url}) | ${truncate(r.snippet, 100)} |`
|
|
131
254
|
).join("\n");
|
|
@@ -139,20 +262,20 @@ ${serpRows}` : "";
|
|
|
139
262
|
> ${truncate(aiOvw.text, 600)}` : "";
|
|
140
263
|
const statsLine = durationMs ? `
|
|
141
264
|
## Stats
|
|
142
|
-
- Questions: ${flat.length} \xB7 Duration: ${(durationMs / 1e3).toFixed(1)}s` : "";
|
|
265
|
+
- Status: ${diagnostics?.completionStatus ?? (flat.length ? "paa_found" : "no_paa")} \xB7 Questions: ${flat.length} \xB7 Duration: ${(durationMs / 1e3).toFixed(1)}s` : "";
|
|
143
266
|
const tips = `
|
|
144
267
|
---
|
|
145
268
|
\u{1F4A1} **Tips**
|
|
146
|
-
- Max questions: \`maxQuestions:
|
|
269
|
+
- Max questions: \`maxQuestions: 150\` (current: ${input.maxQuestions ?? 30})
|
|
147
270
|
- Organic results only: use \`search_serp\`
|
|
148
271
|
- Dig into a result: use \`extract_url\` on any organic URL`;
|
|
149
|
-
const full =
|
|
272
|
+
const full = `# PAA Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
|
|
150
273
|
|
|
151
|
-
${paaTable}${serpTable}${entityIdsSection(entityIds)}${aiSection}${statsLine}${tips}`;
|
|
274
|
+
${paaTable}${serpTable}${entityIdsSection(entityIds)}${aiSection}${statsLine}${debugSection(diagnostics?.debug)}${tips}`;
|
|
152
275
|
const topQ = flat.slice(0, 10).map((r, i) => `${i + 1}. ${r.question}`).join("\n");
|
|
153
276
|
const topO = organic.slice(0, 5).map((r) => `${r.position}. [${r.title}](${r.url}) \u2014 ${r.domain}`).join("\n");
|
|
154
277
|
const summary = [
|
|
155
|
-
|
|
278
|
+
`**PAA: "${input.query}"** \u2014 ${flat.length} questions extracted`,
|
|
156
279
|
topQ ? `
|
|
157
280
|
**Top questions:**
|
|
158
281
|
${topQ}` : "",
|
|
@@ -161,9 +284,9 @@ ${topQ}` : "",
|
|
|
161
284
|
${topO}` : "",
|
|
162
285
|
entityIdsSummaryLine(entityIds),
|
|
163
286
|
`
|
|
164
|
-
\u{1F4A1} \`maxQuestions\` up to
|
|
287
|
+
\u{1F4A1} \`maxQuestions\` up to 150 \xB7Use \`extract_url\` to dig into any result`
|
|
165
288
|
].filter(Boolean).join("\n");
|
|
166
|
-
return
|
|
289
|
+
return oneBlock(full);
|
|
167
290
|
}
|
|
168
291
|
function formatSearchSerp(raw, input) {
|
|
169
292
|
const parsed = parseData(raw);
|
|
@@ -173,6 +296,7 @@ function formatSearchSerp(raw, input) {
|
|
|
173
296
|
const localPack = d.localPack ?? [];
|
|
174
297
|
const entityIds = d.entityIds;
|
|
175
298
|
const aiOvw = d.aiOverview;
|
|
299
|
+
const diagnostics = d.diagnostics;
|
|
176
300
|
const serpRows = organic.map(
|
|
177
301
|
(r) => `| ${r.position} | ${r.title} | [${r.domain}](${r.url}) | ${truncate(r.snippet, 100)} |`
|
|
178
302
|
).join("\n");
|
|
@@ -197,12 +321,12 @@ ${localRows}` : "";
|
|
|
197
321
|
- Get PAA questions: use \`harvest_paa\` for this query
|
|
198
322
|
- Scrape any result: use \`extract_url\`
|
|
199
323
|
- Business entity IDs (CID/GCID/KG MID) shown above if found`;
|
|
200
|
-
const full =
|
|
324
|
+
const full = `# SERP Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
|
|
201
325
|
|
|
202
|
-
${serpTable}${localSection}${entityIdsSection(entityIds)}${aiSection}${tips}`;
|
|
326
|
+
${serpTable}${localSection}${entityIdsSection(entityIds)}${aiSection}${debugSection(diagnostics?.debug)}${tips}`;
|
|
203
327
|
const topO = organic.slice(0, 5).map((r) => `${r.position}. [${r.title}](${r.url}) \u2014 ${r.domain}`).join("\n");
|
|
204
328
|
const summary = [
|
|
205
|
-
|
|
329
|
+
`**SERP: "${input.query}"** \u2014 ${organic.length} organic results`,
|
|
206
330
|
topO ? `
|
|
207
331
|
**Top results:**
|
|
208
332
|
${topO}` : "",
|
|
@@ -212,7 +336,7 @@ ${topO}` : "",
|
|
|
212
336
|
`
|
|
213
337
|
\u{1F4A1} Use \`harvest_paa\` for questions \xB7 \`extract_url\` to scrape any result`
|
|
214
338
|
].filter(Boolean).join("\n");
|
|
215
|
-
return
|
|
339
|
+
return oneBlock(full);
|
|
216
340
|
}
|
|
217
341
|
function formatExtractUrl(raw, input) {
|
|
218
342
|
const parsed = parseData(raw);
|
|
@@ -224,6 +348,9 @@ function formatExtractUrl(raw, input) {
|
|
|
224
348
|
const kpo = d.kpo;
|
|
225
349
|
const bodyMd = d.bodyMarkdown ?? "";
|
|
226
350
|
const schema = d.schema;
|
|
351
|
+
const screenshotMeta = d.screenshot;
|
|
352
|
+
const branding = d.branding;
|
|
353
|
+
const media = d.media;
|
|
227
354
|
const h1Lines = headings.filter((h) => h.level === 1).map((h) => `- ${h.text}`).join("\n");
|
|
228
355
|
const h2Lines = headings.filter((h) => h.level === 2).map((h) => ` - ${h.text}`).join("\n");
|
|
229
356
|
const headingSection = h1Lines || h2Lines ? `
|
|
@@ -246,6 +373,26 @@ ${[h1Lines, h2Lines].filter(Boolean).join("\n")}` : "";
|
|
|
246
373
|
const bodySection = bodyMd ? `
|
|
247
374
|
## Page Content
|
|
248
375
|
${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
|
|
376
|
+
const screenshotSection = screenshotMeta ? `
|
|
377
|
+
## Screenshot
|
|
378
|
+
- **File:** ${screenshotMeta.savedPath}
|
|
379
|
+
- **Size:** ${(screenshotMeta.sizeBytes / 1024).toFixed(1)} KB
|
|
380
|
+
- **Device:** ${screenshotMeta.device}` : "";
|
|
381
|
+
const brandingSection = branding ? [
|
|
382
|
+
`
|
|
383
|
+
## Branding`,
|
|
384
|
+
branding.colorScheme ? `- **Color scheme:** ${branding.colorScheme}` : "",
|
|
385
|
+
`- **Colors:**${Object.entries(branding.colors ?? {}).filter(([, v]) => v).map(([k, v]) => ` ${k}=${v}`).join(",") || " (none extracted)"}`,
|
|
386
|
+
`- **Fonts:**${Object.entries(branding.fonts ?? {}).filter(([, v]) => v).map(([k, v]) => ` ${k}=${v}`).join(",") || " (none extracted)"}`,
|
|
387
|
+
branding.assets?.logo ? `- **Logo:** ${branding.assets.logo}` : "",
|
|
388
|
+
branding.assets?.favicon ? `- **Favicon:** ${branding.assets.favicon}` : ""
|
|
389
|
+
].filter(Boolean).join("\n") : "";
|
|
390
|
+
const mediaSection = media ? [
|
|
391
|
+
`
|
|
392
|
+
## Media Assets`,
|
|
393
|
+
`- **Found:** ${media.totalFound} total, ${media.filteredCount} filtered (ads/noise), ${media.assets.length} downloaded`,
|
|
394
|
+
media.outputDir ? `- **Saved to:** ${media.outputDir}` : ""
|
|
395
|
+
].filter(Boolean).join("\n") : "";
|
|
249
396
|
const schemaCount = Array.isArray(schema) ? schema.length : 0;
|
|
250
397
|
const tips = `
|
|
251
398
|
---
|
|
@@ -253,19 +400,23 @@ ${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
|
|
|
253
400
|
- Crawl entire site: use \`extract_site\`
|
|
254
401
|
- Map all URLs: use \`map_site_urls\`
|
|
255
402
|
- ${schemaCount} JSON-LD schema block(s) detected`;
|
|
256
|
-
const full =
|
|
403
|
+
const full = `# URL Extract: ${url}
|
|
257
404
|
**${title}**
|
|
258
|
-
${headingSection}${kpoSection}${bodySection}${tips}`;
|
|
259
|
-
const
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
405
|
+
${headingSection}${kpoSection}${brandingSection}${bodySection}${screenshotSection}${mediaSection}${tips}`;
|
|
406
|
+
const textResult = oneBlock(full);
|
|
407
|
+
if (screenshotMeta?.savedPath) {
|
|
408
|
+
try {
|
|
409
|
+
const imgBuf = readFileSync(screenshotMeta.savedPath);
|
|
410
|
+
return {
|
|
411
|
+
content: [
|
|
412
|
+
...textResult.content,
|
|
413
|
+
{ type: "image", data: imgBuf.toString("base64"), mimeType: "image/png" }
|
|
414
|
+
]
|
|
415
|
+
};
|
|
416
|
+
} catch {
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
return textResult;
|
|
269
420
|
}
|
|
270
421
|
function formatMapSiteUrls(raw, input) {
|
|
271
422
|
const parsed = parseData(raw);
|
|
@@ -277,7 +428,7 @@ function formatMapSiteUrls(raw, input) {
|
|
|
277
428
|
const redirects = urls.filter((u) => u.status !== null && u.status >= 300 && u.status < 400);
|
|
278
429
|
const urlRows = urls.slice(0, 200).map((u, i) => `| ${i + 1} | ${u.url} | ${u.status ?? "\u2014"} |`).join("\n");
|
|
279
430
|
const full = [
|
|
280
|
-
|
|
431
|
+
`# URL Map: ${input.url}`,
|
|
281
432
|
`**${d.totalFound} URLs** \xB7 ${(d.durationMs / 1e3).toFixed(1)}s${d.truncated ? " \xB7 *truncated*" : ""}`,
|
|
282
433
|
`
|
|
283
434
|
## Summary
|
|
@@ -299,14 +450,14 @@ ${broken.map((u) => `- ${u.url} (${u.status})`).join("\n")}` : "",
|
|
|
299
450
|
- Scrape a single page: use \`extract_url\``
|
|
300
451
|
].filter(Boolean).join("\n");
|
|
301
452
|
const summary = [
|
|
302
|
-
|
|
453
|
+
`**URL Map: ${input.url}**`,
|
|
303
454
|
`${d.totalFound} URLs \u2014 ${ok.length} OK \xB7 ${broken.length} broken \xB7 ${redirects.length} redirects`,
|
|
304
455
|
broken.length ? `
|
|
305
456
|
**Broken URLs:** ${broken.slice(0, 3).map((u) => u.url).join(", ")}` : "",
|
|
306
457
|
`
|
|
307
458
|
\u{1F4A1} Use \`extract_site\` to extract content from all pages`
|
|
308
459
|
].filter(Boolean).join("\n");
|
|
309
|
-
return
|
|
460
|
+
return oneBlock(full);
|
|
310
461
|
}
|
|
311
462
|
function formatExtractSite(raw, input) {
|
|
312
463
|
const parsed = parseData(raw);
|
|
@@ -318,7 +469,7 @@ function formatExtractSite(raw, input) {
|
|
|
318
469
|
return `| ${i + 1} | ${p.title ?? "Untitled"} | ${p.url} | ${schemaInfo} |`;
|
|
319
470
|
}).join("\n");
|
|
320
471
|
const full = [
|
|
321
|
-
|
|
472
|
+
`# Site Extract: ${input.url}`,
|
|
322
473
|
`**${pages.length} pages** \xB7 ${((d.durationMs ?? 0) / 1e3).toFixed(1)}s`,
|
|
323
474
|
`
|
|
324
475
|
## Pages
|
|
@@ -332,13 +483,13 @@ ${pageRows}`,
|
|
|
332
483
|
- Inspect a single page: use \`extract_url\``
|
|
333
484
|
].join("\n");
|
|
334
485
|
const summary = [
|
|
335
|
-
|
|
486
|
+
`**Site Extract: ${input.url}** \u2014 ${pages.length} pages`,
|
|
336
487
|
pages.slice(0, 5).map((p) => `- ${p.title ?? p.url}`).join("\n"),
|
|
337
488
|
pages.length > 5 ? `- \u2026 and ${pages.length - 5} more` : "",
|
|
338
489
|
`
|
|
339
490
|
\u{1F4A1} Use \`extract_url\` to inspect any individual page`
|
|
340
491
|
].filter(Boolean).join("\n");
|
|
341
|
-
return
|
|
492
|
+
return oneBlock(full);
|
|
342
493
|
}
|
|
343
494
|
function formatYoutubeHarvest(raw, input) {
|
|
344
495
|
const parsed = parseData(raw);
|
|
@@ -354,7 +505,7 @@ function formatYoutubeHarvest(raw, input) {
|
|
|
354
505
|
- **Name:** ${d.channelMeta.title ?? "\u2014"}
|
|
355
506
|
- **Subscribers:** ${d.channelMeta.subscriberCount ?? "\u2014"}` : "";
|
|
356
507
|
const full = [
|
|
357
|
-
|
|
508
|
+
`# YouTube Harvest: ${label}`,
|
|
358
509
|
`**${videos.length} videos** \xB7 ${(d.stats.durationMs / 1e3).toFixed(1)}s`,
|
|
359
510
|
channelSection,
|
|
360
511
|
`
|
|
@@ -370,14 +521,14 @@ ${videoRows}`,
|
|
|
370
521
|
].filter(Boolean).join("\n");
|
|
371
522
|
const top5 = videos.slice(0, 5).map((v, i) => `${i + 1}. ${v.title} (\`${v.videoId}\`)`).join("\n");
|
|
372
523
|
const summary = [
|
|
373
|
-
|
|
524
|
+
`**YouTube: ${label}** \u2014 ${videos.length} videos`,
|
|
374
525
|
`
|
|
375
526
|
**Top videos:**
|
|
376
527
|
${top5}`,
|
|
377
528
|
`
|
|
378
529
|
\u{1F4A1} Transcribe any video: \`youtube_transcribe\` with its videoId`
|
|
379
530
|
].join("\n");
|
|
380
|
-
return
|
|
531
|
+
return oneBlock(full);
|
|
381
532
|
}
|
|
382
533
|
function formatYoutubeTranscribe(raw, input) {
|
|
383
534
|
const parsed = parseData(raw);
|
|
@@ -387,13 +538,13 @@ function formatYoutubeTranscribe(raw, input) {
|
|
|
387
538
|
const chunks = d.chunks ?? [];
|
|
388
539
|
const durSec = d.durationMs ? (d.durationMs / 1e3).toFixed(0) : "\u2014";
|
|
389
540
|
const chunkRows = chunks.slice(0, 50).map((c) => {
|
|
390
|
-
const sec = Math.floor(c.
|
|
541
|
+
const sec = Number.isFinite(c.timestamp[0]) ? Math.floor(c.timestamp[0]) : 0;
|
|
391
542
|
const mm = String(Math.floor(sec / 60)).padStart(2, "0");
|
|
392
543
|
const ss = String(sec % 60).padStart(2, "0");
|
|
393
544
|
return `| ${mm}:${ss} | ${truncate(c.text, 120)} |`;
|
|
394
545
|
}).join("\n");
|
|
395
546
|
const full = [
|
|
396
|
-
|
|
547
|
+
`# YouTube Transcript: \`${input.videoId}\``,
|
|
397
548
|
`**Duration:** ${durSec}s \xB7 **${text.split(" ").length} words**`,
|
|
398
549
|
`
|
|
399
550
|
## Full Transcript
|
|
@@ -408,14 +559,14 @@ ${chunkRows}` : "",
|
|
|
408
559
|
\u{1F4A1} Harvest more from this channel: use \`youtube_harvest\` with \`mode: "channel"\``
|
|
409
560
|
].filter(Boolean).join("\n");
|
|
410
561
|
const summary = [
|
|
411
|
-
|
|
562
|
+
`**YouTube Transcript: \`${input.videoId}\`** \u2014 ${text.split(" ").length} words \xB7 ${durSec}s`,
|
|
412
563
|
`
|
|
413
564
|
**Preview:**
|
|
414
565
|
> ${truncate(text, 300)}`,
|
|
415
566
|
`
|
|
416
567
|
\u{1F4A1} Full transcript in artifact above`
|
|
417
568
|
].join("\n");
|
|
418
|
-
return
|
|
569
|
+
return oneBlock(full);
|
|
419
570
|
}
|
|
420
571
|
function formatFacebookPageIntel(raw, input) {
|
|
421
572
|
const parsed = parseData(raw);
|
|
@@ -433,7 +584,7 @@ function formatFacebookPageIntel(raw, input) {
|
|
|
433
584
|
ad.variations ? `**Variations:** ${ad.variations}` : ""
|
|
434
585
|
].filter(Boolean).join("\n")).join("\n\n---\n\n");
|
|
435
586
|
const full = [
|
|
436
|
-
|
|
587
|
+
`# Facebook Ad Intel: ${advertiser}`,
|
|
437
588
|
`**${s.totalAds} ads** \xB7 ${s.activeCount} active \xB7 ${s.videoCount} video \xB7 ${s.imageCount} image`,
|
|
438
589
|
`
|
|
439
590
|
${adBlocks}`,
|
|
@@ -447,7 +598,7 @@ ${adBlocks}`,
|
|
|
447
598
|
const adSummary = activeAds.map((a, i) => `${i + 1}. ${truncate(a.headline ?? a.primaryText, 80)} (${a.creativeType ?? "\u2014"})`).join("\n");
|
|
448
599
|
const videoCount = ads.filter((a) => a.videoUrl).length;
|
|
449
600
|
const summary = [
|
|
450
|
-
|
|
601
|
+
`**Facebook Ads: ${advertiser}** \u2014 ${s.totalAds} ads (${s.activeCount} active)`,
|
|
451
602
|
adSummary ? `
|
|
452
603
|
**Active ads:**
|
|
453
604
|
${adSummary}` : "",
|
|
@@ -455,7 +606,7 @@ ${adSummary}` : "",
|
|
|
455
606
|
videoCount ? `
|
|
456
607
|
\u{1F4A1} ${videoCount} video ads \u2014 transcribe with \`facebook_ad_transcribe\` using the videoUrl` : ""
|
|
457
608
|
].filter(Boolean).join("\n");
|
|
458
|
-
return
|
|
609
|
+
return oneBlock(full);
|
|
459
610
|
}
|
|
460
611
|
function formatFacebookAdSearch(raw, input) {
|
|
461
612
|
const parsed = parseData(raw);
|
|
@@ -466,7 +617,7 @@ function formatFacebookAdSearch(raw, input) {
|
|
|
466
617
|
(a, i) => `| ${i + 1} | ${a.name} | ${a.adCount ?? "\u2014"} | \`${a.libraryId ?? "\u2014"}\` |`
|
|
467
618
|
).join("\n");
|
|
468
619
|
const full = [
|
|
469
|
-
|
|
620
|
+
`# Facebook Ad Library Search: "${input.query}"`,
|
|
470
621
|
`**${advertisers.length} advertisers found**`,
|
|
471
622
|
`
|
|
472
623
|
## Advertisers
|
|
@@ -480,14 +631,14 @@ ${rows}`,
|
|
|
480
631
|
- Or pass the advertiser name as \`query\` in \`facebook_page_intel\``
|
|
481
632
|
].join("\n");
|
|
482
633
|
const summary = [
|
|
483
|
-
|
|
634
|
+
`**Facebook Ad Search: "${input.query}"** \u2014 ${advertisers.length} advertisers`,
|
|
484
635
|
advertisers.slice(0, 5).map(
|
|
485
636
|
(a, i) => `${i + 1}. ${a.name}${a.adCount ? ` (${a.adCount} ads)` : ""} \u2014 \`${a.libraryId ?? "\u2014"}\``
|
|
486
637
|
).join("\n"),
|
|
487
638
|
`
|
|
488
639
|
\u{1F4A1} Scan ads with \`facebook_page_intel\` using \`libraryId\``
|
|
489
640
|
].filter(Boolean).join("\n");
|
|
490
|
-
return
|
|
641
|
+
return oneBlock(full);
|
|
491
642
|
}
|
|
492
643
|
function formatCreditsInfo(raw, input) {
|
|
493
644
|
const parsed = parseData(raw);
|
|
@@ -513,7 +664,7 @@ ${matched.notes}` : ""}` : input.item ? `
|
|
|
513
664
|
## Matched Cost
|
|
514
665
|
No exact cost match found for "${input.item}". See the full cost table below.` : "";
|
|
515
666
|
const full = [
|
|
516
|
-
|
|
667
|
+
`# Credits`,
|
|
517
668
|
`**Balance:** ${balance ?? "unknown"} credits`,
|
|
518
669
|
matchedSection,
|
|
519
670
|
costs.length ? `
|
|
@@ -528,13 +679,13 @@ ${costRows}` : "",
|
|
|
528
679
|
${ledgerRows}` : ""
|
|
529
680
|
].filter(Boolean).join("\n");
|
|
530
681
|
const summary = [
|
|
531
|
-
|
|
682
|
+
`**Credit balance:** ${balance ?? "unknown"} credits`,
|
|
532
683
|
matched ? `
|
|
533
684
|
**${matched.label}:** ${matched.credits} credits ${matched.unit}` : null,
|
|
534
685
|
input.includeLedger && ledger.length ? `
|
|
535
686
|
Recent ledger entries included in the full report.` : null
|
|
536
687
|
].filter(Boolean).join("\n");
|
|
537
|
-
return
|
|
688
|
+
return oneBlock(full);
|
|
538
689
|
}
|
|
539
690
|
function formatMapsPlaceIntel(raw, input) {
|
|
540
691
|
const parsed = parseData(raw);
|
|
@@ -560,6 +711,7 @@ function formatMapsPlaceIntel(raw, input) {
|
|
|
560
711
|
const topics = d.reviewTopics ?? [];
|
|
561
712
|
const about = d.aboutAttributes ?? [];
|
|
562
713
|
const reviews = d.reviews ?? [];
|
|
714
|
+
const reviewsStatus = d.reviewsStatus ?? "not_requested";
|
|
563
715
|
const hoursTable = d.hoursTable ?? [];
|
|
564
716
|
const ratingLine = [rating, reviewCount ? `(${reviewCount} reviews)` : null].filter(Boolean).join(" ");
|
|
565
717
|
const basicLines = [
|
|
@@ -598,18 +750,24 @@ ${attrs.map((a) => `- ${a}`).join("\n")}`).join("\n\n")}` : "";
|
|
|
598
750
|
cidUrl ? `- **Maps CID URL:** ${cidUrl}` : null,
|
|
599
751
|
lat != null && lng != null ? `- **Coordinates:** ${lat}, ${lng}` : null
|
|
600
752
|
].filter(Boolean).join("\n");
|
|
601
|
-
const reviewsSection =
|
|
753
|
+
const reviewsSection = (() => {
|
|
754
|
+
if (reviewsStatus === "not_requested") return "";
|
|
755
|
+
if (reviewsStatus === "unavailable") return "\n## Reviews\n> Reviews could not be retrieved this run \u2014 retry with `includeReviews: true`.";
|
|
756
|
+
if (reviewsStatus === "none_exist") return "\n## Reviews\n*This business has no reviews on Google Maps.*";
|
|
757
|
+
if (reviews.length === 0) return "\n## Reviews\n*0 reviews collected.*";
|
|
758
|
+
return `
|
|
602
759
|
## Reviews (${reviews.length})
|
|
603
760
|
${reviews.map((r, i) => {
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
761
|
+
const starsN = parseInt(r.stars ?? "0");
|
|
762
|
+
const stars = "\u2605".repeat(starsN) + "\u2606".repeat(5 - starsN);
|
|
763
|
+
return `### ${i + 1}. ${r.author ?? "Anonymous"} \u2014 ${stars}
|
|
607
764
|
*${r.date ?? ""}*
|
|
608
765
|
|
|
609
766
|
${r.text ?? ""}`;
|
|
610
|
-
|
|
767
|
+
}).join("\n\n")}`;
|
|
768
|
+
})();
|
|
611
769
|
const full = [
|
|
612
|
-
|
|
770
|
+
`# ${name}`,
|
|
613
771
|
category ? `*${category}*` : null,
|
|
614
772
|
ratingLine ? `
|
|
615
773
|
**Rating:** ${ratingLine}` : null,
|
|
@@ -628,15 +786,19 @@ ${entitySection}` : null,
|
|
|
628
786
|
*Extracted in ${(durationMs / 1e3).toFixed(1)}s*` : null
|
|
629
787
|
].filter(Boolean).join("\n");
|
|
630
788
|
const summary = [
|
|
631
|
-
|
|
789
|
+
`**${name}** \u2014 ${category ?? "Business"} \xB7 ${ratingLine || "No rating"}`,
|
|
632
790
|
address ? `\u{1F4CD} ${address}` : null,
|
|
633
791
|
phone ? `\u{1F4DE} ${phone}` : null,
|
|
634
792
|
hoursSummary ? `\u{1F550} ${hoursSummary}` : null,
|
|
635
793
|
website ? `\u{1F310} ${website}` : null,
|
|
636
|
-
reviews.length ? `
|
|
637
|
-
\u{1F4AC} ${reviews.length} reviews fetched \u2014 full list in artifact above` : null
|
|
794
|
+
reviewsStatus === "collected" && reviews.length ? `
|
|
795
|
+
\u{1F4AC} ${reviews.length} reviews fetched \u2014 full list in artifact above` : null,
|
|
796
|
+
reviewsStatus === "unavailable" ? `
|
|
797
|
+
\u26A0\uFE0F Reviews could not be retrieved this run` : null,
|
|
798
|
+
reviewsStatus === "none_exist" ? `
|
|
799
|
+
\u{1F4AC} No reviews on Google Maps` : null
|
|
638
800
|
].filter(Boolean).join("\n");
|
|
639
|
-
return
|
|
801
|
+
return oneBlock(full);
|
|
640
802
|
}
|
|
641
803
|
function formatFacebookAdTranscribe(raw, input) {
|
|
642
804
|
const parsed = parseData(raw);
|
|
@@ -646,13 +808,13 @@ function formatFacebookAdTranscribe(raw, input) {
|
|
|
646
808
|
const chunks = d.chunks ?? [];
|
|
647
809
|
const durSec = d.durationMs ? (d.durationMs / 1e3).toFixed(0) : "\u2014";
|
|
648
810
|
const chunkRows = chunks.slice(0, 50).map((c) => {
|
|
649
|
-
const sec = Math.floor(c.
|
|
811
|
+
const sec = Number.isFinite(c.timestamp[0]) ? Math.floor(c.timestamp[0]) : 0;
|
|
650
812
|
const mm = String(Math.floor(sec / 60)).padStart(2, "0");
|
|
651
813
|
const ss = String(sec % 60).padStart(2, "0");
|
|
652
814
|
return `| ${mm}:${ss} | ${truncate(c.text, 120)} |`;
|
|
653
815
|
}).join("\n");
|
|
654
816
|
const full = [
|
|
655
|
-
|
|
817
|
+
`# Facebook Ad Transcript`,
|
|
656
818
|
`**Duration:** ${durSec}s \xB7 **${text.split(" ").length} words**`,
|
|
657
819
|
`
|
|
658
820
|
## Full Transcript
|
|
@@ -667,53 +829,53 @@ ${chunkRows}` : "",
|
|
|
667
829
|
\u{1F4A1} Get more ads from this advertiser: use \`facebook_page_intel\``
|
|
668
830
|
].filter(Boolean).join("\n");
|
|
669
831
|
const summary = [
|
|
670
|
-
|
|
832
|
+
`**Facebook Ad Transcript** \u2014 ${text.split(" ").length} words \xB7 ${durSec}s`,
|
|
671
833
|
`
|
|
672
834
|
**Preview:**
|
|
673
835
|
> ${truncate(text, 300)}`,
|
|
674
836
|
`
|
|
675
837
|
\u{1F4A1} Full transcript in artifact above`
|
|
676
838
|
].join("\n");
|
|
677
|
-
return
|
|
839
|
+
return oneBlock(full);
|
|
678
840
|
}
|
|
679
841
|
|
|
680
842
|
// src/mcp/paa-mcp-server.ts
|
|
681
843
|
function buildPaaExtractorMcpServer(executor) {
|
|
682
|
-
const server = new McpServer({ name: "
|
|
844
|
+
const server = new McpServer({ name: "mcp-scraper", version: "1.0.0" });
|
|
683
845
|
server.registerTool("harvest_paa", {
|
|
684
|
-
description:
|
|
846
|
+
description: 'Best default tool for Google search research. Extracts People Also Ask questions plus answers/source URLs, organic SERP, local pack when present, entity IDs (CID/GCID/KG MID), and AI Overview. Infer the user language: split topic from location (e.g. "best hvac company in Denver CO" => query "best hvac company", location "Denver, CO", gl "us", hl "en"). Use maxQuestions 30 normally, 100-150 for "full", "deep", "all", or comprehensive research. Credits are charged by extracted question; unused request hold is refunded. Saves a full Markdown report locally.',
|
|
685
847
|
inputSchema: HarvestPaaInputSchema
|
|
686
848
|
}, async (input) => formatHarvestPaa(await executor.harvestPaa(input), input));
|
|
687
849
|
server.registerTool("search_serp", {
|
|
688
|
-
description: "
|
|
850
|
+
description: "Fast Google SERP lookup without PAA expansion. Use when the user asks for rankings, organic results, local pack, quick SERP, or positions. Split topic from location and infer gl/hl from the user request. Saves a full Markdown report locally.",
|
|
689
851
|
inputSchema: SearchSerpInputSchema
|
|
690
852
|
}, async (input) => formatSearchSerp(await executor.searchSerp(input), input));
|
|
691
853
|
server.registerTool("extract_url", {
|
|
692
|
-
description: "Extract structured data from
|
|
854
|
+
description: "Extract structured data from one public URL: page content as Markdown, heading structure, JSON-LD schema, entity details, NAP score, metadata, and missing schema fields. Use when the user provides a single URL or asks to inspect/scrape one page. Saves a full Markdown report locally.",
|
|
693
855
|
inputSchema: ExtractUrlInputSchema
|
|
694
856
|
}, async (input) => formatExtractUrl(await executor.extractUrl(input), input));
|
|
695
857
|
server.registerTool("map_site_urls", {
|
|
696
|
-
description: "
|
|
858
|
+
description: "Map/crawl a public website to build a URL inventory with HTTP status codes, broken links, redirects, and site scope. Use before extract_site for audits or when the user asks for a sitemap/URL inventory. Saves a full Markdown report locally.",
|
|
697
859
|
inputSchema: MapSiteUrlsInputSchema
|
|
698
860
|
}, async (input) => formatMapSiteUrls(await executor.mapSiteUrls(input), input));
|
|
699
861
|
server.registerTool("extract_site", {
|
|
700
|
-
description: "Run multi-page extraction across
|
|
862
|
+
description: "Run multi-page extraction across a public website. Returns per-page titles, H1s, metadata, headings, schema/entity data, canonical URLs, and content. Use for website audits, competitor audits, and full-site extraction. Saves a full Markdown report locally.",
|
|
701
863
|
inputSchema: ExtractSiteInputSchema
|
|
702
864
|
}, async (input) => formatExtractSite(await executor.extractSite(input), input));
|
|
703
865
|
server.registerTool("youtube_harvest", {
|
|
704
|
-
description: 'Harvest YouTube video metadata by search query or channel handle.
|
|
866
|
+
description: 'Harvest YouTube video metadata by search query or channel handle/ID/URL. Use mode "search" for keyword/topic requests and mode "channel" for @handles, channel IDs, or channel URLs. Returns titles, views, dates, durations, URLs, thumbnails, and videoIds for follow-up transcription. Saves a full Markdown report locally.',
|
|
705
867
|
inputSchema: YoutubeHarvestInputSchema
|
|
706
868
|
}, async (input) => formatYoutubeHarvest(await executor.youtubeHarvest(input), input));
|
|
707
869
|
server.registerTool("youtube_transcribe", {
|
|
708
|
-
description: "Fetch and transcribe captions from a YouTube video. Returns full transcript, timestamped chunks, and word count. Pass a videoId from youtube_harvest results.",
|
|
870
|
+
description: "Fetch and transcribe captions from a YouTube video. Returns full transcript, timestamped chunks, and word count. Pass a videoId from youtube_harvest results or infer it from a YouTube URL if the user provided one. Saves a full Markdown report locally.",
|
|
709
871
|
inputSchema: YoutubeTranscribeInputSchema
|
|
710
872
|
}, async (input) => formatYoutubeTranscribe(await executor.youtubeTranscribe(input), input));
|
|
711
873
|
server.registerTool("facebook_page_intel", {
|
|
712
|
-
description: "Harvest
|
|
874
|
+
description: "Harvest ads from a Facebook advertiser. Returns ad copy, headlines, CTAs, creative type, status, landing URLs, and video URLs ready for transcription. Accepts pageId, libraryId, or a brand/advertiser name as query. Use after facebook_ad_search when possible. Saves a full Markdown report locally.",
|
|
713
875
|
inputSchema: FacebookPageIntelInputSchema
|
|
714
876
|
}, async (input) => formatFacebookPageIntel(await executor.facebookPageIntel(input), input));
|
|
715
877
|
server.registerTool("facebook_ad_search", {
|
|
716
|
-
description: "Search Facebook Ad Library by keyword. Returns advertisers with ad counts and library IDs. Use to discover competitors, then pass libraryId to facebook_page_intel
|
|
878
|
+
description: "Search Facebook Ad Library by brand, advertiser, competitor, niche, or keyword. Returns advertisers with ad counts and library IDs. Use to discover competitors, then pass libraryId to facebook_page_intel. Saves a full Markdown report locally.",
|
|
717
879
|
inputSchema: FacebookAdSearchInputSchema
|
|
718
880
|
}, async (input) => formatFacebookAdSearch(await executor.facebookAdSearch(input), input));
|
|
719
881
|
server.registerTool("facebook_ad_transcribe", {
|
|
@@ -721,7 +883,7 @@ function buildPaaExtractorMcpServer(executor) {
|
|
|
721
883
|
inputSchema: FacebookAdTranscribeInputSchema
|
|
722
884
|
}, async (input) => formatFacebookAdTranscribe(await executor.facebookAdTranscribe(input), input));
|
|
723
885
|
server.registerTool("maps_place_intel", {
|
|
724
|
-
description:
|
|
886
|
+
description: 'Extract Google Maps business intelligence for a named business: rating, review count, category, address, phone, website, hours, booking URL, review histogram, review topics, about attributes, entity IDs, and optional review cards. Split business name from location (e.g. "Elite Roofing Denver CO" => businessName "Elite Roofing", location "Denver, CO"). Pass includeReviews true when the user asks for reviews/customer pain. Saves a full Markdown report locally.',
|
|
725
887
|
inputSchema: MapsPlaceIntelInputSchema
|
|
726
888
|
}, async (input) => formatMapsPlaceIntel(await executor.mapsPlaceIntel(input), input));
|
|
727
889
|
server.registerTool("credits_info", {
|
|
@@ -735,11 +897,17 @@ function buildPaaExtractorMcpServer(executor) {
|
|
|
735
897
|
var HttpMcpToolExecutor = class {
|
|
736
898
|
baseUrl;
|
|
737
899
|
apiKey;
|
|
900
|
+
timeoutMs;
|
|
901
|
+
serpIntelligenceTimeoutMs;
|
|
738
902
|
constructor(baseUrl, apiKey) {
|
|
739
903
|
this.baseUrl = baseUrl.replace(/\/$/, "");
|
|
740
904
|
this.apiKey = apiKey;
|
|
905
|
+
const configuredTimeoutMs = Number(process.env.MCP_SCRAPER_HTTP_TIMEOUT_MS ?? 11e4);
|
|
906
|
+
this.timeoutMs = Number.isFinite(configuredTimeoutMs) && configuredTimeoutMs > 0 ? configuredTimeoutMs : 11e4;
|
|
907
|
+
const configuredSerpIntelligenceTimeoutMs = Number(process.env.MCP_SCRAPER_SERP_INTELLIGENCE_HTTP_TIMEOUT_MS ?? this.timeoutMs);
|
|
908
|
+
this.serpIntelligenceTimeoutMs = Number.isFinite(configuredSerpIntelligenceTimeoutMs) && configuredSerpIntelligenceTimeoutMs > 0 ? configuredSerpIntelligenceTimeoutMs : this.timeoutMs;
|
|
741
909
|
}
|
|
742
|
-
async call(path, body) {
|
|
910
|
+
async call(path, body, timeoutMs = this.timeoutMs) {
|
|
743
911
|
try {
|
|
744
912
|
const res = await fetch(`${this.baseUrl}${path}`, {
|
|
745
913
|
method: "POST",
|
|
@@ -748,7 +916,7 @@ var HttpMcpToolExecutor = class {
|
|
|
748
916
|
"x-api-key": this.apiKey
|
|
749
917
|
},
|
|
750
918
|
body: JSON.stringify(body),
|
|
751
|
-
signal: AbortSignal.timeout(
|
|
919
|
+
signal: AbortSignal.timeout(timeoutMs)
|
|
752
920
|
});
|
|
753
921
|
const data = await res.json();
|
|
754
922
|
if (!res.ok) {
|
|
@@ -757,6 +925,22 @@ var HttpMcpToolExecutor = class {
|
|
|
757
925
|
return { content: [{ type: "text", text: JSON.stringify(data) }] };
|
|
758
926
|
} catch (err) {
|
|
759
927
|
const msg = err instanceof Error ? err.message : String(err);
|
|
928
|
+
if (err instanceof DOMException && err.name === "TimeoutError") {
|
|
929
|
+
return {
|
|
930
|
+
content: [{
|
|
931
|
+
type: "text",
|
|
932
|
+
text: JSON.stringify({
|
|
933
|
+
error: "mcp_request_timeout",
|
|
934
|
+
error_type: "timeout",
|
|
935
|
+
retryable: true,
|
|
936
|
+
path,
|
|
937
|
+
timeoutMs,
|
|
938
|
+
message: `MCP Scraper request exceeded ${Math.round(timeoutMs / 1e3)}s and was cancelled. Retry with fewer results or use the async API for deep harvests.`
|
|
939
|
+
})
|
|
940
|
+
}],
|
|
941
|
+
isError: true
|
|
942
|
+
};
|
|
943
|
+
}
|
|
760
944
|
return { content: [{ type: "text", text: msg }], isError: true };
|
|
761
945
|
}
|
|
762
946
|
}
|
|
@@ -796,10 +980,18 @@ var HttpMcpToolExecutor = class {
|
|
|
796
980
|
creditsInfo(input) {
|
|
797
981
|
return this.call("/billing/credits", input);
|
|
798
982
|
}
|
|
983
|
+
captureSerpSnapshot(input) {
|
|
984
|
+
return this.call("/serp-intelligence/capture", input, this.serpIntelligenceTimeoutMs);
|
|
985
|
+
}
|
|
986
|
+
captureSerpPageSnapshots(input) {
|
|
987
|
+
return this.call("/serp-intelligence/page-snapshots", input, this.serpIntelligenceTimeoutMs);
|
|
988
|
+
}
|
|
799
989
|
};
|
|
800
990
|
|
|
801
991
|
export {
|
|
992
|
+
CaptureSerpSnapshotInputSchema,
|
|
993
|
+
CaptureSerpPageSnapshotsInputSchema,
|
|
802
994
|
buildPaaExtractorMcpServer,
|
|
803
995
|
HttpMcpToolExecutor
|
|
804
996
|
};
|
|
805
|
-
//# sourceMappingURL=chunk-
|
|
997
|
+
//# sourceMappingURL=chunk-4743MZHT.js.map
|