mcp-scraper 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -0
- package/dist/bin/api-server.cjs +15553 -7587
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +3 -3
- package/dist/bin/mcp-stdio-server.cjs +312 -119
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +1 -1
- package/dist/bin/paa-harvest.cjs +1537 -165
- package/dist/bin/paa-harvest.cjs.map +1 -1
- package/dist/bin/paa-harvest.js +1 -1
- package/dist/{chunk-LXZDJJXR.js → chunk-D4CJBZBY.js} +426 -29
- package/dist/chunk-D4CJBZBY.js.map +1 -0
- package/dist/chunk-HERFK7W6.js +2781 -0
- package/dist/chunk-HERFK7W6.js.map +1 -0
- package/dist/chunk-JQKZWEON.js +1000 -0
- package/dist/chunk-JQKZWEON.js.map +1 -0
- package/dist/chunk-Y74EXABN.js +295 -0
- package/dist/chunk-Y74EXABN.js.map +1 -0
- package/dist/{db-IOYMX64U.js → db-YWCNHBLH.js} +36 -4
- package/dist/index.cjs +1660 -237
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +169 -2
- package/dist/index.d.ts +169 -2
- package/dist/index.js +120 -69
- package/dist/index.js.map +1 -1
- package/dist/server-W5NWH5KF.js +11625 -0
- package/dist/server-W5NWH5KF.js.map +1 -0
- package/dist/{worker-3ECJHPRE.js → worker-D4D2YQTA.js} +44 -9
- package/dist/worker-D4D2YQTA.js.map +1 -0
- package/package.json +17 -5
- package/dist/chunk-4API3ZCT.js +0 -1387
- package/dist/chunk-4API3ZCT.js.map +0 -1
- package/dist/chunk-LXZDJJXR.js.map +0 -1
- package/dist/chunk-ZBP4RHNW.js +0 -805
- package/dist/chunk-ZBP4RHNW.js.map +0 -1
- package/dist/server-63DR2HE5.js +0 -6062
- package/dist/server-63DR2HE5.js.map +0 -1
- package/dist/worker-3ECJHPRE.js.map +0 -1
- /package/dist/{db-IOYMX64U.js.map → db-YWCNHBLH.js.map} +0 -0
|
@@ -2,20 +2,26 @@
|
|
|
2
2
|
"use strict";
|
|
3
3
|
|
|
4
4
|
// bin/mcp-stdio-server.ts
|
|
5
|
-
var
|
|
6
|
-
var
|
|
7
|
-
var
|
|
5
|
+
var import_node_fs2 = require("fs");
|
|
6
|
+
var import_node_os2 = require("os");
|
|
7
|
+
var import_node_path2 = require("path");
|
|
8
8
|
var import_stdio = require("@modelcontextprotocol/sdk/server/stdio.js");
|
|
9
9
|
|
|
10
10
|
// src/mcp/http-mcp-tool-executor.ts
|
|
11
11
|
var HttpMcpToolExecutor = class {
|
|
12
12
|
baseUrl;
|
|
13
13
|
apiKey;
|
|
14
|
+
timeoutMs;
|
|
15
|
+
serpIntelligenceTimeoutMs;
|
|
14
16
|
constructor(baseUrl2, apiKey2) {
|
|
15
17
|
this.baseUrl = baseUrl2.replace(/\/$/, "");
|
|
16
18
|
this.apiKey = apiKey2;
|
|
19
|
+
const configuredTimeoutMs = Number(process.env.MCP_SCRAPER_HTTP_TIMEOUT_MS ?? 11e4);
|
|
20
|
+
this.timeoutMs = Number.isFinite(configuredTimeoutMs) && configuredTimeoutMs > 0 ? configuredTimeoutMs : 11e4;
|
|
21
|
+
const configuredSerpIntelligenceTimeoutMs = Number(process.env.MCP_SCRAPER_SERP_INTELLIGENCE_HTTP_TIMEOUT_MS ?? this.timeoutMs);
|
|
22
|
+
this.serpIntelligenceTimeoutMs = Number.isFinite(configuredSerpIntelligenceTimeoutMs) && configuredSerpIntelligenceTimeoutMs > 0 ? configuredSerpIntelligenceTimeoutMs : this.timeoutMs;
|
|
17
23
|
}
|
|
18
|
-
async call(path, body) {
|
|
24
|
+
async call(path, body, timeoutMs = this.timeoutMs) {
|
|
19
25
|
try {
|
|
20
26
|
const res = await fetch(`${this.baseUrl}${path}`, {
|
|
21
27
|
method: "POST",
|
|
@@ -24,7 +30,7 @@ var HttpMcpToolExecutor = class {
|
|
|
24
30
|
"x-api-key": this.apiKey
|
|
25
31
|
},
|
|
26
32
|
body: JSON.stringify(body),
|
|
27
|
-
signal: AbortSignal.timeout(
|
|
33
|
+
signal: AbortSignal.timeout(timeoutMs)
|
|
28
34
|
});
|
|
29
35
|
const data = await res.json();
|
|
30
36
|
if (!res.ok) {
|
|
@@ -33,6 +39,22 @@ var HttpMcpToolExecutor = class {
|
|
|
33
39
|
return { content: [{ type: "text", text: JSON.stringify(data) }] };
|
|
34
40
|
} catch (err) {
|
|
35
41
|
const msg = err instanceof Error ? err.message : String(err);
|
|
42
|
+
if (err instanceof DOMException && err.name === "TimeoutError") {
|
|
43
|
+
return {
|
|
44
|
+
content: [{
|
|
45
|
+
type: "text",
|
|
46
|
+
text: JSON.stringify({
|
|
47
|
+
error: "mcp_request_timeout",
|
|
48
|
+
error_type: "timeout",
|
|
49
|
+
retryable: true,
|
|
50
|
+
path,
|
|
51
|
+
timeoutMs,
|
|
52
|
+
message: `MCP Scraper request exceeded ${Math.round(timeoutMs / 1e3)}s and was cancelled. Retry with fewer results or use the async API for deep harvests.`
|
|
53
|
+
})
|
|
54
|
+
}],
|
|
55
|
+
isError: true
|
|
56
|
+
};
|
|
57
|
+
}
|
|
36
58
|
return { content: [{ type: "text", text: msg }], isError: true };
|
|
37
59
|
}
|
|
38
60
|
}
|
|
@@ -72,6 +94,12 @@ var HttpMcpToolExecutor = class {
|
|
|
72
94
|
creditsInfo(input) {
|
|
73
95
|
return this.call("/billing/credits", input);
|
|
74
96
|
}
|
|
97
|
+
captureSerpSnapshot(input) {
|
|
98
|
+
return this.call("/serp-intelligence/capture", input, this.serpIntelligenceTimeoutMs);
|
|
99
|
+
}
|
|
100
|
+
captureSerpPageSnapshots(input) {
|
|
101
|
+
return this.call("/serp-intelligence/page-snapshots", input, this.serpIntelligenceTimeoutMs);
|
|
102
|
+
}
|
|
75
103
|
};
|
|
76
104
|
|
|
77
105
|
// src/mcp/paa-mcp-server.ts
|
|
@@ -80,28 +108,38 @@ var import_mcp = require("@modelcontextprotocol/sdk/server/mcp.js");
|
|
|
80
108
|
// src/mcp/mcp-tool-schemas.ts
|
|
81
109
|
var import_zod = require("zod");
|
|
82
110
|
var HarvestPaaInputSchema = {
|
|
83
|
-
query: import_zod.z.string().min(1).describe("
|
|
84
|
-
location: import_zod.z.string().optional().describe(
|
|
85
|
-
maxQuestions: import_zod.z.number().int().min(1).max(
|
|
86
|
-
gl: import_zod.z.string().length(2).default("us"),
|
|
87
|
-
hl: import_zod.z.string().default("en")
|
|
111
|
+
query: import_zod.z.string().min(1).describe('Core search topic only. If the user says "best hvac company in Denver CO", use query="best hvac company" and location="Denver, CO". Do not include the location in query when it can be separated.'),
|
|
112
|
+
location: import_zod.z.string().optional().describe('City, region, or country for geo-targeted results, inferred from the user request when present, e.g. "Denver, CO", "Tokyo, Japan", "London, UK".'),
|
|
113
|
+
maxQuestions: import_zod.z.number().int().min(1).max(150).default(30).describe("Number of PAA questions to extract. Default 30. Maximum 150. Use 10 for quick probes, 30 for normal research, 100-150 when the user asks for everything/full/deep research. Credits are charged by extracted question; unused request hold is refunded."),
|
|
114
|
+
gl: import_zod.z.string().length(2).default("us").describe("Google country code inferred from location or user language. Examples: United States us, United Kingdom gb, Japan jp, Canada ca, Australia au."),
|
|
115
|
+
hl: import_zod.z.string().default("en").describe("Google interface/content language inferred from the user request. Use en unless the user asks for another language or locale."),
|
|
116
|
+
device: import_zod.z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use desktop by default; use mobile only when the user asks for mobile rankings."),
|
|
117
|
+
proxyMode: import_zod.z.enum(["location", "configured", "none"]).default("location").describe("Proxy targeting mode. Use location by default so city/state searches create or reuse a matching residential proxy. Use configured for the static configured proxy. Use none only for direct-network debugging."),
|
|
118
|
+
proxyZip: import_zod.z.string().regex(/^\d{5}$/).optional().describe("Optional US ZIP override for residential location proxy targeting. Use only when the user gives a specific ZIP or city-center proxy targeting needs to be forced."),
|
|
119
|
+
debug: import_zod.z.boolean().default(false).describe("Include sanitized browser/session/location diagnostics in the response. Use true when debugging localization, CAPTCHA, or proxy behavior.")
|
|
88
120
|
};
|
|
89
121
|
var ExtractUrlInputSchema = {
|
|
90
|
-
url: import_zod.z.string().url()
|
|
122
|
+
url: import_zod.z.string().url().describe("Public http/https URL to extract. Use this when the user provides one specific page URL."),
|
|
123
|
+
screenshot: import_zod.z.boolean().default(false).describe("Also capture a full-page screenshot of the URL. Saved to ~/Downloads/mcp-scraper/screenshots/ and returned inline. Use when the user asks to see or capture the page visually."),
|
|
124
|
+
screenshotDevice: import_zod.z.enum(["desktop", "mobile"]).default("desktop").describe("Viewport for screenshot. desktop = 1440\xD7900. mobile = 390\xD7844. Default desktop."),
|
|
125
|
+
extractBranding: import_zod.z.boolean().default(false).describe("Extract brand colors, fonts, logo, and favicon using a rendered browser session. Returns colorScheme (light/dark), colors (primary/accent/background/text/heading as hex), fonts (heading/body family names), and assets (logo URL, favicon URL). Use when the user asks about brand colors, site theme, or brand assets."),
|
|
126
|
+
downloadMedia: import_zod.z.boolean().default(false).describe("Extract and download all page media (images, video, audio) to ~/Downloads/mcp-scraper/media/. Ad networks, tracking pixels, and noise URLs are filtered automatically. Use when the user asks to download or harvest assets from a page."),
|
|
127
|
+
mediaTypes: import_zod.z.array(import_zod.z.enum(["image", "video", "audio"])).default(["image", "video", "audio"]).describe("Which media types to download. Default all three."),
|
|
128
|
+
allowLocal: import_zod.z.boolean().default(false).describe("Allow localhost and private-network URLs. For local development only.")
|
|
91
129
|
};
|
|
92
130
|
var MapSiteUrlsInputSchema = {
|
|
93
|
-
url: import_zod.z.string().url(),
|
|
94
|
-
maxUrls: import_zod.z.number().int().min(1).max(500).optional()
|
|
131
|
+
url: import_zod.z.string().url().describe("Public website URL or domain to crawl for internal URLs. Use before extract_site when the user asks to audit/map/crawl a site."),
|
|
132
|
+
maxUrls: import_zod.z.number().int().min(1).max(500).optional().describe("Maximum URLs to discover. Use 100 for normal maps, higher when the user asks for a full inventory.")
|
|
95
133
|
};
|
|
96
134
|
var ExtractSiteInputSchema = {
|
|
97
|
-
url: import_zod.z.string().url(),
|
|
98
|
-
maxPages: import_zod.z.number().int().min(1).max(50).optional()
|
|
135
|
+
url: import_zod.z.string().url().describe("Public website URL or domain to extract across multiple pages. Use when the user asks for a site audit, website crawl, or full-site content/schema extraction."),
|
|
136
|
+
maxPages: import_zod.z.number().int().min(1).max(50).optional().describe("Maximum pages to extract. Use 50 when the user asks for full results or a complete crawl within MCP limits.")
|
|
99
137
|
};
|
|
100
138
|
var YoutubeHarvestInputSchema = {
|
|
101
|
-
mode: import_zod.z.enum(["search", "channel"]),
|
|
102
|
-
query: import_zod.z.string().optional().describe("Required when mode is search"),
|
|
103
|
-
channelHandle: import_zod.z.string().optional().describe("YouTube channel handle,
|
|
104
|
-
maxVideos: import_zod.z.number().int().min(1).max(500).default(50)
|
|
139
|
+
mode: import_zod.z.enum(["search", "channel"]).describe("Use search for topic/keyword requests. Use channel when the user provides @handle, channel ID, or channel URL."),
|
|
140
|
+
query: import_zod.z.string().optional().describe("Required when mode is search. The YouTube search topic in the user\u2019s words."),
|
|
141
|
+
channelHandle: import_zod.z.string().optional().describe("YouTube channel handle, channel ID, or URL. Examples: @mkbhd, UC..., https://youtube.com/@mkbhd."),
|
|
142
|
+
maxVideos: import_zod.z.number().int().min(1).max(500).default(50).describe("Number of videos to return. Default 50. Increase when user asks for full channel/history.")
|
|
105
143
|
};
|
|
106
144
|
var YoutubeTranscribeInputSchema = {
|
|
107
145
|
videoId: import_zod.z.string().min(1).describe("YouTube video ID, e.g. dQw4w9WgXcQ")
|
|
@@ -109,12 +147,12 @@ var YoutubeTranscribeInputSchema = {
|
|
|
109
147
|
var FacebookPageIntelInputSchema = {
|
|
110
148
|
pageId: import_zod.z.string().optional(),
|
|
111
149
|
libraryId: import_zod.z.string().optional(),
|
|
112
|
-
query: import_zod.z.string().optional().describe("One of pageId, libraryId, or query is required"),
|
|
150
|
+
query: import_zod.z.string().optional().describe("Advertiser or brand name when pageId/libraryId is not known. One of pageId, libraryId, or query is required."),
|
|
113
151
|
maxAds: import_zod.z.number().int().min(1).max(200).default(50),
|
|
114
152
|
country: import_zod.z.string().length(2).default("US")
|
|
115
153
|
};
|
|
116
154
|
var FacebookAdSearchInputSchema = {
|
|
117
|
-
query: import_zod.z.string().min(1),
|
|
155
|
+
query: import_zod.z.string().min(1).describe("Advertiser, brand, competitor, niche, or keyword to search in Facebook Ad Library."),
|
|
118
156
|
country: import_zod.z.string().length(2).default("US"),
|
|
119
157
|
maxResults: import_zod.z.number().int().min(1).max(20).default(10)
|
|
120
158
|
};
|
|
@@ -122,10 +160,10 @@ var FacebookAdTranscribeInputSchema = {
|
|
|
122
160
|
videoUrl: import_zod.z.string().url().describe("Facebook CDN video URL from a facebook_page_intel result")
|
|
123
161
|
};
|
|
124
162
|
var MapsPlaceIntelInputSchema = {
|
|
125
|
-
businessName: import_zod.z.string().min(1).describe(
|
|
126
|
-
location: import_zod.z.string().min(1).describe('City
|
|
127
|
-
gl: import_zod.z.string().length(2).default("us"),
|
|
128
|
-
hl: import_zod.z.string().length(2).default("en"),
|
|
163
|
+
businessName: import_zod.z.string().min(1).describe('Business name only. If user says "Elite Roofing Denver CO", use businessName="Elite Roofing" and location="Denver, CO".'),
|
|
164
|
+
location: import_zod.z.string().min(1).describe('City/region/country where the business should be searched, e.g. "Denver, CO". Infer from the user request when possible.'),
|
|
165
|
+
gl: import_zod.z.string().length(2).default("us").describe("Google country code inferred from location."),
|
|
166
|
+
hl: import_zod.z.string().length(2).default("en").describe("Language inferred from user request."),
|
|
129
167
|
includeReviews: import_zod.z.boolean().default(false).describe("Whether to fetch individual review cards"),
|
|
130
168
|
maxReviews: import_zod.z.number().int().min(1).max(500).default(50).describe("Max review cards to return (requires includeReviews: true)")
|
|
131
169
|
};
|
|
@@ -134,26 +172,98 @@ var CreditsInfoInputSchema = {
|
|
|
134
172
|
includeLedger: import_zod.z.boolean().default(false).describe("Whether to include recent credit ledger entries")
|
|
135
173
|
};
|
|
136
174
|
var SearchSerpInputSchema = {
|
|
137
|
-
query: import_zod.z.string().min(1).describe("
|
|
138
|
-
location: import_zod.z.string().optional().describe("
|
|
139
|
-
gl: import_zod.z.string().length(2).default("us"),
|
|
140
|
-
hl: import_zod.z.string().default("en"),
|
|
175
|
+
query: import_zod.z.string().min(1).describe('Core search topic only. Separate location when possible. If user says "best dentist in Brooklyn NY serp", use query="best dentist" and location="Brooklyn, NY".'),
|
|
176
|
+
location: import_zod.z.string().optional().describe("City, region, or country for geo-targeted results, inferred from user request when present."),
|
|
177
|
+
gl: import_zod.z.string().length(2).default("us").describe("Google country code inferred from location or user language."),
|
|
178
|
+
hl: import_zod.z.string().default("en").describe("Google interface/content language inferred from user request."),
|
|
179
|
+
device: import_zod.z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use desktop by default; use mobile only when the user asks for mobile rankings."),
|
|
180
|
+
proxyMode: import_zod.z.enum(["location", "configured", "none"]).default("location").describe("Proxy targeting mode. Use location by default so city/state searches create or reuse a matching residential proxy. Use configured for the static configured proxy. Use none only for direct-network debugging."),
|
|
181
|
+
proxyZip: import_zod.z.string().regex(/^\d{5}$/).optional().describe("Optional US ZIP override for residential location proxy targeting. Use only when the user gives a specific ZIP or city-center proxy targeting needs to be forced."),
|
|
182
|
+
debug: import_zod.z.boolean().default(false).describe("Include sanitized browser/session/location diagnostics in the response. Use true when debugging localization, CAPTCHA, or proxy behavior."),
|
|
141
183
|
pages: import_zod.z.number().int().min(1).max(2).default(1).describe("Number of result pages to fetch (1\u20132)")
|
|
142
184
|
};
|
|
185
|
+
var CaptureSerpSnapshotInputSchema = {
|
|
186
|
+
query: import_zod.z.string().min(1).describe("Core search query to capture as a structured SERP Intelligence snapshot. Separate the place into location when the user gives a city, region, country, or ZIP."),
|
|
187
|
+
location: import_zod.z.string().optional().describe("City, region, country, or service area used for localized Google results. MCP Scraper records location evidence; UULE alone is not proof of localization."),
|
|
188
|
+
gl: import_zod.z.string().length(2).default("us").describe("Google country code inferred from the requested market, e.g. us, gb, ca, au."),
|
|
189
|
+
hl: import_zod.z.string().default("en").describe("Google interface/content language inferred from the user request."),
|
|
190
|
+
device: import_zod.z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use mobile only when the user asks for mobile rankings or mobile SERP evidence."),
|
|
191
|
+
proxyMode: import_zod.z.enum(["location", "configured", "none"]).default("location").describe("Proxy behavior for capture. Use location for localized residential proxy targeting, configured for the static residential proxy, and none only for direct-network debugging."),
|
|
192
|
+
proxyZip: import_zod.z.string().regex(/^\d{5}$/).optional().describe("Optional US ZIP override for residential location proxy targeting when a precise city-center or ZIP proxy is needed."),
|
|
193
|
+
pages: import_zod.z.number().int().min(1).max(2).default(1).describe("Number of Google result pages to capture. Use 1 normally and 2 only when the user needs deeper ranking evidence."),
|
|
194
|
+
debug: import_zod.z.boolean().default(false).describe("Include sanitized browser, proxy, and location diagnostics. Use true when debugging localization, CAPTCHA, proxy selection, or capture reliability."),
|
|
195
|
+
includePageSnapshots: import_zod.z.boolean().default(false).describe("Also capture ranking-page snapshots for selected SERP URLs through the same product capture path."),
|
|
196
|
+
pageSnapshotLimit: import_zod.z.number().int().min(0).max(10).default(0).describe("Maximum ranking-page snapshots to capture when includePageSnapshots is true. Use 0 when only SERP evidence is needed.")
|
|
197
|
+
};
|
|
198
|
+
var ScreenshotInputSchema = {
|
|
199
|
+
url: import_zod.z.string().url().describe("URL to capture as a full-page screenshot. Use http or https. Pass allowLocal: true to capture localhost or private-network URLs during development."),
|
|
200
|
+
device: import_zod.z.enum(["desktop", "mobile"]).default("desktop").describe("Viewport profile. desktop = 1440\xD7900. mobile = 390\xD7844. Use desktop by default; use mobile when the user asks for a mobile view."),
|
|
201
|
+
allowLocal: import_zod.z.boolean().default(false).describe("Allow localhost and private-network URLs (127.x, 192.168.x, 10.x, etc.). For local development only \u2014 not for production use.")
|
|
202
|
+
};
|
|
203
|
+
var CaptureSerpPageSnapshotsInputSchema = {
|
|
204
|
+
urls: import_zod.z.array(import_zod.z.string().url()).min(1).max(25).describe("Public HTTP/HTTPS URLs to capture as SERP Intelligence page snapshots. Do not pass localhost, private IPs, file URLs, or internal admin URLs."),
|
|
205
|
+
targets: import_zod.z.array(import_zod.z.object({
|
|
206
|
+
url: import_zod.z.string().url().describe("Public HTTP/HTTPS URL to capture."),
|
|
207
|
+
sourceKind: import_zod.z.enum(["organic", "ai_citation", "local_pack_website", "configured_target", "site_subject"]).default("configured_target").describe("Why this page is being captured for SERP Intelligence evidence."),
|
|
208
|
+
sourcePosition: import_zod.z.number().int().min(1).optional().describe("Ranking or citation position when the page came from SERP evidence.")
|
|
209
|
+
}).strict()).min(1).max(25).optional().describe("Structured page snapshot targets. Use this instead of urls when source kind or position should be preserved."),
|
|
210
|
+
maxConcurrency: import_zod.z.number().int().min(1).max(5).default(2).describe("Parallel page captures. Use 2 normally; higher values can increase proxy/browser pressure."),
|
|
211
|
+
timeoutMs: import_zod.z.number().int().min(1e3).max(6e4).default(15e3).describe("Per-page capture timeout in milliseconds. Increase for slow pages; timeout artifacts are returned as structured capture failures."),
|
|
212
|
+
debug: import_zod.z.boolean().default(false).describe("Include sanitized browser/proxy diagnostics for page snapshot debugging. Use true for capture, network, or proxy troubleshooting.")
|
|
213
|
+
};
|
|
143
214
|
|
|
144
215
|
// src/mcp/mcp-response-formatter.ts
|
|
145
|
-
|
|
146
|
-
|
|
216
|
+
var import_node_fs = require("fs");
|
|
217
|
+
var import_node_os = require("os");
|
|
218
|
+
var import_node_path = require("path");
|
|
219
|
+
function slugifyReportName(input) {
|
|
220
|
+
return input.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80) || "mcp-scraper-report";
|
|
221
|
+
}
|
|
222
|
+
function reportTitle(full) {
|
|
223
|
+
const title = full.split("\n").find((line) => line.startsWith("# "));
|
|
224
|
+
return title?.replace(/^#\s+/, "").trim() || "MCP Scraper Report";
|
|
225
|
+
}
|
|
226
|
+
function saveFullReport(full) {
|
|
227
|
+
if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
|
|
228
|
+
const outDir = process.env.MCP_SCRAPER_OUTPUT_DIR?.trim() || (0, import_node_path.join)((0, import_node_os.homedir)(), "Downloads", "mcp-scraper");
|
|
229
|
+
try {
|
|
230
|
+
(0, import_node_fs.mkdirSync)(outDir, { recursive: true });
|
|
231
|
+
const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
232
|
+
const file = (0, import_node_path.join)(outDir, `${stamp}-${slugifyReportName(reportTitle(full))}.md`);
|
|
233
|
+
(0, import_node_fs.writeFileSync)(file, full, "utf8");
|
|
234
|
+
return file;
|
|
235
|
+
} catch {
|
|
236
|
+
return null;
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
function oneBlock(content) {
|
|
240
|
+
const filePath = saveFullReport(content);
|
|
241
|
+
const text = filePath ? `${content}
|
|
242
|
+
|
|
243
|
+
\u{1F4C4} Saved: \`${filePath}\`` : content;
|
|
244
|
+
return { content: [{ type: "text", text }] };
|
|
245
|
+
}
|
|
246
|
+
function formatStructuredError(body, fallback) {
|
|
247
|
+
if (body.error === "insufficient_balance") {
|
|
248
|
+
return `Insufficient credits. Balance: ${body.balance_credits} credits. This call requires ${body.required_credits} credits. Top up at ${body.topup_url}`;
|
|
249
|
+
}
|
|
250
|
+
if (body.error === "mcp_request_timeout") {
|
|
251
|
+
return typeof body.message === "string" ? body.message : "MCP Scraper request timed out and was cancelled.";
|
|
252
|
+
}
|
|
253
|
+
if (typeof body.error_code === "string") {
|
|
254
|
+
const message = typeof body.error === "string" ? body.error : typeof body.message === "string" ? body.message : fallback;
|
|
255
|
+
const retryable = body.retryable === true ? " Retryable: yes." : "";
|
|
256
|
+
return `${body.error_code}: ${message}${retryable}${errorAttemptsSection(body)}`;
|
|
257
|
+
}
|
|
258
|
+
if (typeof body.error === "string") return body.error;
|
|
259
|
+
return fallback || "Tool error";
|
|
147
260
|
}
|
|
148
261
|
function parseData(raw) {
|
|
149
262
|
const first = raw.content.find((b) => b.type === "text");
|
|
150
263
|
const text = first?.type === "text" ? first.text : "";
|
|
151
264
|
try {
|
|
152
265
|
const parsed = JSON.parse(text || "{}");
|
|
153
|
-
if (parsed.error
|
|
154
|
-
return { error: `Insufficient credits. Balance: ${parsed.balance_credits} credits. This call requires ${parsed.required_credits} credits. Top up at ${parsed.topup_url}` };
|
|
155
|
-
}
|
|
156
|
-
if (raw.isError) return { error: text || "Tool error" };
|
|
266
|
+
if (raw.isError || parsed.error || parsed.error_code) return { error: formatStructuredError(parsed, text) };
|
|
157
267
|
const data = parsed.result ?? parsed;
|
|
158
268
|
return { data };
|
|
159
269
|
} catch {
|
|
@@ -184,8 +294,51 @@ function truncate(s, max) {
|
|
|
184
294
|
if (!s) return "";
|
|
185
295
|
return s.length > max ? s.slice(0, max) + "\u2026" : s;
|
|
186
296
|
}
|
|
187
|
-
|
|
188
|
-
|
|
297
|
+
function cell(s) {
|
|
298
|
+
return String(s ?? "").replace(/\r?\n+/g, " ").replace(/\|/g, "\\|").replace(/\s+/g, " ").trim();
|
|
299
|
+
}
|
|
300
|
+
function debugSection(debug) {
|
|
301
|
+
if (!debug || typeof debug !== "object") return "";
|
|
302
|
+
const request = debug.request ?? {};
|
|
303
|
+
const browser = debug.browser ?? {};
|
|
304
|
+
const kernel = browser.kernel ?? {};
|
|
305
|
+
const network = browser.networkLocation ?? {};
|
|
306
|
+
const nav = browser.serpNavigation ?? {};
|
|
307
|
+
const proxyResolution = kernel.proxyResolution ?? {};
|
|
308
|
+
const locationEvidence = debug.locationEvidence;
|
|
309
|
+
const candidates = Array.isArray(locationEvidence?.candidates) ? locationEvidence.candidates.slice(0, 4).map((c) => `${c.city}, ${c.regionCode} (${c.count})`).join(", ") : "";
|
|
310
|
+
const lines = [
|
|
311
|
+
"\n## Debug",
|
|
312
|
+
`- Proxy mode: ${request.proxyMode ?? kernel.proxyMode ?? "unknown"} \xB7 requested proxy: ${kernel.requestedProxyIdPresent === true ? `yes (${kernel.requestedProxyIdSuffix ?? "redacted"})` : "no"}`,
|
|
313
|
+
`- Proxy resolution: ${proxyResolution.source ?? "unknown"}${proxyResolution.target ? ` \xB7 ${proxyResolution.target.level ?? "city"} ${proxyResolution.target.city}, ${proxyResolution.target.state}` : ""}${proxyResolution.error ? ` \xB7 ${truncate(proxyResolution.error, 180)}` : ""}`,
|
|
314
|
+
`- Browser session: ${kernel.sessionId ?? "unknown"} \xB7 retrieved proxy: ${kernel.retrievedProxyIdPresent === true ? `yes (${kernel.retrievedProxyIdSuffix ?? "redacted"})` : kernel.retrievedProxyIdPresent === false ? "no" : "unknown"}`,
|
|
315
|
+
`- Browser IP geo: ${[network.ip, network.city, network.region, network.country].filter(Boolean).join(" \xB7 ") || network.error || "unknown"}`,
|
|
316
|
+
`- Google URL: ${truncate(nav.requestedUrl, 240) || "unknown"}`,
|
|
317
|
+
`- Final URL: ${truncate(nav.finalUrl, 240) || "unknown"} \xB7 CAPTCHA: ${nav.captchaDetected === true ? "yes" : nav.captchaDetected === false ? "no" : "unknown"} \xB7 redirected: ${nav.redirected === true ? "yes" : nav.redirected === false ? "no" : "unknown"}`
|
|
318
|
+
];
|
|
319
|
+
if (locationEvidence) {
|
|
320
|
+
lines.push(`- Location evidence: ${locationEvidence.status}${locationEvidence.expected ? ` \xB7 expected ${locationEvidence.expected.city}${locationEvidence.expected.regionCode ? `, ${locationEvidence.expected.regionCode}` : ""}` : ""}${candidates ? ` \xB7 candidates ${candidates}` : ""}`);
|
|
321
|
+
}
|
|
322
|
+
return lines.join("\n");
|
|
323
|
+
}
|
|
324
|
+
function errorAttemptsSection(body) {
|
|
325
|
+
const attempts = Array.isArray(body.attempts) ? body.attempts : [];
|
|
326
|
+
if (attempts.length === 0) return "";
|
|
327
|
+
const lines = attempts.slice(0, 5).map((attempt) => {
|
|
328
|
+
const debug = attempt.debug ?? {};
|
|
329
|
+
const browser = debug.browser ?? {};
|
|
330
|
+
const kernel = browser.kernel ?? {};
|
|
331
|
+
const proxyResolution = kernel.proxyResolution ?? {};
|
|
332
|
+
const network = browser.networkLocation ?? {};
|
|
333
|
+
const nav = browser.serpNavigation ?? {};
|
|
334
|
+
const geo = [network.ip, network.city, network.region].filter(Boolean).join(" / ") || "geo unknown";
|
|
335
|
+
return `- Attempt ${attempt.attempt_number ?? "?"}: ${attempt.outcome ?? attempt.status ?? "unknown"} \xB7 session ${attempt.kernel_session_id ?? kernel.sessionId ?? "unknown"} \xB7 proxy ${debug.request?.proxyMode ?? kernel.proxyMode ?? "unknown"}${proxyResolution.source ? `/${proxyResolution.source}` : ""} \xB7 ${geo} \xB7 CAPTCHA ${nav.captchaDetected === true ? "yes" : nav.captchaDetected === false ? "no" : "unknown"} \xB7 deleted ${attempt.kernel_delete_succeeded === true ? "yes" : attempt.kernel_delete_succeeded === false ? "no" : "unknown"}`;
|
|
336
|
+
});
|
|
337
|
+
return `
|
|
338
|
+
|
|
339
|
+
Attempts:
|
|
340
|
+
${lines.join("\n")}`;
|
|
341
|
+
}
|
|
189
342
|
function formatHarvestPaa(raw, input) {
|
|
190
343
|
const parsed = parseData(raw);
|
|
191
344
|
if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
|
|
@@ -194,16 +347,17 @@ function formatHarvestPaa(raw, input) {
|
|
|
194
347
|
const organic = d.organicResults ?? [];
|
|
195
348
|
const entityIds = d.entityIds;
|
|
196
349
|
const aiOvw = d.aiOverview;
|
|
350
|
+
const diagnostics = d.diagnostics;
|
|
197
351
|
const durationMs = d.stats?.durationMs;
|
|
198
352
|
const paaRows = flat.map(
|
|
199
|
-
(r, i) => `| ${i + 1} | ${r.question} | ${truncate(r.answer, 120)} | ${r.source_site
|
|
353
|
+
(r, i) => `| ${i + 1} | ${cell(r.question)} | ${cell(truncate(r.answer, 120))} | ${cell(r.source_title || r.source_site || "")} |`
|
|
200
354
|
).join("\n");
|
|
201
355
|
const paaTable = flat.length ? `## People Also Ask (${flat.length} questions)
|
|
202
356
|
| # | Question | Answer | Source |
|
|
203
357
|
|---|----------|--------|--------|
|
|
204
|
-
${paaRows}` : "## People Also Ask\n*
|
|
358
|
+
${paaRows}` : "## People Also Ask\n*Google did not return a People Also Ask block for this query/location. SERP data was extracted successfully when available.*";
|
|
205
359
|
const serpRows = organic.map(
|
|
206
|
-
(r) => `| ${r.position} | ${r.title} | [${r.domain}](${r.url}) | ${truncate(r.snippet, 100)} |`
|
|
360
|
+
(r) => `| ${r.position} | ${cell(r.title)} | [${cell(r.domain)}](${r.url}) | ${cell(truncate(r.snippet, 100))} |`
|
|
207
361
|
).join("\n");
|
|
208
362
|
const serpTable = organic.length ? `
|
|
209
363
|
## Organic Results (${organic.length})
|
|
@@ -215,20 +369,20 @@ ${serpRows}` : "";
|
|
|
215
369
|
> ${truncate(aiOvw.text, 600)}` : "";
|
|
216
370
|
const statsLine = durationMs ? `
|
|
217
371
|
## Stats
|
|
218
|
-
- Questions: ${flat.length} \xB7 Duration: ${(durationMs / 1e3).toFixed(1)}s` : "";
|
|
372
|
+
- Status: ${diagnostics?.completionStatus ?? (flat.length ? "paa_found" : "no_paa")} \xB7 Questions: ${flat.length} \xB7 Duration: ${(durationMs / 1e3).toFixed(1)}s` : "";
|
|
219
373
|
const tips = `
|
|
220
374
|
---
|
|
221
375
|
\u{1F4A1} **Tips**
|
|
222
|
-
- Max questions: \`maxQuestions:
|
|
376
|
+
- Max questions: \`maxQuestions: 150\` (current: ${input.maxQuestions ?? 30})
|
|
223
377
|
- Organic results only: use \`search_serp\`
|
|
224
378
|
- Dig into a result: use \`extract_url\` on any organic URL`;
|
|
225
|
-
const full =
|
|
379
|
+
const full = `# PAA Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
|
|
226
380
|
|
|
227
|
-
${paaTable}${serpTable}${entityIdsSection(entityIds)}${aiSection}${statsLine}${tips}`;
|
|
381
|
+
${paaTable}${serpTable}${entityIdsSection(entityIds)}${aiSection}${statsLine}${debugSection(diagnostics?.debug)}${tips}`;
|
|
228
382
|
const topQ = flat.slice(0, 10).map((r, i) => `${i + 1}. ${r.question}`).join("\n");
|
|
229
383
|
const topO = organic.slice(0, 5).map((r) => `${r.position}. [${r.title}](${r.url}) \u2014 ${r.domain}`).join("\n");
|
|
230
384
|
const summary = [
|
|
231
|
-
|
|
385
|
+
`**PAA: "${input.query}"** \u2014 ${flat.length} questions extracted`,
|
|
232
386
|
topQ ? `
|
|
233
387
|
**Top questions:**
|
|
234
388
|
${topQ}` : "",
|
|
@@ -237,9 +391,9 @@ ${topQ}` : "",
|
|
|
237
391
|
${topO}` : "",
|
|
238
392
|
entityIdsSummaryLine(entityIds),
|
|
239
393
|
`
|
|
240
|
-
\u{1F4A1} \`maxQuestions\` up to
|
|
394
|
+
\u{1F4A1} \`maxQuestions\` up to 150 \xB7Use \`extract_url\` to dig into any result`
|
|
241
395
|
].filter(Boolean).join("\n");
|
|
242
|
-
return
|
|
396
|
+
return oneBlock(full);
|
|
243
397
|
}
|
|
244
398
|
function formatSearchSerp(raw, input) {
|
|
245
399
|
const parsed = parseData(raw);
|
|
@@ -249,15 +403,16 @@ function formatSearchSerp(raw, input) {
|
|
|
249
403
|
const localPack = d.localPack ?? [];
|
|
250
404
|
const entityIds = d.entityIds;
|
|
251
405
|
const aiOvw = d.aiOverview;
|
|
406
|
+
const diagnostics = d.diagnostics;
|
|
252
407
|
const serpRows = organic.map(
|
|
253
|
-
(r) => `| ${r.position} | ${r.title} | [${r.domain}](${r.url}) | ${truncate(r.snippet, 100)} |`
|
|
408
|
+
(r) => `| ${r.position} | ${cell(r.title)} | [${cell(r.domain)}](${r.url}) | ${cell(truncate(r.snippet, 100))} |`
|
|
254
409
|
).join("\n");
|
|
255
410
|
const serpTable = organic.length ? `## Organic Results (${organic.length})
|
|
256
411
|
| # | Title | URL | Snippet |
|
|
257
412
|
|---|-------|-----|----------|
|
|
258
413
|
${serpRows}` : "## Organic Results\n*None found*";
|
|
259
414
|
const localRows = localPack.map(
|
|
260
|
-
(b) => `| ${b.position} | ${b.name} | ${b.rating ?? "\u2014"} (${b.reviewCount ?? "0"}) | ${b.websiteUrl ? `[link](${b.websiteUrl})` : "\u2014"} |`
|
|
415
|
+
(b) => `| ${b.position} | ${cell(b.name)} | ${b.rating ?? "\u2014"} (${b.reviewCount ?? "0"}) | ${b.websiteUrl ? `[link](${b.websiteUrl})` : "\u2014"} |`
|
|
261
416
|
).join("\n");
|
|
262
417
|
const localSection = localPack.length ? `
|
|
263
418
|
## Local Pack (${localPack.length})
|
|
@@ -273,12 +428,12 @@ ${localRows}` : "";
|
|
|
273
428
|
- Get PAA questions: use \`harvest_paa\` for this query
|
|
274
429
|
- Scrape any result: use \`extract_url\`
|
|
275
430
|
- Business entity IDs (CID/GCID/KG MID) shown above if found`;
|
|
276
|
-
const full =
|
|
431
|
+
const full = `# SERP Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
|
|
277
432
|
|
|
278
|
-
${serpTable}${localSection}${entityIdsSection(entityIds)}${aiSection}${tips}`;
|
|
433
|
+
${serpTable}${localSection}${entityIdsSection(entityIds)}${aiSection}${debugSection(diagnostics?.debug)}${tips}`;
|
|
279
434
|
const topO = organic.slice(0, 5).map((r) => `${r.position}. [${r.title}](${r.url}) \u2014 ${r.domain}`).join("\n");
|
|
280
435
|
const summary = [
|
|
281
|
-
|
|
436
|
+
`**SERP: "${input.query}"** \u2014 ${organic.length} organic results`,
|
|
282
437
|
topO ? `
|
|
283
438
|
**Top results:**
|
|
284
439
|
${topO}` : "",
|
|
@@ -288,7 +443,7 @@ ${topO}` : "",
|
|
|
288
443
|
`
|
|
289
444
|
\u{1F4A1} Use \`harvest_paa\` for questions \xB7 \`extract_url\` to scrape any result`
|
|
290
445
|
].filter(Boolean).join("\n");
|
|
291
|
-
return
|
|
446
|
+
return oneBlock(full);
|
|
292
447
|
}
|
|
293
448
|
function formatExtractUrl(raw, input) {
|
|
294
449
|
const parsed = parseData(raw);
|
|
@@ -300,6 +455,9 @@ function formatExtractUrl(raw, input) {
|
|
|
300
455
|
const kpo = d.kpo;
|
|
301
456
|
const bodyMd = d.bodyMarkdown ?? "";
|
|
302
457
|
const schema = d.schema;
|
|
458
|
+
const screenshotMeta = d.screenshot;
|
|
459
|
+
const branding = d.branding;
|
|
460
|
+
const media = d.media;
|
|
303
461
|
const h1Lines = headings.filter((h) => h.level === 1).map((h) => `- ${h.text}`).join("\n");
|
|
304
462
|
const h2Lines = headings.filter((h) => h.level === 2).map((h) => ` - ${h.text}`).join("\n");
|
|
305
463
|
const headingSection = h1Lines || h2Lines ? `
|
|
@@ -322,6 +480,26 @@ ${[h1Lines, h2Lines].filter(Boolean).join("\n")}` : "";
|
|
|
322
480
|
const bodySection = bodyMd ? `
|
|
323
481
|
## Page Content
|
|
324
482
|
${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
|
|
483
|
+
const screenshotSection = screenshotMeta ? `
|
|
484
|
+
## Screenshot
|
|
485
|
+
- **File:** ${screenshotMeta.savedPath}
|
|
486
|
+
- **Size:** ${(screenshotMeta.sizeBytes / 1024).toFixed(1)} KB
|
|
487
|
+
- **Device:** ${screenshotMeta.device}` : "";
|
|
488
|
+
const brandingSection = branding ? [
|
|
489
|
+
`
|
|
490
|
+
## Branding`,
|
|
491
|
+
branding.colorScheme ? `- **Color scheme:** ${branding.colorScheme}` : "",
|
|
492
|
+
`- **Colors:**${Object.entries(branding.colors ?? {}).filter(([, v]) => v).map(([k, v]) => ` ${k}=${v}`).join(",") || " (none extracted)"}`,
|
|
493
|
+
`- **Fonts:**${Object.entries(branding.fonts ?? {}).filter(([, v]) => v).map(([k, v]) => ` ${k}=${v}`).join(",") || " (none extracted)"}`,
|
|
494
|
+
branding.assets?.logo ? `- **Logo:** ${branding.assets.logo}` : "",
|
|
495
|
+
branding.assets?.favicon ? `- **Favicon:** ${branding.assets.favicon}` : ""
|
|
496
|
+
].filter(Boolean).join("\n") : "";
|
|
497
|
+
const mediaSection = media ? [
|
|
498
|
+
`
|
|
499
|
+
## Media Assets`,
|
|
500
|
+
`- **Found:** ${media.totalFound} total, ${media.filteredCount} filtered (ads/noise), ${media.assets.length} downloaded`,
|
|
501
|
+
media.outputDir ? `- **Saved to:** ${media.outputDir}` : ""
|
|
502
|
+
].filter(Boolean).join("\n") : "";
|
|
325
503
|
const schemaCount = Array.isArray(schema) ? schema.length : 0;
|
|
326
504
|
const tips = `
|
|
327
505
|
---
|
|
@@ -329,19 +507,23 @@ ${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
|
|
|
329
507
|
- Crawl entire site: use \`extract_site\`
|
|
330
508
|
- Map all URLs: use \`map_site_urls\`
|
|
331
509
|
- ${schemaCount} JSON-LD schema block(s) detected`;
|
|
332
|
-
const full =
|
|
510
|
+
const full = `# URL Extract: ${url}
|
|
333
511
|
**${title}**
|
|
334
|
-
${headingSection}${kpoSection}${bodySection}${tips}`;
|
|
335
|
-
const
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
512
|
+
${headingSection}${kpoSection}${brandingSection}${bodySection}${screenshotSection}${mediaSection}${tips}`;
|
|
513
|
+
const textResult = oneBlock(full);
|
|
514
|
+
if (screenshotMeta?.savedPath) {
|
|
515
|
+
try {
|
|
516
|
+
const imgBuf = (0, import_node_fs.readFileSync)(screenshotMeta.savedPath);
|
|
517
|
+
return {
|
|
518
|
+
content: [
|
|
519
|
+
...textResult.content,
|
|
520
|
+
{ type: "image", data: imgBuf.toString("base64"), mimeType: "image/png" }
|
|
521
|
+
]
|
|
522
|
+
};
|
|
523
|
+
} catch {
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
return textResult;
|
|
345
527
|
}
|
|
346
528
|
function formatMapSiteUrls(raw, input) {
|
|
347
529
|
const parsed = parseData(raw);
|
|
@@ -353,7 +535,7 @@ function formatMapSiteUrls(raw, input) {
|
|
|
353
535
|
const redirects = urls.filter((u) => u.status !== null && u.status >= 300 && u.status < 400);
|
|
354
536
|
const urlRows = urls.slice(0, 200).map((u, i) => `| ${i + 1} | ${u.url} | ${u.status ?? "\u2014"} |`).join("\n");
|
|
355
537
|
const full = [
|
|
356
|
-
|
|
538
|
+
`# URL Map: ${input.url}`,
|
|
357
539
|
`**${d.totalFound} URLs** \xB7 ${(d.durationMs / 1e3).toFixed(1)}s${d.truncated ? " \xB7 *truncated*" : ""}`,
|
|
358
540
|
`
|
|
359
541
|
## Summary
|
|
@@ -375,14 +557,14 @@ ${broken.map((u) => `- ${u.url} (${u.status})`).join("\n")}` : "",
|
|
|
375
557
|
- Scrape a single page: use \`extract_url\``
|
|
376
558
|
].filter(Boolean).join("\n");
|
|
377
559
|
const summary = [
|
|
378
|
-
|
|
560
|
+
`**URL Map: ${input.url}**`,
|
|
379
561
|
`${d.totalFound} URLs \u2014 ${ok.length} OK \xB7 ${broken.length} broken \xB7 ${redirects.length} redirects`,
|
|
380
562
|
broken.length ? `
|
|
381
563
|
**Broken URLs:** ${broken.slice(0, 3).map((u) => u.url).join(", ")}` : "",
|
|
382
564
|
`
|
|
383
565
|
\u{1F4A1} Use \`extract_site\` to extract content from all pages`
|
|
384
566
|
].filter(Boolean).join("\n");
|
|
385
|
-
return
|
|
567
|
+
return oneBlock(full);
|
|
386
568
|
}
|
|
387
569
|
function formatExtractSite(raw, input) {
|
|
388
570
|
const parsed = parseData(raw);
|
|
@@ -391,10 +573,10 @@ function formatExtractSite(raw, input) {
|
|
|
391
573
|
const pages = d.pages ?? [];
|
|
392
574
|
const pageRows = pages.map((p, i) => {
|
|
393
575
|
const schemaInfo = p.kpo?.type?.join(", ") ?? (Array.isArray(p.schema) && p.schema.length ? `${p.schema.length} block(s)` : "\u2014");
|
|
394
|
-
return `| ${i + 1} | ${p.title ?? "Untitled"} | ${p.url} | ${schemaInfo} |`;
|
|
576
|
+
return `| ${i + 1} | ${cell(p.title ?? "Untitled")} | ${p.url} | ${schemaInfo} |`;
|
|
395
577
|
}).join("\n");
|
|
396
578
|
const full = [
|
|
397
|
-
|
|
579
|
+
`# Site Extract: ${input.url}`,
|
|
398
580
|
`**${pages.length} pages** \xB7 ${((d.durationMs ?? 0) / 1e3).toFixed(1)}s`,
|
|
399
581
|
`
|
|
400
582
|
## Pages
|
|
@@ -408,13 +590,13 @@ ${pageRows}`,
|
|
|
408
590
|
- Inspect a single page: use \`extract_url\``
|
|
409
591
|
].join("\n");
|
|
410
592
|
const summary = [
|
|
411
|
-
|
|
593
|
+
`**Site Extract: ${input.url}** \u2014 ${pages.length} pages`,
|
|
412
594
|
pages.slice(0, 5).map((p) => `- ${p.title ?? p.url}`).join("\n"),
|
|
413
595
|
pages.length > 5 ? `- \u2026 and ${pages.length - 5} more` : "",
|
|
414
596
|
`
|
|
415
597
|
\u{1F4A1} Use \`extract_url\` to inspect any individual page`
|
|
416
598
|
].filter(Boolean).join("\n");
|
|
417
|
-
return
|
|
599
|
+
return oneBlock(full);
|
|
418
600
|
}
|
|
419
601
|
function formatYoutubeHarvest(raw, input) {
|
|
420
602
|
const parsed = parseData(raw);
|
|
@@ -423,14 +605,14 @@ function formatYoutubeHarvest(raw, input) {
|
|
|
423
605
|
const videos = d.videos ?? [];
|
|
424
606
|
const label = input.mode === "channel" ? input.channelHandle ?? "channel" : `"${input.query ?? ""}"`;
|
|
425
607
|
const videoRows = videos.map(
|
|
426
|
-
(v, i) => `| ${i + 1} | ${truncate(v.title, 70)} | ${v.channelName} | ${v.views ?? "\u2014"} | ${v.duration ?? "\u2014"} | \`${v.videoId}\` |`
|
|
608
|
+
(v, i) => `| ${i + 1} | ${cell(truncate(v.title, 70))} | ${cell(v.channelName)} | ${v.views ?? "\u2014"} | ${v.duration ?? "\u2014"} | \`${v.videoId}\` |`
|
|
427
609
|
).join("\n");
|
|
428
610
|
const channelSection = d.channelMeta ? `
|
|
429
611
|
## Channel
|
|
430
612
|
- **Name:** ${d.channelMeta.title ?? "\u2014"}
|
|
431
613
|
- **Subscribers:** ${d.channelMeta.subscriberCount ?? "\u2014"}` : "";
|
|
432
614
|
const full = [
|
|
433
|
-
|
|
615
|
+
`# YouTube Harvest: ${label}`,
|
|
434
616
|
`**${videos.length} videos** \xB7 ${(d.stats.durationMs / 1e3).toFixed(1)}s`,
|
|
435
617
|
channelSection,
|
|
436
618
|
`
|
|
@@ -446,14 +628,14 @@ ${videoRows}`,
|
|
|
446
628
|
].filter(Boolean).join("\n");
|
|
447
629
|
const top5 = videos.slice(0, 5).map((v, i) => `${i + 1}. ${v.title} (\`${v.videoId}\`)`).join("\n");
|
|
448
630
|
const summary = [
|
|
449
|
-
|
|
631
|
+
`**YouTube: ${label}** \u2014 ${videos.length} videos`,
|
|
450
632
|
`
|
|
451
633
|
**Top videos:**
|
|
452
634
|
${top5}`,
|
|
453
635
|
`
|
|
454
636
|
\u{1F4A1} Transcribe any video: \`youtube_transcribe\` with its videoId`
|
|
455
637
|
].join("\n");
|
|
456
|
-
return
|
|
638
|
+
return oneBlock(full);
|
|
457
639
|
}
|
|
458
640
|
function formatYoutubeTranscribe(raw, input) {
|
|
459
641
|
const parsed = parseData(raw);
|
|
@@ -463,13 +645,13 @@ function formatYoutubeTranscribe(raw, input) {
|
|
|
463
645
|
const chunks = d.chunks ?? [];
|
|
464
646
|
const durSec = d.durationMs ? (d.durationMs / 1e3).toFixed(0) : "\u2014";
|
|
465
647
|
const chunkRows = chunks.slice(0, 50).map((c) => {
|
|
466
|
-
const sec = Math.floor(c.
|
|
648
|
+
const sec = Number.isFinite(c.timestamp[0]) ? Math.floor(c.timestamp[0]) : 0;
|
|
467
649
|
const mm = String(Math.floor(sec / 60)).padStart(2, "0");
|
|
468
650
|
const ss = String(sec % 60).padStart(2, "0");
|
|
469
|
-
return `| ${mm}:${ss} | ${truncate(c.text, 120)} |`;
|
|
651
|
+
return `| ${mm}:${ss} | ${cell(truncate(c.text, 120))} |`;
|
|
470
652
|
}).join("\n");
|
|
471
653
|
const full = [
|
|
472
|
-
|
|
654
|
+
`# YouTube Transcript: \`${input.videoId}\``,
|
|
473
655
|
`**Duration:** ${durSec}s \xB7 **${text.split(" ").length} words**`,
|
|
474
656
|
`
|
|
475
657
|
## Full Transcript
|
|
@@ -484,14 +666,14 @@ ${chunkRows}` : "",
|
|
|
484
666
|
\u{1F4A1} Harvest more from this channel: use \`youtube_harvest\` with \`mode: "channel"\``
|
|
485
667
|
].filter(Boolean).join("\n");
|
|
486
668
|
const summary = [
|
|
487
|
-
|
|
669
|
+
`**YouTube Transcript: \`${input.videoId}\`** \u2014 ${text.split(" ").length} words \xB7 ${durSec}s`,
|
|
488
670
|
`
|
|
489
671
|
**Preview:**
|
|
490
672
|
> ${truncate(text, 300)}`,
|
|
491
673
|
`
|
|
492
674
|
\u{1F4A1} Full transcript in artifact above`
|
|
493
675
|
].join("\n");
|
|
494
|
-
return
|
|
676
|
+
return oneBlock(full);
|
|
495
677
|
}
|
|
496
678
|
function formatFacebookPageIntel(raw, input) {
|
|
497
679
|
const parsed = parseData(raw);
|
|
@@ -509,7 +691,7 @@ function formatFacebookPageIntel(raw, input) {
|
|
|
509
691
|
ad.variations ? `**Variations:** ${ad.variations}` : ""
|
|
510
692
|
].filter(Boolean).join("\n")).join("\n\n---\n\n");
|
|
511
693
|
const full = [
|
|
512
|
-
|
|
694
|
+
`# Facebook Ad Intel: ${advertiser}`,
|
|
513
695
|
`**${s.totalAds} ads** \xB7 ${s.activeCount} active \xB7 ${s.videoCount} video \xB7 ${s.imageCount} image`,
|
|
514
696
|
`
|
|
515
697
|
${adBlocks}`,
|
|
@@ -523,7 +705,7 @@ ${adBlocks}`,
|
|
|
523
705
|
const adSummary = activeAds.map((a, i) => `${i + 1}. ${truncate(a.headline ?? a.primaryText, 80)} (${a.creativeType ?? "\u2014"})`).join("\n");
|
|
524
706
|
const videoCount = ads.filter((a) => a.videoUrl).length;
|
|
525
707
|
const summary = [
|
|
526
|
-
|
|
708
|
+
`**Facebook Ads: ${advertiser}** \u2014 ${s.totalAds} ads (${s.activeCount} active)`,
|
|
527
709
|
adSummary ? `
|
|
528
710
|
**Active ads:**
|
|
529
711
|
${adSummary}` : "",
|
|
@@ -531,7 +713,7 @@ ${adSummary}` : "",
|
|
|
531
713
|
videoCount ? `
|
|
532
714
|
\u{1F4A1} ${videoCount} video ads \u2014 transcribe with \`facebook_ad_transcribe\` using the videoUrl` : ""
|
|
533
715
|
].filter(Boolean).join("\n");
|
|
534
|
-
return
|
|
716
|
+
return oneBlock(full);
|
|
535
717
|
}
|
|
536
718
|
function formatFacebookAdSearch(raw, input) {
|
|
537
719
|
const parsed = parseData(raw);
|
|
@@ -539,10 +721,10 @@ function formatFacebookAdSearch(raw, input) {
|
|
|
539
721
|
const d = parsed.data;
|
|
540
722
|
const advertisers = d.results ?? d.advertisers ?? [];
|
|
541
723
|
const rows = advertisers.map(
|
|
542
|
-
(a, i) => `| ${i + 1} | ${a.name} | ${a.adCount ?? "\u2014"} | \`${a.libraryId ?? "\u2014"}\` |`
|
|
724
|
+
(a, i) => `| ${i + 1} | ${cell(a.name)} | ${a.adCount ?? "\u2014"} | \`${a.libraryId ?? "\u2014"}\` |`
|
|
543
725
|
).join("\n");
|
|
544
726
|
const full = [
|
|
545
|
-
|
|
727
|
+
`# Facebook Ad Library Search: "${input.query}"`,
|
|
546
728
|
`**${advertisers.length} advertisers found**`,
|
|
547
729
|
`
|
|
548
730
|
## Advertisers
|
|
@@ -556,14 +738,14 @@ ${rows}`,
|
|
|
556
738
|
- Or pass the advertiser name as \`query\` in \`facebook_page_intel\``
|
|
557
739
|
].join("\n");
|
|
558
740
|
const summary = [
|
|
559
|
-
|
|
741
|
+
`**Facebook Ad Search: "${input.query}"** \u2014 ${advertisers.length} advertisers`,
|
|
560
742
|
advertisers.slice(0, 5).map(
|
|
561
743
|
(a, i) => `${i + 1}. ${a.name}${a.adCount ? ` (${a.adCount} ads)` : ""} \u2014 \`${a.libraryId ?? "\u2014"}\``
|
|
562
744
|
).join("\n"),
|
|
563
745
|
`
|
|
564
746
|
\u{1F4A1} Scan ads with \`facebook_page_intel\` using \`libraryId\``
|
|
565
747
|
].filter(Boolean).join("\n");
|
|
566
|
-
return
|
|
748
|
+
return oneBlock(full);
|
|
567
749
|
}
|
|
568
750
|
function formatCreditsInfo(raw, input) {
|
|
569
751
|
const parsed = parseData(raw);
|
|
@@ -589,7 +771,7 @@ ${matched.notes}` : ""}` : input.item ? `
|
|
|
589
771
|
## Matched Cost
|
|
590
772
|
No exact cost match found for "${input.item}". See the full cost table below.` : "";
|
|
591
773
|
const full = [
|
|
592
|
-
|
|
774
|
+
`# Credits`,
|
|
593
775
|
`**Balance:** ${balance ?? "unknown"} credits`,
|
|
594
776
|
matchedSection,
|
|
595
777
|
costs.length ? `
|
|
@@ -604,13 +786,13 @@ ${costRows}` : "",
|
|
|
604
786
|
${ledgerRows}` : ""
|
|
605
787
|
].filter(Boolean).join("\n");
|
|
606
788
|
const summary = [
|
|
607
|
-
|
|
789
|
+
`**Credit balance:** ${balance ?? "unknown"} credits`,
|
|
608
790
|
matched ? `
|
|
609
791
|
**${matched.label}:** ${matched.credits} credits ${matched.unit}` : null,
|
|
610
792
|
input.includeLedger && ledger.length ? `
|
|
611
793
|
Recent ledger entries included in the full report.` : null
|
|
612
794
|
].filter(Boolean).join("\n");
|
|
613
|
-
return
|
|
795
|
+
return oneBlock(full);
|
|
614
796
|
}
|
|
615
797
|
function formatMapsPlaceIntel(raw, input) {
|
|
616
798
|
const parsed = parseData(raw);
|
|
@@ -636,6 +818,7 @@ function formatMapsPlaceIntel(raw, input) {
|
|
|
636
818
|
const topics = d.reviewTopics ?? [];
|
|
637
819
|
const about = d.aboutAttributes ?? [];
|
|
638
820
|
const reviews = d.reviews ?? [];
|
|
821
|
+
const reviewsStatus = d.reviewsStatus ?? "not_requested";
|
|
639
822
|
const hoursTable = d.hoursTable ?? [];
|
|
640
823
|
const ratingLine = [rating, reviewCount ? `(${reviewCount} reviews)` : null].filter(Boolean).join(" ");
|
|
641
824
|
const basicLines = [
|
|
@@ -674,18 +857,24 @@ ${attrs.map((a) => `- ${a}`).join("\n")}`).join("\n\n")}` : "";
|
|
|
674
857
|
cidUrl ? `- **Maps CID URL:** ${cidUrl}` : null,
|
|
675
858
|
lat != null && lng != null ? `- **Coordinates:** ${lat}, ${lng}` : null
|
|
676
859
|
].filter(Boolean).join("\n");
|
|
677
|
-
const reviewsSection =
|
|
860
|
+
const reviewsSection = (() => {
|
|
861
|
+
if (reviewsStatus === "not_requested") return "";
|
|
862
|
+
if (reviewsStatus === "unavailable") return "\n## Reviews\n> Reviews could not be retrieved this run \u2014 retry with `includeReviews: true`.";
|
|
863
|
+
if (reviewsStatus === "none_exist") return "\n## Reviews\n*This business has no reviews on Google Maps.*";
|
|
864
|
+
if (reviews.length === 0) return "\n## Reviews\n*0 reviews collected.*";
|
|
865
|
+
return `
|
|
678
866
|
## Reviews (${reviews.length})
|
|
679
867
|
${reviews.map((r, i) => {
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
868
|
+
const starsN = parseInt(r.stars ?? "0");
|
|
869
|
+
const stars = "\u2605".repeat(starsN) + "\u2606".repeat(5 - starsN);
|
|
870
|
+
return `### ${i + 1}. ${r.author ?? "Anonymous"} \u2014 ${stars}
|
|
683
871
|
*${r.date ?? ""}*
|
|
684
872
|
|
|
685
873
|
${r.text ?? ""}`;
|
|
686
|
-
|
|
874
|
+
}).join("\n\n")}`;
|
|
875
|
+
})();
|
|
687
876
|
const full = [
|
|
688
|
-
|
|
877
|
+
`# ${name}`,
|
|
689
878
|
category ? `*${category}*` : null,
|
|
690
879
|
ratingLine ? `
|
|
691
880
|
**Rating:** ${ratingLine}` : null,
|
|
@@ -704,15 +893,19 @@ ${entitySection}` : null,
|
|
|
704
893
|
*Extracted in ${(durationMs / 1e3).toFixed(1)}s*` : null
|
|
705
894
|
].filter(Boolean).join("\n");
|
|
706
895
|
const summary = [
|
|
707
|
-
|
|
896
|
+
`**${name}** \u2014 ${category ?? "Business"} \xB7 ${ratingLine || "No rating"}`,
|
|
708
897
|
address ? `\u{1F4CD} ${address}` : null,
|
|
709
898
|
phone ? `\u{1F4DE} ${phone}` : null,
|
|
710
899
|
hoursSummary ? `\u{1F550} ${hoursSummary}` : null,
|
|
711
900
|
website ? `\u{1F310} ${website}` : null,
|
|
712
|
-
reviews.length ? `
|
|
713
|
-
\u{1F4AC} ${reviews.length} reviews fetched \u2014 full list in artifact above` : null
|
|
901
|
+
reviewsStatus === "collected" && reviews.length ? `
|
|
902
|
+
\u{1F4AC} ${reviews.length} reviews fetched \u2014 full list in artifact above` : null,
|
|
903
|
+
reviewsStatus === "unavailable" ? `
|
|
904
|
+
\u26A0\uFE0F Reviews could not be retrieved this run` : null,
|
|
905
|
+
reviewsStatus === "none_exist" ? `
|
|
906
|
+
\u{1F4AC} No reviews on Google Maps` : null
|
|
714
907
|
].filter(Boolean).join("\n");
|
|
715
|
-
return
|
|
908
|
+
return oneBlock(full);
|
|
716
909
|
}
|
|
717
910
|
function formatFacebookAdTranscribe(raw, input) {
|
|
718
911
|
const parsed = parseData(raw);
|
|
@@ -722,13 +915,13 @@ function formatFacebookAdTranscribe(raw, input) {
|
|
|
722
915
|
const chunks = d.chunks ?? [];
|
|
723
916
|
const durSec = d.durationMs ? (d.durationMs / 1e3).toFixed(0) : "\u2014";
|
|
724
917
|
const chunkRows = chunks.slice(0, 50).map((c) => {
|
|
725
|
-
const sec = Math.floor(c.
|
|
918
|
+
const sec = Number.isFinite(c.timestamp[0]) ? Math.floor(c.timestamp[0]) : 0;
|
|
726
919
|
const mm = String(Math.floor(sec / 60)).padStart(2, "0");
|
|
727
920
|
const ss = String(sec % 60).padStart(2, "0");
|
|
728
|
-
return `| ${mm}:${ss} | ${truncate(c.text, 120)} |`;
|
|
921
|
+
return `| ${mm}:${ss} | ${cell(truncate(c.text, 120))} |`;
|
|
729
922
|
}).join("\n");
|
|
730
923
|
const full = [
|
|
731
|
-
|
|
924
|
+
`# Facebook Ad Transcript`,
|
|
732
925
|
`**Duration:** ${durSec}s \xB7 **${text.split(" ").length} words**`,
|
|
733
926
|
`
|
|
734
927
|
## Full Transcript
|
|
@@ -743,53 +936,53 @@ ${chunkRows}` : "",
|
|
|
743
936
|
\u{1F4A1} Get more ads from this advertiser: use \`facebook_page_intel\``
|
|
744
937
|
].filter(Boolean).join("\n");
|
|
745
938
|
const summary = [
|
|
746
|
-
|
|
939
|
+
`**Facebook Ad Transcript** \u2014 ${text.split(" ").length} words \xB7 ${durSec}s`,
|
|
747
940
|
`
|
|
748
941
|
**Preview:**
|
|
749
942
|
> ${truncate(text, 300)}`,
|
|
750
943
|
`
|
|
751
944
|
\u{1F4A1} Full transcript in artifact above`
|
|
752
945
|
].join("\n");
|
|
753
|
-
return
|
|
946
|
+
return oneBlock(full);
|
|
754
947
|
}
|
|
755
948
|
|
|
756
949
|
// src/mcp/paa-mcp-server.ts
|
|
757
950
|
function buildPaaExtractorMcpServer(executor2) {
|
|
758
|
-
const server2 = new import_mcp.McpServer({ name: "
|
|
951
|
+
const server2 = new import_mcp.McpServer({ name: "mcp-scraper", version: "1.0.0" });
|
|
759
952
|
server2.registerTool("harvest_paa", {
|
|
760
|
-
description:
|
|
953
|
+
description: 'Best default tool for Google search research. Extracts People Also Ask questions plus answers/source URLs, organic SERP, local pack when present, entity IDs (CID/GCID/KG MID), and AI Overview. Infer the user language: split topic from location (e.g. "best hvac company in Denver CO" => query "best hvac company", location "Denver, CO", gl "us", hl "en"). Use maxQuestions 30 normally, 100-150 for "full", "deep", "all", or comprehensive research. Credits are charged by extracted question; unused request hold is refunded. Saves a full Markdown report locally.',
|
|
761
954
|
inputSchema: HarvestPaaInputSchema
|
|
762
955
|
}, async (input) => formatHarvestPaa(await executor2.harvestPaa(input), input));
|
|
763
956
|
server2.registerTool("search_serp", {
|
|
764
|
-
description: "
|
|
957
|
+
description: "Fast Google SERP lookup without PAA expansion. Use when the user asks for rankings, organic results, local pack, quick SERP, or positions. Split topic from location and infer gl/hl from the user request. Saves a full Markdown report locally.",
|
|
765
958
|
inputSchema: SearchSerpInputSchema
|
|
766
959
|
}, async (input) => formatSearchSerp(await executor2.searchSerp(input), input));
|
|
767
960
|
server2.registerTool("extract_url", {
|
|
768
|
-
description: "Extract structured data from
|
|
961
|
+
description: "Extract structured data from one public URL: page content as Markdown, heading structure, JSON-LD schema, entity details, NAP score, metadata, and missing schema fields. Use when the user provides a single URL or asks to inspect/scrape one page. Saves a full Markdown report locally.",
|
|
769
962
|
inputSchema: ExtractUrlInputSchema
|
|
770
963
|
}, async (input) => formatExtractUrl(await executor2.extractUrl(input), input));
|
|
771
964
|
server2.registerTool("map_site_urls", {
|
|
772
|
-
description: "
|
|
965
|
+
description: "Map/crawl a public website to build a URL inventory with HTTP status codes, broken links, redirects, and site scope. Use before extract_site for audits or when the user asks for a sitemap/URL inventory. Saves a full Markdown report locally.",
|
|
773
966
|
inputSchema: MapSiteUrlsInputSchema
|
|
774
967
|
}, async (input) => formatMapSiteUrls(await executor2.mapSiteUrls(input), input));
|
|
775
968
|
server2.registerTool("extract_site", {
|
|
776
|
-
description: "Run multi-page extraction across
|
|
969
|
+
description: "Run multi-page extraction across a public website. Returns per-page titles, H1s, metadata, headings, schema/entity data, canonical URLs, and content. Use for website audits, competitor audits, and full-site extraction. Saves a full Markdown report locally.",
|
|
777
970
|
inputSchema: ExtractSiteInputSchema
|
|
778
971
|
}, async (input) => formatExtractSite(await executor2.extractSite(input), input));
|
|
779
972
|
server2.registerTool("youtube_harvest", {
|
|
780
|
-
description: 'Harvest YouTube video metadata by search query or channel handle.
|
|
973
|
+
description: 'Harvest YouTube video metadata by search query or channel handle/ID/URL. Use mode "search" for keyword/topic requests and mode "channel" for @handles, channel IDs, or channel URLs. Returns titles, views, dates, durations, URLs, thumbnails, and videoIds for follow-up transcription. Saves a full Markdown report locally.',
|
|
781
974
|
inputSchema: YoutubeHarvestInputSchema
|
|
782
975
|
}, async (input) => formatYoutubeHarvest(await executor2.youtubeHarvest(input), input));
|
|
783
976
|
server2.registerTool("youtube_transcribe", {
|
|
784
|
-
description: "Fetch and transcribe captions from a YouTube video. Returns full transcript, timestamped chunks, and word count. Pass a videoId from youtube_harvest results.",
|
|
977
|
+
description: "Fetch and transcribe captions from a YouTube video. Returns full transcript, timestamped chunks, and word count. Pass a videoId from youtube_harvest results or infer it from a YouTube URL if the user provided one. Saves a full Markdown report locally.",
|
|
785
978
|
inputSchema: YoutubeTranscribeInputSchema
|
|
786
979
|
}, async (input) => formatYoutubeTranscribe(await executor2.youtubeTranscribe(input), input));
|
|
787
980
|
server2.registerTool("facebook_page_intel", {
|
|
788
|
-
description: "Harvest
|
|
981
|
+
description: "Harvest ads from a Facebook advertiser. Returns ad copy, headlines, CTAs, creative type, status, landing URLs, and video URLs ready for transcription. Accepts pageId, libraryId, or a brand/advertiser name as query. Use after facebook_ad_search when possible. Saves a full Markdown report locally.",
|
|
789
982
|
inputSchema: FacebookPageIntelInputSchema
|
|
790
983
|
}, async (input) => formatFacebookPageIntel(await executor2.facebookPageIntel(input), input));
|
|
791
984
|
server2.registerTool("facebook_ad_search", {
|
|
792
|
-
description: "Search Facebook Ad Library by keyword. Returns advertisers with ad counts and library IDs. Use to discover competitors, then pass libraryId to facebook_page_intel
|
|
985
|
+
description: "Search Facebook Ad Library by brand, advertiser, competitor, niche, or keyword. Returns advertisers with ad counts and library IDs. Use to discover competitors, then pass libraryId to facebook_page_intel. Saves a full Markdown report locally.",
|
|
793
986
|
inputSchema: FacebookAdSearchInputSchema
|
|
794
987
|
}, async (input) => formatFacebookAdSearch(await executor2.facebookAdSearch(input), input));
|
|
795
988
|
server2.registerTool("facebook_ad_transcribe", {
|
|
@@ -797,7 +990,7 @@ function buildPaaExtractorMcpServer(executor2) {
|
|
|
797
990
|
inputSchema: FacebookAdTranscribeInputSchema
|
|
798
991
|
}, async (input) => formatFacebookAdTranscribe(await executor2.facebookAdTranscribe(input), input));
|
|
799
992
|
server2.registerTool("maps_place_intel", {
|
|
800
|
-
description:
|
|
993
|
+
description: 'Extract Google Maps business intelligence for a named business: rating, review count, category, address, phone, website, hours, booking URL, review histogram, review topics, about attributes, entity IDs, and optional review cards. Split business name from location (e.g. "Elite Roofing Denver CO" => businessName "Elite Roofing", location "Denver, CO"). Pass includeReviews true when the user asks for reviews/customer pain. Saves a full Markdown report locally.',
|
|
801
994
|
inputSchema: MapsPlaceIntelInputSchema
|
|
802
995
|
}, async (input) => formatMapsPlaceIntel(await executor2.mapsPlaceIntel(input), input));
|
|
803
996
|
server2.registerTool("credits_info", {
|
|
@@ -810,10 +1003,10 @@ function buildPaaExtractorMcpServer(executor2) {
|
|
|
810
1003
|
// bin/mcp-stdio-server.ts
|
|
811
1004
|
function readApiKeyFile() {
|
|
812
1005
|
const explicitPath = process.env.MCP_SCRAPER_KEY_PATH?.trim();
|
|
813
|
-
const paths = [explicitPath, (0,
|
|
1006
|
+
const paths = [explicitPath, (0, import_node_path2.join)((0, import_node_os2.homedir)(), ".mcp-scraper-key")].filter(Boolean);
|
|
814
1007
|
for (const path of paths) {
|
|
815
1008
|
try {
|
|
816
|
-
const value = (0,
|
|
1009
|
+
const value = (0, import_node_fs2.readFileSync)(path, "utf8").trim();
|
|
817
1010
|
if (value) return value;
|
|
818
1011
|
} catch {
|
|
819
1012
|
}
|