mcp-scraper 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -2
- package/dist/bin/api-server.cjs +572 -171
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +2 -2
- package/dist/bin/mcp-stdio-server.cjs +299 -149
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +2 -1
- package/dist/bin/mcp-stdio-server.js.map +1 -1
- package/dist/bin/paa-harvest.cjs +22 -1
- package/dist/bin/paa-harvest.cjs.map +1 -1
- package/dist/bin/paa-harvest.js +2 -1
- package/dist/bin/paa-harvest.js.map +1 -1
- package/dist/{chunk-6TWZS2FQ.js → chunk-3OIRNUF5.js} +302 -150
- package/dist/chunk-3OIRNUF5.js.map +1 -0
- package/dist/{chunk-W4P2U5VF.js → chunk-LUBDFS67.js} +32 -32
- package/dist/chunk-LUBDFS67.js.map +1 -0
- package/dist/{chunk-7HB7NDOY.js → chunk-ZK456YXN.js} +12 -2
- package/dist/chunk-ZK456YXN.js.map +1 -0
- package/dist/chunk-ZMOWIBMK.js +36 -0
- package/dist/chunk-ZMOWIBMK.js.map +1 -0
- package/dist/index.cjs +22 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +2 -1
- package/dist/index.js.map +1 -1
- package/dist/{server-2Y27U4TO.js → server-YNJHP5PU.js} +235 -22
- package/dist/server-YNJHP5PU.js.map +1 -0
- package/dist/{worker-UT4ZQU2T.js → worker-PBG6LGET.js} +4 -3
- package/dist/{worker-UT4ZQU2T.js.map → worker-PBG6LGET.js.map} +1 -1
- package/docs/adr/0001-in-page-graphql-interception-for-anti-bot-scraping.md +58 -0
- package/docs/adr/README.md +11 -0
- package/docs/mcp-tool-quality-spec.md +238 -0
- package/package.json +5 -4
- package/dist/chunk-6TWZS2FQ.js.map +0 -1
- package/dist/chunk-7HB7NDOY.js.map +0 -1
- package/dist/chunk-W4P2U5VF.js.map +0 -1
- package/dist/server-2Y27U4TO.js.map +0 -1
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
import {
|
|
2
|
+
sanitizeVendorName
|
|
3
|
+
} from "./chunk-ZMOWIBMK.js";
|
|
4
|
+
|
|
1
5
|
// src/harvest-timeout.ts
|
|
2
6
|
var VERCEL_FUNCTION_MAX_MS = 3e5;
|
|
3
7
|
var CLIENT_OVER_SERVER_MARGIN_MS = 15e3;
|
|
@@ -15,6 +19,9 @@ function harvestTimeoutBudget(maxQuestions, serpOnly = false) {
|
|
|
15
19
|
// src/mcp/paa-mcp-server.ts
|
|
16
20
|
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
17
21
|
|
|
22
|
+
// src/version.ts
|
|
23
|
+
var PACKAGE_VERSION = "0.1.7";
|
|
24
|
+
|
|
18
25
|
// src/mcp/mcp-tool-schemas.ts
|
|
19
26
|
import { z } from "zod";
|
|
20
27
|
var HarvestPaaInputSchema = {
|
|
@@ -77,6 +84,93 @@ var MapsPlaceIntelInputSchema = {
|
|
|
77
84
|
includeReviews: z.boolean().default(false).describe("Whether to fetch individual review cards"),
|
|
78
85
|
maxReviews: z.number().int().min(1).max(500).default(50).describe("Max review cards to return (requires includeReviews: true)")
|
|
79
86
|
};
|
|
87
|
+
var MapsSearchInputSchema = {
|
|
88
|
+
query: z.string().min(1).describe('Business category, niche, keyword, or search term. If the user says "roofers in Denver CO", use query="roofers" and location="Denver, CO". Do not put the location here when it can be separated.'),
|
|
89
|
+
location: z.string().optional().describe('City, region, country, or service area for the Maps search, e.g. "Denver, CO". Infer from the user request when present.'),
|
|
90
|
+
gl: z.string().length(2).default("us").describe("Google country code inferred from location."),
|
|
91
|
+
hl: z.string().length(2).default("en").describe("Language inferred from user request."),
|
|
92
|
+
maxResults: z.number().int().min(1).max(50).default(10).describe("Number of Google Maps business/profile candidates to return. Default 10. Maximum 50. Use 10 unless the user asks for more.")
|
|
93
|
+
};
|
|
94
|
+
var NullableString = z.string().nullable();
|
|
95
|
+
var MapsSearchOutputSchema = {
|
|
96
|
+
query: z.string(),
|
|
97
|
+
location: z.string().nullable(),
|
|
98
|
+
searchQuery: z.string(),
|
|
99
|
+
searchUrl: z.string().url(),
|
|
100
|
+
extractedAt: z.string(),
|
|
101
|
+
requestedMaxResults: z.number().int().min(1).max(50),
|
|
102
|
+
resultCount: z.number().int().min(0).max(50),
|
|
103
|
+
results: z.array(z.object({
|
|
104
|
+
position: z.number().int().min(1),
|
|
105
|
+
name: z.string(),
|
|
106
|
+
placeUrl: z.string().url(),
|
|
107
|
+
cid: NullableString,
|
|
108
|
+
cidDecimal: NullableString,
|
|
109
|
+
rating: NullableString,
|
|
110
|
+
reviewCount: NullableString,
|
|
111
|
+
category: NullableString,
|
|
112
|
+
address: NullableString,
|
|
113
|
+
websiteUrl: NullableString,
|
|
114
|
+
directionsUrl: NullableString,
|
|
115
|
+
metadata: z.array(z.string())
|
|
116
|
+
})),
|
|
117
|
+
durationMs: z.number().int().min(0)
|
|
118
|
+
};
|
|
119
|
+
var MapSiteUrlsOutputSchema = {
|
|
120
|
+
startUrl: z.string(),
|
|
121
|
+
totalFound: z.number().int().min(0),
|
|
122
|
+
truncated: z.boolean(),
|
|
123
|
+
okCount: z.number().int().min(0),
|
|
124
|
+
redirectCount: z.number().int().min(0),
|
|
125
|
+
brokenCount: z.number().int().min(0),
|
|
126
|
+
urls: z.array(z.object({
|
|
127
|
+
url: z.string(),
|
|
128
|
+
status: z.number().int().nullable()
|
|
129
|
+
})),
|
|
130
|
+
durationMs: z.number().min(0)
|
|
131
|
+
};
|
|
132
|
+
var YoutubeHarvestOutputSchema = {
|
|
133
|
+
mode: z.string(),
|
|
134
|
+
videoCount: z.number().int().min(0),
|
|
135
|
+
channel: z.object({
|
|
136
|
+
title: NullableString,
|
|
137
|
+
subscriberCount: NullableString
|
|
138
|
+
}).nullable(),
|
|
139
|
+
videos: z.array(z.object({
|
|
140
|
+
videoId: z.string(),
|
|
141
|
+
title: z.string(),
|
|
142
|
+
channelName: NullableString,
|
|
143
|
+
views: NullableString,
|
|
144
|
+
duration: NullableString,
|
|
145
|
+
url: NullableString
|
|
146
|
+
}))
|
|
147
|
+
};
|
|
148
|
+
var FacebookAdSearchOutputSchema = {
|
|
149
|
+
query: z.string(),
|
|
150
|
+
advertiserCount: z.number().int().min(0),
|
|
151
|
+
advertisers: z.array(z.object({
|
|
152
|
+
name: NullableString,
|
|
153
|
+
adCount: z.number().int().nullable(),
|
|
154
|
+
libraryId: NullableString
|
|
155
|
+
}))
|
|
156
|
+
};
|
|
157
|
+
var FacebookPageIntelOutputSchema = {
|
|
158
|
+
advertiserName: NullableString,
|
|
159
|
+
totalAds: z.number().int().min(0),
|
|
160
|
+
activeCount: z.number().int().min(0),
|
|
161
|
+
videoCount: z.number().int().min(0),
|
|
162
|
+
imageCount: z.number().int().min(0),
|
|
163
|
+
ads: z.array(z.object({
|
|
164
|
+
libraryId: NullableString,
|
|
165
|
+
status: NullableString,
|
|
166
|
+
creativeType: NullableString,
|
|
167
|
+
headline: NullableString,
|
|
168
|
+
cta: NullableString,
|
|
169
|
+
startDate: NullableString,
|
|
170
|
+
videoUrl: NullableString,
|
|
171
|
+
variations: z.number().int().nullable()
|
|
172
|
+
}))
|
|
173
|
+
};
|
|
80
174
|
var CreditsInfoInputSchema = {
|
|
81
175
|
item: z.string().optional().describe('Optional tool, action, or feature to look up, e.g. "maps reviews", "extract_url", or "YouTube transcription"'),
|
|
82
176
|
includeLedger: z.boolean().default(false).describe("Whether to include recent credit ledger entries")
|
|
@@ -126,6 +220,15 @@ var CaptureSerpPageSnapshotsInputSchema = {
|
|
|
126
220
|
import { mkdirSync, writeFileSync } from "fs";
|
|
127
221
|
import { homedir } from "os";
|
|
128
222
|
import { join } from "path";
|
|
223
|
+
var reportSavingEnabled = true;
|
|
224
|
+
function configureReportSaving(enabled) {
|
|
225
|
+
reportSavingEnabled = enabled;
|
|
226
|
+
}
|
|
227
|
+
function sanitizeVendorText(text) {
|
|
228
|
+
return sanitizeVendorName(
|
|
229
|
+
text.replace(/kernel_session_id/gi, "browser_session_id").replace(/kernel_delete_succeeded/gi, "session_cleanup_succeeded").replace(/kernel_delete_started/gi, "session_cleanup_started").replace(/kernel_delete_error/gi, "session_cleanup_error").replace(/kernelSessionId/g, "browserSessionId").replace(/kernelProxyId/g, "proxyId").replace(/KERNEL_API_KEY/g, "BROWSER_SERVICE_API_KEY").replace(/"kernel"\s*:/gi, '"browserRuntime":')
|
|
230
|
+
);
|
|
231
|
+
}
|
|
129
232
|
function slugifyReportName(input) {
|
|
130
233
|
return input.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80) || "mcp-scraper-report";
|
|
131
234
|
}
|
|
@@ -137,7 +240,7 @@ function outputBaseDir() {
|
|
|
137
240
|
return process.env.MCP_SCRAPER_OUTPUT_DIR?.trim() || join(homedir(), "Downloads", "mcp-scraper");
|
|
138
241
|
}
|
|
139
242
|
function saveFullReport(full) {
|
|
140
|
-
if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
|
|
243
|
+
if (!reportSavingEnabled || process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
|
|
141
244
|
const outDir = outputBaseDir();
|
|
142
245
|
try {
|
|
143
246
|
mkdirSync(outDir, { recursive: true });
|
|
@@ -150,7 +253,7 @@ function saveFullReport(full) {
|
|
|
150
253
|
}
|
|
151
254
|
}
|
|
152
255
|
function persistScreenshotLocally(base64, url) {
|
|
153
|
-
if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
|
|
256
|
+
if (!reportSavingEnabled || process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
|
|
154
257
|
try {
|
|
155
258
|
const dir = join(outputBaseDir(), "screenshots");
|
|
156
259
|
mkdirSync(dir, { recursive: true });
|
|
@@ -190,11 +293,11 @@ function parseData(raw) {
|
|
|
190
293
|
const text = first?.type === "text" ? first.text : "";
|
|
191
294
|
try {
|
|
192
295
|
const parsed = JSON.parse(text || "{}");
|
|
193
|
-
if (raw.isError || parsed.error || parsed.error_code) return { error: formatStructuredError(parsed, text) };
|
|
296
|
+
if (raw.isError || parsed.error || parsed.error_code) return { error: sanitizeVendorText(formatStructuredError(parsed, text)) };
|
|
194
297
|
const data = parsed.result ?? parsed;
|
|
195
298
|
return { data };
|
|
196
299
|
} catch {
|
|
197
|
-
if (raw.isError) return { error: text || "Tool error" };
|
|
300
|
+
if (raw.isError) return { error: sanitizeVendorText(text || "Tool error") };
|
|
198
301
|
return { error: "Failed to parse tool response" };
|
|
199
302
|
}
|
|
200
303
|
}
|
|
@@ -208,15 +311,6 @@ function entityIdsSection(ids) {
|
|
|
208
311
|
## Entity IDs
|
|
209
312
|
${lines.join("\n")}` : "";
|
|
210
313
|
}
|
|
211
|
-
function entityIdsSummaryLine(ids) {
|
|
212
|
-
if (!ids) return "";
|
|
213
|
-
const parts = [];
|
|
214
|
-
if (ids.kgIds?.length) parts.push(`KG MID: ${ids.kgIds[0]}`);
|
|
215
|
-
if (ids.cids?.length) parts.push(`CID: ${ids.cids[0]}`);
|
|
216
|
-
if (ids.gcids?.length) parts.push(`GCID: ${ids.gcids[0]}`);
|
|
217
|
-
return parts.length ? `
|
|
218
|
-
**Entity IDs:** ${parts.join(" \xB7 ")}` : "";
|
|
219
|
-
}
|
|
220
314
|
function truncate(s, max) {
|
|
221
315
|
if (!s) return "";
|
|
222
316
|
return s.length > max ? s.slice(0, max) + "\u2026" : s;
|
|
@@ -246,7 +340,7 @@ function debugSection(debug) {
|
|
|
246
340
|
if (locationEvidence) {
|
|
247
341
|
lines.push(`- Location evidence: ${locationEvidence.status}${locationEvidence.expected ? ` \xB7 expected ${locationEvidence.expected.city}${locationEvidence.expected.regionCode ? `, ${locationEvidence.expected.regionCode}` : ""}` : ""}${candidates ? ` \xB7 candidates ${candidates}` : ""}`);
|
|
248
342
|
}
|
|
249
|
-
return lines.join("\n");
|
|
343
|
+
return sanitizeVendorText(lines.join("\n"));
|
|
250
344
|
}
|
|
251
345
|
function errorAttemptsSection(body) {
|
|
252
346
|
const attempts = Array.isArray(body.attempts) ? body.attempts : [];
|
|
@@ -300,26 +394,12 @@ ${serpRows}` : "";
|
|
|
300
394
|
const tips = `
|
|
301
395
|
---
|
|
302
396
|
\u{1F4A1} **Tips**
|
|
303
|
-
- Max questions: \`maxQuestions:
|
|
397
|
+
- Max questions: \`maxQuestions: 200\` (current: ${input.maxQuestions ?? 30})
|
|
304
398
|
- Organic results only: use \`search_serp\`
|
|
305
399
|
- Dig into a result: use \`extract_url\` on any organic URL`;
|
|
306
400
|
const full = `# PAA Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
|
|
307
401
|
|
|
308
402
|
${paaTable}${serpTable}${entityIdsSection(entityIds)}${aiSection}${statsLine}${debugSection(diagnostics?.debug)}${tips}`;
|
|
309
|
-
const topQ = flat.slice(0, 10).map((r, i) => `${i + 1}. ${r.question}`).join("\n");
|
|
310
|
-
const topO = organic.slice(0, 5).map((r) => `${r.position}. [${r.title}](${r.url}) \u2014 ${r.domain}`).join("\n");
|
|
311
|
-
const summary = [
|
|
312
|
-
`**PAA: "${input.query}"** \u2014 ${flat.length} questions extracted`,
|
|
313
|
-
topQ ? `
|
|
314
|
-
**Top questions:**
|
|
315
|
-
${topQ}` : "",
|
|
316
|
-
organic.length ? `
|
|
317
|
-
**Top organic results:**
|
|
318
|
-
${topO}` : "",
|
|
319
|
-
entityIdsSummaryLine(entityIds),
|
|
320
|
-
`
|
|
321
|
-
\u{1F4A1} \`maxQuestions\` up to 150 \xB7Use \`extract_url\` to dig into any result`
|
|
322
|
-
].filter(Boolean).join("\n");
|
|
323
403
|
return oneBlock(full);
|
|
324
404
|
}
|
|
325
405
|
function formatSearchSerp(raw, input) {
|
|
@@ -358,18 +438,6 @@ ${localRows}` : "";
|
|
|
358
438
|
const full = `# SERP Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
|
|
359
439
|
|
|
360
440
|
${serpTable}${localSection}${entityIdsSection(entityIds)}${aiSection}${debugSection(diagnostics?.debug)}${tips}`;
|
|
361
|
-
const topO = organic.slice(0, 5).map((r) => `${r.position}. [${r.title}](${r.url}) \u2014 ${r.domain}`).join("\n");
|
|
362
|
-
const summary = [
|
|
363
|
-
`**SERP: "${input.query}"** \u2014 ${organic.length} organic results`,
|
|
364
|
-
topO ? `
|
|
365
|
-
**Top results:**
|
|
366
|
-
${topO}` : "",
|
|
367
|
-
localPack.length ? `
|
|
368
|
-
**Local Pack:** ${localPack.map((b) => b.name).join(", ")}` : "",
|
|
369
|
-
entityIdsSummaryLine(entityIds),
|
|
370
|
-
`
|
|
371
|
-
\u{1F4A1} Use \`harvest_paa\` for questions \xB7 \`extract_url\` to scrape any result`
|
|
372
|
-
].filter(Boolean).join("\n");
|
|
373
441
|
return oneBlock(full);
|
|
374
442
|
}
|
|
375
443
|
function formatExtractUrl(raw, input) {
|
|
@@ -480,15 +548,19 @@ ${broken.map((u) => `- ${u.url} (${u.status})`).join("\n")}` : "",
|
|
|
480
548
|
- Extract content from all pages: use \`extract_site\`
|
|
481
549
|
- Scrape a single page: use \`extract_url\``
|
|
482
550
|
].filter(Boolean).join("\n");
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
551
|
+
return {
|
|
552
|
+
...oneBlock(full),
|
|
553
|
+
structuredContent: {
|
|
554
|
+
startUrl: d.startUrl ?? input.url,
|
|
555
|
+
totalFound: d.totalFound ?? urls.length,
|
|
556
|
+
truncated: d.truncated === true,
|
|
557
|
+
okCount: ok.length,
|
|
558
|
+
redirectCount: redirects.length,
|
|
559
|
+
brokenCount: broken.length,
|
|
560
|
+
urls: urls.map((u) => ({ url: u.url, status: u.status ?? null })),
|
|
561
|
+
durationMs: d.durationMs ?? 0
|
|
562
|
+
}
|
|
563
|
+
};
|
|
492
564
|
}
|
|
493
565
|
function formatExtractSite(raw, input) {
|
|
494
566
|
const parsed = parseData(raw);
|
|
@@ -513,13 +585,6 @@ ${pageRows}`,
|
|
|
513
585
|
- Map URLs first: use \`map_site_urls\`
|
|
514
586
|
- Inspect a single page: use \`extract_url\``
|
|
515
587
|
].join("\n");
|
|
516
|
-
const summary = [
|
|
517
|
-
`**Site Extract: ${input.url}** \u2014 ${pages.length} pages`,
|
|
518
|
-
pages.slice(0, 5).map((p) => `- ${p.title ?? p.url}`).join("\n"),
|
|
519
|
-
pages.length > 5 ? `- \u2026 and ${pages.length - 5} more` : "",
|
|
520
|
-
`
|
|
521
|
-
\u{1F4A1} Use \`extract_url\` to inspect any individual page`
|
|
522
|
-
].filter(Boolean).join("\n");
|
|
523
588
|
return oneBlock(full);
|
|
524
589
|
}
|
|
525
590
|
function formatYoutubeHarvest(raw, input) {
|
|
@@ -550,16 +615,22 @@ ${videoRows}`,
|
|
|
550
615
|
- Transcribe a video: use \`youtube_transcribe\` with the \`videoId\` above
|
|
551
616
|
- Switch mode: \`mode: "channel"\` with \`channelHandle\` or \`mode: "search"\` with \`query\``
|
|
552
617
|
].filter(Boolean).join("\n");
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
618
|
+
return {
|
|
619
|
+
...oneBlock(full),
|
|
620
|
+
structuredContent: {
|
|
621
|
+
mode: input.mode,
|
|
622
|
+
videoCount: videos.length,
|
|
623
|
+
channel: d.channelMeta ? { title: d.channelMeta.title ?? null, subscriberCount: d.channelMeta.subscriberCount ?? null } : null,
|
|
624
|
+
videos: videos.map((v) => ({
|
|
625
|
+
videoId: String(v.videoId ?? ""),
|
|
626
|
+
title: String(v.title ?? ""),
|
|
627
|
+
channelName: v.channelName ?? null,
|
|
628
|
+
views: v.views ?? null,
|
|
629
|
+
duration: v.duration ?? null,
|
|
630
|
+
url: v.url ?? null
|
|
631
|
+
}))
|
|
632
|
+
}
|
|
633
|
+
};
|
|
563
634
|
}
|
|
564
635
|
function formatYoutubeTranscribe(raw, input) {
|
|
565
636
|
const parsed = parseData(raw);
|
|
@@ -589,14 +660,6 @@ ${chunkRows}` : "",
|
|
|
589
660
|
---
|
|
590
661
|
\u{1F4A1} Harvest more from this channel: use \`youtube_harvest\` with \`mode: "channel"\``
|
|
591
662
|
].filter(Boolean).join("\n");
|
|
592
|
-
const summary = [
|
|
593
|
-
`**YouTube Transcript: \`${input.videoId}\`** \u2014 ${text.split(" ").length} words \xB7 ${durSec}s`,
|
|
594
|
-
`
|
|
595
|
-
**Preview:**
|
|
596
|
-
> ${truncate(text, 300)}`,
|
|
597
|
-
`
|
|
598
|
-
\u{1F4A1} Full transcript in artifact above`
|
|
599
|
-
].join("\n");
|
|
600
663
|
return oneBlock(full);
|
|
601
664
|
}
|
|
602
665
|
function formatFacebookPageIntel(raw, input) {
|
|
@@ -625,19 +688,26 @@ ${adBlocks}`,
|
|
|
625
688
|
- Transcribe video ads: use \`facebook_ad_transcribe\` with the \`videoUrl\` above
|
|
626
689
|
- Find other advertisers: use \`facebook_ad_search\``
|
|
627
690
|
].filter(Boolean).join("\n");
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
691
|
+
return {
|
|
692
|
+
...oneBlock(full),
|
|
693
|
+
structuredContent: {
|
|
694
|
+
advertiserName: d.advertiserName ?? null,
|
|
695
|
+
totalAds: s.totalAds ?? 0,
|
|
696
|
+
activeCount: s.activeCount ?? 0,
|
|
697
|
+
videoCount: s.videoCount ?? 0,
|
|
698
|
+
imageCount: s.imageCount ?? 0,
|
|
699
|
+
ads: ads.map((ad) => ({
|
|
700
|
+
libraryId: ad.libraryId ?? null,
|
|
701
|
+
status: ad.status ?? null,
|
|
702
|
+
creativeType: ad.creativeType ?? null,
|
|
703
|
+
headline: ad.headline ?? null,
|
|
704
|
+
cta: ad.cta ?? null,
|
|
705
|
+
startDate: ad.startDate ?? null,
|
|
706
|
+
videoUrl: ad.videoUrl ?? null,
|
|
707
|
+
variations: typeof ad.variations === "number" ? ad.variations : null
|
|
708
|
+
}))
|
|
709
|
+
}
|
|
710
|
+
};
|
|
641
711
|
}
|
|
642
712
|
function formatFacebookAdSearch(raw, input) {
|
|
643
713
|
const parsed = parseData(raw);
|
|
@@ -661,15 +731,18 @@ ${rows}`,
|
|
|
661
731
|
- Scan all ads: use \`facebook_page_intel\` with \`libraryId\`
|
|
662
732
|
- Or pass the advertiser name as \`query\` in \`facebook_page_intel\``
|
|
663
733
|
].join("\n");
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
734
|
+
return {
|
|
735
|
+
...oneBlock(full),
|
|
736
|
+
structuredContent: {
|
|
737
|
+
query: input.query,
|
|
738
|
+
advertiserCount: advertisers.length,
|
|
739
|
+
advertisers: advertisers.map((a) => ({
|
|
740
|
+
name: a.pageName ?? a.name ?? null,
|
|
741
|
+
adCount: typeof a.adCount === "number" ? a.adCount : null,
|
|
742
|
+
libraryId: a.sampleLibraryId ?? a.libraryId ?? null
|
|
743
|
+
}))
|
|
744
|
+
}
|
|
745
|
+
};
|
|
673
746
|
}
|
|
674
747
|
function formatCreditsInfo(raw, input) {
|
|
675
748
|
const parsed = parseData(raw);
|
|
@@ -708,16 +781,58 @@ ${costRows}` : "",
|
|
|
708
781
|
| Date | Operation | Credits | Description |
|
|
709
782
|
|------|-----------|---------|-------------|
|
|
710
783
|
${ledgerRows}` : ""
|
|
711
|
-
].filter(Boolean).join("\n");
|
|
712
|
-
const summary = [
|
|
713
|
-
`**Credit balance:** ${balance ?? "unknown"} credits`,
|
|
714
|
-
matched ? `
|
|
715
|
-
**${matched.label}:** ${matched.credits} credits ${matched.unit}` : null,
|
|
716
|
-
input.includeLedger && ledger.length ? `
|
|
717
|
-
Recent ledger entries included in the full report.` : null
|
|
718
784
|
].filter(Boolean).join("\n");
|
|
719
785
|
return oneBlock(full);
|
|
720
786
|
}
|
|
787
|
+
function formatMapsSearch(raw, input) {
|
|
788
|
+
const parsed = parseData(raw);
|
|
789
|
+
if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
|
|
790
|
+
const d = parsed.data;
|
|
791
|
+
const results = d.results ?? [];
|
|
792
|
+
const searchQuery = d.searchQuery ?? [input.query, input.location].filter(Boolean).join(" ");
|
|
793
|
+
const requestedMax = d.requestedMaxResults ?? input.maxResults ?? 10;
|
|
794
|
+
const durationMs = d.durationMs;
|
|
795
|
+
const rows = results.map((r) => {
|
|
796
|
+
const rating = [r.rating, r.reviewCount ? `(${r.reviewCount})` : null].filter(Boolean).join(" ");
|
|
797
|
+
return `| ${r.position} | ${cell(r.name)} | ${cell(r.category)} | ${cell(rating)} | ${cell(r.address)} | ${r.cidDecimal ? `\`${r.cidDecimal}\`` : "\u2014"} | ${r.websiteUrl ? `[site](${r.websiteUrl})` : "\u2014"} | [maps](${r.placeUrl}) |`;
|
|
798
|
+
}).join("\n");
|
|
799
|
+
const metadataSection = results.length ? `
|
|
800
|
+
## Candidate Metadata
|
|
801
|
+
${results.map((r) => {
|
|
802
|
+
const meta = r.metadata?.length ? r.metadata.slice(0, 8).map((m) => ` - ${m}`).join("\n") : " - none";
|
|
803
|
+
return `### ${r.position}. ${r.name}
|
|
804
|
+
${meta}`;
|
|
805
|
+
}).join("\n\n")}` : "";
|
|
806
|
+
const full = [
|
|
807
|
+
`# Google Maps Search: "${searchQuery}"`,
|
|
808
|
+
`**Returned:** ${results.length} profile candidate${results.length === 1 ? "" : "s"} \xB7 **Requested max:** ${requestedMax} \xB7 **Limit:** 50`,
|
|
809
|
+
`
|
|
810
|
+
## Results
|
|
811
|
+
| # | Name | Category | Rating | Address | CID | Website | Maps |
|
|
812
|
+
|---|------|----------|--------|---------|-----|---------|------|
|
|
813
|
+
${rows}`,
|
|
814
|
+
metadataSection,
|
|
815
|
+
`
|
|
816
|
+
---
|
|
817
|
+
\u{1F4A1} **Next step:** use \`maps_place_intel\` with a selected business name and location to hydrate full hours, phone, review topics, and optional review cards.`,
|
|
818
|
+
durationMs != null ? `
|
|
819
|
+
*Extracted in ${(durationMs / 1e3).toFixed(1)}s*` : null
|
|
820
|
+
].filter(Boolean).join("\n");
|
|
821
|
+
return {
|
|
822
|
+
...oneBlock(full),
|
|
823
|
+
structuredContent: {
|
|
824
|
+
query: d.query,
|
|
825
|
+
location: d.location ?? null,
|
|
826
|
+
searchQuery: d.searchQuery,
|
|
827
|
+
searchUrl: d.searchUrl,
|
|
828
|
+
extractedAt: d.extractedAt,
|
|
829
|
+
requestedMaxResults: requestedMax,
|
|
830
|
+
resultCount: results.length,
|
|
831
|
+
results,
|
|
832
|
+
durationMs: durationMs ?? 0
|
|
833
|
+
}
|
|
834
|
+
};
|
|
835
|
+
}
|
|
721
836
|
function formatMapsPlaceIntel(raw, input) {
|
|
722
837
|
const parsed = parseData(raw);
|
|
723
838
|
if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
|
|
@@ -815,19 +930,6 @@ ${entitySection}` : null,
|
|
|
815
930
|
durationMs != null ? `
|
|
816
931
|
---
|
|
817
932
|
*Extracted in ${(durationMs / 1e3).toFixed(1)}s*` : null
|
|
818
|
-
].filter(Boolean).join("\n");
|
|
819
|
-
const summary = [
|
|
820
|
-
`**${name}** \u2014 ${category ?? "Business"} \xB7 ${ratingLine || "No rating"}`,
|
|
821
|
-
address ? `\u{1F4CD} ${address}` : null,
|
|
822
|
-
phone ? `\u{1F4DE} ${phone}` : null,
|
|
823
|
-
hoursSummary ? `\u{1F550} ${hoursSummary}` : null,
|
|
824
|
-
website ? `\u{1F310} ${website}` : null,
|
|
825
|
-
reviewsStatus === "collected" && reviews.length ? `
|
|
826
|
-
\u{1F4AC} ${reviews.length} reviews fetched \u2014 full list in artifact above` : null,
|
|
827
|
-
reviewsStatus === "unavailable" ? `
|
|
828
|
-
\u26A0\uFE0F Reviews could not be retrieved this run` : null,
|
|
829
|
-
reviewsStatus === "none_exist" ? `
|
|
830
|
-
\u{1F4AC} No reviews on Google Maps` : null
|
|
831
933
|
].filter(Boolean).join("\n");
|
|
832
934
|
return oneBlock(full);
|
|
833
935
|
}
|
|
@@ -859,67 +961,112 @@ ${chunkRows}` : "",
|
|
|
859
961
|
---
|
|
860
962
|
\u{1F4A1} Get more ads from this advertiser: use \`facebook_page_intel\``
|
|
861
963
|
].filter(Boolean).join("\n");
|
|
862
|
-
const summary = [
|
|
863
|
-
`**Facebook Ad Transcript** \u2014 ${text.split(" ").length} words \xB7 ${durSec}s`,
|
|
864
|
-
`
|
|
865
|
-
**Preview:**
|
|
866
|
-
> ${truncate(text, 300)}`,
|
|
867
|
-
`
|
|
868
|
-
\u{1F4A1} Full transcript in artifact above`
|
|
869
|
-
].join("\n");
|
|
870
964
|
return oneBlock(full);
|
|
871
965
|
}
|
|
872
966
|
|
|
873
967
|
// src/mcp/paa-mcp-server.ts
|
|
874
|
-
function
|
|
875
|
-
|
|
968
|
+
function liveWebToolAnnotations(title) {
|
|
969
|
+
return {
|
|
970
|
+
title,
|
|
971
|
+
readOnlyHint: true,
|
|
972
|
+
destructiveHint: false,
|
|
973
|
+
idempotentHint: false,
|
|
974
|
+
openWorldHint: true
|
|
975
|
+
};
|
|
976
|
+
}
|
|
977
|
+
function buildPaaExtractorMcpServer(executor, options = {}) {
|
|
978
|
+
const savesReports = options.savesReportsLocally !== false;
|
|
979
|
+
const reportNote = savesReports ? " Saves a full Markdown report locally." : " Reports are returned inline; no files are saved on this hosted endpoint.";
|
|
980
|
+
const withReportNote = (description) => `${description}${reportNote}`;
|
|
981
|
+
const server = new McpServer({ name: "mcp-scraper", version: PACKAGE_VERSION });
|
|
876
982
|
server.registerTool("harvest_paa", {
|
|
877
|
-
|
|
878
|
-
|
|
983
|
+
title: "Google PAA + SERP Harvest",
|
|
984
|
+
description: withReportNote('Best default tool for Google search research. Extracts People Also Ask questions plus answers/source URLs, organic SERP, local pack when present, entity IDs (CID/GCID/KG MID), and AI Overview. Infer the user language: split topic from location (e.g. "best hvac company in Denver CO" => query "best hvac company", location "Denver, CO", gl "us", hl "en"). Use maxQuestions 30 normally, 100-150 for "full", "deep", "all", or comprehensive research. Credits are charged by extracted question; unused request hold is refunded.'),
|
|
985
|
+
inputSchema: HarvestPaaInputSchema,
|
|
986
|
+
annotations: liveWebToolAnnotations("Google PAA + SERP Harvest")
|
|
879
987
|
}, async (input) => formatHarvestPaa(await executor.harvestPaa(input), input));
|
|
880
988
|
server.registerTool("search_serp", {
|
|
881
|
-
|
|
882
|
-
|
|
989
|
+
title: "Google SERP Lookup",
|
|
990
|
+
description: withReportNote("Fast Google SERP lookup without PAA expansion. Use when the user asks for rankings, organic results, local pack, quick SERP, or positions. Split topic from location and infer gl/hl from the user request."),
|
|
991
|
+
inputSchema: SearchSerpInputSchema,
|
|
992
|
+
annotations: liveWebToolAnnotations("Google SERP Lookup")
|
|
883
993
|
}, async (input) => formatSearchSerp(await executor.searchSerp(input), input));
|
|
884
994
|
server.registerTool("extract_url", {
|
|
885
|
-
|
|
886
|
-
|
|
995
|
+
title: "Single URL Extract",
|
|
996
|
+
description: withReportNote("Extract structured data from one public URL: page content as Markdown, heading structure, JSON-LD schema, entity details, NAP score, metadata, and missing schema fields. Use when the user provides a single URL or asks to inspect/scrape one page."),
|
|
997
|
+
inputSchema: ExtractUrlInputSchema,
|
|
998
|
+
annotations: liveWebToolAnnotations("Single URL Extract")
|
|
887
999
|
}, async (input) => formatExtractUrl(await executor.extractUrl(input), input));
|
|
888
1000
|
server.registerTool("map_site_urls", {
|
|
889
|
-
|
|
890
|
-
|
|
1001
|
+
title: "Site URL Map",
|
|
1002
|
+
description: withReportNote("Map/crawl a public website to build a URL inventory with HTTP status codes, broken links, redirects, and site scope. Use before extract_site for audits or when the user asks for a sitemap/URL inventory."),
|
|
1003
|
+
inputSchema: MapSiteUrlsInputSchema,
|
|
1004
|
+
outputSchema: MapSiteUrlsOutputSchema,
|
|
1005
|
+
annotations: liveWebToolAnnotations("Site URL Map")
|
|
891
1006
|
}, async (input) => formatMapSiteUrls(await executor.mapSiteUrls(input), input));
|
|
892
1007
|
server.registerTool("extract_site", {
|
|
893
|
-
|
|
894
|
-
|
|
1008
|
+
title: "Multi-Page Site Extract",
|
|
1009
|
+
description: withReportNote("Run multi-page extraction across a public website. Returns per-page titles, H1s, metadata, headings, schema/entity data, canonical URLs, and content. Use for website audits, competitor audits, and full-site extraction."),
|
|
1010
|
+
inputSchema: ExtractSiteInputSchema,
|
|
1011
|
+
annotations: liveWebToolAnnotations("Multi-Page Site Extract")
|
|
895
1012
|
}, async (input) => formatExtractSite(await executor.extractSite(input), input));
|
|
896
1013
|
server.registerTool("youtube_harvest", {
|
|
897
|
-
|
|
898
|
-
|
|
1014
|
+
title: "YouTube Video Harvest",
|
|
1015
|
+
description: withReportNote('Harvest YouTube video metadata by search query or channel handle/ID/URL. Use mode "search" for keyword/topic requests and mode "channel" for @handles, channel IDs, or channel URLs. Returns titles, views, dates, durations, URLs, thumbnails, and videoIds for follow-up transcription.'),
|
|
1016
|
+
inputSchema: YoutubeHarvestInputSchema,
|
|
1017
|
+
outputSchema: YoutubeHarvestOutputSchema,
|
|
1018
|
+
annotations: liveWebToolAnnotations("YouTube Video Harvest")
|
|
899
1019
|
}, async (input) => formatYoutubeHarvest(await executor.youtubeHarvest(input), input));
|
|
900
1020
|
server.registerTool("youtube_transcribe", {
|
|
901
|
-
|
|
902
|
-
|
|
1021
|
+
title: "YouTube Transcription",
|
|
1022
|
+
description: withReportNote("Fetch and transcribe captions from a YouTube video. Returns full transcript, timestamped chunks, and word count. Pass a videoId from youtube_harvest results or infer it from a YouTube URL if the user provided one."),
|
|
1023
|
+
inputSchema: YoutubeTranscribeInputSchema,
|
|
1024
|
+
annotations: liveWebToolAnnotations("YouTube Transcription")
|
|
903
1025
|
}, async (input) => formatYoutubeTranscribe(await executor.youtubeTranscribe(input), input));
|
|
904
1026
|
server.registerTool("facebook_page_intel", {
|
|
905
|
-
|
|
906
|
-
|
|
1027
|
+
title: "Facebook Advertiser Ad Intel",
|
|
1028
|
+
description: withReportNote("Harvest ads from a Facebook advertiser. Returns ad copy, headlines, CTAs, creative type, status, landing URLs, and video URLs ready for transcription. Accepts pageId, libraryId, or a brand/advertiser name as query. Use after facebook_ad_search when possible."),
|
|
1029
|
+
inputSchema: FacebookPageIntelInputSchema,
|
|
1030
|
+
outputSchema: FacebookPageIntelOutputSchema,
|
|
1031
|
+
annotations: liveWebToolAnnotations("Facebook Advertiser Ad Intel")
|
|
907
1032
|
}, async (input) => formatFacebookPageIntel(await executor.facebookPageIntel(input), input));
|
|
908
1033
|
server.registerTool("facebook_ad_search", {
|
|
909
|
-
|
|
910
|
-
|
|
1034
|
+
title: "Facebook Ad Library Search",
|
|
1035
|
+
description: withReportNote("Search Facebook Ad Library by brand, advertiser, competitor, niche, or keyword. Returns advertisers with ad counts and library IDs. Use to discover competitors, then pass libraryId to facebook_page_intel."),
|
|
1036
|
+
inputSchema: FacebookAdSearchInputSchema,
|
|
1037
|
+
outputSchema: FacebookAdSearchOutputSchema,
|
|
1038
|
+
annotations: liveWebToolAnnotations("Facebook Ad Library Search")
|
|
911
1039
|
}, async (input) => formatFacebookAdSearch(await executor.facebookAdSearch(input), input));
|
|
912
1040
|
server.registerTool("facebook_ad_transcribe", {
|
|
1041
|
+
title: "Facebook Ad Transcription",
|
|
913
1042
|
description: "Transcribe audio from a Facebook ad video. Returns full transcript and timestamped chunks. Use the videoUrl value from facebook_page_intel results.",
|
|
914
|
-
inputSchema: FacebookAdTranscribeInputSchema
|
|
1043
|
+
inputSchema: FacebookAdTranscribeInputSchema,
|
|
1044
|
+
annotations: liveWebToolAnnotations("Facebook Ad Transcription")
|
|
915
1045
|
}, async (input) => formatFacebookAdTranscribe(await executor.facebookAdTranscribe(input), input));
|
|
916
1046
|
server.registerTool("maps_place_intel", {
|
|
917
|
-
|
|
918
|
-
|
|
1047
|
+
title: "Google Maps Business Profile Details",
|
|
1048
|
+
description: withReportNote('Extract Google Maps business intelligence for one known/named business: rating, review count, category, address, phone, website, hours, booking URL, review histogram, review topics, about attributes, entity IDs, and optional review cards. Do not use this for category searches, local market prospect lists, or requests for multiple GMB/GBP profiles; use maps_search first for those. Split business name from location (e.g. "Elite Roofing Denver CO" => businessName "Elite Roofing", location "Denver, CO"). Pass includeReviews true when the user asks for reviews/customer pain.'),
|
|
1049
|
+
inputSchema: MapsPlaceIntelInputSchema,
|
|
1050
|
+
annotations: liveWebToolAnnotations("Google Maps Business Profile Details")
|
|
919
1051
|
}, async (input) => formatMapsPlaceIntel(await executor.mapsPlaceIntel(input), input));
|
|
1052
|
+
server.registerTool("maps_search", {
|
|
1053
|
+
title: "Google Maps Business Search",
|
|
1054
|
+
description: withReportNote('Search Google Maps for multiple businesses/profiles by category, niche, keyword, or local market. Use this when the user asks for several Google Business Profiles, GMBs, GBPs, leads, prospects, competitors, or "more than the 3-pack." Returns up to 50 candidates with names, place URLs, CIDs when available, ratings, review counts, and profile metadata. Default maxResults is 10; maximum is 50. Use maps_place_intel afterward only when a selected business needs full details and reviews.'),
|
|
1055
|
+
inputSchema: MapsSearchInputSchema,
|
|
1056
|
+
outputSchema: MapsSearchOutputSchema,
|
|
1057
|
+
annotations: liveWebToolAnnotations("Google Maps Business Search")
|
|
1058
|
+
}, async (input) => formatMapsSearch(await executor.mapsSearch(input), input));
|
|
920
1059
|
server.registerTool("credits_info", {
|
|
1060
|
+
title: "MCP Scraper Credits & Costs",
|
|
921
1061
|
description: "Answer questions about MCP Scraper credits: current credit balance, what a specific tool/action costs, the full cost table, and optionally recent credit ledger entries. Does not expose payment methods or credit card information.",
|
|
922
|
-
inputSchema: CreditsInfoInputSchema
|
|
1062
|
+
inputSchema: CreditsInfoInputSchema,
|
|
1063
|
+
annotations: {
|
|
1064
|
+
title: "MCP Scraper Credits & Costs",
|
|
1065
|
+
readOnlyHint: true,
|
|
1066
|
+
destructiveHint: false,
|
|
1067
|
+
idempotentHint: true,
|
|
1068
|
+
openWorldHint: false
|
|
1069
|
+
}
|
|
923
1070
|
}, async (input) => formatCreditsInfo(await executor.creditsInfo(input), input));
|
|
924
1071
|
return server;
|
|
925
1072
|
}
|
|
@@ -1013,6 +1160,9 @@ var HttpMcpToolExecutor = class {
|
|
|
1013
1160
|
mapsPlaceIntel(input) {
|
|
1014
1161
|
return this.call("/maps/place", input);
|
|
1015
1162
|
}
|
|
1163
|
+
mapsSearch(input) {
|
|
1164
|
+
return this.call("/maps/search", input);
|
|
1165
|
+
}
|
|
1016
1166
|
creditsInfo(input) {
|
|
1017
1167
|
return this.call("/billing/credits", input);
|
|
1018
1168
|
}
|
|
@@ -1028,7 +1178,9 @@ export {
|
|
|
1028
1178
|
harvestTimeoutBudget,
|
|
1029
1179
|
CaptureSerpSnapshotInputSchema,
|
|
1030
1180
|
CaptureSerpPageSnapshotsInputSchema,
|
|
1181
|
+
configureReportSaving,
|
|
1182
|
+
liveWebToolAnnotations,
|
|
1031
1183
|
buildPaaExtractorMcpServer,
|
|
1032
1184
|
HttpMcpToolExecutor
|
|
1033
1185
|
};
|
|
1034
|
-
//# sourceMappingURL=chunk-
|
|
1186
|
+
//# sourceMappingURL=chunk-3OIRNUF5.js.map
|