mcp-scraper 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -2
- package/dist/bin/api-server.cjs +573 -172
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +2 -2
- package/dist/bin/mcp-stdio-server.cjs +300 -150
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +2 -1
- package/dist/bin/mcp-stdio-server.js.map +1 -1
- package/dist/bin/paa-harvest.cjs +22 -1
- package/dist/bin/paa-harvest.cjs.map +1 -1
- package/dist/bin/paa-harvest.js +2 -1
- package/dist/bin/paa-harvest.js.map +1 -1
- package/dist/{chunk-4OHPDEZM.js → chunk-3OIRNUF5.js} +303 -151
- package/dist/chunk-3OIRNUF5.js.map +1 -0
- package/dist/{chunk-W4P2U5VF.js → chunk-LUBDFS67.js} +32 -32
- package/dist/chunk-LUBDFS67.js.map +1 -0
- package/dist/{chunk-7HB7NDOY.js → chunk-ZK456YXN.js} +12 -2
- package/dist/chunk-ZK456YXN.js.map +1 -0
- package/dist/chunk-ZMOWIBMK.js +36 -0
- package/dist/chunk-ZMOWIBMK.js.map +1 -0
- package/dist/index.cjs +22 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +2 -1
- package/dist/index.js.map +1 -1
- package/dist/{server-V5XMVRYE.js → server-YNJHP5PU.js} +235 -22
- package/dist/server-YNJHP5PU.js.map +1 -0
- package/dist/{worker-UT4ZQU2T.js → worker-PBG6LGET.js} +4 -3
- package/dist/{worker-UT4ZQU2T.js.map → worker-PBG6LGET.js.map} +1 -1
- package/docs/adr/0001-in-page-graphql-interception-for-anti-bot-scraping.md +58 -0
- package/docs/adr/README.md +11 -0
- package/docs/mcp-tool-quality-spec.md +238 -0
- package/package.json +5 -4
- package/dist/chunk-4OHPDEZM.js.map +0 -1
- package/dist/chunk-7HB7NDOY.js.map +0 -1
- package/dist/chunk-W4P2U5VF.js.map +0 -1
- package/dist/server-V5XMVRYE.js.map +0 -1
package/dist/bin/api-server.js
CHANGED
|
@@ -17,8 +17,8 @@ loadDotEnv();
|
|
|
17
17
|
async function main() {
|
|
18
18
|
const [{ serve }, { app }, { startWorker }, { migrate }] = await Promise.all([
|
|
19
19
|
import("@hono/node-server"),
|
|
20
|
-
import("../server-
|
|
21
|
-
import("../worker-
|
|
20
|
+
import("../server-YNJHP5PU.js"),
|
|
21
|
+
import("../worker-PBG6LGET.js"),
|
|
22
22
|
import("../db-YWCNHBLH.js")
|
|
23
23
|
]);
|
|
24
24
|
const PORT = parseInt(process.env.PORT ?? "3001");
|
|
@@ -110,6 +110,9 @@ var HttpMcpToolExecutor = class {
|
|
|
110
110
|
mapsPlaceIntel(input) {
|
|
111
111
|
return this.call("/maps/place", input);
|
|
112
112
|
}
|
|
113
|
+
mapsSearch(input) {
|
|
114
|
+
return this.call("/maps/search", input);
|
|
115
|
+
}
|
|
113
116
|
creditsInfo(input) {
|
|
114
117
|
return this.call("/billing/credits", input);
|
|
115
118
|
}
|
|
@@ -124,12 +127,15 @@ var HttpMcpToolExecutor = class {
|
|
|
124
127
|
// src/mcp/paa-mcp-server.ts
|
|
125
128
|
var import_mcp = require("@modelcontextprotocol/sdk/server/mcp.js");
|
|
126
129
|
|
|
130
|
+
// src/version.ts
|
|
131
|
+
var PACKAGE_VERSION = "0.1.7";
|
|
132
|
+
|
|
127
133
|
// src/mcp/mcp-tool-schemas.ts
|
|
128
134
|
var import_zod = require("zod");
|
|
129
135
|
var HarvestPaaInputSchema = {
|
|
130
136
|
query: import_zod.z.string().min(1).describe('Core search topic only. If the user says "best hvac company in Denver CO", use query="best hvac company" and location="Denver, CO". Do not include the location in query when it can be separated.'),
|
|
131
137
|
location: import_zod.z.string().optional().describe('City, region, or country for geo-targeted results, inferred from the user request when present, e.g. "Denver, CO", "Tokyo, Japan", "London, UK".'),
|
|
132
|
-
maxQuestions: import_zod.z.number().int().min(1).max(
|
|
138
|
+
maxQuestions: import_zod.z.number().int().min(1).max(200).default(30).describe("Number of PAA questions to extract. Default 30. Maximum 200. Use 10 for quick probes, 30 for normal research, 100-200 when the user asks for everything/full/deep research. Larger harvests get a longer server time budget (151-200 questions \u2192 up to 280s). Credits are charged by extracted question; unused request hold is refunded."),
|
|
133
139
|
gl: import_zod.z.string().length(2).default("us").describe("Google country code inferred from location or user language. Examples: United States us, United Kingdom gb, Japan jp, Canada ca, Australia au."),
|
|
134
140
|
hl: import_zod.z.string().default("en").describe("Google interface/content language inferred from the user request. Use en unless the user asks for another language or locale."),
|
|
135
141
|
device: import_zod.z.enum(["desktop", "mobile"]).default("desktop").describe("SERP device context. Use desktop by default; use mobile only when the user asks for mobile rankings."),
|
|
@@ -186,6 +192,93 @@ var MapsPlaceIntelInputSchema = {
|
|
|
186
192
|
includeReviews: import_zod.z.boolean().default(false).describe("Whether to fetch individual review cards"),
|
|
187
193
|
maxReviews: import_zod.z.number().int().min(1).max(500).default(50).describe("Max review cards to return (requires includeReviews: true)")
|
|
188
194
|
};
|
|
195
|
+
var MapsSearchInputSchema = {
|
|
196
|
+
query: import_zod.z.string().min(1).describe('Business category, niche, keyword, or search term. If the user says "roofers in Denver CO", use query="roofers" and location="Denver, CO". Do not put the location here when it can be separated.'),
|
|
197
|
+
location: import_zod.z.string().optional().describe('City, region, country, or service area for the Maps search, e.g. "Denver, CO". Infer from the user request when present.'),
|
|
198
|
+
gl: import_zod.z.string().length(2).default("us").describe("Google country code inferred from location."),
|
|
199
|
+
hl: import_zod.z.string().length(2).default("en").describe("Language inferred from user request."),
|
|
200
|
+
maxResults: import_zod.z.number().int().min(1).max(50).default(10).describe("Number of Google Maps business/profile candidates to return. Default 10. Maximum 50. Use 10 unless the user asks for more.")
|
|
201
|
+
};
|
|
202
|
+
var NullableString = import_zod.z.string().nullable();
|
|
203
|
+
var MapsSearchOutputSchema = {
|
|
204
|
+
query: import_zod.z.string(),
|
|
205
|
+
location: import_zod.z.string().nullable(),
|
|
206
|
+
searchQuery: import_zod.z.string(),
|
|
207
|
+
searchUrl: import_zod.z.string().url(),
|
|
208
|
+
extractedAt: import_zod.z.string(),
|
|
209
|
+
requestedMaxResults: import_zod.z.number().int().min(1).max(50),
|
|
210
|
+
resultCount: import_zod.z.number().int().min(0).max(50),
|
|
211
|
+
results: import_zod.z.array(import_zod.z.object({
|
|
212
|
+
position: import_zod.z.number().int().min(1),
|
|
213
|
+
name: import_zod.z.string(),
|
|
214
|
+
placeUrl: import_zod.z.string().url(),
|
|
215
|
+
cid: NullableString,
|
|
216
|
+
cidDecimal: NullableString,
|
|
217
|
+
rating: NullableString,
|
|
218
|
+
reviewCount: NullableString,
|
|
219
|
+
category: NullableString,
|
|
220
|
+
address: NullableString,
|
|
221
|
+
websiteUrl: NullableString,
|
|
222
|
+
directionsUrl: NullableString,
|
|
223
|
+
metadata: import_zod.z.array(import_zod.z.string())
|
|
224
|
+
})),
|
|
225
|
+
durationMs: import_zod.z.number().int().min(0)
|
|
226
|
+
};
|
|
227
|
+
var MapSiteUrlsOutputSchema = {
|
|
228
|
+
startUrl: import_zod.z.string(),
|
|
229
|
+
totalFound: import_zod.z.number().int().min(0),
|
|
230
|
+
truncated: import_zod.z.boolean(),
|
|
231
|
+
okCount: import_zod.z.number().int().min(0),
|
|
232
|
+
redirectCount: import_zod.z.number().int().min(0),
|
|
233
|
+
brokenCount: import_zod.z.number().int().min(0),
|
|
234
|
+
urls: import_zod.z.array(import_zod.z.object({
|
|
235
|
+
url: import_zod.z.string(),
|
|
236
|
+
status: import_zod.z.number().int().nullable()
|
|
237
|
+
})),
|
|
238
|
+
durationMs: import_zod.z.number().min(0)
|
|
239
|
+
};
|
|
240
|
+
var YoutubeHarvestOutputSchema = {
|
|
241
|
+
mode: import_zod.z.string(),
|
|
242
|
+
videoCount: import_zod.z.number().int().min(0),
|
|
243
|
+
channel: import_zod.z.object({
|
|
244
|
+
title: NullableString,
|
|
245
|
+
subscriberCount: NullableString
|
|
246
|
+
}).nullable(),
|
|
247
|
+
videos: import_zod.z.array(import_zod.z.object({
|
|
248
|
+
videoId: import_zod.z.string(),
|
|
249
|
+
title: import_zod.z.string(),
|
|
250
|
+
channelName: NullableString,
|
|
251
|
+
views: NullableString,
|
|
252
|
+
duration: NullableString,
|
|
253
|
+
url: NullableString
|
|
254
|
+
}))
|
|
255
|
+
};
|
|
256
|
+
var FacebookAdSearchOutputSchema = {
|
|
257
|
+
query: import_zod.z.string(),
|
|
258
|
+
advertiserCount: import_zod.z.number().int().min(0),
|
|
259
|
+
advertisers: import_zod.z.array(import_zod.z.object({
|
|
260
|
+
name: NullableString,
|
|
261
|
+
adCount: import_zod.z.number().int().nullable(),
|
|
262
|
+
libraryId: NullableString
|
|
263
|
+
}))
|
|
264
|
+
};
|
|
265
|
+
var FacebookPageIntelOutputSchema = {
|
|
266
|
+
advertiserName: NullableString,
|
|
267
|
+
totalAds: import_zod.z.number().int().min(0),
|
|
268
|
+
activeCount: import_zod.z.number().int().min(0),
|
|
269
|
+
videoCount: import_zod.z.number().int().min(0),
|
|
270
|
+
imageCount: import_zod.z.number().int().min(0),
|
|
271
|
+
ads: import_zod.z.array(import_zod.z.object({
|
|
272
|
+
libraryId: NullableString,
|
|
273
|
+
status: NullableString,
|
|
274
|
+
creativeType: NullableString,
|
|
275
|
+
headline: NullableString,
|
|
276
|
+
cta: NullableString,
|
|
277
|
+
startDate: NullableString,
|
|
278
|
+
videoUrl: NullableString,
|
|
279
|
+
variations: import_zod.z.number().int().nullable()
|
|
280
|
+
}))
|
|
281
|
+
};
|
|
189
282
|
var CreditsInfoInputSchema = {
|
|
190
283
|
item: import_zod.z.string().optional().describe('Optional tool, action, or feature to look up, e.g. "maps reviews", "extract_url", or "YouTube transcription"'),
|
|
191
284
|
includeLedger: import_zod.z.boolean().default(false).describe("Whether to include recent credit ledger entries")
|
|
@@ -235,6 +328,19 @@ var CaptureSerpPageSnapshotsInputSchema = {
|
|
|
235
328
|
var import_node_fs = require("fs");
|
|
236
329
|
var import_node_os = require("os");
|
|
237
330
|
var import_node_path = require("path");
|
|
331
|
+
|
|
332
|
+
// src/errors.ts
|
|
333
|
+
function sanitizeVendorName(message) {
|
|
334
|
+
return message.replace(/kernel\.sh\s+sessions?/gi, "sessions").replace(/kernel\.sh\s+session/gi, "this session").replace(/kernel\.sh/gi, "the service").replace(/kernel\s+sessions?/gi, "sessions").replace(/kernel\s+session/gi, "this session").replace(/\bkernel\b/gi, "the service").replace(/ +/g, " ").trim();
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
// src/mcp/mcp-response-formatter.ts
|
|
338
|
+
var reportSavingEnabled = true;
|
|
339
|
+
function sanitizeVendorText(text) {
|
|
340
|
+
return sanitizeVendorName(
|
|
341
|
+
text.replace(/kernel_session_id/gi, "browser_session_id").replace(/kernel_delete_succeeded/gi, "session_cleanup_succeeded").replace(/kernel_delete_started/gi, "session_cleanup_started").replace(/kernel_delete_error/gi, "session_cleanup_error").replace(/kernelSessionId/g, "browserSessionId").replace(/kernelProxyId/g, "proxyId").replace(/KERNEL_API_KEY/g, "BROWSER_SERVICE_API_KEY").replace(/"kernel"\s*:/gi, '"browserRuntime":')
|
|
342
|
+
);
|
|
343
|
+
}
|
|
238
344
|
function slugifyReportName(input) {
|
|
239
345
|
return input.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80) || "mcp-scraper-report";
|
|
240
346
|
}
|
|
@@ -246,7 +352,7 @@ function outputBaseDir() {
|
|
|
246
352
|
return process.env.MCP_SCRAPER_OUTPUT_DIR?.trim() || (0, import_node_path.join)((0, import_node_os.homedir)(), "Downloads", "mcp-scraper");
|
|
247
353
|
}
|
|
248
354
|
function saveFullReport(full) {
|
|
249
|
-
if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
|
|
355
|
+
if (!reportSavingEnabled || process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
|
|
250
356
|
const outDir = outputBaseDir();
|
|
251
357
|
try {
|
|
252
358
|
(0, import_node_fs.mkdirSync)(outDir, { recursive: true });
|
|
@@ -259,7 +365,7 @@ function saveFullReport(full) {
|
|
|
259
365
|
}
|
|
260
366
|
}
|
|
261
367
|
function persistScreenshotLocally(base64, url) {
|
|
262
|
-
if (process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
|
|
368
|
+
if (!reportSavingEnabled || process.env.MCP_SCRAPER_SAVE_REPORTS === "false") return null;
|
|
263
369
|
try {
|
|
264
370
|
const dir = (0, import_node_path.join)(outputBaseDir(), "screenshots");
|
|
265
371
|
(0, import_node_fs.mkdirSync)(dir, { recursive: true });
|
|
@@ -299,11 +405,11 @@ function parseData(raw) {
|
|
|
299
405
|
const text = first?.type === "text" ? first.text : "";
|
|
300
406
|
try {
|
|
301
407
|
const parsed = JSON.parse(text || "{}");
|
|
302
|
-
if (raw.isError || parsed.error || parsed.error_code) return { error: formatStructuredError(parsed, text) };
|
|
408
|
+
if (raw.isError || parsed.error || parsed.error_code) return { error: sanitizeVendorText(formatStructuredError(parsed, text)) };
|
|
303
409
|
const data = parsed.result ?? parsed;
|
|
304
410
|
return { data };
|
|
305
411
|
} catch {
|
|
306
|
-
if (raw.isError) return { error: text || "Tool error" };
|
|
412
|
+
if (raw.isError) return { error: sanitizeVendorText(text || "Tool error") };
|
|
307
413
|
return { error: "Failed to parse tool response" };
|
|
308
414
|
}
|
|
309
415
|
}
|
|
@@ -317,15 +423,6 @@ function entityIdsSection(ids) {
|
|
|
317
423
|
## Entity IDs
|
|
318
424
|
${lines.join("\n")}` : "";
|
|
319
425
|
}
|
|
320
|
-
function entityIdsSummaryLine(ids) {
|
|
321
|
-
if (!ids) return "";
|
|
322
|
-
const parts = [];
|
|
323
|
-
if (ids.kgIds?.length) parts.push(`KG MID: ${ids.kgIds[0]}`);
|
|
324
|
-
if (ids.cids?.length) parts.push(`CID: ${ids.cids[0]}`);
|
|
325
|
-
if (ids.gcids?.length) parts.push(`GCID: ${ids.gcids[0]}`);
|
|
326
|
-
return parts.length ? `
|
|
327
|
-
**Entity IDs:** ${parts.join(" \xB7 ")}` : "";
|
|
328
|
-
}
|
|
329
426
|
function truncate(s, max) {
|
|
330
427
|
if (!s) return "";
|
|
331
428
|
return s.length > max ? s.slice(0, max) + "\u2026" : s;
|
|
@@ -355,7 +452,7 @@ function debugSection(debug) {
|
|
|
355
452
|
if (locationEvidence) {
|
|
356
453
|
lines.push(`- Location evidence: ${locationEvidence.status}${locationEvidence.expected ? ` \xB7 expected ${locationEvidence.expected.city}${locationEvidence.expected.regionCode ? `, ${locationEvidence.expected.regionCode}` : ""}` : ""}${candidates ? ` \xB7 candidates ${candidates}` : ""}`);
|
|
357
454
|
}
|
|
358
|
-
return lines.join("\n");
|
|
455
|
+
return sanitizeVendorText(lines.join("\n"));
|
|
359
456
|
}
|
|
360
457
|
function errorAttemptsSection(body) {
|
|
361
458
|
const attempts = Array.isArray(body.attempts) ? body.attempts : [];
|
|
@@ -409,26 +506,12 @@ ${serpRows}` : "";
|
|
|
409
506
|
const tips = `
|
|
410
507
|
---
|
|
411
508
|
\u{1F4A1} **Tips**
|
|
412
|
-
- Max questions: \`maxQuestions:
|
|
509
|
+
- Max questions: \`maxQuestions: 200\` (current: ${input.maxQuestions ?? 30})
|
|
413
510
|
- Organic results only: use \`search_serp\`
|
|
414
511
|
- Dig into a result: use \`extract_url\` on any organic URL`;
|
|
415
512
|
const full = `# PAA Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
|
|
416
513
|
|
|
417
514
|
${paaTable}${serpTable}${entityIdsSection(entityIds)}${aiSection}${statsLine}${debugSection(diagnostics?.debug)}${tips}`;
|
|
418
|
-
const topQ = flat.slice(0, 10).map((r, i) => `${i + 1}. ${r.question}`).join("\n");
|
|
419
|
-
const topO = organic.slice(0, 5).map((r) => `${r.position}. [${r.title}](${r.url}) \u2014 ${r.domain}`).join("\n");
|
|
420
|
-
const summary = [
|
|
421
|
-
`**PAA: "${input.query}"** \u2014 ${flat.length} questions extracted`,
|
|
422
|
-
topQ ? `
|
|
423
|
-
**Top questions:**
|
|
424
|
-
${topQ}` : "",
|
|
425
|
-
organic.length ? `
|
|
426
|
-
**Top organic results:**
|
|
427
|
-
${topO}` : "",
|
|
428
|
-
entityIdsSummaryLine(entityIds),
|
|
429
|
-
`
|
|
430
|
-
\u{1F4A1} \`maxQuestions\` up to 150 \xB7Use \`extract_url\` to dig into any result`
|
|
431
|
-
].filter(Boolean).join("\n");
|
|
432
515
|
return oneBlock(full);
|
|
433
516
|
}
|
|
434
517
|
function formatSearchSerp(raw, input) {
|
|
@@ -467,18 +550,6 @@ ${localRows}` : "";
|
|
|
467
550
|
const full = `# SERP Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
|
|
468
551
|
|
|
469
552
|
${serpTable}${localSection}${entityIdsSection(entityIds)}${aiSection}${debugSection(diagnostics?.debug)}${tips}`;
|
|
470
|
-
const topO = organic.slice(0, 5).map((r) => `${r.position}. [${r.title}](${r.url}) \u2014 ${r.domain}`).join("\n");
|
|
471
|
-
const summary = [
|
|
472
|
-
`**SERP: "${input.query}"** \u2014 ${organic.length} organic results`,
|
|
473
|
-
topO ? `
|
|
474
|
-
**Top results:**
|
|
475
|
-
${topO}` : "",
|
|
476
|
-
localPack.length ? `
|
|
477
|
-
**Local Pack:** ${localPack.map((b) => b.name).join(", ")}` : "",
|
|
478
|
-
entityIdsSummaryLine(entityIds),
|
|
479
|
-
`
|
|
480
|
-
\u{1F4A1} Use \`harvest_paa\` for questions \xB7 \`extract_url\` to scrape any result`
|
|
481
|
-
].filter(Boolean).join("\n");
|
|
482
553
|
return oneBlock(full);
|
|
483
554
|
}
|
|
484
555
|
function formatExtractUrl(raw, input) {
|
|
@@ -589,15 +660,19 @@ ${broken.map((u) => `- ${u.url} (${u.status})`).join("\n")}` : "",
|
|
|
589
660
|
- Extract content from all pages: use \`extract_site\`
|
|
590
661
|
- Scrape a single page: use \`extract_url\``
|
|
591
662
|
].filter(Boolean).join("\n");
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
663
|
+
return {
|
|
664
|
+
...oneBlock(full),
|
|
665
|
+
structuredContent: {
|
|
666
|
+
startUrl: d.startUrl ?? input.url,
|
|
667
|
+
totalFound: d.totalFound ?? urls.length,
|
|
668
|
+
truncated: d.truncated === true,
|
|
669
|
+
okCount: ok.length,
|
|
670
|
+
redirectCount: redirects.length,
|
|
671
|
+
brokenCount: broken.length,
|
|
672
|
+
urls: urls.map((u) => ({ url: u.url, status: u.status ?? null })),
|
|
673
|
+
durationMs: d.durationMs ?? 0
|
|
674
|
+
}
|
|
675
|
+
};
|
|
601
676
|
}
|
|
602
677
|
function formatExtractSite(raw, input) {
|
|
603
678
|
const parsed = parseData(raw);
|
|
@@ -622,13 +697,6 @@ ${pageRows}`,
|
|
|
622
697
|
- Map URLs first: use \`map_site_urls\`
|
|
623
698
|
- Inspect a single page: use \`extract_url\``
|
|
624
699
|
].join("\n");
|
|
625
|
-
const summary = [
|
|
626
|
-
`**Site Extract: ${input.url}** \u2014 ${pages.length} pages`,
|
|
627
|
-
pages.slice(0, 5).map((p) => `- ${p.title ?? p.url}`).join("\n"),
|
|
628
|
-
pages.length > 5 ? `- \u2026 and ${pages.length - 5} more` : "",
|
|
629
|
-
`
|
|
630
|
-
\u{1F4A1} Use \`extract_url\` to inspect any individual page`
|
|
631
|
-
].filter(Boolean).join("\n");
|
|
632
700
|
return oneBlock(full);
|
|
633
701
|
}
|
|
634
702
|
function formatYoutubeHarvest(raw, input) {
|
|
@@ -659,16 +727,22 @@ ${videoRows}`,
|
|
|
659
727
|
- Transcribe a video: use \`youtube_transcribe\` with the \`videoId\` above
|
|
660
728
|
- Switch mode: \`mode: "channel"\` with \`channelHandle\` or \`mode: "search"\` with \`query\``
|
|
661
729
|
].filter(Boolean).join("\n");
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
730
|
+
return {
|
|
731
|
+
...oneBlock(full),
|
|
732
|
+
structuredContent: {
|
|
733
|
+
mode: input.mode,
|
|
734
|
+
videoCount: videos.length,
|
|
735
|
+
channel: d.channelMeta ? { title: d.channelMeta.title ?? null, subscriberCount: d.channelMeta.subscriberCount ?? null } : null,
|
|
736
|
+
videos: videos.map((v) => ({
|
|
737
|
+
videoId: String(v.videoId ?? ""),
|
|
738
|
+
title: String(v.title ?? ""),
|
|
739
|
+
channelName: v.channelName ?? null,
|
|
740
|
+
views: v.views ?? null,
|
|
741
|
+
duration: v.duration ?? null,
|
|
742
|
+
url: v.url ?? null
|
|
743
|
+
}))
|
|
744
|
+
}
|
|
745
|
+
};
|
|
672
746
|
}
|
|
673
747
|
function formatYoutubeTranscribe(raw, input) {
|
|
674
748
|
const parsed = parseData(raw);
|
|
@@ -698,14 +772,6 @@ ${chunkRows}` : "",
|
|
|
698
772
|
---
|
|
699
773
|
\u{1F4A1} Harvest more from this channel: use \`youtube_harvest\` with \`mode: "channel"\``
|
|
700
774
|
].filter(Boolean).join("\n");
|
|
701
|
-
const summary = [
|
|
702
|
-
`**YouTube Transcript: \`${input.videoId}\`** \u2014 ${text.split(" ").length} words \xB7 ${durSec}s`,
|
|
703
|
-
`
|
|
704
|
-
**Preview:**
|
|
705
|
-
> ${truncate(text, 300)}`,
|
|
706
|
-
`
|
|
707
|
-
\u{1F4A1} Full transcript in artifact above`
|
|
708
|
-
].join("\n");
|
|
709
775
|
return oneBlock(full);
|
|
710
776
|
}
|
|
711
777
|
function formatFacebookPageIntel(raw, input) {
|
|
@@ -734,19 +800,26 @@ ${adBlocks}`,
|
|
|
734
800
|
- Transcribe video ads: use \`facebook_ad_transcribe\` with the \`videoUrl\` above
|
|
735
801
|
- Find other advertisers: use \`facebook_ad_search\``
|
|
736
802
|
].filter(Boolean).join("\n");
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
803
|
+
return {
|
|
804
|
+
...oneBlock(full),
|
|
805
|
+
structuredContent: {
|
|
806
|
+
advertiserName: d.advertiserName ?? null,
|
|
807
|
+
totalAds: s.totalAds ?? 0,
|
|
808
|
+
activeCount: s.activeCount ?? 0,
|
|
809
|
+
videoCount: s.videoCount ?? 0,
|
|
810
|
+
imageCount: s.imageCount ?? 0,
|
|
811
|
+
ads: ads.map((ad) => ({
|
|
812
|
+
libraryId: ad.libraryId ?? null,
|
|
813
|
+
status: ad.status ?? null,
|
|
814
|
+
creativeType: ad.creativeType ?? null,
|
|
815
|
+
headline: ad.headline ?? null,
|
|
816
|
+
cta: ad.cta ?? null,
|
|
817
|
+
startDate: ad.startDate ?? null,
|
|
818
|
+
videoUrl: ad.videoUrl ?? null,
|
|
819
|
+
variations: typeof ad.variations === "number" ? ad.variations : null
|
|
820
|
+
}))
|
|
821
|
+
}
|
|
822
|
+
};
|
|
750
823
|
}
|
|
751
824
|
function formatFacebookAdSearch(raw, input) {
|
|
752
825
|
const parsed = parseData(raw);
|
|
@@ -770,15 +843,18 @@ ${rows}`,
|
|
|
770
843
|
- Scan all ads: use \`facebook_page_intel\` with \`libraryId\`
|
|
771
844
|
- Or pass the advertiser name as \`query\` in \`facebook_page_intel\``
|
|
772
845
|
].join("\n");
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
846
|
+
return {
|
|
847
|
+
...oneBlock(full),
|
|
848
|
+
structuredContent: {
|
|
849
|
+
query: input.query,
|
|
850
|
+
advertiserCount: advertisers.length,
|
|
851
|
+
advertisers: advertisers.map((a) => ({
|
|
852
|
+
name: a.pageName ?? a.name ?? null,
|
|
853
|
+
adCount: typeof a.adCount === "number" ? a.adCount : null,
|
|
854
|
+
libraryId: a.sampleLibraryId ?? a.libraryId ?? null
|
|
855
|
+
}))
|
|
856
|
+
}
|
|
857
|
+
};
|
|
782
858
|
}
|
|
783
859
|
function formatCreditsInfo(raw, input) {
|
|
784
860
|
const parsed = parseData(raw);
|
|
@@ -817,16 +893,58 @@ ${costRows}` : "",
|
|
|
817
893
|
| Date | Operation | Credits | Description |
|
|
818
894
|
|------|-----------|---------|-------------|
|
|
819
895
|
${ledgerRows}` : ""
|
|
820
|
-
].filter(Boolean).join("\n");
|
|
821
|
-
const summary = [
|
|
822
|
-
`**Credit balance:** ${balance ?? "unknown"} credits`,
|
|
823
|
-
matched ? `
|
|
824
|
-
**${matched.label}:** ${matched.credits} credits ${matched.unit}` : null,
|
|
825
|
-
input.includeLedger && ledger.length ? `
|
|
826
|
-
Recent ledger entries included in the full report.` : null
|
|
827
896
|
].filter(Boolean).join("\n");
|
|
828
897
|
return oneBlock(full);
|
|
829
898
|
}
|
|
899
|
+
function formatMapsSearch(raw, input) {
|
|
900
|
+
const parsed = parseData(raw);
|
|
901
|
+
if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
|
|
902
|
+
const d = parsed.data;
|
|
903
|
+
const results = d.results ?? [];
|
|
904
|
+
const searchQuery = d.searchQuery ?? [input.query, input.location].filter(Boolean).join(" ");
|
|
905
|
+
const requestedMax = d.requestedMaxResults ?? input.maxResults ?? 10;
|
|
906
|
+
const durationMs = d.durationMs;
|
|
907
|
+
const rows = results.map((r) => {
|
|
908
|
+
const rating = [r.rating, r.reviewCount ? `(${r.reviewCount})` : null].filter(Boolean).join(" ");
|
|
909
|
+
return `| ${r.position} | ${cell(r.name)} | ${cell(r.category)} | ${cell(rating)} | ${cell(r.address)} | ${r.cidDecimal ? `\`${r.cidDecimal}\`` : "\u2014"} | ${r.websiteUrl ? `[site](${r.websiteUrl})` : "\u2014"} | [maps](${r.placeUrl}) |`;
|
|
910
|
+
}).join("\n");
|
|
911
|
+
const metadataSection = results.length ? `
|
|
912
|
+
## Candidate Metadata
|
|
913
|
+
${results.map((r) => {
|
|
914
|
+
const meta = r.metadata?.length ? r.metadata.slice(0, 8).map((m) => ` - ${m}`).join("\n") : " - none";
|
|
915
|
+
return `### ${r.position}. ${r.name}
|
|
916
|
+
${meta}`;
|
|
917
|
+
}).join("\n\n")}` : "";
|
|
918
|
+
const full = [
|
|
919
|
+
`# Google Maps Search: "${searchQuery}"`,
|
|
920
|
+
`**Returned:** ${results.length} profile candidate${results.length === 1 ? "" : "s"} \xB7 **Requested max:** ${requestedMax} \xB7 **Limit:** 50`,
|
|
921
|
+
`
|
|
922
|
+
## Results
|
|
923
|
+
| # | Name | Category | Rating | Address | CID | Website | Maps |
|
|
924
|
+
|---|------|----------|--------|---------|-----|---------|------|
|
|
925
|
+
${rows}`,
|
|
926
|
+
metadataSection,
|
|
927
|
+
`
|
|
928
|
+
---
|
|
929
|
+
\u{1F4A1} **Next step:** use \`maps_place_intel\` with a selected business name and location to hydrate full hours, phone, review topics, and optional review cards.`,
|
|
930
|
+
durationMs != null ? `
|
|
931
|
+
*Extracted in ${(durationMs / 1e3).toFixed(1)}s*` : null
|
|
932
|
+
].filter(Boolean).join("\n");
|
|
933
|
+
return {
|
|
934
|
+
...oneBlock(full),
|
|
935
|
+
structuredContent: {
|
|
936
|
+
query: d.query,
|
|
937
|
+
location: d.location ?? null,
|
|
938
|
+
searchQuery: d.searchQuery,
|
|
939
|
+
searchUrl: d.searchUrl,
|
|
940
|
+
extractedAt: d.extractedAt,
|
|
941
|
+
requestedMaxResults: requestedMax,
|
|
942
|
+
resultCount: results.length,
|
|
943
|
+
results,
|
|
944
|
+
durationMs: durationMs ?? 0
|
|
945
|
+
}
|
|
946
|
+
};
|
|
947
|
+
}
|
|
830
948
|
function formatMapsPlaceIntel(raw, input) {
|
|
831
949
|
const parsed = parseData(raw);
|
|
832
950
|
if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
|
|
@@ -924,19 +1042,6 @@ ${entitySection}` : null,
|
|
|
924
1042
|
durationMs != null ? `
|
|
925
1043
|
---
|
|
926
1044
|
*Extracted in ${(durationMs / 1e3).toFixed(1)}s*` : null
|
|
927
|
-
].filter(Boolean).join("\n");
|
|
928
|
-
const summary = [
|
|
929
|
-
`**${name}** \u2014 ${category ?? "Business"} \xB7 ${ratingLine || "No rating"}`,
|
|
930
|
-
address ? `\u{1F4CD} ${address}` : null,
|
|
931
|
-
phone ? `\u{1F4DE} ${phone}` : null,
|
|
932
|
-
hoursSummary ? `\u{1F550} ${hoursSummary}` : null,
|
|
933
|
-
website ? `\u{1F310} ${website}` : null,
|
|
934
|
-
reviewsStatus === "collected" && reviews.length ? `
|
|
935
|
-
\u{1F4AC} ${reviews.length} reviews fetched \u2014 full list in artifact above` : null,
|
|
936
|
-
reviewsStatus === "unavailable" ? `
|
|
937
|
-
\u26A0\uFE0F Reviews could not be retrieved this run` : null,
|
|
938
|
-
reviewsStatus === "none_exist" ? `
|
|
939
|
-
\u{1F4AC} No reviews on Google Maps` : null
|
|
940
1045
|
].filter(Boolean).join("\n");
|
|
941
1046
|
return oneBlock(full);
|
|
942
1047
|
}
|
|
@@ -968,67 +1073,112 @@ ${chunkRows}` : "",
|
|
|
968
1073
|
---
|
|
969
1074
|
\u{1F4A1} Get more ads from this advertiser: use \`facebook_page_intel\``
|
|
970
1075
|
].filter(Boolean).join("\n");
|
|
971
|
-
const summary = [
|
|
972
|
-
`**Facebook Ad Transcript** \u2014 ${text.split(" ").length} words \xB7 ${durSec}s`,
|
|
973
|
-
`
|
|
974
|
-
**Preview:**
|
|
975
|
-
> ${truncate(text, 300)}`,
|
|
976
|
-
`
|
|
977
|
-
\u{1F4A1} Full transcript in artifact above`
|
|
978
|
-
].join("\n");
|
|
979
1076
|
return oneBlock(full);
|
|
980
1077
|
}
|
|
981
1078
|
|
|
982
1079
|
// src/mcp/paa-mcp-server.ts
|
|
983
|
-
function
|
|
984
|
-
|
|
1080
|
+
function liveWebToolAnnotations(title) {
|
|
1081
|
+
return {
|
|
1082
|
+
title,
|
|
1083
|
+
readOnlyHint: true,
|
|
1084
|
+
destructiveHint: false,
|
|
1085
|
+
idempotentHint: false,
|
|
1086
|
+
openWorldHint: true
|
|
1087
|
+
};
|
|
1088
|
+
}
|
|
1089
|
+
function buildPaaExtractorMcpServer(executor2, options = {}) {
|
|
1090
|
+
const savesReports = options.savesReportsLocally !== false;
|
|
1091
|
+
const reportNote = savesReports ? " Saves a full Markdown report locally." : " Reports are returned inline; no files are saved on this hosted endpoint.";
|
|
1092
|
+
const withReportNote = (description) => `${description}${reportNote}`;
|
|
1093
|
+
const server2 = new import_mcp.McpServer({ name: "mcp-scraper", version: PACKAGE_VERSION });
|
|
985
1094
|
server2.registerTool("harvest_paa", {
|
|
986
|
-
|
|
987
|
-
|
|
1095
|
+
title: "Google PAA + SERP Harvest",
|
|
1096
|
+
description: withReportNote('Best default tool for Google search research. Extracts People Also Ask questions plus answers/source URLs, organic SERP, local pack when present, entity IDs (CID/GCID/KG MID), and AI Overview. Infer the user language: split topic from location (e.g. "best hvac company in Denver CO" => query "best hvac company", location "Denver, CO", gl "us", hl "en"). Use maxQuestions 30 normally, 100-150 for "full", "deep", "all", or comprehensive research. Credits are charged by extracted question; unused request hold is refunded.'),
|
|
1097
|
+
inputSchema: HarvestPaaInputSchema,
|
|
1098
|
+
annotations: liveWebToolAnnotations("Google PAA + SERP Harvest")
|
|
988
1099
|
}, async (input) => formatHarvestPaa(await executor2.harvestPaa(input), input));
|
|
989
1100
|
server2.registerTool("search_serp", {
|
|
990
|
-
|
|
991
|
-
|
|
1101
|
+
title: "Google SERP Lookup",
|
|
1102
|
+
description: withReportNote("Fast Google SERP lookup without PAA expansion. Use when the user asks for rankings, organic results, local pack, quick SERP, or positions. Split topic from location and infer gl/hl from the user request."),
|
|
1103
|
+
inputSchema: SearchSerpInputSchema,
|
|
1104
|
+
annotations: liveWebToolAnnotations("Google SERP Lookup")
|
|
992
1105
|
}, async (input) => formatSearchSerp(await executor2.searchSerp(input), input));
|
|
993
1106
|
server2.registerTool("extract_url", {
|
|
994
|
-
|
|
995
|
-
|
|
1107
|
+
title: "Single URL Extract",
|
|
1108
|
+
description: withReportNote("Extract structured data from one public URL: page content as Markdown, heading structure, JSON-LD schema, entity details, NAP score, metadata, and missing schema fields. Use when the user provides a single URL or asks to inspect/scrape one page."),
|
|
1109
|
+
inputSchema: ExtractUrlInputSchema,
|
|
1110
|
+
annotations: liveWebToolAnnotations("Single URL Extract")
|
|
996
1111
|
}, async (input) => formatExtractUrl(await executor2.extractUrl(input), input));
|
|
997
1112
|
server2.registerTool("map_site_urls", {
|
|
998
|
-
|
|
999
|
-
|
|
1113
|
+
title: "Site URL Map",
|
|
1114
|
+
description: withReportNote("Map/crawl a public website to build a URL inventory with HTTP status codes, broken links, redirects, and site scope. Use before extract_site for audits or when the user asks for a sitemap/URL inventory."),
|
|
1115
|
+
inputSchema: MapSiteUrlsInputSchema,
|
|
1116
|
+
outputSchema: MapSiteUrlsOutputSchema,
|
|
1117
|
+
annotations: liveWebToolAnnotations("Site URL Map")
|
|
1000
1118
|
}, async (input) => formatMapSiteUrls(await executor2.mapSiteUrls(input), input));
|
|
1001
1119
|
server2.registerTool("extract_site", {
|
|
1002
|
-
|
|
1003
|
-
|
|
1120
|
+
title: "Multi-Page Site Extract",
|
|
1121
|
+
description: withReportNote("Run multi-page extraction across a public website. Returns per-page titles, H1s, metadata, headings, schema/entity data, canonical URLs, and content. Use for website audits, competitor audits, and full-site extraction."),
|
|
1122
|
+
inputSchema: ExtractSiteInputSchema,
|
|
1123
|
+
annotations: liveWebToolAnnotations("Multi-Page Site Extract")
|
|
1004
1124
|
}, async (input) => formatExtractSite(await executor2.extractSite(input), input));
|
|
1005
1125
|
server2.registerTool("youtube_harvest", {
|
|
1006
|
-
|
|
1007
|
-
|
|
1126
|
+
title: "YouTube Video Harvest",
|
|
1127
|
+
description: withReportNote('Harvest YouTube video metadata by search query or channel handle/ID/URL. Use mode "search" for keyword/topic requests and mode "channel" for @handles, channel IDs, or channel URLs. Returns titles, views, dates, durations, URLs, thumbnails, and videoIds for follow-up transcription.'),
|
|
1128
|
+
inputSchema: YoutubeHarvestInputSchema,
|
|
1129
|
+
outputSchema: YoutubeHarvestOutputSchema,
|
|
1130
|
+
annotations: liveWebToolAnnotations("YouTube Video Harvest")
|
|
1008
1131
|
}, async (input) => formatYoutubeHarvest(await executor2.youtubeHarvest(input), input));
|
|
1009
1132
|
server2.registerTool("youtube_transcribe", {
|
|
1010
|
-
|
|
1011
|
-
|
|
1133
|
+
title: "YouTube Transcription",
|
|
1134
|
+
description: withReportNote("Fetch and transcribe captions from a YouTube video. Returns full transcript, timestamped chunks, and word count. Pass a videoId from youtube_harvest results or infer it from a YouTube URL if the user provided one."),
|
|
1135
|
+
inputSchema: YoutubeTranscribeInputSchema,
|
|
1136
|
+
annotations: liveWebToolAnnotations("YouTube Transcription")
|
|
1012
1137
|
}, async (input) => formatYoutubeTranscribe(await executor2.youtubeTranscribe(input), input));
|
|
1013
1138
|
server2.registerTool("facebook_page_intel", {
|
|
1014
|
-
|
|
1015
|
-
|
|
1139
|
+
title: "Facebook Advertiser Ad Intel",
|
|
1140
|
+
description: withReportNote("Harvest ads from a Facebook advertiser. Returns ad copy, headlines, CTAs, creative type, status, landing URLs, and video URLs ready for transcription. Accepts pageId, libraryId, or a brand/advertiser name as query. Use after facebook_ad_search when possible."),
|
|
1141
|
+
inputSchema: FacebookPageIntelInputSchema,
|
|
1142
|
+
outputSchema: FacebookPageIntelOutputSchema,
|
|
1143
|
+
annotations: liveWebToolAnnotations("Facebook Advertiser Ad Intel")
|
|
1016
1144
|
}, async (input) => formatFacebookPageIntel(await executor2.facebookPageIntel(input), input));
|
|
1017
1145
|
server2.registerTool("facebook_ad_search", {
|
|
1018
|
-
|
|
1019
|
-
|
|
1146
|
+
title: "Facebook Ad Library Search",
|
|
1147
|
+
description: withReportNote("Search Facebook Ad Library by brand, advertiser, competitor, niche, or keyword. Returns advertisers with ad counts and library IDs. Use to discover competitors, then pass libraryId to facebook_page_intel."),
|
|
1148
|
+
inputSchema: FacebookAdSearchInputSchema,
|
|
1149
|
+
outputSchema: FacebookAdSearchOutputSchema,
|
|
1150
|
+
annotations: liveWebToolAnnotations("Facebook Ad Library Search")
|
|
1020
1151
|
}, async (input) => formatFacebookAdSearch(await executor2.facebookAdSearch(input), input));
|
|
1021
1152
|
server2.registerTool("facebook_ad_transcribe", {
|
|
1153
|
+
title: "Facebook Ad Transcription",
|
|
1022
1154
|
description: "Transcribe audio from a Facebook ad video. Returns full transcript and timestamped chunks. Use the videoUrl value from facebook_page_intel results.",
|
|
1023
|
-
inputSchema: FacebookAdTranscribeInputSchema
|
|
1155
|
+
inputSchema: FacebookAdTranscribeInputSchema,
|
|
1156
|
+
annotations: liveWebToolAnnotations("Facebook Ad Transcription")
|
|
1024
1157
|
}, async (input) => formatFacebookAdTranscribe(await executor2.facebookAdTranscribe(input), input));
|
|
1025
1158
|
server2.registerTool("maps_place_intel", {
|
|
1026
|
-
|
|
1027
|
-
|
|
1159
|
+
title: "Google Maps Business Profile Details",
|
|
1160
|
+
description: withReportNote('Extract Google Maps business intelligence for one known/named business: rating, review count, category, address, phone, website, hours, booking URL, review histogram, review topics, about attributes, entity IDs, and optional review cards. Do not use this for category searches, local market prospect lists, or requests for multiple GMB/GBP profiles; use maps_search first for those. Split business name from location (e.g. "Elite Roofing Denver CO" => businessName "Elite Roofing", location "Denver, CO"). Pass includeReviews true when the user asks for reviews/customer pain.'),
|
|
1161
|
+
inputSchema: MapsPlaceIntelInputSchema,
|
|
1162
|
+
annotations: liveWebToolAnnotations("Google Maps Business Profile Details")
|
|
1028
1163
|
}, async (input) => formatMapsPlaceIntel(await executor2.mapsPlaceIntel(input), input));
|
|
1164
|
+
server2.registerTool("maps_search", {
|
|
1165
|
+
title: "Google Maps Business Search",
|
|
1166
|
+
description: withReportNote('Search Google Maps for multiple businesses/profiles by category, niche, keyword, or local market. Use this when the user asks for several Google Business Profiles, GMBs, GBPs, leads, prospects, competitors, or "more than the 3-pack." Returns up to 50 candidates with names, place URLs, CIDs when available, ratings, review counts, and profile metadata. Default maxResults is 10; maximum is 50. Use maps_place_intel afterward only when a selected business needs full details and reviews.'),
|
|
1167
|
+
inputSchema: MapsSearchInputSchema,
|
|
1168
|
+
outputSchema: MapsSearchOutputSchema,
|
|
1169
|
+
annotations: liveWebToolAnnotations("Google Maps Business Search")
|
|
1170
|
+
}, async (input) => formatMapsSearch(await executor2.mapsSearch(input), input));
|
|
1029
1171
|
server2.registerTool("credits_info", {
|
|
1172
|
+
title: "MCP Scraper Credits & Costs",
|
|
1030
1173
|
description: "Answer questions about MCP Scraper credits: current credit balance, what a specific tool/action costs, the full cost table, and optionally recent credit ledger entries. Does not expose payment methods or credit card information.",
|
|
1031
|
-
inputSchema: CreditsInfoInputSchema
|
|
1174
|
+
inputSchema: CreditsInfoInputSchema,
|
|
1175
|
+
annotations: {
|
|
1176
|
+
title: "MCP Scraper Credits & Costs",
|
|
1177
|
+
readOnlyHint: true,
|
|
1178
|
+
destructiveHint: false,
|
|
1179
|
+
idempotentHint: true,
|
|
1180
|
+
openWorldHint: false
|
|
1181
|
+
}
|
|
1032
1182
|
}, async (input) => formatCreditsInfo(await executor2.creditsInfo(input), input));
|
|
1033
1183
|
return server2;
|
|
1034
1184
|
}
|