mcp-scraper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +56 -0
- package/dist/bin/api-server.cjs +9256 -0
- package/dist/bin/api-server.cjs.map +1 -0
- package/dist/bin/api-server.d.cts +1 -0
- package/dist/bin/api-server.d.ts +1 -0
- package/dist/bin/api-server.js +38 -0
- package/dist/bin/api-server.js.map +1 -0
- package/dist/bin/mcp-stdio-server.cjs +840 -0
- package/dist/bin/mcp-stdio-server.cjs.map +1 -0
- package/dist/bin/mcp-stdio-server.d.cts +1 -0
- package/dist/bin/mcp-stdio-server.d.ts +1 -0
- package/dist/bin/mcp-stdio-server.js +41 -0
- package/dist/bin/mcp-stdio-server.js.map +1 -0
- package/dist/bin/paa-harvest.cjs +1438 -0
- package/dist/bin/paa-harvest.cjs.map +1 -0
- package/dist/bin/paa-harvest.d.cts +1 -0
- package/dist/bin/paa-harvest.d.ts +1 -0
- package/dist/bin/paa-harvest.js +37 -0
- package/dist/bin/paa-harvest.js.map +1 -0
- package/dist/chunk-4API3ZCT.js +1387 -0
- package/dist/chunk-4API3ZCT.js.map +1 -0
- package/dist/chunk-LXZDJJXR.js +476 -0
- package/dist/chunk-LXZDJJXR.js.map +1 -0
- package/dist/chunk-ZBP4RHNW.js +805 -0
- package/dist/chunk-ZBP4RHNW.js.map +1 -0
- package/dist/db-IOYMX64U.js +87 -0
- package/dist/db-IOYMX64U.js.map +1 -0
- package/dist/index.cjs +1689 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +210 -0
- package/dist/index.d.ts +210 -0
- package/dist/index.js +275 -0
- package/dist/index.js.map +1 -0
- package/dist/server-63DR2HE5.js +6062 -0
- package/dist/server-63DR2HE5.js.map +1 -0
- package/dist/worker-3ECJHPRE.js +88 -0
- package/dist/worker-3ECJHPRE.js.map +1 -0
- package/package.json +76 -0
|
@@ -0,0 +1,840 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
"use strict";
|
|
3
|
+
|
|
4
|
+
// bin/mcp-stdio-server.ts
|
|
5
|
+
var import_node_fs = require("fs");
|
|
6
|
+
var import_node_os = require("os");
|
|
7
|
+
var import_node_path = require("path");
|
|
8
|
+
var import_stdio = require("@modelcontextprotocol/sdk/server/stdio.js");
|
|
9
|
+
|
|
10
|
+
// src/mcp/http-mcp-tool-executor.ts
|
|
11
|
+
var HttpMcpToolExecutor = class {
|
|
12
|
+
baseUrl;
|
|
13
|
+
apiKey;
|
|
14
|
+
constructor(baseUrl2, apiKey2) {
|
|
15
|
+
this.baseUrl = baseUrl2.replace(/\/$/, "");
|
|
16
|
+
this.apiKey = apiKey2;
|
|
17
|
+
}
|
|
18
|
+
async call(path, body) {
|
|
19
|
+
try {
|
|
20
|
+
const res = await fetch(`${this.baseUrl}${path}`, {
|
|
21
|
+
method: "POST",
|
|
22
|
+
headers: {
|
|
23
|
+
"Content-Type": "application/json",
|
|
24
|
+
"x-api-key": this.apiKey
|
|
25
|
+
},
|
|
26
|
+
body: JSON.stringify(body),
|
|
27
|
+
signal: AbortSignal.timeout(29e4)
|
|
28
|
+
});
|
|
29
|
+
const data = await res.json();
|
|
30
|
+
if (!res.ok) {
|
|
31
|
+
return { content: [{ type: "text", text: JSON.stringify(data) }], isError: true };
|
|
32
|
+
}
|
|
33
|
+
return { content: [{ type: "text", text: JSON.stringify(data) }] };
|
|
34
|
+
} catch (err) {
|
|
35
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
36
|
+
return { content: [{ type: "text", text: msg }], isError: true };
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
harvestPaa(input) {
|
|
40
|
+
return this.call("/harvest/sync", input);
|
|
41
|
+
}
|
|
42
|
+
searchSerp(input) {
|
|
43
|
+
return this.call("/harvest/sync", { ...input, serpOnly: true });
|
|
44
|
+
}
|
|
45
|
+
extractUrl(input) {
|
|
46
|
+
return this.call("/extract-url", input);
|
|
47
|
+
}
|
|
48
|
+
mapSiteUrls(input) {
|
|
49
|
+
return this.call("/map-urls", input);
|
|
50
|
+
}
|
|
51
|
+
extractSite(input) {
|
|
52
|
+
return this.call("/extract-site", input);
|
|
53
|
+
}
|
|
54
|
+
youtubeHarvest(input) {
|
|
55
|
+
return this.call("/youtube/harvest", input);
|
|
56
|
+
}
|
|
57
|
+
youtubeTranscribe(input) {
|
|
58
|
+
return this.call("/youtube/transcribe", input);
|
|
59
|
+
}
|
|
60
|
+
facebookPageIntel(input) {
|
|
61
|
+
return this.call("/facebook/page-intel", input);
|
|
62
|
+
}
|
|
63
|
+
facebookAdSearch(input) {
|
|
64
|
+
return this.call("/facebook/search", input);
|
|
65
|
+
}
|
|
66
|
+
facebookAdTranscribe(input) {
|
|
67
|
+
return this.call("/facebook/transcribe", input);
|
|
68
|
+
}
|
|
69
|
+
mapsPlaceIntel(input) {
|
|
70
|
+
return this.call("/maps/place", input);
|
|
71
|
+
}
|
|
72
|
+
creditsInfo(input) {
|
|
73
|
+
return this.call("/billing/credits", input);
|
|
74
|
+
}
|
|
75
|
+
};
|
|
76
|
+
|
|
77
|
+
// src/mcp/paa-mcp-server.ts
|
|
78
|
+
var import_mcp = require("@modelcontextprotocol/sdk/server/mcp.js");
|
|
79
|
+
|
|
80
|
+
// src/mcp/mcp-tool-schemas.ts
|
|
81
|
+
var import_zod = require("zod");
|
|
82
|
+
var HarvestPaaInputSchema = {
|
|
83
|
+
query: import_zod.z.string().min(1).describe("Search query to harvest PAA questions for"),
|
|
84
|
+
location: import_zod.z.string().optional().describe("Location name for geo-targeted results"),
|
|
85
|
+
maxQuestions: import_zod.z.number().int().min(1).max(100).default(30).describe("Number of PAA questions to extract (max 100)"),
|
|
86
|
+
gl: import_zod.z.string().length(2).default("us"),
|
|
87
|
+
hl: import_zod.z.string().default("en")
|
|
88
|
+
};
|
|
89
|
+
var ExtractUrlInputSchema = {
|
|
90
|
+
url: import_zod.z.string().url()
|
|
91
|
+
};
|
|
92
|
+
var MapSiteUrlsInputSchema = {
|
|
93
|
+
url: import_zod.z.string().url(),
|
|
94
|
+
maxUrls: import_zod.z.number().int().min(1).max(500).optional()
|
|
95
|
+
};
|
|
96
|
+
var ExtractSiteInputSchema = {
|
|
97
|
+
url: import_zod.z.string().url(),
|
|
98
|
+
maxPages: import_zod.z.number().int().min(1).max(50).optional()
|
|
99
|
+
};
|
|
100
|
+
var YoutubeHarvestInputSchema = {
|
|
101
|
+
mode: import_zod.z.enum(["search", "channel"]),
|
|
102
|
+
query: import_zod.z.string().optional().describe("Required when mode is search"),
|
|
103
|
+
channelHandle: import_zod.z.string().optional().describe("YouTube channel handle, e.g. @mkbhd"),
|
|
104
|
+
maxVideos: import_zod.z.number().int().min(1).max(500).default(50)
|
|
105
|
+
};
|
|
106
|
+
var YoutubeTranscribeInputSchema = {
|
|
107
|
+
videoId: import_zod.z.string().min(1).describe("YouTube video ID, e.g. dQw4w9WgXcQ")
|
|
108
|
+
};
|
|
109
|
+
var FacebookPageIntelInputSchema = {
|
|
110
|
+
pageId: import_zod.z.string().optional(),
|
|
111
|
+
libraryId: import_zod.z.string().optional(),
|
|
112
|
+
query: import_zod.z.string().optional().describe("One of pageId, libraryId, or query is required"),
|
|
113
|
+
maxAds: import_zod.z.number().int().min(1).max(200).default(50),
|
|
114
|
+
country: import_zod.z.string().length(2).default("US")
|
|
115
|
+
};
|
|
116
|
+
var FacebookAdSearchInputSchema = {
|
|
117
|
+
query: import_zod.z.string().min(1),
|
|
118
|
+
country: import_zod.z.string().length(2).default("US"),
|
|
119
|
+
maxResults: import_zod.z.number().int().min(1).max(20).default(10)
|
|
120
|
+
};
|
|
121
|
+
var FacebookAdTranscribeInputSchema = {
|
|
122
|
+
videoUrl: import_zod.z.string().url().describe("Facebook CDN video URL from a facebook_page_intel result")
|
|
123
|
+
};
|
|
124
|
+
var MapsPlaceIntelInputSchema = {
|
|
125
|
+
businessName: import_zod.z.string().min(1).describe("Business name to search for on Google Maps"),
|
|
126
|
+
location: import_zod.z.string().min(1).describe('City and state, e.g. "Denver, CO"'),
|
|
127
|
+
gl: import_zod.z.string().length(2).default("us"),
|
|
128
|
+
hl: import_zod.z.string().length(2).default("en"),
|
|
129
|
+
includeReviews: import_zod.z.boolean().default(false).describe("Whether to fetch individual review cards"),
|
|
130
|
+
maxReviews: import_zod.z.number().int().min(1).max(500).default(50).describe("Max review cards to return (requires includeReviews: true)")
|
|
131
|
+
};
|
|
132
|
+
var CreditsInfoInputSchema = {
|
|
133
|
+
item: import_zod.z.string().optional().describe('Optional tool, action, or feature to look up, e.g. "maps reviews", "extract_url", or "YouTube transcription"'),
|
|
134
|
+
includeLedger: import_zod.z.boolean().default(false).describe("Whether to include recent credit ledger entries")
|
|
135
|
+
};
|
|
136
|
+
var SearchSerpInputSchema = {
|
|
137
|
+
query: import_zod.z.string().min(1).describe("Search query to retrieve organic Google results for"),
|
|
138
|
+
location: import_zod.z.string().optional().describe("Location name for geo-targeted results"),
|
|
139
|
+
gl: import_zod.z.string().length(2).default("us"),
|
|
140
|
+
hl: import_zod.z.string().default("en"),
|
|
141
|
+
pages: import_zod.z.number().int().min(1).max(2).default(1).describe("Number of result pages to fetch (1\u20132)")
|
|
142
|
+
};
|
|
143
|
+
|
|
144
|
+
// src/mcp/mcp-response-formatter.ts
|
|
145
|
+
function twoBlocks(full, summary) {
|
|
146
|
+
return { content: [{ type: "text", text: full }, { type: "text", text: summary }] };
|
|
147
|
+
}
|
|
148
|
+
function parseData(raw) {
|
|
149
|
+
const first = raw.content.find((b) => b.type === "text");
|
|
150
|
+
const text = first?.type === "text" ? first.text : "";
|
|
151
|
+
try {
|
|
152
|
+
const parsed = JSON.parse(text || "{}");
|
|
153
|
+
if (parsed.error === "insufficient_balance") {
|
|
154
|
+
return { error: `Insufficient credits. Balance: ${parsed.balance_credits} credits. This call requires ${parsed.required_credits} credits. Top up at ${parsed.topup_url}` };
|
|
155
|
+
}
|
|
156
|
+
if (raw.isError) return { error: text || "Tool error" };
|
|
157
|
+
const data = parsed.result ?? parsed;
|
|
158
|
+
return { data };
|
|
159
|
+
} catch {
|
|
160
|
+
if (raw.isError) return { error: text || "Tool error" };
|
|
161
|
+
return { error: "Failed to parse tool response" };
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
function entityIdsSection(ids) {
|
|
165
|
+
if (!ids) return "";
|
|
166
|
+
const lines = [];
|
|
167
|
+
if (ids.kgIds?.length) lines.push(`- **Knowledge Graph MID:** ${ids.kgIds.join(", ")}`);
|
|
168
|
+
if (ids.cids?.length) lines.push(`- **CID:** ${ids.cids.join(", ")}`);
|
|
169
|
+
if (ids.gcids?.length) lines.push(`- **GCID:** ${ids.gcids.join(", ")}`);
|
|
170
|
+
return lines.length ? `
|
|
171
|
+
## Entity IDs
|
|
172
|
+
${lines.join("\n")}` : "";
|
|
173
|
+
}
|
|
174
|
+
function entityIdsSummaryLine(ids) {
|
|
175
|
+
if (!ids) return "";
|
|
176
|
+
const parts = [];
|
|
177
|
+
if (ids.kgIds?.length) parts.push(`KG MID: ${ids.kgIds[0]}`);
|
|
178
|
+
if (ids.cids?.length) parts.push(`CID: ${ids.cids[0]}`);
|
|
179
|
+
if (ids.gcids?.length) parts.push(`GCID: ${ids.gcids[0]}`);
|
|
180
|
+
return parts.length ? `
|
|
181
|
+
**Entity IDs:** ${parts.join(" \xB7 ")}` : "";
|
|
182
|
+
}
|
|
183
|
+
function truncate(s, max) {
|
|
184
|
+
if (!s) return "";
|
|
185
|
+
return s.length > max ? s.slice(0, max) + "\u2026" : s;
|
|
186
|
+
}
|
|
187
|
+
var DIRECTIVE_FULL = "> \u{1F4C4} **FULL REPORT** \u2014 render as artifact or expandable block\n\n";
|
|
188
|
+
var DIRECTIVE_SUMMARY = "> \u{1F4AC} **SUMMARY** \u2014 present this inline; offer the full report above as an artifact\n\n";
|
|
189
|
+
function formatHarvestPaa(raw, input) {
|
|
190
|
+
const parsed = parseData(raw);
|
|
191
|
+
if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
|
|
192
|
+
const d = parsed.data;
|
|
193
|
+
const flat = d.flat ?? [];
|
|
194
|
+
const organic = d.organicResults ?? [];
|
|
195
|
+
const entityIds = d.entityIds;
|
|
196
|
+
const aiOvw = d.aiOverview;
|
|
197
|
+
const durationMs = d.stats?.durationMs;
|
|
198
|
+
const paaRows = flat.map(
|
|
199
|
+
(r, i) => `| ${i + 1} | ${r.question} | ${truncate(r.answer, 120)} | ${r.source_site ?? ""} |`
|
|
200
|
+
).join("\n");
|
|
201
|
+
const paaTable = flat.length ? `## People Also Ask (${flat.length} questions)
|
|
202
|
+
| # | Question | Answer | Source |
|
|
203
|
+
|---|----------|--------|--------|
|
|
204
|
+
${paaRows}` : "## People Also Ask\n*No questions extracted*";
|
|
205
|
+
const serpRows = organic.map(
|
|
206
|
+
(r) => `| ${r.position} | ${r.title} | [${r.domain}](${r.url}) | ${truncate(r.snippet, 100)} |`
|
|
207
|
+
).join("\n");
|
|
208
|
+
const serpTable = organic.length ? `
|
|
209
|
+
## Organic Results (${organic.length})
|
|
210
|
+
| # | Title | URL | Snippet |
|
|
211
|
+
|---|-------|-----|----------|
|
|
212
|
+
${serpRows}` : "";
|
|
213
|
+
const aiSection = aiOvw?.detected && aiOvw.text ? `
|
|
214
|
+
## AI Overview
|
|
215
|
+
> ${truncate(aiOvw.text, 600)}` : "";
|
|
216
|
+
const statsLine = durationMs ? `
|
|
217
|
+
## Stats
|
|
218
|
+
- Questions: ${flat.length} \xB7 Duration: ${(durationMs / 1e3).toFixed(1)}s` : "";
|
|
219
|
+
const tips = `
|
|
220
|
+
---
|
|
221
|
+
\u{1F4A1} **Tips**
|
|
222
|
+
- Max questions: \`maxQuestions: 100\` (current: ${input.maxQuestions ?? 30})
|
|
223
|
+
- Organic results only: use \`search_serp\`
|
|
224
|
+
- Dig into a result: use \`extract_url\` on any organic URL`;
|
|
225
|
+
const full = `${DIRECTIVE_FULL}# PAA Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
|
|
226
|
+
|
|
227
|
+
${paaTable}${serpTable}${entityIdsSection(entityIds)}${aiSection}${statsLine}${tips}`;
|
|
228
|
+
const topQ = flat.slice(0, 10).map((r, i) => `${i + 1}. ${r.question}`).join("\n");
|
|
229
|
+
const topO = organic.slice(0, 5).map((r) => `${r.position}. [${r.title}](${r.url}) \u2014 ${r.domain}`).join("\n");
|
|
230
|
+
const summary = [
|
|
231
|
+
`${DIRECTIVE_SUMMARY}**PAA: "${input.query}"** \u2014 ${flat.length} questions extracted`,
|
|
232
|
+
topQ ? `
|
|
233
|
+
**Top questions:**
|
|
234
|
+
${topQ}` : "",
|
|
235
|
+
organic.length ? `
|
|
236
|
+
**Top organic results:**
|
|
237
|
+
${topO}` : "",
|
|
238
|
+
entityIdsSummaryLine(entityIds),
|
|
239
|
+
`
|
|
240
|
+
\u{1F4A1} \`maxQuestions\` up to 100 \xB7Use \`extract_url\` to dig into any result`
|
|
241
|
+
].filter(Boolean).join("\n");
|
|
242
|
+
return twoBlocks(full, summary);
|
|
243
|
+
}
|
|
244
|
+
function formatSearchSerp(raw, input) {
|
|
245
|
+
const parsed = parseData(raw);
|
|
246
|
+
if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
|
|
247
|
+
const d = parsed.data;
|
|
248
|
+
const organic = d.organicResults ?? [];
|
|
249
|
+
const localPack = d.localPack ?? [];
|
|
250
|
+
const entityIds = d.entityIds;
|
|
251
|
+
const aiOvw = d.aiOverview;
|
|
252
|
+
const serpRows = organic.map(
|
|
253
|
+
(r) => `| ${r.position} | ${r.title} | [${r.domain}](${r.url}) | ${truncate(r.snippet, 100)} |`
|
|
254
|
+
).join("\n");
|
|
255
|
+
const serpTable = organic.length ? `## Organic Results (${organic.length})
|
|
256
|
+
| # | Title | URL | Snippet |
|
|
257
|
+
|---|-------|-----|----------|
|
|
258
|
+
${serpRows}` : "## Organic Results\n*None found*";
|
|
259
|
+
const localRows = localPack.map(
|
|
260
|
+
(b) => `| ${b.position} | ${b.name} | ${b.rating ?? "\u2014"} (${b.reviewCount ?? "0"}) | ${b.websiteUrl ? `[link](${b.websiteUrl})` : "\u2014"} |`
|
|
261
|
+
).join("\n");
|
|
262
|
+
const localSection = localPack.length ? `
|
|
263
|
+
## Local Pack (${localPack.length})
|
|
264
|
+
| # | Name | Rating | Website |
|
|
265
|
+
|---|------|--------|---------|
|
|
266
|
+
${localRows}` : "";
|
|
267
|
+
const aiSection = aiOvw?.detected && aiOvw.text ? `
|
|
268
|
+
## AI Overview
|
|
269
|
+
> ${truncate(aiOvw.text, 600)}` : "";
|
|
270
|
+
const tips = `
|
|
271
|
+
---
|
|
272
|
+
\u{1F4A1} **Tips**
|
|
273
|
+
- Get PAA questions: use \`harvest_paa\` for this query
|
|
274
|
+
- Scrape any result: use \`extract_url\`
|
|
275
|
+
- Business entity IDs (CID/GCID/KG MID) shown above if found`;
|
|
276
|
+
const full = `${DIRECTIVE_FULL}# SERP Report: "${input.query}"${input.location ? ` \xB7 ${input.location}` : ""}
|
|
277
|
+
|
|
278
|
+
${serpTable}${localSection}${entityIdsSection(entityIds)}${aiSection}${tips}`;
|
|
279
|
+
const topO = organic.slice(0, 5).map((r) => `${r.position}. [${r.title}](${r.url}) \u2014 ${r.domain}`).join("\n");
|
|
280
|
+
const summary = [
|
|
281
|
+
`${DIRECTIVE_SUMMARY}**SERP: "${input.query}"** \u2014 ${organic.length} organic results`,
|
|
282
|
+
topO ? `
|
|
283
|
+
**Top results:**
|
|
284
|
+
${topO}` : "",
|
|
285
|
+
localPack.length ? `
|
|
286
|
+
**Local Pack:** ${localPack.map((b) => b.name).join(", ")}` : "",
|
|
287
|
+
entityIdsSummaryLine(entityIds),
|
|
288
|
+
`
|
|
289
|
+
\u{1F4A1} Use \`harvest_paa\` for questions \xB7 \`extract_url\` to scrape any result`
|
|
290
|
+
].filter(Boolean).join("\n");
|
|
291
|
+
return twoBlocks(full, summary);
|
|
292
|
+
}
|
|
293
|
+
function formatExtractUrl(raw, input) {
|
|
294
|
+
const parsed = parseData(raw);
|
|
295
|
+
if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
|
|
296
|
+
const d = parsed.data;
|
|
297
|
+
const url = d.url ?? input.url;
|
|
298
|
+
const title = d.title ?? "Untitled";
|
|
299
|
+
const headings = d.headings ?? [];
|
|
300
|
+
const kpo = d.kpo;
|
|
301
|
+
const bodyMd = d.bodyMarkdown ?? "";
|
|
302
|
+
const schema = d.schema;
|
|
303
|
+
const h1Lines = headings.filter((h) => h.level === 1).map((h) => `- ${h.text}`).join("\n");
|
|
304
|
+
const h2Lines = headings.filter((h) => h.level === 2).map((h) => ` - ${h.text}`).join("\n");
|
|
305
|
+
const headingSection = h1Lines || h2Lines ? `
|
|
306
|
+
## Heading Structure
|
|
307
|
+
${[h1Lines, h2Lines].filter(Boolean).join("\n")}` : "";
|
|
308
|
+
const kpoSection = kpo ? [
|
|
309
|
+
`
|
|
310
|
+
## Entity / Schema`,
|
|
311
|
+
kpo.entityName ? `- **Entity:** ${kpo.entityName}` : "",
|
|
312
|
+
kpo.type?.length ? `- **@type:** ${kpo.type.join(", ")}` : "",
|
|
313
|
+
kpo.napScore !== void 0 ? `- **NAP Score:** ${kpo.napScore}/5` : "",
|
|
314
|
+
kpo.address ? `- **Address:** ${kpo.address}` : "",
|
|
315
|
+
kpo.phone ? `- **Phone:** ${kpo.phone}` : "",
|
|
316
|
+
kpo.email ? `- **Email:** ${kpo.email}` : "",
|
|
317
|
+
kpo.faqCount ? `- **FAQ items:** ${kpo.faqCount}` : "",
|
|
318
|
+
kpo.sameAs?.length ? `- **sameAs:** ${kpo.sameAs.slice(0, 5).join(", ")}` : "",
|
|
319
|
+
kpo.missingFields?.length ? `
|
|
320
|
+
**Missing schema fields:** ${kpo.missingFields.slice(0, 5).join(", ")}` : ""
|
|
321
|
+
].filter(Boolean).join("\n") : "";
|
|
322
|
+
const bodySection = bodyMd ? `
|
|
323
|
+
## Page Content
|
|
324
|
+
${bodyMd.slice(0, 3e3)}${bodyMd.length > 3e3 ? "\n\n*(truncated)*" : ""}` : "";
|
|
325
|
+
const schemaCount = Array.isArray(schema) ? schema.length : 0;
|
|
326
|
+
const tips = `
|
|
327
|
+
---
|
|
328
|
+
\u{1F4A1} **Tips**
|
|
329
|
+
- Crawl entire site: use \`extract_site\`
|
|
330
|
+
- Map all URLs: use \`map_site_urls\`
|
|
331
|
+
- ${schemaCount} JSON-LD schema block(s) detected`;
|
|
332
|
+
const full = `${DIRECTIVE_FULL}# URL Extract: ${url}
|
|
333
|
+
**${title}**
|
|
334
|
+
${headingSection}${kpoSection}${bodySection}${tips}`;
|
|
335
|
+
const summary = [
|
|
336
|
+
`${DIRECTIVE_SUMMARY}**Extracted:** ${title}`,
|
|
337
|
+
`**URL:** ${url}`,
|
|
338
|
+
kpo?.entityName ? `**Entity:** ${kpo.entityName} (${kpo.type?.join(", ") ?? "unknown"})` : "",
|
|
339
|
+
kpo?.napScore !== void 0 ? `**NAP Score:** ${kpo.napScore}/5` : "",
|
|
340
|
+
headings.length ? `**${headings.length} headings**` : "",
|
|
341
|
+
`
|
|
342
|
+
\u{1F4A1} Use \`extract_site\` to crawl the full domain`
|
|
343
|
+
].filter(Boolean).join("\n");
|
|
344
|
+
return twoBlocks(full, summary);
|
|
345
|
+
}
|
|
346
|
+
function formatMapSiteUrls(raw, input) {
|
|
347
|
+
const parsed = parseData(raw);
|
|
348
|
+
if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
|
|
349
|
+
const d = parsed.data;
|
|
350
|
+
const urls = d.urls ?? [];
|
|
351
|
+
const ok = urls.filter((u) => (u.status ?? 0) >= 200 && (u.status ?? 0) < 300);
|
|
352
|
+
const broken = urls.filter((u) => u.status !== null && u.status >= 400);
|
|
353
|
+
const redirects = urls.filter((u) => u.status !== null && u.status >= 300 && u.status < 400);
|
|
354
|
+
const urlRows = urls.slice(0, 200).map((u, i) => `| ${i + 1} | ${u.url} | ${u.status ?? "\u2014"} |`).join("\n");
|
|
355
|
+
const full = [
|
|
356
|
+
`${DIRECTIVE_FULL}# URL Map: ${input.url}`,
|
|
357
|
+
`**${d.totalFound} URLs** \xB7 ${(d.durationMs / 1e3).toFixed(1)}s${d.truncated ? " \xB7 *truncated*" : ""}`,
|
|
358
|
+
`
|
|
359
|
+
## Summary
|
|
360
|
+
- \u2705 2xx: ${ok.length}
|
|
361
|
+
- \u{1F500} 3xx: ${redirects.length}
|
|
362
|
+
- \u274C 4xx+: ${broken.length}`,
|
|
363
|
+
`
|
|
364
|
+
## URL Inventory
|
|
365
|
+
| # | URL | Status |
|
|
366
|
+
|---|-----|--------|
|
|
367
|
+
${urlRows}`,
|
|
368
|
+
broken.length ? `
|
|
369
|
+
## Broken URLs
|
|
370
|
+
${broken.map((u) => `- ${u.url} (${u.status})`).join("\n")}` : "",
|
|
371
|
+
`
|
|
372
|
+
---
|
|
373
|
+
\u{1F4A1} **Tips**
|
|
374
|
+
- Extract content from all pages: use \`extract_site\`
|
|
375
|
+
- Scrape a single page: use \`extract_url\``
|
|
376
|
+
].filter(Boolean).join("\n");
|
|
377
|
+
const summary = [
|
|
378
|
+
`${DIRECTIVE_SUMMARY}**URL Map: ${input.url}**`,
|
|
379
|
+
`${d.totalFound} URLs \u2014 ${ok.length} OK \xB7 ${broken.length} broken \xB7 ${redirects.length} redirects`,
|
|
380
|
+
broken.length ? `
|
|
381
|
+
**Broken URLs:** ${broken.slice(0, 3).map((u) => u.url).join(", ")}` : "",
|
|
382
|
+
`
|
|
383
|
+
\u{1F4A1} Use \`extract_site\` to extract content from all pages`
|
|
384
|
+
].filter(Boolean).join("\n");
|
|
385
|
+
return twoBlocks(full, summary);
|
|
386
|
+
}
|
|
387
|
+
function formatExtractSite(raw, input) {
|
|
388
|
+
const parsed = parseData(raw);
|
|
389
|
+
if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
|
|
390
|
+
const d = parsed.data;
|
|
391
|
+
const pages = d.pages ?? [];
|
|
392
|
+
const pageRows = pages.map((p, i) => {
|
|
393
|
+
const schemaInfo = p.kpo?.type?.join(", ") ?? (Array.isArray(p.schema) && p.schema.length ? `${p.schema.length} block(s)` : "\u2014");
|
|
394
|
+
return `| ${i + 1} | ${p.title ?? "Untitled"} | ${p.url} | ${schemaInfo} |`;
|
|
395
|
+
}).join("\n");
|
|
396
|
+
const full = [
|
|
397
|
+
`${DIRECTIVE_FULL}# Site Extract: ${input.url}`,
|
|
398
|
+
`**${pages.length} pages** \xB7 ${((d.durationMs ?? 0) / 1e3).toFixed(1)}s`,
|
|
399
|
+
`
|
|
400
|
+
## Pages
|
|
401
|
+
| # | Title | URL | Schema |
|
|
402
|
+
|---|-------|-----|--------|
|
|
403
|
+
${pageRows}`,
|
|
404
|
+
`
|
|
405
|
+
---
|
|
406
|
+
\u{1F4A1} **Tips**
|
|
407
|
+
- Map URLs first: use \`map_site_urls\`
|
|
408
|
+
- Inspect a single page: use \`extract_url\``
|
|
409
|
+
].join("\n");
|
|
410
|
+
const summary = [
|
|
411
|
+
`${DIRECTIVE_SUMMARY}**Site Extract: ${input.url}** \u2014 ${pages.length} pages`,
|
|
412
|
+
pages.slice(0, 5).map((p) => `- ${p.title ?? p.url}`).join("\n"),
|
|
413
|
+
pages.length > 5 ? `- \u2026 and ${pages.length - 5} more` : "",
|
|
414
|
+
`
|
|
415
|
+
\u{1F4A1} Use \`extract_url\` to inspect any individual page`
|
|
416
|
+
].filter(Boolean).join("\n");
|
|
417
|
+
return twoBlocks(full, summary);
|
|
418
|
+
}
|
|
419
|
+
function formatYoutubeHarvest(raw, input) {
|
|
420
|
+
const parsed = parseData(raw);
|
|
421
|
+
if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
|
|
422
|
+
const d = parsed.data;
|
|
423
|
+
const videos = d.videos ?? [];
|
|
424
|
+
const label = input.mode === "channel" ? input.channelHandle ?? "channel" : `"${input.query ?? ""}"`;
|
|
425
|
+
const videoRows = videos.map(
|
|
426
|
+
(v, i) => `| ${i + 1} | ${truncate(v.title, 70)} | ${v.channelName} | ${v.views ?? "\u2014"} | ${v.duration ?? "\u2014"} | \`${v.videoId}\` |`
|
|
427
|
+
).join("\n");
|
|
428
|
+
const channelSection = d.channelMeta ? `
|
|
429
|
+
## Channel
|
|
430
|
+
- **Name:** ${d.channelMeta.title ?? "\u2014"}
|
|
431
|
+
- **Subscribers:** ${d.channelMeta.subscriberCount ?? "\u2014"}` : "";
|
|
432
|
+
const full = [
|
|
433
|
+
`${DIRECTIVE_FULL}# YouTube Harvest: ${label}`,
|
|
434
|
+
`**${videos.length} videos** \xB7 ${(d.stats.durationMs / 1e3).toFixed(1)}s`,
|
|
435
|
+
channelSection,
|
|
436
|
+
`
|
|
437
|
+
## Videos
|
|
438
|
+
| # | Title | Channel | Views | Duration | Video ID |
|
|
439
|
+
|---|-------|---------|-------|----------|----------|
|
|
440
|
+
${videoRows}`,
|
|
441
|
+
`
|
|
442
|
+
---
|
|
443
|
+
\u{1F4A1} **Tips**
|
|
444
|
+
- Transcribe a video: use \`youtube_transcribe\` with the \`videoId\` above
|
|
445
|
+
- Switch mode: \`mode: "channel"\` with \`channelHandle\` or \`mode: "search"\` with \`query\``
|
|
446
|
+
].filter(Boolean).join("\n");
|
|
447
|
+
const top5 = videos.slice(0, 5).map((v, i) => `${i + 1}. ${v.title} (\`${v.videoId}\`)`).join("\n");
|
|
448
|
+
const summary = [
|
|
449
|
+
`${DIRECTIVE_SUMMARY}**YouTube: ${label}** \u2014 ${videos.length} videos`,
|
|
450
|
+
`
|
|
451
|
+
**Top videos:**
|
|
452
|
+
${top5}`,
|
|
453
|
+
`
|
|
454
|
+
\u{1F4A1} Transcribe any video: \`youtube_transcribe\` with its videoId`
|
|
455
|
+
].join("\n");
|
|
456
|
+
return twoBlocks(full, summary);
|
|
457
|
+
}
|
|
458
|
+
function formatYoutubeTranscribe(raw, input) {
|
|
459
|
+
const parsed = parseData(raw);
|
|
460
|
+
if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
|
|
461
|
+
const d = parsed.data;
|
|
462
|
+
const text = d.text ?? "";
|
|
463
|
+
const chunks = d.chunks ?? [];
|
|
464
|
+
const durSec = d.durationMs ? (d.durationMs / 1e3).toFixed(0) : "\u2014";
|
|
465
|
+
const chunkRows = chunks.slice(0, 50).map((c) => {
|
|
466
|
+
const sec = Math.floor(c.startMs / 1e3);
|
|
467
|
+
const mm = String(Math.floor(sec / 60)).padStart(2, "0");
|
|
468
|
+
const ss = String(sec % 60).padStart(2, "0");
|
|
469
|
+
return `| ${mm}:${ss} | ${truncate(c.text, 120)} |`;
|
|
470
|
+
}).join("\n");
|
|
471
|
+
const full = [
|
|
472
|
+
`${DIRECTIVE_FULL}# YouTube Transcript: \`${input.videoId}\``,
|
|
473
|
+
`**Duration:** ${durSec}s \xB7 **${text.split(" ").length} words**`,
|
|
474
|
+
`
|
|
475
|
+
## Full Transcript
|
|
476
|
+
${text}`,
|
|
477
|
+
chunks.length ? `
|
|
478
|
+
## Timestamped Chunks
|
|
479
|
+
| Time | Text |
|
|
480
|
+
|------|------|
|
|
481
|
+
${chunkRows}` : "",
|
|
482
|
+
`
|
|
483
|
+
---
|
|
484
|
+
\u{1F4A1} Harvest more from this channel: use \`youtube_harvest\` with \`mode: "channel"\``
|
|
485
|
+
].filter(Boolean).join("\n");
|
|
486
|
+
const summary = [
|
|
487
|
+
`${DIRECTIVE_SUMMARY}**YouTube Transcript: \`${input.videoId}\`** \u2014 ${text.split(" ").length} words \xB7 ${durSec}s`,
|
|
488
|
+
`
|
|
489
|
+
**Preview:**
|
|
490
|
+
> ${truncate(text, 300)}`,
|
|
491
|
+
`
|
|
492
|
+
\u{1F4A1} Full transcript in artifact above`
|
|
493
|
+
].join("\n");
|
|
494
|
+
return twoBlocks(full, summary);
|
|
495
|
+
}
|
|
496
|
+
function formatFacebookPageIntel(raw, input) {
|
|
497
|
+
const parsed = parseData(raw);
|
|
498
|
+
if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
|
|
499
|
+
const d = parsed.data;
|
|
500
|
+
const advertiser = d.advertiserName ?? input.query ?? input.pageId ?? input.libraryId ?? "Advertiser";
|
|
501
|
+
const ads = d.ads ?? [];
|
|
502
|
+
const s = d.summary ?? { totalAds: 0, activeCount: 0, videoCount: 0, imageCount: 0 };
|
|
503
|
+
const adBlocks = ads.map((ad, i) => [
|
|
504
|
+
`### Ad ${i + 1}${ad.libraryId ? ` \xB7 \`${ad.libraryId}\`` : ""} \u2014 ${ad.status ?? "\u2014"} \xB7 ${ad.creativeType ?? "\u2014"} \xB7 ${ad.startDate ?? "\u2014"}`,
|
|
505
|
+
ad.headline ? `**Headline:** ${ad.headline}` : "",
|
|
506
|
+
ad.primaryText ? `**Copy:** ${truncate(ad.primaryText, 200)}` : "",
|
|
507
|
+
ad.cta ? `**CTA:** ${ad.cta}` : "",
|
|
508
|
+
ad.videoUrl ? `**Video URL:** \`${ad.videoUrl}\`` : "",
|
|
509
|
+
ad.variations ? `**Variations:** ${ad.variations}` : ""
|
|
510
|
+
].filter(Boolean).join("\n")).join("\n\n---\n\n");
|
|
511
|
+
const full = [
|
|
512
|
+
`${DIRECTIVE_FULL}# Facebook Ad Intel: ${advertiser}`,
|
|
513
|
+
`**${s.totalAds} ads** \xB7 ${s.activeCount} active \xB7 ${s.videoCount} video \xB7 ${s.imageCount} image`,
|
|
514
|
+
`
|
|
515
|
+
${adBlocks}`,
|
|
516
|
+
`
|
|
517
|
+
---
|
|
518
|
+
\u{1F4A1} **Tips**
|
|
519
|
+
- Transcribe video ads: use \`facebook_ad_transcribe\` with the \`videoUrl\` above
|
|
520
|
+
- Find other advertisers: use \`facebook_ad_search\``
|
|
521
|
+
].filter(Boolean).join("\n");
|
|
522
|
+
const activeAds = ads.filter((a) => a.status?.toLowerCase() === "active").slice(0, 5);
|
|
523
|
+
const adSummary = activeAds.map((a, i) => `${i + 1}. ${truncate(a.headline ?? a.primaryText, 80)} (${a.creativeType ?? "\u2014"})`).join("\n");
|
|
524
|
+
const videoCount = ads.filter((a) => a.videoUrl).length;
|
|
525
|
+
const summary = [
|
|
526
|
+
`${DIRECTIVE_SUMMARY}**Facebook Ads: ${advertiser}** \u2014 ${s.totalAds} ads (${s.activeCount} active)`,
|
|
527
|
+
adSummary ? `
|
|
528
|
+
**Active ads:**
|
|
529
|
+
${adSummary}` : "",
|
|
530
|
+
`**Creative mix:** ${s.videoCount} video \xB7 ${s.imageCount} image`,
|
|
531
|
+
videoCount ? `
|
|
532
|
+
\u{1F4A1} ${videoCount} video ads \u2014 transcribe with \`facebook_ad_transcribe\` using the videoUrl` : ""
|
|
533
|
+
].filter(Boolean).join("\n");
|
|
534
|
+
return twoBlocks(full, summary);
|
|
535
|
+
}
|
|
536
|
+
function formatFacebookAdSearch(raw, input) {
|
|
537
|
+
const parsed = parseData(raw);
|
|
538
|
+
if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
|
|
539
|
+
const d = parsed.data;
|
|
540
|
+
const advertisers = d.results ?? d.advertisers ?? [];
|
|
541
|
+
const rows = advertisers.map(
|
|
542
|
+
(a, i) => `| ${i + 1} | ${a.name} | ${a.adCount ?? "\u2014"} | \`${a.libraryId ?? "\u2014"}\` |`
|
|
543
|
+
).join("\n");
|
|
544
|
+
const full = [
|
|
545
|
+
`${DIRECTIVE_FULL}# Facebook Ad Library Search: "${input.query}"`,
|
|
546
|
+
`**${advertisers.length} advertisers found**`,
|
|
547
|
+
`
|
|
548
|
+
## Advertisers
|
|
549
|
+
| # | Name | Ad Count | Library ID |
|
|
550
|
+
|---|------|----------|------------|
|
|
551
|
+
${rows}`,
|
|
552
|
+
`
|
|
553
|
+
---
|
|
554
|
+
\u{1F4A1} **Tips**
|
|
555
|
+
- Scan all ads: use \`facebook_page_intel\` with \`libraryId\`
|
|
556
|
+
- Or pass the advertiser name as \`query\` in \`facebook_page_intel\``
|
|
557
|
+
].join("\n");
|
|
558
|
+
const summary = [
|
|
559
|
+
`${DIRECTIVE_SUMMARY}**Facebook Ad Search: "${input.query}"** \u2014 ${advertisers.length} advertisers`,
|
|
560
|
+
advertisers.slice(0, 5).map(
|
|
561
|
+
(a, i) => `${i + 1}. ${a.name}${a.adCount ? ` (${a.adCount} ads)` : ""} \u2014 \`${a.libraryId ?? "\u2014"}\``
|
|
562
|
+
).join("\n"),
|
|
563
|
+
`
|
|
564
|
+
\u{1F4A1} Scan ads with \`facebook_page_intel\` using \`libraryId\``
|
|
565
|
+
].filter(Boolean).join("\n");
|
|
566
|
+
return twoBlocks(full, summary);
|
|
567
|
+
}
|
|
568
|
+
function formatCreditsInfo(raw, input) {
|
|
569
|
+
const parsed = parseData(raw);
|
|
570
|
+
if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
|
|
571
|
+
const d = parsed.data;
|
|
572
|
+
const balance = d.balance_credits;
|
|
573
|
+
const costs = d.costs ?? [];
|
|
574
|
+
const matched = d.matched_cost;
|
|
575
|
+
const ledger = d.ledger ?? [];
|
|
576
|
+
const costRows = costs.map((c) => {
|
|
577
|
+
const notes = c.notes ? ` ${c.notes}` : "";
|
|
578
|
+
return `| ${c.label} | ${c.credits} | ${c.unit}${notes} |`;
|
|
579
|
+
}).join("\n");
|
|
580
|
+
const ledgerRows = ledger.map((row) => {
|
|
581
|
+
const credits = row.amount_mc / 1e3;
|
|
582
|
+
return `| ${row.created_at} | ${row.operation} | ${credits} | ${row.description ?? ""} |`;
|
|
583
|
+
}).join("\n");
|
|
584
|
+
const matchedSection = matched ? `
|
|
585
|
+
## Matched Cost
|
|
586
|
+
**${matched.label}:** ${matched.credits} credits ${matched.unit}${matched.notes ? `
|
|
587
|
+
|
|
588
|
+
${matched.notes}` : ""}` : input.item ? `
|
|
589
|
+
## Matched Cost
|
|
590
|
+
No exact cost match found for "${input.item}". See the full cost table below.` : "";
|
|
591
|
+
const full = [
|
|
592
|
+
`${DIRECTIVE_FULL}# Credits`,
|
|
593
|
+
`**Balance:** ${balance ?? "unknown"} credits`,
|
|
594
|
+
matchedSection,
|
|
595
|
+
costs.length ? `
|
|
596
|
+
## Cost Table
|
|
597
|
+
| Item | Credits | Unit |
|
|
598
|
+
|------|---------|------|
|
|
599
|
+
${costRows}` : "",
|
|
600
|
+
ledger.length ? `
|
|
601
|
+
## Recent Ledger
|
|
602
|
+
| Date | Operation | Credits | Description |
|
|
603
|
+
|------|-----------|---------|-------------|
|
|
604
|
+
${ledgerRows}` : ""
|
|
605
|
+
].filter(Boolean).join("\n");
|
|
606
|
+
const summary = [
|
|
607
|
+
`${DIRECTIVE_SUMMARY}**Credit balance:** ${balance ?? "unknown"} credits`,
|
|
608
|
+
matched ? `
|
|
609
|
+
**${matched.label}:** ${matched.credits} credits ${matched.unit}` : null,
|
|
610
|
+
input.includeLedger && ledger.length ? `
|
|
611
|
+
Recent ledger entries included in the full report.` : null
|
|
612
|
+
].filter(Boolean).join("\n");
|
|
613
|
+
return twoBlocks(full, summary);
|
|
614
|
+
}
|
|
615
|
+
function formatMapsPlaceIntel(raw, input) {
|
|
616
|
+
const parsed = parseData(raw);
|
|
617
|
+
if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
|
|
618
|
+
const d = parsed.data;
|
|
619
|
+
const name = d.name ?? input.businessName;
|
|
620
|
+
const rating = d.rating;
|
|
621
|
+
const reviewCount = d.reviewCount;
|
|
622
|
+
const category = d.category;
|
|
623
|
+
const address = d.address;
|
|
624
|
+
const phone = d.phoneDisplay;
|
|
625
|
+
const website = d.website;
|
|
626
|
+
const hoursSummary = d.hoursSummary;
|
|
627
|
+
const plusCode = d.plusCode;
|
|
628
|
+
const bookingUrl = d.bookingUrl;
|
|
629
|
+
const kgmid = d.kgmid;
|
|
630
|
+
const cidDecimal = d.cidDecimal;
|
|
631
|
+
const cidUrl = d.cidUrl;
|
|
632
|
+
const lat = d.lat;
|
|
633
|
+
const lng = d.lng;
|
|
634
|
+
const durationMs = d.durationMs;
|
|
635
|
+
const histogram = d.reviewHistogram ?? [];
|
|
636
|
+
const topics = d.reviewTopics ?? [];
|
|
637
|
+
const about = d.aboutAttributes ?? [];
|
|
638
|
+
const reviews = d.reviews ?? [];
|
|
639
|
+
const hoursTable = d.hoursTable ?? [];
|
|
640
|
+
const ratingLine = [rating, reviewCount ? `(${reviewCount} reviews)` : null].filter(Boolean).join(" ");
|
|
641
|
+
const basicLines = [
|
|
642
|
+
address ? `- **Address:** ${address}` : null,
|
|
643
|
+
phone ? `- **Phone:** ${phone}` : null,
|
|
644
|
+
website ? `- **Website:** ${website}` : null,
|
|
645
|
+
hoursSummary ? `- **Hours:** ${hoursSummary}` : null,
|
|
646
|
+
plusCode ? `- **Plus Code:** ${plusCode}` : null,
|
|
647
|
+
bookingUrl ? `- **Book:** ${bookingUrl}` : null
|
|
648
|
+
].filter(Boolean).join("\n");
|
|
649
|
+
const hoursSection = hoursTable.length ? `
|
|
650
|
+
## Hours
|
|
651
|
+
| Day | Hours |
|
|
652
|
+
|-----|-------|
|
|
653
|
+
${hoursTable.map((r) => `| ${r.day} | ${r.hours} |`).join("\n")}` : "";
|
|
654
|
+
const histSection = histogram.length ? `
|
|
655
|
+
## Rating Distribution
|
|
656
|
+
| Stars | Count |
|
|
657
|
+
|-------|-------|
|
|
658
|
+
${histogram.map((r) => `| ${"\u2605".repeat(r.stars)}${"\u2606".repeat(5 - r.stars)} | ${r.count} |`).join("\n")}` : "";
|
|
659
|
+
const topicsSection = topics.length ? `
|
|
660
|
+
## Review Topics
|
|
661
|
+
${topics.map((t) => `- **${t.label}:** ${t.count} mentions`).join("\n")}` : "";
|
|
662
|
+
const aboutBySection = {};
|
|
663
|
+
for (const a of about) {
|
|
664
|
+
if (!aboutBySection[a.section]) aboutBySection[a.section] = [];
|
|
665
|
+
aboutBySection[a.section].push(a.attribute);
|
|
666
|
+
}
|
|
667
|
+
const aboutSection = Object.keys(aboutBySection).length ? `
|
|
668
|
+
## About
|
|
669
|
+
${Object.entries(aboutBySection).map(([s, attrs]) => `**${s}**
|
|
670
|
+
${attrs.map((a) => `- ${a}`).join("\n")}`).join("\n\n")}` : "";
|
|
671
|
+
const entitySection = [
|
|
672
|
+
kgmid ? `- **KGMID:** \`${kgmid}\`` : null,
|
|
673
|
+
cidDecimal ? `- **CID:** \`${cidDecimal}\`` : null,
|
|
674
|
+
cidUrl ? `- **Maps CID URL:** ${cidUrl}` : null,
|
|
675
|
+
lat != null && lng != null ? `- **Coordinates:** ${lat}, ${lng}` : null
|
|
676
|
+
].filter(Boolean).join("\n");
|
|
677
|
+
const reviewsSection = reviews.length ? `
|
|
678
|
+
## Reviews (${reviews.length})
|
|
679
|
+
${reviews.map((r, i) => {
|
|
680
|
+
const starsN = parseInt(r.stars ?? "0");
|
|
681
|
+
const stars = "\u2605".repeat(starsN) + "\u2606".repeat(5 - starsN);
|
|
682
|
+
return `### ${i + 1}. ${r.author ?? "Anonymous"} \u2014 ${stars}
|
|
683
|
+
*${r.date ?? ""}*
|
|
684
|
+
|
|
685
|
+
${r.text ?? ""}`;
|
|
686
|
+
}).join("\n\n")}` : "";
|
|
687
|
+
const full = [
|
|
688
|
+
`${DIRECTIVE_FULL}# ${name}`,
|
|
689
|
+
category ? `*${category}*` : null,
|
|
690
|
+
ratingLine ? `
|
|
691
|
+
**Rating:** ${ratingLine}` : null,
|
|
692
|
+
basicLines ? `
|
|
693
|
+
${basicLines}` : null,
|
|
694
|
+
hoursSection,
|
|
695
|
+
histSection,
|
|
696
|
+
topicsSection,
|
|
697
|
+
aboutSection,
|
|
698
|
+
entitySection ? `
|
|
699
|
+
## Entity IDs
|
|
700
|
+
${entitySection}` : null,
|
|
701
|
+
reviewsSection,
|
|
702
|
+
durationMs != null ? `
|
|
703
|
+
---
|
|
704
|
+
*Extracted in ${(durationMs / 1e3).toFixed(1)}s*` : null
|
|
705
|
+
].filter(Boolean).join("\n");
|
|
706
|
+
const summary = [
|
|
707
|
+
`${DIRECTIVE_SUMMARY}**${name}** \u2014 ${category ?? "Business"} \xB7 ${ratingLine || "No rating"}`,
|
|
708
|
+
address ? `\u{1F4CD} ${address}` : null,
|
|
709
|
+
phone ? `\u{1F4DE} ${phone}` : null,
|
|
710
|
+
hoursSummary ? `\u{1F550} ${hoursSummary}` : null,
|
|
711
|
+
website ? `\u{1F310} ${website}` : null,
|
|
712
|
+
reviews.length ? `
|
|
713
|
+
\u{1F4AC} ${reviews.length} reviews fetched \u2014 full list in artifact above` : null
|
|
714
|
+
].filter(Boolean).join("\n");
|
|
715
|
+
return twoBlocks(full, summary);
|
|
716
|
+
}
|
|
717
|
+
function formatFacebookAdTranscribe(raw, input) {
|
|
718
|
+
const parsed = parseData(raw);
|
|
719
|
+
if ("error" in parsed) return { content: [{ type: "text", text: parsed.error }], isError: true };
|
|
720
|
+
const d = parsed.data;
|
|
721
|
+
const text = d.text ?? "";
|
|
722
|
+
const chunks = d.chunks ?? [];
|
|
723
|
+
const durSec = d.durationMs ? (d.durationMs / 1e3).toFixed(0) : "\u2014";
|
|
724
|
+
const chunkRows = chunks.slice(0, 50).map((c) => {
|
|
725
|
+
const sec = Math.floor(c.startMs / 1e3);
|
|
726
|
+
const mm = String(Math.floor(sec / 60)).padStart(2, "0");
|
|
727
|
+
const ss = String(sec % 60).padStart(2, "0");
|
|
728
|
+
return `| ${mm}:${ss} | ${truncate(c.text, 120)} |`;
|
|
729
|
+
}).join("\n");
|
|
730
|
+
const full = [
|
|
731
|
+
`${DIRECTIVE_FULL}# Facebook Ad Transcript`,
|
|
732
|
+
`**Duration:** ${durSec}s \xB7 **${text.split(" ").length} words**`,
|
|
733
|
+
`
|
|
734
|
+
## Full Transcript
|
|
735
|
+
${text}`,
|
|
736
|
+
chunks.length ? `
|
|
737
|
+
## Timestamped Chunks
|
|
738
|
+
| Time | Text |
|
|
739
|
+
|------|------|
|
|
740
|
+
${chunkRows}` : "",
|
|
741
|
+
`
|
|
742
|
+
---
|
|
743
|
+
\u{1F4A1} Get more ads from this advertiser: use \`facebook_page_intel\``
|
|
744
|
+
].filter(Boolean).join("\n");
|
|
745
|
+
const summary = [
|
|
746
|
+
`${DIRECTIVE_SUMMARY}**Facebook Ad Transcript** \u2014 ${text.split(" ").length} words \xB7 ${durSec}s`,
|
|
747
|
+
`
|
|
748
|
+
**Preview:**
|
|
749
|
+
> ${truncate(text, 300)}`,
|
|
750
|
+
`
|
|
751
|
+
\u{1F4A1} Full transcript in artifact above`
|
|
752
|
+
].join("\n");
|
|
753
|
+
return twoBlocks(full, summary);
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
// src/mcp/paa-mcp-server.ts
|
|
757
|
+
function buildPaaExtractorMcpServer(executor2) {
|
|
758
|
+
const server2 = new import_mcp.McpServer({ name: "paa-extractor", version: "1.0.0" });
|
|
759
|
+
server2.registerTool("harvest_paa", {
|
|
760
|
+
description: "Extract PAA (People Also Ask) questions from Google Search. Returns full question list with answers, organic SERP, entity IDs (CID/GCID/KG MID), and AI Overview. Use maxQuestions to control volume (up to 40).",
|
|
761
|
+
inputSchema: HarvestPaaInputSchema
|
|
762
|
+
}, async (input) => formatHarvestPaa(await executor2.harvestPaa(input), input));
|
|
763
|
+
server2.registerTool("search_serp", {
|
|
764
|
+
description: "Fetch organic Google search results. Returns ranked URLs, titles, snippets, local pack, entity IDs (CID/GCID/KG MID), and AI Overview. Use when you need SERP positions without PAA expansion.",
|
|
765
|
+
inputSchema: SearchSerpInputSchema
|
|
766
|
+
}, async (input) => formatSearchSerp(await executor2.searchSerp(input), input));
|
|
767
|
+
server2.registerTool("extract_url", {
|
|
768
|
+
description: "Extract structured data from a single URL: page content as Markdown, heading structure, JSON-LD schema, entity details, NAP score, and missing schema fields. Use for SEO audits and entity validation.",
|
|
769
|
+
inputSchema: ExtractUrlInputSchema
|
|
770
|
+
}, async (input) => formatExtractUrl(await executor2.extractUrl(input), input));
|
|
771
|
+
server2.registerTool("map_site_urls", {
|
|
772
|
+
description: "Spider a website to build a complete URL inventory with HTTP status codes. Identifies broken links and redirect chains. Use before extract_site to understand site scope.",
|
|
773
|
+
inputSchema: MapSiteUrlsInputSchema
|
|
774
|
+
}, async (input) => formatMapSiteUrls(await executor2.mapSiteUrls(input), input));
|
|
775
|
+
server2.registerTool("extract_site", {
|
|
776
|
+
description: "Run multi-page extraction across an entire website. Returns schema, entity data, headings, and content from each page. Use map_site_urls first to check scope.",
|
|
777
|
+
inputSchema: ExtractSiteInputSchema
|
|
778
|
+
}, async (input) => formatExtractSite(await executor2.extractSite(input), input));
|
|
779
|
+
server2.registerTool("youtube_harvest", {
|
|
780
|
+
description: 'Harvest YouTube video metadata by search query or channel handle. Returns titles, view counts, durations, and videoIds. Use mode "search" for keyword results or "channel" for a specific creator.',
|
|
781
|
+
inputSchema: YoutubeHarvestInputSchema
|
|
782
|
+
}, async (input) => formatYoutubeHarvest(await executor2.youtubeHarvest(input), input));
|
|
783
|
+
server2.registerTool("youtube_transcribe", {
|
|
784
|
+
description: "Fetch and transcribe captions from a YouTube video. Returns full transcript, timestamped chunks, and word count. Pass a videoId from youtube_harvest results.",
|
|
785
|
+
inputSchema: YoutubeTranscribeInputSchema
|
|
786
|
+
}, async (input) => formatYoutubeTranscribe(await executor2.youtubeTranscribe(input), input));
|
|
787
|
+
server2.registerTool("facebook_page_intel", {
|
|
788
|
+
description: "Harvest all ads from a Facebook advertiser. Returns ad copy, headlines, CTAs, creative type, status, and video URLs ready for transcription. Accepts pageId, libraryId, or a brand name as query.",
|
|
789
|
+
inputSchema: FacebookPageIntelInputSchema
|
|
790
|
+
}, async (input) => formatFacebookPageIntel(await executor2.facebookPageIntel(input), input));
|
|
791
|
+
server2.registerTool("facebook_ad_search", {
|
|
792
|
+
description: "Search Facebook Ad Library by keyword. Returns advertisers with ad counts and library IDs. Use to discover competitors, then pass libraryId to facebook_page_intel to get their full ad list.",
|
|
793
|
+
inputSchema: FacebookAdSearchInputSchema
|
|
794
|
+
}, async (input) => formatFacebookAdSearch(await executor2.facebookAdSearch(input), input));
|
|
795
|
+
server2.registerTool("facebook_ad_transcribe", {
|
|
796
|
+
description: "Transcribe audio from a Facebook ad video. Returns full transcript and timestamped chunks. Use the videoUrl value from facebook_page_intel results.",
|
|
797
|
+
inputSchema: FacebookAdTranscribeInputSchema
|
|
798
|
+
}, async (input) => formatFacebookAdTranscribe(await executor2.facebookAdTranscribe(input), input));
|
|
799
|
+
server2.registerTool("maps_place_intel", {
|
|
800
|
+
description: "Extract Google Maps business intelligence for a named business: rating, review count, category, address, phone, website, hours, booking URL, review histogram, review topics, about attributes, and optional review cards. Pass includeReviews: true and maxReviews to fetch individual review text.",
|
|
801
|
+
inputSchema: MapsPlaceIntelInputSchema
|
|
802
|
+
}, async (input) => formatMapsPlaceIntel(await executor2.mapsPlaceIntel(input), input));
|
|
803
|
+
server2.registerTool("credits_info", {
|
|
804
|
+
description: "Answer questions about MCP Scraper credits: current credit balance, what a specific tool/action costs, the full cost table, and optionally recent credit ledger entries. Does not expose payment methods or credit card information.",
|
|
805
|
+
inputSchema: CreditsInfoInputSchema
|
|
806
|
+
}, async (input) => formatCreditsInfo(await executor2.creditsInfo(input), input));
|
|
807
|
+
return server2;
|
|
808
|
+
}
|
|
809
|
+
|
|
810
|
+
// bin/mcp-stdio-server.ts
|
|
811
|
+
function readApiKeyFile() {
|
|
812
|
+
const explicitPath = process.env.MCP_SCRAPER_KEY_PATH?.trim();
|
|
813
|
+
const paths = [explicitPath, (0, import_node_path.join)((0, import_node_os.homedir)(), ".mcp-scraper-key")].filter(Boolean);
|
|
814
|
+
for (const path of paths) {
|
|
815
|
+
try {
|
|
816
|
+
const value = (0, import_node_fs.readFileSync)(path, "utf8").trim();
|
|
817
|
+
if (value) return value;
|
|
818
|
+
} catch {
|
|
819
|
+
}
|
|
820
|
+
}
|
|
821
|
+
return void 0;
|
|
822
|
+
}
|
|
823
|
+
var apiKey = (process.env.MCP_SCRAPER_API_KEY ?? process.env.MCP_SCRAPER_KEY ?? process.env.MCP_API_KEY ?? readApiKeyFile())?.trim();
|
|
824
|
+
if (!apiKey) {
|
|
825
|
+
process.stderr.write("MCP_SCRAPER_API_KEY env var or ~/.mcp-scraper-key is required\n");
|
|
826
|
+
process.exit(1);
|
|
827
|
+
}
|
|
828
|
+
var baseUrl = process.env.MCP_SCRAPER_BASE_URL?.trim() ?? process.env.MCP_BASE_URL?.trim() ?? "https://mcpscraper.dev";
|
|
829
|
+
var executor = new HttpMcpToolExecutor(baseUrl, apiKey);
|
|
830
|
+
var server = buildPaaExtractorMcpServer(executor);
|
|
831
|
+
var transport = new import_stdio.StdioServerTransport();
|
|
832
|
+
async function main() {
|
|
833
|
+
await server.connect(transport);
|
|
834
|
+
}
|
|
835
|
+
main().catch((err) => {
|
|
836
|
+
process.stderr.write(`${err instanceof Error ? err.message : String(err)}
|
|
837
|
+
`);
|
|
838
|
+
process.exit(1);
|
|
839
|
+
});
|
|
840
|
+
//# sourceMappingURL=mcp-stdio-server.cjs.map
|